diff --git a/packages/CLI11/.all-contributorsrc b/packages/CLI11/.all-contributorsrc
index 7de67853e063fa7ea75d853506c59ea571d265ef..14ba0211e135005b41ac3ea91451afd3ee599f3f 100644
--- a/packages/CLI11/.all-contributorsrc
+++ b/packages/CLI11/.all-contributorsrc
@@ -476,8 +476,226 @@
       "contributions": [
         "code"
       ]
+    },
+    {
+      "login": "trokhymchuk",
+      "name": "Artem Trokhymchuk ",
+      "avatar_url": "https://avatars.githubusercontent.com/u/66204814?v=4",
+      "profile": "https://github.com/trokhymchuk",
+      "contributions": [
+        "code"
+      ]
+    },
+    {
+      "login": "dherrera-fb",
+      "name": "dherrera-fb",
+      "avatar_url": "https://avatars.githubusercontent.com/u/89840711?v=4",
+      "profile": "https://github.com/dherrera-fb",
+      "contributions": [
+        "code"
+      ]
+    },
+    {
+      "login": "VolkerChristian",
+      "name": "Volker Christian",
+      "avatar_url": "https://avatars.githubusercontent.com/u/18554540?v=4",
+      "profile": "https://github.com/VolkerChristian",
+      "contributions": [
+        "code"
+      ]
+    },
+    {
+      "login": "thewtex",
+      "name": "Matt McCormick",
+      "avatar_url": "https://avatars.githubusercontent.com/u/25432?v=4",
+      "profile": "https://www.mmmccormick.com/",
+      "contributions": [
+        "code"
+      ]
+    },
+    {
+      "login": "polistern",
+      "name": "polistern",
+      "avatar_url": "https://avatars.githubusercontent.com/u/55511995?v=4",
+      "profile": "http://polistern.i2p/",
+      "contributions": [
+        "code"
+      ]
+    },
+    {
+      "login": "andreasxp",
+      "name": "Andrey Zhukov",
+      "avatar_url": "https://avatars.githubusercontent.com/u/28830446?v=4",
+      "profile": "https://github.com/andreasxp",
+      "contributions": [
+        "code"
+      ]
+    },
+    {
+      "login": "SherlockInSpace",
+      "name": "Ryan Sherlock",
+      "avatar_url": "https://avatars.githubusercontent.com/u/5507786?v=4",
+      "profile": "https://github.com/SherlockInSpace",
+      "contributions": [
+        "code"
+      ]
+    },
+    {
+      "login": "Krzmbrzl",
+      "name": "Robert Adam",
+      "avatar_url": "https://avatars.githubusercontent.com/u/12751591?v=4",
+      "profile": "https://github.com/Krzmbrzl",
+      "contributions": [
+        "code"
+      ]
+    },
+    {
+      "login": "RangeMachine",
+      "name": "RangeMachine",
+      "avatar_url": "https://avatars.githubusercontent.com/u/11577601?v=4",
+      "profile": "https://github.com/RangeMachine",
+      "contributions": [
+        "code"
+      ]
+    },
+    {
+      "login": "ptheywood",
+      "name": "Peter Heywood",
+      "avatar_url": "https://avatars.githubusercontent.com/u/628937?v=4",
+      "profile": "http://ptheywood.uk/",
+      "contributions": [
+        "code"
+      ]
+    },
+    {
+      "login": "peterh",
+      "name": "Peter Harris",
+      "avatar_url": "https://avatars.githubusercontent.com/u/79339?v=4",
+      "profile": "https://github.com/peterh",
+      "contributions": [
+        "code"
+      ]
+    },
+    {
+      "login": "PeteAudinate",
+      "name": "PeteAudinate",
+      "avatar_url": "https://avatars.githubusercontent.com/u/99274874?v=4",
+      "profile": "https://github.com/PeteAudinate",
+      "contributions": [
+        "code"
+      ]
+    },
+    {
+      "login": "captainurist",
+      "name": "captainurist",
+      "avatar_url": "https://avatars.githubusercontent.com/u/73941350?v=4",
+      "profile": "https://github.com/captainurist",
+      "contributions": [
+        "code"
+      ]
+    },
+    {
+      "login": "djerius",
+      "name": "djerius",
+      "avatar_url": "https://avatars.githubusercontent.com/u/196875?v=4",
+      "profile": "https://github.com/djerius",
+      "contributions": [
+        "code"
+      ]
+    },
+    {
+      "login": "shameekganguly",
+      "name": "shameekganguly",
+      "avatar_url": "https://avatars.githubusercontent.com/u/2412842?v=4",
+      "profile": "https://github.com/shameekganguly",
+      "contributions": [
+        "code"
+      ]
+    },
+    {
+      "login": "ayum",
+      "name": "ayum",
+      "avatar_url": "https://avatars.githubusercontent.com/u/6747040?v=4",
+      "profile": "https://github.com/ayum",
+      "contributions": [
+        "code"
+      ]
+    },
+    {
+      "login": "BenjaminBeichler",
+      "name": "Benjamin Beichler",
+      "avatar_url": "https://avatars.githubusercontent.com/u/1441492?v=4",
+      "profile": "https://github.com/BenjaminBeichler",
+      "contributions": [
+        "code"
+      ]
+    },
+    {
+      "login": "DarkWingMcQuack",
+      "name": "DarkWingMcQuack",
+      "avatar_url": "https://avatars.githubusercontent.com/u/38857302?v=4",
+      "profile": "https://github.com/DarkWingMcQuack",
+      "contributions": [
+        "code"
+      ]
+    },
+    {
+      "login": "eli-schwartz",
+      "name": "Eli Schwartz",
+      "avatar_url": "https://avatars.githubusercontent.com/u/6551424?v=4",
+      "profile": "https://github.com/eli-schwartz",
+      "contributions": [
+        "code"
+      ]
+    },
+    {
+      "login": "bruxisma",
+      "name": "Izzy Muerte",
+      "avatar_url": "https://avatars.githubusercontent.com/u/63051?v=4",
+      "profile": "https://izzys.casa/",
+      "contributions": [
+        "code"
+      ]
+    },
+    {
+      "login": "j-rivero",
+      "name": "Jose Luis Rivero",
+      "avatar_url": "https://avatars.githubusercontent.com/u/2098802?v=4",
+      "profile": "https://github.com/j-rivero",
+      "contributions": [
+        "code"
+      ]
+    },
+    {
+      "login": "looopTools",
+      "name": "Lars Nielsen",
+      "avatar_url": "https://avatars.githubusercontent.com/u/1943536?v=4",
+      "profile": "https://github.com/looopTools",
+      "contributions": [
+        "code"
+      ]
+    },
+    {
+      "login": "cetius",
+      "name": "Marcin Ropa",
+      "avatar_url": "https://avatars.githubusercontent.com/u/6552472?v=4",
+      "profile": "https://github.com/cetius",
+      "contributions": [
+        "code"
+      ]
+    },
+    {
+      "login": "nathanielhourt",
+      "name": "Nathaniel Hourt",
+      "avatar_url": "https://avatars.githubusercontent.com/u/271977?v=4",
+      "profile": "https://github.com/nathanielhourt",
+      "contributions": [
+        "code"
+      ]
     }
   ],
+  "contributorsSortAlphabetically": true,
   "contributorsPerLine": 7,
-  "skipCi": true
+  "skipCi": true,
+  "commitType": "docs"
 }
diff --git a/packages/CLI11/.ci/azure-cmake-new.yml b/packages/CLI11/.ci/azure-cmake-new.yml
new file mode 100644
index 0000000000000000000000000000000000000000..56a2fb4d99627c1fa37ffc453586d7b030eb2e4f
--- /dev/null
+++ b/packages/CLI11/.ci/azure-cmake-new.yml
@@ -0,0 +1,17 @@
+steps:
+  # Note that silkeh/clang does not include ca-certificates, so check the shasum for verification
+  - bash: |
+      wget --no-check-certificate "https://cmake.org/files/v3.28/cmake-3.28.0-linux-x86_64.tar.gz"
+      echo "898f0b5ca6e2ea5286998e97bd33f030d7d09f18ca4b88be661fdfbad5dadd88  cmake-3.28.0-linux-x86_64.tar.gz" | shasum -sca 256
+    displayName: Download CMake
+
+  - task: ExtractFiles@1
+    inputs:
+      archiveFilePatterns: "cmake*.tar.gz"
+      destinationFolder: "cmake_program"
+      displayName: Extract CMake
+
+  - bash:
+      echo
+      "##vso[task.prependpath]$(Build.SourcesDirectory)/cmake_program/cmake-3.28.0-linux-x86_64/bin"
+    displayName: Add CMake to PATH
diff --git a/packages/CLI11/.clang-tidy b/packages/CLI11/.clang-tidy
index 82450d1b5a1047190bb6a7f02db561c582542fd8..727b76525e3bf579e222bd7945799ab07338ae86 100644
--- a/packages/CLI11/.clang-tidy
+++ b/packages/CLI11/.clang-tidy
@@ -6,6 +6,7 @@
 # modernize-avoid-c-arrays trips up in TEMPLATE_TEST_CASE catch macro
 # modernize-return-braced-init-list triggers on lambdas ?
 # modernize-make-unique requires C++14
+# modernize-type_traits requires C++17
 # readability-avoid-const-params-in-decls Affected by the pre-compile split
 
 Checks: |
@@ -37,6 +38,8 @@ Checks: |
   -modernize-concat-nested-namespaces,
   -modernize-return-braced-init-list,
   -modernize-make-unique,
+  -modernize-type-traits,
+  -modernize-macro-to-enum,
   *performance*,
   -performance-unnecessary-value-param,
   -performance-inefficient-string-concatenation,
diff --git a/packages/CLI11/.codacy.yml b/packages/CLI11/.codacy.yml
new file mode 100644
index 0000000000000000000000000000000000000000..03a1e522b2fbd30ea61dd173265d74cfbc0ff71e
--- /dev/null
+++ b/packages/CLI11/.codacy.yml
@@ -0,0 +1,18 @@
+---
+engines:
+  rubocop:
+    enabled: true
+  duplication:
+    enabled: true
+  metrics:
+    enabled: true
+  coverage:
+    enabled: false
+languages:
+
+exclude_paths:
+  - "fuzz/**/*"
+  - "fuzz/*"
+  - "scripts/**/*"
+  - "scripts/*"
+  - "**.md"
diff --git a/packages/CLI11/.github/actions/quick_cmake/action.yml b/packages/CLI11/.github/actions/quick_cmake/action.yml
index 8359fb0d01575c5225ab9bc09e657e58e76e4099..d2b3825fe5331e63b232dbe45750ef7c31410373 100644
--- a/packages/CLI11/.github/actions/quick_cmake/action.yml
+++ b/packages/CLI11/.github/actions/quick_cmake/action.yml
@@ -1,5 +1,5 @@
 name: Quick CMake config
-description: "Runs CMake 3.4+ (if already setup)"
+description: "Runs CMake 3.5+ (if already setup)"
 inputs:
   args:
     description: "Other arguments"
@@ -13,7 +13,7 @@ runs:
   using: composite
   steps:
     - name: CMake ${{ inputs.cmake-version }}
-      uses: jwlawson/actions-setup-cmake@v1.13
+      uses: jwlawson/actions-setup-cmake@v1.14
       with:
         cmake-version: "${{ inputs.cmake-version }}"
     - run: |
diff --git a/packages/CLI11/.github/dependabot.yml b/packages/CLI11/.github/dependabot.yml
index f265d88d94951cbeb868e505d142cfe6232955c4..40aaf098ff5d105969fe3bb9323e238acc20b3e4 100644
--- a/packages/CLI11/.github/dependabot.yml
+++ b/packages/CLI11/.github/dependabot.yml
@@ -7,3 +7,7 @@ updates:
       interval: "weekly"
     target-branch: "main"
     open-pull-requests-limit: 10
+    groups:
+      actions:
+        patterns:
+          - "*"
diff --git a/packages/CLI11/.github/workflows/build.yml b/packages/CLI11/.github/workflows/build.yml
index 625502c6b33751730894729cabff3dff86f23e2f..57d54dc16694a3a0e51aa144343d8edb334bbea2 100644
--- a/packages/CLI11/.github/workflows/build.yml
+++ b/packages/CLI11/.github/workflows/build.yml
@@ -13,11 +13,11 @@ jobs:
     name: Single header
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
         with:
           submodules: true
 
-      - uses: actions/setup-python@v4
+      - uses: actions/setup-python@v5
         with:
           python-version: "3.x"
 
@@ -39,12 +39,12 @@ jobs:
       - name: Copy file to main folder
         run: cp build/include/CLI11.hpp CLI11.hpp
 
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
         with:
           name: CLI11.hpp
           path: CLI11.hpp
 
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
         with:
           name: CLI11-Source
           path: CLI11-Source
diff --git a/packages/CLI11/.github/workflows/docs.yml b/packages/CLI11/.github/workflows/docs.yml
new file mode 100644
index 0000000000000000000000000000000000000000..6b79b352fa297d26ee77103e039d1fed4ed6c4d8
--- /dev/null
+++ b/packages/CLI11/.github/workflows/docs.yml
@@ -0,0 +1,91 @@
+name: Docs
+
+on:
+  workflow_dispatch:
+  pull_request:
+  push:
+    branches:
+      - main
+
+permissions:
+  contents: read
+  pages: write
+  id-token: write
+
+concurrency:
+  group: "pages"
+  cancel-in-progress: false
+
+jobs:
+  apidocs:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: mattnotmitt/doxygen-action@v1
+        with:
+          doxyfile-path: ./docs/Doxyfile
+
+      - uses: actions/upload-artifact@v4
+        with:
+          name: api-docs
+          path: html
+
+  gitbook:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-node@v4
+        with:
+          node-version: 16
+
+      - uses: awalsh128/cache-apt-pkgs-action@v1
+        with:
+          packages: calibre calibre-bin libxss1 libasound2
+          version: 1
+
+      - name: Install JS requirements
+        working-directory: book
+        run: |
+          npm install
+
+      - name: Build book
+        working-directory: book
+        run: |
+          npx gitbook build . public
+          npx gitbook pdf . public/cli11.pdf
+
+      - uses: actions/upload-artifact@v4
+        with:
+          name: gitbook
+          path: book/public
+
+  pages:
+    runs-on: ubuntu-latest
+    needs: [apidocs, gitbook]
+    environment:
+      name: github-pages
+      url: ${{ steps.deployment.outputs.page_url }}
+    if: >
+      success()
+      && github.ref == 'refs/heads/main'
+      && github.repository == 'CLIUtils/CLI11'
+    steps:
+      - uses: actions/configure-pages@v4
+        id: pages
+
+      - uses: actions/download-artifact@v4
+        with:
+          name: api-docs
+          path: _site
+
+      - uses: actions/download-artifact@v4
+        with:
+          name: gitbook
+          path: _site/book
+
+      - uses: actions/upload-pages-artifact@v3
+
+      - uses: actions/deploy-pages@v4
+        id: deployment
diff --git a/packages/CLI11/.github/workflows/fuzz.yml b/packages/CLI11/.github/workflows/fuzz.yml
index 75d161b38a1f06cb91752e7b02e4f81e58071cca..413f150fbb80ac78d7832a1538d377f5251c7c56 100644
--- a/packages/CLI11/.github/workflows/fuzz.yml
+++ b/packages/CLI11/.github/workflows/fuzz.yml
@@ -16,7 +16,7 @@ jobs:
     name: quickfuzz1
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
         with:
           fetch-depth: 0
 
@@ -35,12 +35,12 @@ jobs:
       - name: Build
         run: cmake --build build -j4
 
-      - name: Test
+      - name: Test_app
         run: |
           cd build
           make QUICK_CLI11_APP_FUZZ
 
-      - name: Test2
+      - name: Test_file
         run: |
           cd build
           make QUICK_CLI11_FILE_FUZZ
@@ -48,7 +48,7 @@ jobs:
 
       - name: artifacts
         if: failure()
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: file_failure
           path: ./build/fuzz/cli11_*_fail_artifact.txt
diff --git a/packages/CLI11/.github/workflows/tests.yml b/packages/CLI11/.github/workflows/tests.yml
index 2ab5a66692d9f433ba88aa12fcfff05c679a1e99..460d2ebd36469c75d4b8bedc5398d38314138bdd 100644
--- a/packages/CLI11/.github/workflows/tests.yml
+++ b/packages/CLI11/.github/workflows/tests.yml
@@ -10,6 +10,9 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
   cancel-in-progress: true
 
+env:
+  CTEST_OUTPUT_ON_FAILURE: "1"
+
 jobs:
   coverage:
     name: Coverage
@@ -19,7 +22,7 @@ jobs:
         std: ["11", "14", "17", "20"]
         precompile: ["ON", "OFF"]
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
         with:
           fetch-depth: 0
 
@@ -35,7 +38,7 @@ jobs:
           cmake -S . -B build \
             -DCMAKE_CXX_STANDARD=${{matrix.std}} \
             -DCLI11_SINGLE_FILE_TESTS=OFF \
-            -DCLI11_EXAMPLES=OFF \
+            -DCLI11_BUILD_EXAMPLES=OFF \
             -DCLI11_PRECOMPILED=${{matrix.precompile}} \
             -DCMAKE_BUILD_TYPE=Coverage
 
@@ -55,15 +58,38 @@ jobs:
       - uses: codecov/codecov-action@v3
         with:
           files: build/coverage.info
-          fail_ci_if_error: true
           functionalities: fixes
 
+  catch2-3:
+    name: Catch 2 3.x
+    runs-on: macos-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Get Catch 2
+        run: brew install catch2
+
+      - name: Configure
+        run: |
+          cmake -S . -B build \
+            -DCMAKE_CXX_STANDARD=14 \
+            -DCLI11_SINGLE_FILE_TESTS=OFF \
+            -DCLI11_BUILD_EXAMPLES=OFF \
+            -DCLI11_PRECOMPILED=ON
+
+      - name: Build
+        run: cmake --build build -j4
+
+      - name: Test
+        run: cmake --build build --target test
+
+
   clang-tidy:
     name: Clang-Tidy
     runs-on: ubuntu-latest
-    container: silkeh/clang:14
+    container: silkeh/clang:17
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - name: Configure
         run: >
@@ -81,7 +107,7 @@ jobs:
     steps:
       - name: Add build tools
         run: apt-get update && apt-get install -y wget git cmake
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
         with:
           submodules: true
       - name: Configure
@@ -92,11 +118,11 @@ jobs:
   cuda12-build:
     name: CUDA 12 build only
     runs-on: ubuntu-latest
-    container: nvidia/cuda:12.1.0-devel-ubuntu22.04
+    container: nvidia/cuda:12.3.1-devel-ubuntu22.04
     steps:
       - name: Add build tools
         run: apt-get update && apt-get install -y wget git cmake
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
         with:
           submodules: true
       - name: Configure
@@ -106,9 +132,9 @@ jobs:
 
   boost-build:
     name: Boost build
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-22.04
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
         with:
           submodules: true
       - name: Add boost
@@ -127,7 +153,7 @@ jobs:
     name: Meson build
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - name: Prepare commands
         run: |
@@ -140,16 +166,62 @@ jobs:
       - name: Build
         run: meson compile -C build-meson
 
-  cmake-config-ubuntu-1804:
-    name: CMake config check (Ubuntu 18.04)
-    runs-on: ubuntu-18.04
+  install:
+    name: install tests
+    runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
+        with:
+          submodules: true
+      - name: Configure
+        run: cmake -S . -B build -DCLI11_INSTALL_PACKAGE_TESTS=ON -DCMAKE_INSTALL_PREFIX=/home/runner/work/install
+      - name: Build
+        run: cmake --build build  -j2
+      - name: install
+        run: cmake --install build
+      - name: Run tests
+        run: ctest --output-on-failure -L Packaging
+        working-directory: build
 
-      - name: Check CMake 3.4
+  install-precompiled:
+    name: install tests precompiled
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
         with:
-          cmake-version: "3.4"
-        uses: ./.github/actions/quick_cmake
+          submodules: true
+      - name: Configure
+        run: cmake -S . -B build -DCLI11_INSTALL_PACKAGE_TESTS=ON -DCMAKE_INSTALL_PREFIX=/home/runner/work/install -DCLI11_PRECOMPILED=ON
+      - name: Build
+        run: cmake --build build  -j2
+      - name: install
+        run: cmake --install build
+      - name: Run tests
+        run: ctest --output-on-failure -L Packaging
+        working-directory: build
+
+  install-single_file:
+    name: install tests single file
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: true
+      - name: Configure
+        run: cmake -S . -B build -DCLI11_INSTALL_PACKAGE_TESTS=ON -DCMAKE_INSTALL_PREFIX=/home/runner/work/install -DCLI11_SINGLE_FILE=ON
+      - name: Build
+        run: cmake --build build  -j2
+      - name: install
+        run: cmake --install build
+      - name: Run tests
+        run: ctest --output-on-failure -L Packaging
+        working-directory: build
+
+  cmake-config-ubuntu-2004:
+    name: CMake config check (Ubuntu 20.04)
+    runs-on: ubuntu-20.04
+    steps:
+      - uses: actions/checkout@v4
 
       - name: Check CMake 3.5
         uses: ./.github/actions/quick_cmake
@@ -187,17 +259,10 @@ jobs:
           cmake-version: "3.10"
         if: success() || failure()
 
-  cmake-config-ubuntu-2004:
-    name: CMake config check (Ubuntu 20.04)
-    runs-on: ubuntu-20.04
-    steps:
-      - uses: actions/checkout@v3
-
-      - name: Check CMake 3.11 (full)
+      - name: Check CMake 3.11
         uses: ./.github/actions/quick_cmake
         with:
           cmake-version: "3.11"
-          args: -DCLI11_SANITIZERS=ON -DCLI11_BUILD_EXAMPLES_JSON=ON
         if: success() || failure()
 
       - name: Check CMake 3.12
@@ -216,6 +281,7 @@ jobs:
         uses: ./.github/actions/quick_cmake
         with:
           cmake-version: "3.14"
+          args: -DCLI11_SANITIZERS=ON -DCLI11_BUILD_EXAMPLES_JSON=ON
         if: success() || failure()
 
       - name: Check CMake 3.15
@@ -234,7 +300,7 @@ jobs:
     name: CMake config check (Ubuntu 22.04)
     runs-on: ubuntu-22.04
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - name: Check CMake 3.17
         uses: ./.github/actions/quick_cmake
@@ -272,22 +338,40 @@ jobs:
           cmake-version: "3.22"
         if: success() || failure()
 
-      - name: Check CMake 3.23 
+      - name: Check CMake 3.23
         uses: ./.github/actions/quick_cmake
         with:
           cmake-version: "3.23"
         if: success() || failure()
 
-      - name: Check CMake 3.24 (full)
+      - name: Check CMake 3.24
         uses: ./.github/actions/quick_cmake
         with:
           cmake-version: "3.24"
-          args: -DCLI11_SANITIZERS=ON -DCLI11_BUILD_EXAMPLES_JSON=ON
         if: success() || failure()
 
-      - name: Check CMake 3.25 (full)
+      - name: Check CMake 3.25
         uses: ./.github/actions/quick_cmake
         with:
           cmake-version: "3.25"
+        if: success() || failure()
+
+      - name: Check CMake 3.26 (full)
+        uses: ./.github/actions/quick_cmake
+        with:
+          cmake-version: "3.26"
+          args: -DCLI11_SANITIZERS=ON -DCLI11_BUILD_EXAMPLES_JSON=ON
+        if: success() || failure()
+
+      - name: Check CMake 3.27
+        uses: ./.github/actions/quick_cmake
+        with:
+          cmake-version: "3.27"
+        if: success() || failure()
+
+      - name: Check CMake 3.28 (full)
+        uses: ./.github/actions/quick_cmake
+        with:
+          cmake-version: "3.28"
           args: -DCLI11_SANITIZERS=ON -DCLI11_BUILD_EXAMPLES_JSON=ON
         if: success() || failure()
diff --git a/packages/CLI11/.gitrepo b/packages/CLI11/.gitrepo
index 925d8dccefb3a338b66e4a57300b3d1e983bf56a..3c6f249cfbead662dc9b5c1abfb35929ace55549 100644
--- a/packages/CLI11/.gitrepo
+++ b/packages/CLI11/.gitrepo
@@ -6,7 +6,7 @@
 [subrepo]
 	remote = git@github.com:CLIUtils/CLI11.git
 	branch = main
-	commit = 784fa3ebd387e63feef41d174f587bbe4cfec4da
-	parent = 164bbb3a73dc902c29aa7ccabfaba50cefa6345d
+	commit = 20de8b73bbbabaf2f94dd07c4ece8ff3590af531
+	parent = 65932507ed66da9e0ada0d9294a336690069148a
 	method = merge
 	cmdver = 0.4.6
diff --git a/packages/CLI11/.pre-commit-config.yaml b/packages/CLI11/.pre-commit-config.yaml
index febf04dd23f4f524d646fefc2b093a35792dfdbe..0d271c913ccb1f4b1c13118ac7a4bcb48dfc56a5 100644
--- a/packages/CLI11/.pre-commit-config.yaml
+++ b/packages/CLI11/.pre-commit-config.yaml
@@ -5,12 +5,12 @@ ci:
 
 repos:
   - repo: https://github.com/psf/black
-    rev: 23.3.0
+    rev: 23.12.1
     hooks:
       - id: black
 
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.4.0
+    rev: v4.5.0
     hooks:
       - id: check-added-large-files
       - id: check-case-conflict
@@ -26,7 +26,7 @@ repos:
       - id: debug-statements
 
   - repo: https://github.com/pre-commit/mirrors-clang-format
-    rev: v16.0.3
+    rev: v17.0.6
     hooks:
       - id: clang-format
         types_or: [c++, c, cuda]
@@ -38,7 +38,7 @@ repos:
         additional_dependencies: [pyyaml]
 
   - repo: https://github.com/pre-commit/mirrors-prettier
-    rev: "v3.0.0-alpha.9-for-vscode"
+    rev: "v4.0.0-alpha.8"
     hooks:
       - id: prettier
         types_or: [yaml, markdown, html, css, scss, javascript, json]
@@ -87,7 +87,7 @@ repos:
         exclude: .pre-commit-config.yaml
 
   - repo: https://github.com/codespell-project/codespell
-    rev: v2.2.4
+    rev: v2.2.6
     hooks:
       - id: codespell
         args: ["-L", "atleast,ans,doub,inout"]
diff --git a/packages/CLI11/CLI11.hpp.in b/packages/CLI11/CLI11.hpp.in
index edc16bb15f6468287a4410d9c14fa258e0356003..e84a20aac305056b0bc9cdc3ccc0a64bc8912f2c 100644
--- a/packages/CLI11/CLI11.hpp.in
+++ b/packages/CLI11/CLI11.hpp.in
@@ -5,7 +5,7 @@
 // This is a standalone header file generated by MakeSingleHeader.py in CLI11/scripts
 // from: {git}
 //
-// CLI11 {version} Copyright (c) 2017-2023 University of Cincinnati, developed by Henry
+// CLI11 {version} Copyright (c) 2017-2024 University of Cincinnati, developed by Henry
 // Schreiner under NSF AWARD 1414736. All rights reserved.
 //
 // Redistribution and use in source and binary forms of CLI11, with or without
diff --git a/packages/CLI11/CMakeLists.txt b/packages/CLI11/CMakeLists.txt
index cdc2011d8800199f8c752523446391905eba808a..512553f89e2e30e764d280a35e50ffcd088d7c68 100644
--- a/packages/CLI11/CMakeLists.txt
+++ b/packages/CLI11/CMakeLists.txt
@@ -1,15 +1,15 @@
-cmake_minimum_required(VERSION 3.4)
-# Note: this is a header only library. If you have an older CMake than 3.4,
+cmake_minimum_required(VERSION 3.5)
+# Note: this is a header only library. If you have an older CMake than 3.5,
 # just add the CLI11/include directory and that's all you need to do.
 
-# Make sure users don't get warnings on a tested (3.4 to 3.24) version
+# Make sure users don't get warnings on a tested (3.5 to 3.28) version
 # of CMake. For most of the policies, the new version is better (hence the change).
-# We don't use the 3.4...3.24 syntax because of a bug in an older MSVC's
+# We don't use the 3.5...3.28 syntax because of a bug in an older MSVC's
 # built-in and modified CMake 3.11
-if(${CMAKE_VERSION} VERSION_LESS 3.25)
+if(${CMAKE_VERSION} VERSION_LESS 3.28)
   cmake_policy(VERSION ${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION})
 else()
-  cmake_policy(VERSION 3.25)
+  cmake_policy(VERSION 3.28)
 endif()
 
 set(VERSION_REGEX "#define CLI11_VERSION[ \t]+\"(.+)\"")
@@ -81,7 +81,7 @@ option(CLI11_WARNINGS_AS_ERRORS "Turn all warnings into errors (for CI)")
 option(CLI11_SINGLE_FILE "Generate a single header file")
 option(CLI11_PRECOMPILED "Generate a precompiled static library instead of a header-only" OFF)
 cmake_dependent_option(CLI11_SANITIZERS "Download the sanitizers CMake config" OFF
-                       "NOT CMAKE_VERSION VERSION_LESS 3.11" OFF)
+                       "NOT CMAKE_VERSION VERSION_LESS 3.13" OFF)
 
 cmake_dependent_option(CLI11_BUILD_DOCS "Build CLI11 documentation" ON "${build-docs}" OFF)
 
@@ -198,39 +198,41 @@ if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME AND EXISTS "${CMAKE_CURRENT_SOURCE_D
 endif()
 
 # Packaging support
-set(CPACK_PACKAGE_VENDOR "github.com/CLIUtils/CLI11")
-set(CPACK_PACKAGE_CONTACT "https://${CPACK_PACKAGE_VENDOR}")
-set(CPACK_PACKAGE_VERSION_MAJOR ${PROJECT_VERSION_MAJOR}) # Automatic in CMake 3.12+
-set(CPACK_PACKAGE_VERSION_MINOR ${PROJECT_VERSION_MINOR}) # Automatic in CMake 3.12+
-set(CPACK_PACKAGE_VERSION_PATCH ${PROJECT_VERSION_PATCH}) # Automatic in CMake 3.12+
-set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Command line parser with simple and intuitive interface")
-set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE")
-set(CPACK_RESOURCE_FILE_README "${CMAKE_CURRENT_SOURCE_DIR}/README.md")
-set(CPACK_PACKAGE_DESCRIPTION_FILE "${CMAKE_CURRENT_SOURCE_DIR}/CLI11.CPack.Description.txt")
-set(CPACK_SOURCE_GENERATOR "TGZ;ZIP")
-
-# CPack collects *everything* except what's listed here.
-set(CPACK_SOURCE_IGNORE_FILES
-    /.git
-    /dist
-    /.*build.*
-    /\\\\.DS_Store
-    /.*\\\\.egg-info
-    /var
-    /azure-pipelines.yml
-    /.ci
-    /docs
-    /examples
-    /test_package
-    /book
-    /.travis.yml
-    .swp
-    /.all-contributorsrc
-    /.appveyor.yml
-    /.pre-commit.*yaml)
-
-set(CPACK_DEBIAN_PACKAGE_ARCHITECTURE "all")
-set(CPACK_DEBIAN_COMPRESSION_TYPE "xz")
-set(CPACK_DEBIAN_PACKAGE_NAME "libcli11-dev")
-
-include(CPack)
+if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
+  set(CPACK_PACKAGE_VENDOR "github.com/CLIUtils/CLI11")
+  set(CPACK_PACKAGE_CONTACT "https://${CPACK_PACKAGE_VENDOR}")
+  set(CPACK_PACKAGE_VERSION_MAJOR ${PROJECT_VERSION_MAJOR}) # Automatic in CMake 3.12+
+  set(CPACK_PACKAGE_VERSION_MINOR ${PROJECT_VERSION_MINOR}) # Automatic in CMake 3.12+
+  set(CPACK_PACKAGE_VERSION_PATCH ${PROJECT_VERSION_PATCH}) # Automatic in CMake 3.12+
+  set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Command line parser with simple and intuitive interface")
+  set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE")
+  set(CPACK_RESOURCE_FILE_README "${CMAKE_CURRENT_SOURCE_DIR}/README.md")
+  set(CPACK_PACKAGE_DESCRIPTION_FILE "${CMAKE_CURRENT_SOURCE_DIR}/CLI11.CPack.Description.txt")
+  set(CPACK_SOURCE_GENERATOR "TGZ;ZIP")
+
+  # CPack collects *everything* except what's listed here.
+  set(CPACK_SOURCE_IGNORE_FILES
+      /.git
+      /dist
+      /.*build.*
+      /\\\\.DS_Store
+      /.*\\\\.egg-info
+      /var
+      /azure-pipelines.yml
+      /.ci
+      /docs
+      /examples
+      /test_package
+      /book
+      /.travis.yml
+      .swp
+      /.all-contributorsrc
+      /.appveyor.yml
+      /.pre-commit.*yaml)
+
+  set(CPACK_DEBIAN_PACKAGE_ARCHITECTURE "all")
+  set(CPACK_DEBIAN_COMPRESSION_TYPE "xz")
+  set(CPACK_DEBIAN_PACKAGE_NAME "libcli11-dev")
+
+  include(CPack)
+endif()
diff --git a/packages/CLI11/CPPLINT.cfg b/packages/CLI11/CPPLINT.cfg
index 40bec3714ed59c7101b1d168b9ccdb087a611132..e1d27d9f5f770900c0b6c7fd48744f01cf2ddceb 100644
--- a/packages/CLI11/CPPLINT.cfg
+++ b/packages/CLI11/CPPLINT.cfg
@@ -9,6 +9,7 @@ filter=-readability/nolint  # Conflicts with clang-tidy
 filter=-readability/check  # Catch uses CHECK(a == b) (Tests only)
 filter=-build/namespaces  # Currently using it for one test (Tests only)
 filter=-runtime/references  # Requires fundamental change of API, don't see need for this
+filter=-runtime/string  # Requires not using static const strings which makes thing really annoying
 filter=-whitespace/blank_line  # Unnecessarily strict with blank lines that otherwise help with readability
 filter=-whitespace/indent  # Requires strange 3-space indent of private/protected/public markers
 filter=-whitespace/parens,-whitespace/braces  # Conflict with clang-format
diff --git a/packages/CLI11/LICENSE b/packages/CLI11/LICENSE
index aae15855ecf563d8e62cd6458a0c99b96be4e5b1..715be0bb493bea2a438781b4e9b4dfeaf4fc7634 100644
--- a/packages/CLI11/LICENSE
+++ b/packages/CLI11/LICENSE
@@ -1,4 +1,4 @@
-CLI11 2.2 Copyright (c) 2017-2023 University of Cincinnati, developed by Henry
+CLI11 2.2 Copyright (c) 2017-2024 University of Cincinnati, developed by Henry
 Schreiner under NSF AWARD 1414736. All rights reserved.
 
 Redistribution and use in source and binary forms of CLI11, with or without
diff --git a/packages/CLI11/README.md b/packages/CLI11/README.md
index bd9e58db71e02dfe4a479c283783b37bf281193f..bb75fd4928b405f87cbd60c29cf50b7a8befec3b 100644
--- a/packages/CLI11/README.md
+++ b/packages/CLI11/README.md
@@ -24,39 +24,42 @@ set with a simple and intuitive interface.
 
 ## Table of Contents
 
-- [Background](#background)
-  - [Introduction](#introduction)
-  - [Why write another CLI parser?](#why-write-another-cli-parser)
-  - [Other parsers](#other-parsers)
-  - [Features not supported by this library](#features-not-supported-by-this-library)
-- [Install](#install)
-- [Usage](#usage)
-  - [Adding options](#adding-options)
-    - [Option types](#option-types)
-    - [Example](#example)
-    - [Option options](#option-options)
-    - [Validators](#validators)
-      - [Transforming Validators](#transforming-validators)
-      - [Validator operations](#validator-operations)
-      - [Custom Validators](#custom-validators)
-      - [Querying Validators](#querying-validators)
-      - [Getting Results](#getting-results)
-  - [Subcommands](#subcommands)
-    - [Subcommand options](#subcommand-options)
-    - [Option groups](#option-groups)
-    - [Callbacks](#callbacks)
-  - [Configuration file](#configuration-file)
-  - [Inheriting defaults](#inheriting-defaults)
-  - [Formatting](#formatting)
-  - [Subclassing](#subclassing)
-  - [How it works](#how-it-works)
-  - [Unicode support](#unicode-support)
-  - [Utilities](#utilities)
-  - [Other libraries](#other-libraries)
-- [API](#api)
-- [Examples](#Examples)
-- [Contribute](#contribute)
-- [License](#license)
+- [CLI11: Command line parser for C++11](#cli11-command-line-parser-for-c11)
+  - [Table of Contents](#table-of-contents)
+  - [Background](#background)
+    - [Introduction](#introduction)
+    - [Why write another CLI parser?](#why-write-another-cli-parser)
+    - [Other parsers](#other-parsers)
+    - [Features not supported by this library](#features-not-supported-by-this-library)
+  - [Install](#install)
+  - [Usage](#usage)
+    - [Adding options](#adding-options)
+      - [Option types](#option-types)
+      - [Example](#example)
+      - [Option options](#option-options)
+      - [Validators](#validators)
+        - [Transforming Validators](#transforming-validators)
+        - [Validator operations](#validator-operations)
+        - [Custom Validators](#custom-validators)
+        - [Querying Validators](#querying-validators)
+      - [Getting results](#getting-results)
+    - [Subcommands](#subcommands)
+      - [Subcommand options](#subcommand-options)
+      - [Callbacks](#callbacks)
+      - [Option groups](#option-groups)
+    - [Configuration file](#configuration-file)
+    - [Inheriting defaults](#inheriting-defaults)
+    - [Formatting](#formatting)
+    - [Subclassing](#subclassing)
+    - [How it works](#how-it-works)
+    - [Unicode support](#unicode-support)
+      - [Note on using Unicode paths](#note-on-using-unicode-paths)
+    - [Utilities](#utilities)
+    - [Other libraries](#other-libraries)
+  - [API](#api)
+  - [Examples](#examples)
+  - [Contribute](#contribute)
+  - [License](#license)
 
 Features that were added in the last released minor version are marked with
 "πŸ†•". Features only available in main are marked with "🚧".
@@ -168,7 +171,8 @@ this library:
 
 ## Install
 
-To use, there are several methods:
+To use, the most common methods are described here additional methods and
+details are available at [installation][]:
 
 - All-in-one local header: Copy `CLI11.hpp` from the [most recent
   release][github releases] into your include directory, and you are set. This
@@ -178,95 +182,9 @@ To use, there are several methods:
   separately.
 - All-in-one global header: Like above, but copying the file to a shared folder
   location like `/opt/CLI11`. Then, the C++ include path has to be extended to
-  point at this folder. With CMake, use `include_directories(/opt/CLI11)`
-- Local headers and target: Use `CLI/*.hpp` files. You could check out the
-  repository as a git submodule, for example. With CMake, you can use
-  `add_subdirectory` and the `CLI11::CLI11` interface target when linking. If
-  not using a submodule, you must ensure that the copied files are located
-  inside the same tree directory than your current project, to prevent an error
-  with CMake and `add_subdirectory`.
-- Global headers: Use `CLI/*.hpp` files stored in a shared folder. You could
-  check out the git repository to a system-wide folder, for example `/opt/`.
-  With CMake, you could add to the include path via:
-
-```bash
-if(NOT DEFINED CLI11_DIR)
-set (CLI11_DIR "/opt/CLI11" CACHE STRING "CLI11 git repository")
-endif()
-include_directories(${CLI11_DIR}/include)
-```
-
-And then in the source code (adding several headers might be needed to prevent
-linker errors):
-
-```cpp
-#include "CLI/App.hpp"
-#include "CLI/Formatter.hpp"
-#include "CLI/Config.hpp"
-```
-
-- Global headers and target: configuring and installing the project is required
-  for linking CLI11 to your project in the same way as you would do with any
-  other external library. With CMake, this step allows using
-  `find_package(CLI11 CONFIG REQUIRED)` and then using the `CLI11::CLI11` target
-  when linking. If `CMAKE_INSTALL_PREFIX` was changed during install to a
-  specific folder like `/opt/CLI11`, then you have to pass
-  `-DCLI11_DIR=/opt/CLI11` when building your current project. You can also use
-  [Conan.io][conan-link] or [Hunter][]. (These are just conveniences to allow
-  you to use your favorite method of managing packages; it's just header only so
-  including the correct path and using C++11 is all you really need.)
-- Via FetchContent in CMake 3.14+ (or 3.11+ with more work): you can add this
-  with fetch-content, then use the `CLI11::CLI11` target as above, and CMake
-  will download the project in the configure stage:
-
-```cmake
-include(FetchContent)
-FetchContent_Declare(
-  cli11
-  GIT_REPOSITORY https://github.com/CLIUtils/CLI11
-  GIT_TAG        v2.2.0
-)
-
-FetchContent_MakeAvailable(cli11)
-```
-
-It is highly recommended that you use the git hash for `GIT_TAG` instead of a
-tag or branch, as that will both be more secure, as well as faster to
-reconfigure - CMake will not have to reach out to the internet to see if the tag
-moved. You can also download just the single header file from the releases using
-`file(DOWNLOAD`.
-
-To build the tests, checkout the repository and use CMake:
-
-```bash
-cmake -S . -B build
-cmake --build build
-CTEST_OUTPUT_ON_FAILURE=1 cmake --build build -t test
-```
-
-<details><summary>Note: Special instructions for GCC 8</summary><p>
-
-If you are using GCC 8 and using it in C++17 mode with CLI11. CLI11 makes use of
-the `<filesystem>` header if available, but specifically for this compiler, the
-`filesystem` library is separate from the standard library and needs to be
-linked separately. So it is available but CLI11 doesn't use it by default.
-
-Specifically `libstdc++fs` needs to be added to the linking list and
-`CLI11_HAS_FILESYSTEM=1` has to be defined. Then the filesystem variant of the
-Validators could be used on GCC 8. GCC 9+ does not have this issue so the
-`<filesystem>` is used by default.
-
-There may also be other cases where a specific library needs to be linked.
-
-Defining `CLI11_HAS_FILESYSTEM=0` which will remove the usage and hence any
-linking issue.
-
-In some cases certain clang compilations may require linking against `libc++fs`.
-These situations have not been encountered so the specific situations requiring
-them are unknown yet.
-
-</p></details>
-</br>
+  point at this folder. With CMake 3.5+, use `include_directories(/opt/CLI11)`
+- For other methods including using CMake or vcpkg and some specific
+  instructions for GCC 8 or WASI see [installation][].
 
 ## Usage
 
@@ -276,22 +194,27 @@ To set up, add options, and run, your main function will look something like
 this:
 
 ```cpp
-int main() {
+int main(int argc, char** argv) {
     CLI::App app{"App description"};
+    argv = app.ensure_utf8(argv);
 
     std::string filename = "default";
     app.add_option("-f,--file", filename, "A help string");
 
-    CLI11_PARSE(app);
+    CLI11_PARSE(app, argc, argv);
     return 0;
 }
 ```
 
+For more information about 🚧`ensure_utf8` the section on
+[Unicode support](#unicode-support) below. The 🚧`ensure_utf8` function is only
+available in main currently and not in a release.
+
 <details><summary>Note: If you don't like macros, this is what that macro expands to: (click to expand)</summary><p>
 
 ```cpp
 try {
-    app.parse();
+    app.parse(argc, argv);
 } catch (const CLI::ParseError &e) {
     return app.exit(e);
 }
@@ -304,25 +227,6 @@ inside the catch block; for example, help flags intentionally short-circuit all
 other processing for speed and to ensure required options and the like do not
 interfere.
 
-</p></details>
-
-<details><summary>Note: Why are argc and argv not used? (click to expand)</summary><p>
-
-`argc` and `argv` may contain incorrect information on Windows when unicode text
-is passed in. Check out a section on [unicode support](#unicode-support) below.
-
-If this is not a concern, you can explicitly pass `argc` and `argv` from main or
-from an external preprocessor of CLI arguments to `parse`:
-
-```cpp
-int main(int argc, char** argv) {
-    // ...
-
-    CLI11_PARSE(app, argc, argv);
-    return 0;
-}
-```
-
 </p></details>
 </br>
 
@@ -512,10 +416,11 @@ Before parsing, you can set the following options:
   option. Options can be removed from the excludes list with
   `->remove_excludes(opt)`
 - `->envname(name)`: Gets the value from the environment if present and not
-  passed on the command line.
+  passed on the command line. 🚧 The value must also pass any validators to be
+  used.
 - `->group(name)`: The help group to put the option in. No effect for positional
-  options. Defaults to `"Options"`. `""` will not show up in the help print
-  (hidden).
+  options. Defaults to `"Options"`. Options given an empty string will not show
+  up in the help print (hidden).
 - `->ignore_case()`: Ignore the case on the command line (also works on
   subcommands, does not affect arguments).
 - `->ignore_underscore()`: Ignore any underscores in the options names (also
@@ -546,8 +451,8 @@ Before parsing, you can set the following options:
   This equivalent to calling `->delimiter(delim)` and `->join()`. Valid values
   are `CLI::MultiOptionPolicy::Throw`, `CLI::MultiOptionPolicy::Throw`,
   `CLI::MultiOptionPolicy::TakeLast`, `CLI::MultiOptionPolicy::TakeFirst`,
-  `CLI::MultiOptionPolicy::Join`, `CLI::MultiOptionPolicy::TakeAll`, and
-  `CLI::MultiOptionPolicy::Sum` πŸ†•.
+  `CLI::MultiOptionPolicy::Join`, `CLI::MultiOptionPolicy::TakeAll`,
+  `CLI::MultiOptionPolicy::Sum` πŸ†•, and `CLI::MultiOptionPolicy::Reverse` 🚧.
 - `->check(std::string(const std::string &), validator_name="",validator_description="")`:
   Define a check function. The function should return a non empty string with
   the error message if the check fails
@@ -797,6 +702,17 @@ filters on the key values is performed.
   `CLI::FileOnDefaultPath(default_path, false)`. This allows multiple paths to
   be chained using multiple transform calls.
 
+- `CLI::EscapedString`: 🚧 can be used to process an escaped string. The
+  processing is equivalent to that used for TOML config files, see
+  [TOML strings](https://toml.io/en/v1.0.0#string). With 2 notable exceptions.
+  \` can also be used as a literal string notation, and it also allows binary
+  string notation see
+  [binary strings](https://cliutils.github.io/CLI11/book/chapters/config.html).
+  The escaped string processing will remove outer quotes if present, `"` will
+  indicate a string with potential escape sequences, `'` and \` will indicate a
+  literal string and the quotes removed but no escape sequences will be
+  processed. This is the same escape processing as used in config files.
+
 ##### Validator operations
 
 Validators are copyable and have a few operations that can be performed on them
@@ -901,12 +817,15 @@ not used in performance critical code:
 
 ### Subcommands
 
-Subcommands are supported, and can be nested infinitely. To add a subcommand,
-call the `add_subcommand` method with a name and an optional description. This
-gives a pointer to an `App` that behaves just like the main app, and can take
-options or further subcommands. Add `->ignore_case()` to a subcommand to allow
-any variation of caps to also be accepted. `->ignore_underscore()` is similar,
-but for underscores. Children inherit the current setting from the parent. You
+Subcommands are keywords that invoke a new set of options and features. For
+example, the `git` command has a long series of subcommands, like `add` and
+`commit`. Each can have its own options and implementations. Subcommands are
+supported in CLI11, and can be nested infinitely. To add a subcommand, call the
+`add_subcommand` method with a name and an optional description. This gives a
+pointer to an `App` that behaves just like the main app, and can take options or
+further subcommands. Add `->ignore_case()` to a subcommand to allow any
+variation of caps to also be accepted. `->ignore_underscore()` is similar, but
+for underscores. Children inherit the current setting from the parent. You
 cannot add multiple matching subcommand names at the same level (including
 `ignore_case` and `ignore_underscore`).
 
@@ -965,9 +884,11 @@ through the `add_subcommand` method have the same restrictions as option names.
 - `--subcommand1.subsub.f val` (short form nested subcommand option)
 
 The use of dot notation in this form is equivalent `--subcommand.long <args>` =>
-`subcommand --long <args> ++`. Nested subcommands also work `"sub1.subsub"`
-would trigger the subsub subcommand in `sub1`. This is equivalent to "sub1
-subsub"
+`subcommand --long <args> ++`. Nested subcommands also work `sub1.subsub` would
+trigger the subsub subcommand in `sub1`. This is equivalent to "sub1 subsub".
+Quotes around the subcommand names are permitted 🚧 following the TOML standard
+for such specification. This includes allowing escape sequences. For example
+`"subcommand".'f'` or `"subcommand.with.dots".arg1 = value`.
 
 #### Subcommand options
 
@@ -1093,10 +1014,10 @@ option_groups. These are:
 - `.prefix_command()`: Like `allow_extras`, but stop immediately on the first
   unrecognized item. It is ideal for allowing your app or subcommand to be a
   "prefix" to calling another app.
-- `.usage(message)`: Replace text to appear at the start of the help string
+- `.usage(message)`: 🚧 Replace text to appear at the start of the help string
   after description.
-- `.usage(std::string())`: Set a callback to generate a string that will appear
-  at the start of the help string after description.
+- `.usage(std::string())`: 🚧 Set a callback to generate a string that will
+  appear at the start of the help string after description.
 - `.footer(message)`: Set text to appear at the bottom of the help string.
 - `.footer(std::string())`: Set a callback to generate a string that will appear
   at the end of the help string.
@@ -1109,17 +1030,19 @@ option_groups. These are:
   returns a pointer to the created option. Expands subcommands.
 - `.failure_message(func)`: Set the failure message function. Two provided:
   `CLI::FailureMessage::help` and `CLI::FailureMessage::simple` (the default).
-- `.group(name)`: Set a group name, defaults to `"Subcommands"`. Setting `""`
-  will be hide the subcommand.
+- `.group(name)`: Set a group name, defaults to `"Subcommands"`. Setting an
+  empty string for the name will be hide the subcommand.
 - `[option_name]`: retrieve a const pointer to an option given by `option_name`
   for Example `app["--flag1"]` will get a pointer to the option for the
   "--flag1" value, `app["--flag1"]->as<bool>()` will get the results of the
   command line for a flag. The operation will throw an exception if the option
   name is not valid.
 
-> Note: if you have a fixed number of required positional options, that will
-> match before subcommand names. `{}` is an empty filter function, and any
-> positional argument will match before repeated subcommand names.
+> [!NOTE]
+>
+> If you have a fixed number of required positional options, that will match
+> before subcommand names. `{}` is an empty filter function, and any positional
+> argument will match before repeated subcommand names.
 
 #### Callbacks
 
@@ -1299,18 +1222,22 @@ option (like `set_help_flag`). Setting a configuration option is special. If it
 is present, it will be read along with the normal command line arguments. The
 file will be read if it exists, and does not throw an error unless `required` is
 `true`. Configuration files are in [TOML][] format by default, though the
-default reader can also accept files in INI format as well. It should be noted
-that CLI11 does not contain a full TOML parser but can read strings from most
-TOML file and run them through the CLI11 parser. Other formats can be added by
-an adept user, some variations are available through customization points in the
-default formatter. An example of a TOML file:
+default reader can also accept files in INI format as well. The config reader
+can read most aspects of TOML files including strings both literal 🚧 and with
+potential escape sequences 🚧, digit separators 🚧, and multi-line strings 🚧,
+and run them through the CLI11 parser. Other formats can be added by an adept
+user, some variations are available through customization points in the default
+formatter. An example of a TOML file:
 
 ```toml
 # Comments are supported, using a #
 # The default section is [default], case insensitive
 
 value = 1
+value2 = 123_456 # a string with separators
 str = "A string"
+str2 = "A string\nwith new lines"
+str3 = 'A literal "string"'
 vector = [1,2,3]
 str_vector = ["one","two","and three"]
 
@@ -1318,6 +1245,7 @@ str_vector = ["one","two","and three"]
 [subcommand]
 in_subcommand = Wow
 sub.subcommand = true
+"sub"."subcommand2" = "string_value"
 ```
 
 or equivalently in INI format
@@ -1476,8 +1404,6 @@ need to convert to. Some examples of some new parsers for `complex<double>` that
 support all of the features of a standard `add_options` call are in
 [one of the tests](./tests/NewParseTest.cpp). A simpler example is shown below:
 
-#### Example
-
 ```cpp
 app.add_option("--fancy-count", [](std::vector<std::string> val){
     std::cout << "This option was given " << val.size() << " times." << std::endl;
@@ -1501,49 +1427,76 @@ CLI11 supports Unicode and wide strings as defined in the
 
 When using the command line on Windows with unicode arguments, your `main`
 function may already receive broken Unicode. Parsing `argv` at that point will
-not give you a correct string. To fix this, you have three options:
-
-1. If you pass unmodified command-line arguments to CLI11, call `app.parse()`
-   instead of `app.parse(argc, argv)` (or `CLI11_PARSE(app)` instead of
-   `CLI11_PARSE(app, argc, argv)`). The library will find correct arguments
-   itself.
-
-   ```cpp
-   int main() {
-       CLI::App app;
-       // ...
-       CLI11_PARSE(app);
-   }
-   ```
-
-2. Get correct arguments with which the program was originally executed using
-   provided functions: `CLI::argc()` and `CLI::argv()`. These two methods are
-   the only cross-platform ways of handling unicode correctly.
-
-   ```cpp
-   int main() {
-       CLI::App app;
-       // ...
-       CLI11_PARSE(app, CLI::argc(), CLI::argv());
-   }
-   ```
-
-3. Use the Windows-only non-standard `wmain` function, which accepts
-   `wchar_t *argv[]` instead of `char* argv[]`. Parsing this will allow CLI to
-   convert wide strings to UTF-8 without losing information.
-
-   ```cpp
-   int wmain(int argc, wchar_t *argv[]) {
-       CLI::App app;
-       // ...
-       CLI11_PARSE(app, argc, argv);
-   }
-   ```
-
-4. Retrieve arguments yourself by using Windows APIs like
-   [`CommandLineToArgvW`](https://learn.microsoft.com/en-us/windows/win32/api/shellapi/nf-shellapi-commandlinetoargvw)
-   and pass them to CLI. This is what the library is doing under the hood in
-   `CLI::argv()`.
+not give you a correct string. To fix this, you have three good options and two
+bad ones:
+
+1\. Replace `argv` with `app.ensure_utf8(argv)` before any arguments are parsed.
+`ensure_utf8` will do nothing on systems where `argv` is already in UTF-8 (Such
+as Linux or macOS) and return `argv` unmodified. On Windows, it will discard
+`argv` and replace it with a correctly decoded array or arguments from win32
+API.
+
+```cpp
+int main(int argc, char** argv) {
+    CLI::App app;
+    argv = app.ensure_utf8(argv);  // new argv memory is held by app
+    // ...
+    CLI11_PARSE(app, argc, argv);
+}
+```
+
+2\. If you pass unmodified command-line arguments to CLI11, call `app.parse()`
+instead of `app.parse(argc, argv)` (or `CLI11_PARSE(app)` instead of
+`CLI11_PARSE(app, argc, argv)`). The library will find correct arguments by
+itself.
+
+> [!NOTE]
+>
+> This approach may not work on weird OS configurations, such as when the
+> `/proc` dir is missing on Linux systems (see also
+> [#845](https://github.com/CLIUtils/CLI11/issues/845)).
+>
+> ```cpp
+> int main() {
+>     CLI::App app;
+>     // ...
+>     CLI11_PARSE(app);
+> }
+> ```
+
+3\. Get correct arguments with which the program was originally executed using
+provided functions: `CLI::argc()` and `CLI::argv()`. These three methods are the
+only cross-platform ways of handling unicode correctly.
+
+```cpp
+int main() {
+    CLI::App app;
+    // ...
+    CLI11_PARSE(app, CLI::argc(), CLI::argv());
+}
+```
+
+<details><summary>Bad options (click to expand)</summary><p>
+
+4\. Use the Windows-only non-standard `wmain` function, which accepts
+`wchar_t *argv[]` instead of `char* argv[]`. Parsing this will allow CLI to
+convert wide strings to UTF-8 without losing information.
+
+```cpp
+int wmain(int argc, wchar_t *argv[]) {
+    CLI::App app;
+    // ...
+    CLI11_PARSE(app, argc, argv);
+}
+```
+
+5\. Retrieve arguments yourself by using Windows APIs like
+[`CommandLineToArgvW`](https://learn.microsoft.com/en-us/windows/win32/api/shellapi/nf-shellapi-commandlinetoargvw)
+and pass them to CLI. This is what the library is doing under the hood in
+`CLI::argv()`.
+
+</p></details>
+</br>
 
 The library provides functions to convert between UTF-8 and wide strings:
 
@@ -1648,6 +1601,9 @@ GitBook][gitbook].
 Several short examples of different features are included in the repository. A
 brief description of each is included here
 
+- [arg_capture](https://github.com/CLIUtils/CLI11/blob/main/examples/arg_capture.cpp):
+  Example of capturing all remaining arguments after a specific option, using
+  subcommand and prefix_command() with an alias.
 - [callback_passthrough](https://github.com/CLIUtils/CLI11/blob/main/examples/callback_passthrough.cpp):
   Example of directly passing remaining arguments through to a callback function
   which generates a CLI11 application based on existing arguments.
@@ -1727,75 +1683,107 @@ thanks to all the contributors
 <!-- prettier-ignore-start -->
 <!-- markdownlint-disable -->
 <table>
-  <tr>
-    <td align="center"><a href="http://iscinumpy.gitlab.io"><img src="https://avatars1.githubusercontent.com/u/4616906?v=4" width="100px;" alt=""/><br /><sub><b>Henry Schreiner</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/issues?q=author%3Ahenryiii" title="Bug reports">πŸ›</a> <a href="https://github.com/CLIUtils/CLI11/commits?author=henryiii" title="Documentation">πŸ“–</a> <a href="https://github.com/CLIUtils/CLI11/commits?author=henryiii" title="Code">πŸ’»</a></td>
-    <td align="center"><a href="https://github.com/phlptp"><img src="https://avatars0.githubusercontent.com/u/20667153?v=4" width="100px;" alt=""/><br /><sub><b>Philip Top</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/issues?q=author%3Aphlptp" title="Bug reports">πŸ›</a> <a href="https://github.com/CLIUtils/CLI11/commits?author=phlptp" title="Documentation">πŸ“–</a> <a href="https://github.com/CLIUtils/CLI11/commits?author=phlptp" title="Code">πŸ’»</a></td>
-    <td align="center"><a href="https://www.linkedin.com/in/cbachhuber/"><img src="https://avatars0.githubusercontent.com/u/27212661?v=4" width="100px;" alt=""/><br /><sub><b>Christoph Bachhuber</b></sub></a><br /><a href="#example-cbachhuber" title="Examples">πŸ’‘</a> <a href="https://github.com/CLIUtils/CLI11/commits?author=cbachhuber" title="Code">πŸ’»</a></td>
-    <td align="center"><a href="https://lambdafu.net/"><img src="https://avatars1.githubusercontent.com/u/1138455?v=4" width="100px;" alt=""/><br /><sub><b>Marcus Brinkmann</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/issues?q=author%3Alambdafu" title="Bug reports">πŸ›</a> <a href="https://github.com/CLIUtils/CLI11/commits?author=lambdafu" title="Code">πŸ’»</a></td>
-    <td align="center"><a href="https://github.com/SkyToGround"><img src="https://avatars1.githubusercontent.com/u/58835?v=4" width="100px;" alt=""/><br /><sub><b>Jonas Nilsson</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/issues?q=author%3ASkyToGround" title="Bug reports">πŸ›</a> <a href="https://github.com/CLIUtils/CLI11/commits?author=SkyToGround" title="Code">πŸ’»</a></td>
-    <td align="center"><a href="https://github.com/dvj"><img src="https://avatars2.githubusercontent.com/u/77217?v=4" width="100px;" alt=""/><br /><sub><b>Doug Johnston</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/issues?q=author%3Advj" title="Bug reports">πŸ›</a> <a href="https://github.com/CLIUtils/CLI11/commits?author=dvj" title="Code">πŸ’»</a></td>
-    <td align="center"><a href="http://lucas-czech.de"><img src="https://avatars0.githubusercontent.com/u/4741887?v=4" width="100px;" alt=""/><br /><sub><b>Lucas Czech</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/issues?q=author%3Alczech" title="Bug reports">πŸ›</a> <a href="https://github.com/CLIUtils/CLI11/commits?author=lczech" title="Code">πŸ’»</a></td>
-  </tr>
-  <tr>
-    <td align="center"><a href="https://github.com/rafiw"><img src="https://avatars3.githubusercontent.com/u/3034707?v=4" width="100px;" alt=""/><br /><sub><b>Rafi Wiener</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/issues?q=author%3Arafiw" title="Bug reports">πŸ›</a> <a href="https://github.com/CLIUtils/CLI11/commits?author=rafiw" title="Code">πŸ’»</a></td>
-    <td align="center"><a href="https://github.com/mensinda"><img src="https://avatars3.githubusercontent.com/u/3407462?v=4" width="100px;" alt=""/><br /><sub><b>Daniel Mensinger</b></sub></a><br /><a href="#platform-mensinda" title="Packaging/porting to new platform">πŸ“¦</a></td>
-    <td align="center"><a href="https://github.com/jbriales"><img src="https://avatars1.githubusercontent.com/u/6850478?v=4" width="100px;" alt=""/><br /><sub><b>Jesus Briales</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=jbriales" title="Code">πŸ’»</a> <a href="https://github.com/CLIUtils/CLI11/issues?q=author%3Ajbriales" title="Bug reports">πŸ›</a></td>
-    <td align="center"><a href="https://seanfisk.com/"><img src="https://avatars0.githubusercontent.com/u/410322?v=4" width="100px;" alt=""/><br /><sub><b>Sean Fisk</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/issues?q=author%3Aseanfisk" title="Bug reports">πŸ›</a> <a href="https://github.com/CLIUtils/CLI11/commits?author=seanfisk" title="Code">πŸ’»</a></td>
-    <td align="center"><a href="https://github.com/fpeng1985"><img src="https://avatars1.githubusercontent.com/u/87981?v=4" width="100px;" alt=""/><br /><sub><b>fpeng1985</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=fpeng1985" title="Code">πŸ’»</a></td>
-    <td align="center"><a href="https://github.com/almikhayl"><img src="https://avatars2.githubusercontent.com/u/6747040?v=4" width="100px;" alt=""/><br /><sub><b>almikhayl</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=almikhayl" title="Code">πŸ’»</a> <a href="#platform-almikhayl" title="Packaging/porting to new platform">πŸ“¦</a></td>
-    <td align="center"><a href="https://github.com/andrew-hardin"><img src="https://avatars0.githubusercontent.com/u/16496326?v=4" width="100px;" alt=""/><br /><sub><b>Andrew Hardin</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=andrew-hardin" title="Code">πŸ’»</a></td>
-  </tr>
-  <tr>
-    <td align="center"><a href="https://github.com/SX91"><img src="https://avatars2.githubusercontent.com/u/754754?v=4" width="100px;" alt=""/><br /><sub><b>Anton</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=SX91" title="Code">πŸ’»</a></td>
-    <td align="center"><a href="https://github.com/helmesjo"><img src="https://avatars0.githubusercontent.com/u/2501070?v=4" width="100px;" alt=""/><br /><sub><b>Fred HelmesjΓΆ</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/issues?q=author%3Ahelmesjo" title="Bug reports">πŸ›</a> <a href="https://github.com/CLIUtils/CLI11/commits?author=helmesjo" title="Code">πŸ’»</a></td>
-    <td align="center"><a href="https://github.com/skannan89"><img src="https://avatars0.githubusercontent.com/u/11918764?v=4" width="100px;" alt=""/><br /><sub><b>Kannan</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/issues?q=author%3Askannan89" title="Bug reports">πŸ›</a> <a href="https://github.com/CLIUtils/CLI11/commits?author=skannan89" title="Code">πŸ’»</a></td>
-    <td align="center"><a href="http://himvis.com"><img src="https://avatars3.githubusercontent.com/u/465279?v=4" width="100px;" alt=""/><br /><sub><b>Khem Raj</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=kraj" title="Code">πŸ’»</a></td>
-    <td align="center"><a href="https://www.mogigoma.com/"><img src="https://avatars2.githubusercontent.com/u/130862?v=4" width="100px;" alt=""/><br /><sub><b>Mak Kolybabi</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=mogigoma" title="Documentation">πŸ“–</a></td>
-    <td align="center"><a href="http://msoeken.github.io"><img src="https://avatars0.githubusercontent.com/u/1998245?v=4" width="100px;" alt=""/><br /><sub><b>Mathias Soeken</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=msoeken" title="Documentation">πŸ“–</a></td>
-    <td align="center"><a href="https://github.com/nathanhourt"><img src="https://avatars2.githubusercontent.com/u/271977?v=4" width="100px;" alt=""/><br /><sub><b>Nathan Hourt</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/issues?q=author%3Anathanhourt" title="Bug reports">πŸ›</a> <a href="https://github.com/CLIUtils/CLI11/commits?author=nathanhourt" title="Code">πŸ’»</a></td>
-  </tr>
-  <tr>
-    <td align="center"><a href="https://github.com/pleroux0"><img src="https://avatars2.githubusercontent.com/u/39619854?v=4" width="100px;" alt=""/><br /><sub><b>Paul le Roux</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=pleroux0" title="Code">πŸ’»</a> <a href="#platform-pleroux0" title="Packaging/porting to new platform">πŸ“¦</a></td>
-    <td align="center"><a href="https://github.com/chfast"><img src="https://avatars1.githubusercontent.com/u/573380?v=4" width="100px;" alt=""/><br /><sub><b>PaweΕ‚ Bylica</b></sub></a><br /><a href="#platform-chfast" title="Packaging/porting to new platform">πŸ“¦</a></td>
-    <td align="center"><a href="https://github.com/peterazmanov"><img src="https://avatars0.githubusercontent.com/u/15322318?v=4" width="100px;" alt=""/><br /><sub><b>Peter Azmanov</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=peterazmanov" title="Code">πŸ’»</a></td>
-    <td align="center"><a href="https://github.com/delpinux"><img src="https://avatars0.githubusercontent.com/u/35096584?v=4" width="100px;" alt=""/><br /><sub><b>StΓ©phane Del Pino</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=delpinux" title="Code">πŸ’»</a></td>
-    <td align="center"><a href="https://github.com/metopa"><img src="https://avatars2.githubusercontent.com/u/3974178?v=4" width="100px;" alt=""/><br /><sub><b>Viacheslav Kroilov</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=metopa" title="Code">πŸ’»</a></td>
-    <td align="center"><a href="http://cs.odu.edu/~ctsolakis"><img src="https://avatars0.githubusercontent.com/u/6725596?v=4" width="100px;" alt=""/><br /><sub><b>christos</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=ChristosT" title="Code">πŸ’»</a></td>
-    <td align="center"><a href="https://github.com/deining"><img src="https://avatars3.githubusercontent.com/u/18169566?v=4" width="100px;" alt=""/><br /><sub><b>deining</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=deining" title="Documentation">πŸ“–</a></td>
-  </tr>
-  <tr>
-    <td align="center"><a href="https://github.com/elszon"><img src="https://avatars0.githubusercontent.com/u/2971495?v=4" width="100px;" alt=""/><br /><sub><b>elszon</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=elszon" title="Code">πŸ’»</a></td>
-    <td align="center"><a href="https://github.com/ncihnegn"><img src="https://avatars3.githubusercontent.com/u/12021721?v=4" width="100px;" alt=""/><br /><sub><b>ncihnegn</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=ncihnegn" title="Code">πŸ’»</a></td>
-    <td align="center"><a href="https://github.com/nurelin"><img src="https://avatars3.githubusercontent.com/u/5276274?v=4" width="100px;" alt=""/><br /><sub><b>nurelin</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=nurelin" title="Code">πŸ’»</a></td>
-    <td align="center"><a href="https://github.com/ryan4729"><img src="https://avatars3.githubusercontent.com/u/40183301?v=4" width="100px;" alt=""/><br /><sub><b>ryan4729</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=ryan4729" title="Tests">⚠️</a></td>
-    <td align="center"><a href="https://izzys.casa"><img src="https://avatars0.githubusercontent.com/u/63051?v=4" width="100px;" alt=""/><br /><sub><b>Isabella Muerte</b></sub></a><br /><a href="#platform-slurps-mad-rips" title="Packaging/porting to new platform">πŸ“¦</a></td>
-    <td align="center"><a href="https://github.com/KOLANICH"><img src="https://avatars1.githubusercontent.com/u/240344?v=4" width="100px;" alt=""/><br /><sub><b>KOLANICH</b></sub></a><br /><a href="#platform-KOLANICH" title="Packaging/porting to new platform">πŸ“¦</a></td>
-    <td align="center"><a href="https://github.com/jgerityneurala"><img src="https://avatars2.githubusercontent.com/u/57360646?v=4" width="100px;" alt=""/><br /><sub><b>James Gerity</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=jgerityneurala" title="Documentation">πŸ“–</a></td>
-  </tr>
-  <tr>
-    <td align="center"><a href="https://github.com/jsoref"><img src="https://avatars0.githubusercontent.com/u/2119212?v=4" width="100px;" alt=""/><br /><sub><b>Josh Soref</b></sub></a><br /><a href="#tool-jsoref" title="Tools">πŸ”§</a></td>
-    <td align="center"><a href="https://github.com/geir-t"><img src="https://avatars3.githubusercontent.com/u/35292136?v=4" width="100px;" alt=""/><br /><sub><b>geir-t</b></sub></a><br /><a href="#platform-geir-t" title="Packaging/porting to new platform">πŸ“¦</a></td>
-    <td align="center"><a href="https://ondrejcertik.com/"><img src="https://avatars3.githubusercontent.com/u/20568?v=4" width="100px;" alt=""/><br /><sub><b>OndΕ™ej ČertΓ­k</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/issues?q=author%3Acertik" title="Bug reports">πŸ›</a></td>
-    <td align="center"><a href="http://sam.hocevar.net/"><img src="https://avatars2.githubusercontent.com/u/245089?v=4" width="100px;" alt=""/><br /><sub><b>Sam Hocevar</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=samhocevar" title="Code">πŸ’»</a></td>
-    <td align="center"><a href="http://www.ratml.org/"><img src="https://avatars0.githubusercontent.com/u/1845039?v=4" width="100px;" alt=""/><br /><sub><b>Ryan Curtin</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=rcurtin" title="Documentation">πŸ“–</a></td>
-    <td align="center"><a href="https://mbh.sh"><img src="https://avatars3.githubusercontent.com/u/20403931?v=4" width="100px;" alt=""/><br /><sub><b>Michael Hall</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=mbhall88" title="Documentation">πŸ“–</a></td>
-    <td align="center"><a href="https://github.com/ferdymercury"><img src="https://avatars3.githubusercontent.com/u/10653970?v=4" width="100px;" alt=""/><br /><sub><b>ferdymercury</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=ferdymercury" title="Documentation">πŸ“–</a></td>
-  </tr>
-  <tr>
-    <td align="center"><a href="https://github.com/jakoblover"><img src="https://avatars0.githubusercontent.com/u/14160441?v=4" width="100px;" alt=""/><br /><sub><b>Jakob Lover</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=jakoblover" title="Code">πŸ’»</a></td>
-    <td align="center"><a href="https://github.com/ZeeD26"><img src="https://avatars2.githubusercontent.com/u/2487468?v=4" width="100px;" alt=""/><br /><sub><b>Dominik Steinberger</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=ZeeD26" title="Code">πŸ’»</a></td>
-    <td align="center"><a href="https://github.com/dfleury2"><img src="https://avatars1.githubusercontent.com/u/4805384?v=4" width="100px;" alt=""/><br /><sub><b>D. Fleury</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=dfleury2" title="Code">πŸ’»</a></td>
-    <td align="center"><a href="https://github.com/dbarowy"><img src="https://avatars3.githubusercontent.com/u/573142?v=4" width="100px;" alt=""/><br /><sub><b>Dan Barowy</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=dbarowy" title="Documentation">πŸ“–</a></td>
-    <td align="center"><a href="https://github.com/paddy-hack"><img src="https://avatars.githubusercontent.com/u/6804372?v=4" width="100px;" alt=""/><br /><sub><b>Olaf Meeuwissen</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=paddy-hack" title="Code">πŸ’»</a></td>
-    <td align="center"><a href="https://github.com/dryleev"><img src="https://avatars.githubusercontent.com/u/83670813?v=4" width="100px;" alt=""/><br /><sub><b>dryleev</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=dryleev" title="Code">πŸ’»</a></td>
-    <td align="center"><a href="https://github.com/AnticliMaxtic"><img src="https://avatars.githubusercontent.com/u/43995389?v=4" width="100px;" alt=""/><br /><sub><b>Max</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=AnticliMaxtic" title="Code">πŸ’»</a></td>
-  </tr>
-  <tr>
-    <td align="center"><a href="https://profiles.sussex.ac.uk/p281168-alex-dewar/publications"><img src="https://avatars.githubusercontent.com/u/23149834?v=4" width="100px;" alt=""/><br /><sub><b>Alex Dewar</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=alexdewar" title="Code">πŸ’»</a></td>
-  </tr>
+  <tbody>
+    <tr>
+      <td align="center" valign="top" width="14.28%"><a href="https://profiles.sussex.ac.uk/p281168-alex-dewar/publications"><img src="https://avatars.githubusercontent.com/u/23149834?v=4?s=100" width="100px;" alt="Alex Dewar"/><br /><sub><b>Alex Dewar</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=alexdewar" title="Code">πŸ’»</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/andrew-hardin"><img src="https://avatars0.githubusercontent.com/u/16496326?v=4?s=100" width="100px;" alt="Andrew Hardin"/><br /><sub><b>Andrew Hardin</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=andrew-hardin" title="Code">πŸ’»</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/andreasxp"><img src="https://avatars.githubusercontent.com/u/28830446?v=4?s=100" width="100px;" alt="Andrey Zhukov"/><br /><sub><b>Andrey Zhukov</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=andreasxp" title="Code">πŸ’»</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/SX91"><img src="https://avatars2.githubusercontent.com/u/754754?v=4?s=100" width="100px;" alt="Anton"/><br /><sub><b>Anton</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=SX91" title="Code">πŸ’»</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/trokhymchuk"><img src="https://avatars.githubusercontent.com/u/66204814?v=4?s=100" width="100px;" alt="Artem Trokhymchuk "/><br /><sub><b>Artem Trokhymchuk </b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=trokhymchuk" title="Code">πŸ’»</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/BenjaminBeichler"><img src="https://avatars.githubusercontent.com/u/1441492?v=4?s=100" width="100px;" alt="Benjamin Beichler"/><br /><sub><b>Benjamin Beichler</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=BenjaminBeichler" title="Code">πŸ’»</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://www.linkedin.com/in/cbachhuber/"><img src="https://avatars0.githubusercontent.com/u/27212661?v=4?s=100" width="100px;" alt="Christoph Bachhuber"/><br /><sub><b>Christoph Bachhuber</b></sub></a><br /><a href="#example-cbachhuber" title="Examples">πŸ’‘</a> <a href="https://github.com/CLIUtils/CLI11/commits?author=cbachhuber" title="Code">πŸ’»</a></td>
+    </tr>
+    <tr>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/dfleury2"><img src="https://avatars1.githubusercontent.com/u/4805384?v=4?s=100" width="100px;" alt="D. Fleury"/><br /><sub><b>D. Fleury</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=dfleury2" title="Code">πŸ’»</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/dbarowy"><img src="https://avatars3.githubusercontent.com/u/573142?v=4?s=100" width="100px;" alt="Dan Barowy"/><br /><sub><b>Dan Barowy</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=dbarowy" title="Documentation">πŸ“–</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/mensinda"><img src="https://avatars3.githubusercontent.com/u/3407462?v=4?s=100" width="100px;" alt="Daniel Mensinger"/><br /><sub><b>Daniel Mensinger</b></sub></a><br /><a href="#platform-mensinda" title="Packaging/porting to new platform">πŸ“¦</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/DarkWingMcQuack"><img src="https://avatars.githubusercontent.com/u/38857302?v=4?s=100" width="100px;" alt="DarkWingMcQuack"/><br /><sub><b>DarkWingMcQuack</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=DarkWingMcQuack" title="Code">πŸ’»</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/ZeeD26"><img src="https://avatars2.githubusercontent.com/u/2487468?v=4?s=100" width="100px;" alt="Dominik Steinberger"/><br /><sub><b>Dominik Steinberger</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=ZeeD26" title="Code">πŸ’»</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/dvj"><img src="https://avatars2.githubusercontent.com/u/77217?v=4?s=100" width="100px;" alt="Doug Johnston"/><br /><sub><b>Doug Johnston</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/issues?q=author%3Advj" title="Bug reports">πŸ›</a> <a href="https://github.com/CLIUtils/CLI11/commits?author=dvj" title="Code">πŸ’»</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/eli-schwartz"><img src="https://avatars.githubusercontent.com/u/6551424?v=4?s=100" width="100px;" alt="Eli Schwartz"/><br /><sub><b>Eli Schwartz</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=eli-schwartz" title="Code">πŸ’»</a></td>
+    </tr>
+    <tr>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/helmesjo"><img src="https://avatars0.githubusercontent.com/u/2501070?v=4?s=100" width="100px;" alt="Fred HelmesjΓΆ"/><br /><sub><b>Fred HelmesjΓΆ</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/issues?q=author%3Ahelmesjo" title="Bug reports">πŸ›</a> <a href="https://github.com/CLIUtils/CLI11/commits?author=helmesjo" title="Code">πŸ’»</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="http://iscinumpy.gitlab.io"><img src="https://avatars1.githubusercontent.com/u/4616906?v=4?s=100" width="100px;" alt="Henry Schreiner"/><br /><sub><b>Henry Schreiner</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/issues?q=author%3Ahenryiii" title="Bug reports">πŸ›</a> <a href="https://github.com/CLIUtils/CLI11/commits?author=henryiii" title="Documentation">πŸ“–</a> <a href="https://github.com/CLIUtils/CLI11/commits?author=henryiii" title="Code">πŸ’»</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://izzys.casa"><img src="https://avatars0.githubusercontent.com/u/63051?v=4?s=100" width="100px;" alt="Isabella Muerte"/><br /><sub><b>Isabella Muerte</b></sub></a><br /><a href="#platform-slurps-mad-rips" title="Packaging/porting to new platform">πŸ“¦</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://izzys.casa/"><img src="https://avatars.githubusercontent.com/u/63051?v=4?s=100" width="100px;" alt="Izzy Muerte"/><br /><sub><b>Izzy Muerte</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=bruxisma" title="Code">πŸ’»</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/jakoblover"><img src="https://avatars0.githubusercontent.com/u/14160441?v=4?s=100" width="100px;" alt="Jakob Lover"/><br /><sub><b>Jakob Lover</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=jakoblover" title="Code">πŸ’»</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/jgerityneurala"><img src="https://avatars2.githubusercontent.com/u/57360646?v=4?s=100" width="100px;" alt="James Gerity"/><br /><sub><b>James Gerity</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=jgerityneurala" title="Documentation">πŸ“–</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/jbriales"><img src="https://avatars1.githubusercontent.com/u/6850478?v=4?s=100" width="100px;" alt="Jesus Briales"/><br /><sub><b>Jesus Briales</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=jbriales" title="Code">πŸ’»</a> <a href="https://github.com/CLIUtils/CLI11/issues?q=author%3Ajbriales" title="Bug reports">πŸ›</a></td>
+    </tr>
+    <tr>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/SkyToGround"><img src="https://avatars1.githubusercontent.com/u/58835?v=4?s=100" width="100px;" alt="Jonas Nilsson"/><br /><sub><b>Jonas Nilsson</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/issues?q=author%3ASkyToGround" title="Bug reports">πŸ›</a> <a href="https://github.com/CLIUtils/CLI11/commits?author=SkyToGround" title="Code">πŸ’»</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/j-rivero"><img src="https://avatars.githubusercontent.com/u/2098802?v=4?s=100" width="100px;" alt="Jose Luis Rivero"/><br /><sub><b>Jose Luis Rivero</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=j-rivero" title="Code">πŸ’»</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/jsoref"><img src="https://avatars0.githubusercontent.com/u/2119212?v=4?s=100" width="100px;" alt="Josh Soref"/><br /><sub><b>Josh Soref</b></sub></a><br /><a href="#tool-jsoref" title="Tools">πŸ”§</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/KOLANICH"><img src="https://avatars1.githubusercontent.com/u/240344?v=4?s=100" width="100px;" alt="KOLANICH"/><br /><sub><b>KOLANICH</b></sub></a><br /><a href="#platform-KOLANICH" title="Packaging/porting to new platform">πŸ“¦</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/skannan89"><img src="https://avatars0.githubusercontent.com/u/11918764?v=4?s=100" width="100px;" alt="Kannan"/><br /><sub><b>Kannan</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/issues?q=author%3Askannan89" title="Bug reports">πŸ›</a> <a href="https://github.com/CLIUtils/CLI11/commits?author=skannan89" title="Code">πŸ’»</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="http://himvis.com"><img src="https://avatars3.githubusercontent.com/u/465279?v=4?s=100" width="100px;" alt="Khem Raj"/><br /><sub><b>Khem Raj</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=kraj" title="Code">πŸ’»</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/looopTools"><img src="https://avatars.githubusercontent.com/u/1943536?v=4?s=100" width="100px;" alt="Lars Nielsen"/><br /><sub><b>Lars Nielsen</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=looopTools" title="Code">πŸ’»</a></td>
+    </tr>
+    <tr>
+      <td align="center" valign="top" width="14.28%"><a href="http://lucas-czech.de"><img src="https://avatars0.githubusercontent.com/u/4741887?v=4?s=100" width="100px;" alt="Lucas Czech"/><br /><sub><b>Lucas Czech</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/issues?q=author%3Alczech" title="Bug reports">πŸ›</a> <a href="https://github.com/CLIUtils/CLI11/commits?author=lczech" title="Code">πŸ’»</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://www.mogigoma.com/"><img src="https://avatars2.githubusercontent.com/u/130862?v=4?s=100" width="100px;" alt="Mak Kolybabi"/><br /><sub><b>Mak Kolybabi</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=mogigoma" title="Documentation">πŸ“–</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/cetius"><img src="https://avatars.githubusercontent.com/u/6552472?v=4?s=100" width="100px;" alt="Marcin Ropa"/><br /><sub><b>Marcin Ropa</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=cetius" title="Code">πŸ’»</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://lambdafu.net/"><img src="https://avatars1.githubusercontent.com/u/1138455?v=4?s=100" width="100px;" alt="Marcus Brinkmann"/><br /><sub><b>Marcus Brinkmann</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/issues?q=author%3Alambdafu" title="Bug reports">πŸ›</a> <a href="https://github.com/CLIUtils/CLI11/commits?author=lambdafu" title="Code">πŸ’»</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="http://msoeken.github.io"><img src="https://avatars0.githubusercontent.com/u/1998245?v=4?s=100" width="100px;" alt="Mathias Soeken"/><br /><sub><b>Mathias Soeken</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=msoeken" title="Documentation">πŸ“–</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://www.mmmccormick.com/"><img src="https://avatars.githubusercontent.com/u/25432?v=4?s=100" width="100px;" alt="Matt McCormick"/><br /><sub><b>Matt McCormick</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=thewtex" title="Code">πŸ’»</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/AnticliMaxtic"><img src="https://avatars.githubusercontent.com/u/43995389?v=4?s=100" width="100px;" alt="Max"/><br /><sub><b>Max</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=AnticliMaxtic" title="Code">πŸ’»</a></td>
+    </tr>
+    <tr>
+      <td align="center" valign="top" width="14.28%"><a href="https://mbh.sh"><img src="https://avatars3.githubusercontent.com/u/20403931?v=4?s=100" width="100px;" alt="Michael Hall"/><br /><sub><b>Michael Hall</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=mbhall88" title="Documentation">πŸ“–</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/nathanhourt"><img src="https://avatars2.githubusercontent.com/u/271977?v=4?s=100" width="100px;" alt="Nathan Hourt"/><br /><sub><b>Nathan Hourt</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/issues?q=author%3Anathanhourt" title="Bug reports">πŸ›</a> <a href="https://github.com/CLIUtils/CLI11/commits?author=nathanhourt" title="Code">πŸ’»</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/nathanielhourt"><img src="https://avatars.githubusercontent.com/u/271977?v=4?s=100" width="100px;" alt="Nathaniel Hourt"/><br /><sub><b>Nathaniel Hourt</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=nathanielhourt" title="Code">πŸ’»</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/paddy-hack"><img src="https://avatars.githubusercontent.com/u/6804372?v=4?s=100" width="100px;" alt="Olaf Meeuwissen"/><br /><sub><b>Olaf Meeuwissen</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=paddy-hack" title="Code">πŸ’»</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://ondrejcertik.com/"><img src="https://avatars3.githubusercontent.com/u/20568?v=4?s=100" width="100px;" alt="OndΕ™ej ČertΓ­k"/><br /><sub><b>OndΕ™ej ČertΓ­k</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/issues?q=author%3Acertik" title="Bug reports">πŸ›</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/pleroux0"><img src="https://avatars2.githubusercontent.com/u/39619854?v=4?s=100" width="100px;" alt="Paul le Roux"/><br /><sub><b>Paul le Roux</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=pleroux0" title="Code">πŸ’»</a> <a href="#platform-pleroux0" title="Packaging/porting to new platform">πŸ“¦</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/chfast"><img src="https://avatars1.githubusercontent.com/u/573380?v=4?s=100" width="100px;" alt="PaweΕ‚ Bylica"/><br /><sub><b>PaweΕ‚ Bylica</b></sub></a><br /><a href="#platform-chfast" title="Packaging/porting to new platform">πŸ“¦</a></td>
+    </tr>
+    <tr>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/PeteAudinate"><img src="https://avatars.githubusercontent.com/u/99274874?v=4?s=100" width="100px;" alt="PeteAudinate"/><br /><sub><b>PeteAudinate</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=PeteAudinate" title="Code">πŸ’»</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/peterazmanov"><img src="https://avatars0.githubusercontent.com/u/15322318?v=4?s=100" width="100px;" alt="Peter Azmanov"/><br /><sub><b>Peter Azmanov</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=peterazmanov" title="Code">πŸ’»</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/peterh"><img src="https://avatars.githubusercontent.com/u/79339?v=4?s=100" width="100px;" alt="Peter Harris"/><br /><sub><b>Peter Harris</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=peterh" title="Code">πŸ’»</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="http://ptheywood.uk/"><img src="https://avatars.githubusercontent.com/u/628937?v=4?s=100" width="100px;" alt="Peter Heywood"/><br /><sub><b>Peter Heywood</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=ptheywood" title="Code">πŸ’»</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/phlptp"><img src="https://avatars0.githubusercontent.com/u/20667153?v=4?s=100" width="100px;" alt="Philip Top"/><br /><sub><b>Philip Top</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/issues?q=author%3Aphlptp" title="Bug reports">πŸ›</a> <a href="https://github.com/CLIUtils/CLI11/commits?author=phlptp" title="Documentation">πŸ“–</a> <a href="https://github.com/CLIUtils/CLI11/commits?author=phlptp" title="Code">πŸ’»</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/rafiw"><img src="https://avatars3.githubusercontent.com/u/3034707?v=4?s=100" width="100px;" alt="Rafi Wiener"/><br /><sub><b>Rafi Wiener</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/issues?q=author%3Arafiw" title="Bug reports">πŸ›</a> <a href="https://github.com/CLIUtils/CLI11/commits?author=rafiw" title="Code">πŸ’»</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/RangeMachine"><img src="https://avatars.githubusercontent.com/u/11577601?v=4?s=100" width="100px;" alt="RangeMachine"/><br /><sub><b>RangeMachine</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=RangeMachine" title="Code">πŸ’»</a></td>
+    </tr>
+    <tr>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/Krzmbrzl"><img src="https://avatars.githubusercontent.com/u/12751591?v=4?s=100" width="100px;" alt="Robert Adam"/><br /><sub><b>Robert Adam</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=Krzmbrzl" title="Code">πŸ’»</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="http://www.ratml.org/"><img src="https://avatars0.githubusercontent.com/u/1845039?v=4?s=100" width="100px;" alt="Ryan Curtin"/><br /><sub><b>Ryan Curtin</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=rcurtin" title="Documentation">πŸ“–</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/SherlockInSpace"><img src="https://avatars.githubusercontent.com/u/5507786?v=4?s=100" width="100px;" alt="Ryan Sherlock"/><br /><sub><b>Ryan Sherlock</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=SherlockInSpace" title="Code">πŸ’»</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="http://sam.hocevar.net/"><img src="https://avatars2.githubusercontent.com/u/245089?v=4?s=100" width="100px;" alt="Sam Hocevar"/><br /><sub><b>Sam Hocevar</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=samhocevar" title="Code">πŸ’»</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://seanfisk.com/"><img src="https://avatars0.githubusercontent.com/u/410322?v=4?s=100" width="100px;" alt="Sean Fisk"/><br /><sub><b>Sean Fisk</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/issues?q=author%3Aseanfisk" title="Bug reports">πŸ›</a> <a href="https://github.com/CLIUtils/CLI11/commits?author=seanfisk" title="Code">πŸ’»</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/delpinux"><img src="https://avatars0.githubusercontent.com/u/35096584?v=4?s=100" width="100px;" alt="StΓ©phane Del Pino"/><br /><sub><b>StΓ©phane Del Pino</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=delpinux" title="Code">πŸ’»</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/metopa"><img src="https://avatars2.githubusercontent.com/u/3974178?v=4?s=100" width="100px;" alt="Viacheslav Kroilov"/><br /><sub><b>Viacheslav Kroilov</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=metopa" title="Code">πŸ’»</a></td>
+    </tr>
+    <tr>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/VolkerChristian"><img src="https://avatars.githubusercontent.com/u/18554540?v=4?s=100" width="100px;" alt="Volker Christian"/><br /><sub><b>Volker Christian</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=VolkerChristian" title="Code">πŸ’»</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/almikhayl"><img src="https://avatars2.githubusercontent.com/u/6747040?v=4?s=100" width="100px;" alt="almikhayl"/><br /><sub><b>almikhayl</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=almikhayl" title="Code">πŸ’»</a> <a href="#platform-almikhayl" title="Packaging/porting to new platform">πŸ“¦</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/ayum"><img src="https://avatars.githubusercontent.com/u/6747040?v=4?s=100" width="100px;" alt="ayum"/><br /><sub><b>ayum</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=ayum" title="Code">πŸ’»</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/captainurist"><img src="https://avatars.githubusercontent.com/u/73941350?v=4?s=100" width="100px;" alt="captainurist"/><br /><sub><b>captainurist</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=captainurist" title="Code">πŸ’»</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="http://cs.odu.edu/~ctsolakis"><img src="https://avatars0.githubusercontent.com/u/6725596?v=4?s=100" width="100px;" alt="christos"/><br /><sub><b>christos</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=ChristosT" title="Code">πŸ’»</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/deining"><img src="https://avatars3.githubusercontent.com/u/18169566?v=4?s=100" width="100px;" alt="deining"/><br /><sub><b>deining</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=deining" title="Documentation">πŸ“–</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/dherrera-fb"><img src="https://avatars.githubusercontent.com/u/89840711?v=4?s=100" width="100px;" alt="dherrera-fb"/><br /><sub><b>dherrera-fb</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=dherrera-fb" title="Code">πŸ’»</a></td>
+    </tr>
+    <tr>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/djerius"><img src="https://avatars.githubusercontent.com/u/196875?v=4?s=100" width="100px;" alt="djerius"/><br /><sub><b>djerius</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=djerius" title="Code">πŸ’»</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/dryleev"><img src="https://avatars.githubusercontent.com/u/83670813?v=4?s=100" width="100px;" alt="dryleev"/><br /><sub><b>dryleev</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=dryleev" title="Code">πŸ’»</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/elszon"><img src="https://avatars0.githubusercontent.com/u/2971495?v=4?s=100" width="100px;" alt="elszon"/><br /><sub><b>elszon</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=elszon" title="Code">πŸ’»</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/ferdymercury"><img src="https://avatars3.githubusercontent.com/u/10653970?v=4?s=100" width="100px;" alt="ferdymercury"/><br /><sub><b>ferdymercury</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=ferdymercury" title="Documentation">πŸ“–</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/fpeng1985"><img src="https://avatars1.githubusercontent.com/u/87981?v=4?s=100" width="100px;" alt="fpeng1985"/><br /><sub><b>fpeng1985</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=fpeng1985" title="Code">πŸ’»</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/geir-t"><img src="https://avatars3.githubusercontent.com/u/35292136?v=4?s=100" width="100px;" alt="geir-t"/><br /><sub><b>geir-t</b></sub></a><br /><a href="#platform-geir-t" title="Packaging/porting to new platform">πŸ“¦</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/ncihnegn"><img src="https://avatars3.githubusercontent.com/u/12021721?v=4?s=100" width="100px;" alt="ncihnegn"/><br /><sub><b>ncihnegn</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=ncihnegn" title="Code">πŸ’»</a></td>
+    </tr>
+    <tr>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/nurelin"><img src="https://avatars3.githubusercontent.com/u/5276274?v=4?s=100" width="100px;" alt="nurelin"/><br /><sub><b>nurelin</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=nurelin" title="Code">πŸ’»</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="http://polistern.i2p/"><img src="https://avatars.githubusercontent.com/u/55511995?v=4?s=100" width="100px;" alt="polistern"/><br /><sub><b>polistern</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=polistern" title="Code">πŸ’»</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/ryan4729"><img src="https://avatars3.githubusercontent.com/u/40183301?v=4?s=100" width="100px;" alt="ryan4729"/><br /><sub><b>ryan4729</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=ryan4729" title="Tests">⚠️</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/shameekganguly"><img src="https://avatars.githubusercontent.com/u/2412842?v=4?s=100" width="100px;" alt="shameekganguly"/><br /><sub><b>shameekganguly</b></sub></a><br /><a href="https://github.com/CLIUtils/CLI11/commits?author=shameekganguly" title="Code">πŸ’»</a></td>
+    </tr>
+  </tbody>
 </table>
 
-<!-- markdownlint-enable -->
+<!-- markdownlint-restore -->
 <!-- prettier-ignore-end -->
 
 <!-- ALL-CONTRIBUTORS-LIST:END -->
@@ -1890,3 +1878,4 @@ try! Feedback is always welcome.
 [argparse]: https://github.com/p-ranav/argparse
 [toml]: https://toml.io
 [lyra]: https://github.com/bfgroup/Lyra
+[installation]: https://cliutils.github.io/CLI11/book/chapters/installation.html
diff --git a/packages/CLI11/azure-pipelines.yml b/packages/CLI11/azure-pipelines.yml
index 1bb4d0771abf016d442eaa33300dc31698ca9825..647c7982b62a7cdff5e24f74c7acc57ae804d3b4 100644
--- a/packages/CLI11/azure-pipelines.yml
+++ b/packages/CLI11/azure-pipelines.yml
@@ -9,7 +9,6 @@ trigger:
 
 pr:
   - main
-  - "v*"
 
 variables:
   cli11.single: ON
@@ -28,6 +27,20 @@ jobs:
       - bash: cpplint --counting=detailed --recursive examples include/CLI tests
         displayName: Checking against google style guide
 
+  - job: build_only
+    strategy:
+      matrix:
+        visual_studio_arm64:
+          vmImage: "windows-2022"
+          cli11.std: 17
+          cli11.build_type: Debug
+          cli11.options: -G "Visual Studio 17 2022" -A ARM64
+    pool:
+      vmImage: $(vmImage)
+
+    steps:
+      - template: .ci/azure-build.yml
+
   - job: Native
     strategy:
       matrix:
@@ -137,3 +150,28 @@ jobs:
       - template: .ci/azure-cmake.yml
       - template: .ci/azure-build.yml
       - template: .ci/azure-test.yml
+
+  - job: Docker_new
+    variables:
+      cli11.single: OFF
+    pool:
+      vmImage: "ubuntu-latest"
+    strategy:
+      matrix:
+        gcc13:
+          containerImage: gcc:13
+          cli11.std: 17
+          cli11.options: -DCMAKE_CXX_FLAGS="-Wstrict-overflow=5"
+        gcc12:
+          containerImage: gcc:12
+          cli11.std: 20
+          cli11.options: -DCMAKE_CXX_FLAGS="-Wredundant-decls -Wconversion"
+        clang17_20:
+          containerImage: silkeh/clang:17
+          cli11.std: 23
+          cli11.options: -DCMAKE_CXX_FLAGS=-std=c++23
+    container: $[ variables['containerImage'] ]
+    steps:
+      - template: .ci/azure-cmake-new.yml
+      - template: .ci/azure-build.yml
+      - template: .ci/azure-test.yml
diff --git a/packages/CLI11/book/chapters/config.md b/packages/CLI11/book/chapters/config.md
index 30ca48effb6a5370bd6bdba1fc73e521b5737339..54f8661390c139ee02fb23dd4d1b5ad047233ae6 100644
--- a/packages/CLI11/book/chapters/config.md
+++ b/packages/CLI11/book/chapters/config.md
@@ -8,7 +8,9 @@ config flag. The second item is the default file name. If that is specified, the
 config will try to read that file. The third item is the help string, with a
 reasonable default, and the final argument is a boolean (default: false) that
 indicates that the configuration file is required and an error will be thrown if
-the file is not found and this is set to true.
+the file is not found and this is set to true. The option pointer returned by
+`set_config` is the same type as returned by `add_option` and all modifiers
+including validators, and checks are valid.
 
 ### Adding a default path
 
@@ -98,12 +100,29 @@ If it is needed to get the configuration file name used this can be obtained via
 `app["--config"]->as<std::string>()` assuming `--config` was the configuration
 option name.
 
+### Order of precedence
+
+By default if multiple configuration files are given they are read in reverse
+order. With the last one given taking precedence over the earlier ones. This
+behavior can be changed through the `multi_option_policy`. For example:
+
+```cpp
+app.set_config("--config")
+    ->multi_option_policy(CLI::MultiOptionPolicy::TakeAll);
+```
+
+will read the files in the order given, which may be useful in some
+circumstances. Using `CLI::MultiOptionPolicy::TakeLast` would work similarly
+getting the last `N` files given. The default policy for config options is
+`CLI::MultiOptionPolicy::Reverse` which takes the last expected `N` and reverses
+them so the last option given is given precedence.
+
 ## Configure file format
 
 Here is an example configuration file, in
 [TOML](https://github.com/toml-lang/toml) format:
 
-```ini
+```toml
 # Comments are supported, using a #
 # The default section is [default], case insensitive
 
@@ -148,6 +167,61 @@ The main differences are in vector notation and comment character. Note: CLI11
 is not a full TOML parser as it just reads values as strings. It is possible
 (but not recommended) to mix notation.
 
+### Multi-line strings
+
+The default config file parser supports multi-line strings like the toml
+standard [TOML](https://toml.io/en/). It also supports multiline comments like
+python doc strings.
+
+```toml
+"""
+this is a multine
+comment
+"""
+
+""" this is also
+a multiline comment"""
+
+''' and so is
+this
+'''
+
+value = 1
+str = """
+this is a multiline string value
+the first \n is removed and so is the last
+"""
+
+str2 = ''' this is also a mu-
+ltiline value '''
+
+str3 = """\
+    a line continuation \
+    will skip \
+    all white space between the '\' \
+    and the next non-whitespace character \
+    making this into a single line
+"""
+
+```
+
+The key is that the closing of the multiline string must be at the end of a line
+and match the starting 3 quote sequence. Multiline sequences using `"""` allow
+escape sequences. Following [TOML](https://toml.io/en/v1.0.0#string) with the
+addition of allowing '\0' for a null character, and binary Strings described in
+the next section. This same formatting also applies to single line strings.
+Multiline strings are not allowed as part of an array.
+
+### Binary Strings
+
+Config files have a binary conversion capability, this is mainly to support
+writing config files but can be used by user generated files as well. Strings
+with the form `B"(XXXXX)"` will convert any characters inside the parenthesis
+with the form `\xHH` to the equivalent binary value. The HH are hexadecimal
+characters. Characters not in this form will be translated as given. If argument
+values with unprintable characters are used to generate a config file this
+binary form will be used in the output string.
+
 ## Multiple configuration files
 
 If it is desired that multiple configuration be allowed. Use
@@ -206,8 +280,8 @@ char arraySeparator = ',';
 char valueDelimiter = '=';
 /// the character to use around strings
 char stringQuote = '"';
-/// the character to use around single characters
-char characterQuote = '\'';
+/// the character to use around single characters and literal strings
+char literalQuote = '\'';
 /// the maximum number of layers to allow
 uint8_t maximumLayers{255};
 /// the separator used to separator parent layers
@@ -228,8 +302,8 @@ These can be modified via setter functions
   an array
 - `ConfigBase *valueSeparator(char vSep)`: Specify the delimiter between a name
   and value
-- `ConfigBase *quoteCharacter(char qString, char qChar)` :specify the characters
-  to use around strings and single characters
+- `ConfigBase *quoteCharacter(char qString, char literalChar)` :specify the
+  characters to use around strings and single characters
 - `ConfigBase *maxLayers(uint8_t layers)` : specify the maximum number of parent
   layers to process. This is useful to limit processing for larger config files
 - `ConfigBase *parentSeparator(char sep)` : specify the character to separate
@@ -342,3 +416,6 @@ will create an option name in following priority.
 2. Positional name
 3. First short name
 4. Environment name
+
+In config files the name will be enclosed in quotes if there is any potential
+ambiguities in parsing the name.
diff --git a/packages/CLI11/book/chapters/flags.md b/packages/CLI11/book/chapters/flags.md
index 16134b26a450b2df13a798bdecd2e8fdabcd5976..c1318025394552cb88af42422d84a1991fbf43fe 100644
--- a/packages/CLI11/book/chapters/flags.md
+++ b/packages/CLI11/book/chapters/flags.md
@@ -21,7 +21,7 @@ passing something like `./my_app -f -f` or `./my_app -ff` will throw a
 `ParseError` with a nice help description. A flag name may start with any
 character except ('-', ' ', '\n', and '!'). For long flags, after the first
 character all characters are allowed except ('=',':','{',' ', '\n'). Names are
-given as a comma separated string, with the dash or dashes. An flag can have as
+given as a comma separated string, with the dash or dashes. A flag can have as
 many names as you want, and afterward, using `count`, you can use any of the
 names, with dashes as needed.
 
diff --git a/packages/CLI11/book/chapters/installation.md b/packages/CLI11/book/chapters/installation.md
index c8af7dfa2fcb46ff66c174ba1f7ceb80b0bdf491..e1678a52306b63fa169429a25c5c8ec9fcdfd46b 100644
--- a/packages/CLI11/book/chapters/installation.md
+++ b/packages/CLI11/book/chapters/installation.md
@@ -9,7 +9,8 @@
 This example uses the single file edition of CLI11. You can download `CLI11.hpp`
 from the latest release and put it into the same folder as your source code,
 then compile this with C++ enabled. For a larger project, you can just put this
-in an include folder and you are set.
+in an include folder and you are set. This is the simplest and most
+straightforward means of including CLI11 with a project.
 
 ## Full edition
 
@@ -24,7 +25,7 @@ include shown above.
 
 ### CMake support for the full edition
 
-If you use CMake 3.4+ for your project (highly recommended), CLI11 comes with a
+If you use CMake 3.5+ for your project (highly recommended), CLI11 comes with a
 powerful CMakeLists.txt file that was designed to also be used with
 `add_subproject`. You can add the repository to your code (preferably as a git
 submodule), then add the following line to your project (assuming your folder is
@@ -43,7 +44,83 @@ You can also configure and optionally install CLI11, and CMake will create the
 necessary `lib/cmake/CLI11/CLI11Config.cmake` files, so
 `find_package(CLI11 CONFIG REQUIRED)` also works.
 
-If you use conan.io, CLI11 supports that too.
+If you use conan.io, CLI11 supports that too. CLI11 also supports Meson and
+pkg-config if you are not using CMake.
+
+If the CMake option `CLI11_PRECOMPILED` is set then the library is compiled into
+a static library. This can be used to improve compile times if CLI11 is included
+in many different parts of a project.
+
+### Global Headers
+
+Use `CLI/*.hpp` files stored in a shared folder. You could check out the git
+repository to a system-wide folder, for example `/opt/`. With CMake, you could
+add to the include path via:
+
+```bash
+if(NOT DEFINED CLI11_DIR)
+set (CLI11_DIR "/opt/CLI11" CACHE STRING "CLI11 git repository")
+endif()
+include_directories(${CLI11_DIR}/include)
+```
+
+And then in the source code (adding several headers might be needed to prevent
+linker errors):
+
+```cpp
+#include "CLI/App.hpp"
+#include "CLI/Formatter.hpp"
+#include "CLI/Config.hpp"
+```
+
+#### Global Headers with Target
+
+configuring and installing the project is required for linking CLI11 to your
+project in the same way as you would do with any other external library. With
+CMake, this step allows using `find_package(CLI11 CONFIG REQUIRED)` and then
+using the `CLI11::CLI11` target when linking. If `CMAKE_INSTALL_PREFIX` was
+changed during install to a specific folder like `/opt/CLI11`, then you have to
+pass `-DCLI11_DIR=/opt/CLI11` when building your current project. You can also
+use [Conan.io](https://conan.io/center/cli11) or
+[Hunter](https://docs.hunter.sh/en/latest/packages/pkg/CLI11.html). (These are
+just conveniences to allow you to use your favorite method of managing packages;
+it's just header only so including the correct path and using C++11 is all you
+really need.)
+
+#### Using Fetchcontent
+
+If you do not want to add cmake as a submodule or include it with your code the
+project can be added using `FetchContent`. This capability requires CMake 3.14+
+(or 3.11+ with more work).
+
+An example CMake file would include:
+
+```cmake
+include(FetchContent)
+FetchContent_Declare(
+    cli11_proj
+    QUIET
+    GIT_REPOSITORY https://github.com/CLIUtils/CLI11.git
+    GIT_TAG v2.3.2
+)
+
+FetchContent_MakeAvailable(cli11_proj)
+
+# And now you can use it
+target_link_libraries(<your project> PRIVATE CLI11::CLI11)
+```
+
+And use
+
+```c++
+#include <CLI/CLI.hpp>
+```
+
+in your project. It is highly recommended that you use the git hash for
+`GIT_TAG` instead of a tag or branch, as that will both be more secure, as well
+as faster to reconfigure - CMake will not have to reach out to the internet to
+see if the tag moved. You can also download just the single header file from the
+releases using `file(DOWNLOAD)`.
 
 ### Running tests on the full edition
 
@@ -99,16 +176,63 @@ Total Test time (real) =   0.34 sec
 For the curious, the CMake options and defaults are listed below. Most options
 default to off if CLI11 is used as a subdirectory in another project.
 
-| Option                        | Description                                                                                     |
-| ----------------------------- | ----------------------------------------------------------------------------------------------- |
-| `CLI11_SINGLE_FILE=ON`        | Build the `CLI11.hpp` file from the sources. Requires Python (version 3 or 2.7).                |
-| `CLI11_SINGLE_FILE_TESTS=OFF` | Run the tests on the generated single file version as well                                      |
-| `CLI11_EXAMPLES=ON`           | Build the example programs.                                                                     |
-| `CLI11_TESTING=ON`            | Build the tests.                                                                                |
-| `CLI11_CLANG_TIDY=OFF`        | Run `clang-tidy` on the examples and headers. Requires CMake 3.6+.                              |
-| `CLI11_CLANG_TIDY_OPTIONS=""` | Options to pass to `clang-tidy`, such as `-fix` (single threaded build only if applying fixes!) |
+| Option                         | Description                                                                      |
+| ------------------------------ | -------------------------------------------------------------------------------- |
+| `CLI11_SINGLE_FILE=ON`         | Build the `CLI11.hpp` file from the sources. Requires Python (version 3 or 2.7). |
+| `CLI11_PRECOMPILED=OFF`        | generate a precompiled static library instead of header-only                     |
+| `CLI11_SINGLE_FILE_TESTS=OFF`  | Run the tests on the generated single file version as well                       |
+| `CLI11_BUILD_DOCS=ON`          | build CLI11 documentation and book                                               |
+| `CLI11_BUILD_EXAMPLES=ON`      | Build the example programs.                                                      |
+| `CLI11_BUILD_EXAMPLES_JSON=ON` | Build some additional example using json libraries                               |
+| `CLI11_INSTALL=ON`             | install CLI11 to the install folder during the install process                   |
+| `CLI11_FORCE_LIBCXX=OFF`       | use libc++ instead of libstdc++ if building with clang on linux                  |
+| `CLI11_CUDA_TESTS=OFF`         | build the tests with NVCC                                                        |
+| `CLI11_BUILD_TESTS=ON`         | Build the tests.                                                                 |
 
 [^1]:
     Docker is being used to create a pristine disposable environment; there is
     nothing special about this container. Alpine is being used because it is
     small, modern, and fast. Commands are similar on any other platform.
+
+## Installing cli11 using vcpkg
+
+You can download and install cli11 using the
+[vcpkg](https://github.com/Microsoft/vcpkg) dependency manager:
+
+```bash
+git clone https://github.com/Microsoft/vcpkg.git
+cd vcpkg
+./bootstrap-vcpkg.sh
+./vcpkg integrate install
+./vcpkg install cli11
+```
+
+The cli11 port in vcpkg is kept up to date by Microsoft team members and
+community contributors. If the version is out of date, please
+[create an issue or pull request](https://github.com/Microsoft/vcpkg) on the
+vcpkg repository.
+
+## Special instructions for GCC 8, Some clang, and WASI
+
+If you are using GCC 8 and using it in C++17 mode with CLI11. CLI11 makes use of
+the `<filesystem>` header if available, but specifically for this compiler, the
+`filesystem` library is separate from the standard library and needs to be
+linked separately. So it is available but CLI11 doesn't use it by default.
+
+Specifically `libstdc++fs` needs to be added to the linking list and
+`CLI11_HAS_FILESYSTEM=1` has to be defined. Then the filesystem variant of the
+Validators could be used on GCC 8. GCC 9+ does not have this issue so the
+`<filesystem>` is used by default.
+
+There may also be other cases where a specific library needs to be linked.
+
+Defining `CLI11_HAS_FILESYSTEM=0` which will remove the usage and hence any
+linking issue.
+
+In some cases certain clang compilations may require linking against `libc++fs`.
+These situations have not been encountered so the specific situations requiring
+them are unknown yet.
+
+If building with WASI it is necessary to add the flag
+`-lc-printscan-long-double` to the build to allow long double support. See #841
+for more details.
diff --git a/packages/CLI11/book/chapters/options.md b/packages/CLI11/book/chapters/options.md
index 39447113b4a7fde4660139f6b11b673e8e75351f..67fb9e5546d9718e35cdd85c29bb399c3abd0ca8 100644
--- a/packages/CLI11/book/chapters/options.md
+++ b/packages/CLI11/book/chapters/options.md
@@ -26,18 +26,18 @@ app.add_option("-i", int_option, "Optional description")->capture_default_str();
 You can use any C++ int-like type, not just `int`. CLI11 understands the
 following categories of types:
 
-| Type           | CLI11                                                                                                                                                                             |
-| -------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| number like    | Integers, floats, bools, or any type that can be constructed from an integer or floating point number. Accepts common numerical strings like `0xFF` as well as octal, and decimal |
-| string-like    | std::string, or anything that can be constructed from or assigned a std::string                                                                                                   |
-| char           | For a single char, single string values are accepted, otherwise longer strings are treated as integral values and a conversion is attempted                                       |
-| complex-number | std::complex or any type which has a real(), and imag() operations available, will allow 1 or 2 string definitions like "1+2j" or two arguments "1","2"                           |
-| enumeration    | any enum or enum class type is supported through conversion from the underlying type(typically int, though it can be specified otherwise)                                         |
-| container-like | a container(like vector) of any available types including other containers                                                                                                        |
-| wrapper        | any other object with a `value_type` static definition where the type specified by `value_type` is one of the type in this list, including `std::atomic<>`                        |
-| tuple          | a tuple, pair, or array, or other type with a tuple size and tuple_type operations defined and the members being a type contained in this list                                    |
-| function       | A function that takes an array of strings and returns a string that describes the conversion failure or empty for success. May be the empty function. (`{}`)                      |
-| streamable     | any other type with a `<<` operator will also work                                                                                                                                |
+| Type           | CLI11                                                                                                                                                                                                                                                                    |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| number like    | Integers, floats, bools, or any type that can be constructed from an integer or floating point number. Accepts common numerical strings like `0xFF` as well as octal[\0755, or \o755], decimal, and binary(0b011111100), supports value separators including `_` and `'` |
+| string-like    | std::string, or anything that can be constructed from or assigned a std::string                                                                                                                                                                                          |
+| char           | For a single char, single string values are accepted, otherwise longer strings are treated as integral values and a conversion is attempted                                                                                                                              |
+| complex-number | std::complex or any type which has a real(), and imag() operations available, will allow 1 or 2 string definitions like "1+2j" or two arguments "1","2"                                                                                                                  |
+| enumeration    | any enum or enum class type is supported through conversion from the underlying type(typically int, though it can be specified otherwise)                                                                                                                                |
+| container-like | a container(like vector) of any available types including other containers                                                                                                                                                                                               |
+| wrapper        | any other object with a `value_type` static definition where the type specified by `value_type` is one of the type in this list, including `std::atomic<>`                                                                                                               |
+| tuple          | a tuple, pair, or array, or other type with a tuple size and tuple_type operations defined and the members being a type contained in this list                                                                                                                           |
+| function       | A function that takes an array of strings and returns a string that describes the conversion failure or empty for success. May be the empty function. (`{}`)                                                                                                             |
+| streamable     | any other type with a `<<` operator will also work                                                                                                                                                                                                                       |
 
 By default, CLI11 will assume that an option is optional, and one value is
 expected if you do not use a vector. You can change this on a specific option
@@ -214,15 +214,15 @@ that to add option modifiers. A full listing of the option modifiers:
 | `->type_size(Nmin,Nmax)`                                | specify that each block of values would consist of between Nmin and Nmax elements                                                                                                                                                                                                                                                                                                                                                                         |
 | `->needs(opt)`                                          | This option requires another option to also be present, opt is an `Option` pointer or a string with the name of the option. Can be removed with `->remove_needs(opt)`                                                                                                                                                                                                                                                                                     |
 | `->excludes(opt)`                                       | This option cannot be given with `opt` present, opt is an `Option` pointer or a string with the name of the option. Can be removed with `->remove_excludes(opt)`                                                                                                                                                                                                                                                                                          |
-| `->envname(name)`                                       | Gets the value from the environment if present and not passed on the command line.                                                                                                                                                                                                                                                                                                                                                                        |
-| `->group(name)`                                         | The help group to put the option in. No effect for positional options. Defaults to `"Options"`. `"Hidden"` will not show up in the help print.                                                                                                                                                                                                                                                                                                            |
+| `->envname(name)`                                       | Gets the value from the environment if present and not passed on the command line and passes any validators.                                                                                                                                                                                                                                                                                                                                              |
+| `->group(name)`                                         | The help group to put the option in. No effect for positional options. Defaults to `"Options"`. Options given an empty string for the group name will not show up in the help print.                                                                                                                                                                                                                                                                      |
 | `->description(string)`                                 | Set/change the description                                                                                                                                                                                                                                                                                                                                                                                                                                |
 | `->ignore_case()`                                       | Ignore the case on the command line (also works on subcommands, does not affect arguments).                                                                                                                                                                                                                                                                                                                                                               |
 | `->ignore_underscore()`                                 | Ignore any underscores on the command line (also works on subcommands, does not affect arguments).                                                                                                                                                                                                                                                                                                                                                        |
 | `->allow_extra_args()`                                  | Allow extra argument values to be included when an option is passed. Enabled by default for vector options.                                                                                                                                                                                                                                                                                                                                               |
 | `->disable_flag_override()`                             | specify that flag options cannot be overridden on the command line use `=<newval>`                                                                                                                                                                                                                                                                                                                                                                        |
 | `->delimiter('<CH>')`                                   | specify a character that can be used to separate elements in a command line argument, default is <none>, common values are ',', and ';'                                                                                                                                                                                                                                                                                                                   |
-| `->multi_option_policy( CLI::MultiOptionPolicy::Throw)` | Sets the policy for handling multiple arguments if the option was received on the command line several times. `Throw`ing an error is the default, but `TakeLast`, `TakeFirst`, `TakeAll`, `Join`, and `Sum` are also available. See the next four lines for shortcuts to set this more easily.                                                                                                                                                            |
+| `->multi_option_policy( CLI::MultiOptionPolicy::Throw)` | Sets the policy for handling multiple arguments if the option was received on the command line several times. `Throw`ing an error is the default, but `TakeLast`, `TakeFirst`, `TakeAll`, `Join`, `Reverse`, and `Sum` are also available. See the next four lines for shortcuts to set this more easily.                                                                                                                                                 |
 | `->take_last()`                                         | Only use the last option if passed several times. This is always true by default for bool options, regardless of the app default, but can be set to false explicitly with `->multi_option_policy()`.                                                                                                                                                                                                                                                      |
 | `->take_first()`                                        | sets `->multi_option_policy(CLI::MultiOptionPolicy::TakeFirst)`                                                                                                                                                                                                                                                                                                                                                                                           |
 | `->take_all()`                                          | sets `->multi_option_policy(CLI::MultiOptionPolicy::TakeAll)`                                                                                                                                                                                                                                                                                                                                                                                             |
@@ -246,6 +246,28 @@ function of the form `bool function(std::string)` that runs on every value that
 the option receives, and returns a value that tells CLI11 whether the check
 passed or failed.
 
+### Multi Option policy
+
+The Multi option policy can be used to instruct CLI11 what to do when an option
+is called multiple times and how to return those values in a meaningful way.
+There are several options can be set through the
+`->multi_option_policy( CLI::MultiOptionPolicy::Throw)` option modifier.
+`Throw`ing an error is the default, but `TakeLast`, `TakeFirst`, `TakeAll`,
+`Join`, `Reverse`, and `Sum`
+
+| Value     | Description                                                                       |
+| --------- | --------------------------------------------------------------------------------- |
+| Throw     | Throws an error if more values are given then expected                            |
+| TakeLast  | Selects the last expected number of values given                                  |
+| TakeFirst | Selects the first expected number of of values given                              |
+| Join      | Joins the strings together using the `delimiter` given                            |
+| TakeAll   | Takes all the values                                                              |
+| Sum       | If the values are numeric, it sums them and returns the result                    |
+| Reverse   | Selects the last expected number of values given and return them in reverse order |
+
+NOTE: For reverse, the index used for an indexed validator is also applied in
+reverse order index 1 will be the last element and 2 second from last and so on.
+
 ## Using the `CLI::Option` pointer
 
 Each of the option creation mechanisms returns a pointer to the internally
@@ -261,7 +283,7 @@ CLI::Option* opt = app.add_flag("--opt");
 CLI11_PARSE(app, argv, argc);
 
 if(* opt)
-    std::cout << "Flag received " << opt->count() << " times." << std::endl;
+    std::cout << "Flag received " << opt->count() << " times." << '\n';
 ```
 
 ## Inheritance of defaults
diff --git a/packages/CLI11/book/package.json b/packages/CLI11/book/package.json
new file mode 100644
index 0000000000000000000000000000000000000000..493b5db9acab34c5273173f3fef2d2b29e854d02
--- /dev/null
+++ b/packages/CLI11/book/package.json
@@ -0,0 +1,14 @@
+{
+  "name": "cli11-gitbook",
+  "version": "1.0.0",
+  "dependencies": {
+    "gitbook-cli": "2.2.0",
+    "gitbook-plugin-hints": "^1.0.2",
+    "gitbook-plugin-include-codeblock": "^3.2.2",
+    "gitbook-plugin-term": "^0.5.1",
+    "svgexport": ">=0.4.2"
+  },
+  "scripts": {
+    "postinstall": "npx gitbook fetch 3.2.3 && npx gitbook install"
+  }
+}
diff --git a/packages/CLI11/cmake/CLI11GeneratePkgConfig.cmake b/packages/CLI11/cmake/CLI11GeneratePkgConfig.cmake
index 5abb03d165f908d73ba0c85b97dd156483100778..a9c5eb88525bd236f07adca8797669e93b7f9331 100644
--- a/packages/CLI11/cmake/CLI11GeneratePkgConfig.cmake
+++ b/packages/CLI11/cmake/CLI11GeneratePkgConfig.cmake
@@ -1,3 +1,7 @@
-configure_file("cmake/CLI11.pc.in" "CLI11.pc" @ONLY)
+if(CLI11_PRECOMPILED)
+  configure_file("cmake/CLI11precompiled.pc.in" "CLI11.pc" @ONLY)
+else()
+  configure_file("cmake/CLI11.pc.in" "CLI11.pc" @ONLY)
+endif()
 
-install(FILES "${PROJECT_BINARY_DIR}/CLI11.pc" DESTINATION "${CMAKE_INSTALL_DATADIR}/pkgconfig")
+install(FILES "${PROJECT_BINARY_DIR}/CLI11.pc" DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
diff --git a/packages/CLI11/cmake/CLI11precompiled.pc.in b/packages/CLI11/cmake/CLI11precompiled.pc.in
new file mode 100644
index 0000000000000000000000000000000000000000..df73d7780c02767b29807f9f333b1568e65d1a4f
--- /dev/null
+++ b/packages/CLI11/cmake/CLI11precompiled.pc.in
@@ -0,0 +1,11 @@
+prefix=@CMAKE_INSTALL_PREFIX@
+exec_prefix=${prefix}
+includedir=${prefix}/include
+libdir=${exec_prefix}/lib
+
+Name: CLI11
+Description: C++ command line parser
+Version: @PROJECT_VERSION@
+
+Cflags: -I${includedir} -DCLI11_COMPILE
+Libs: -L${libdir} -lCLI11
diff --git a/packages/CLI11/cmake/CLIsingle.hpp.in b/packages/CLI11/cmake/CLIsingle.hpp.in
new file mode 100644
index 0000000000000000000000000000000000000000..a2d783c3006382a6c0d1f310d93d3f7f9796889e
--- /dev/null
+++ b/packages/CLI11/cmake/CLIsingle.hpp.in
@@ -0,0 +1,10 @@
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
+// under NSF AWARD 1414736 and by the respective contributors.
+// All rights reserved.
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#pragma once
+
+//single file header
+#include "../CLI11.hpp"
diff --git a/packages/CLI11/cmake/CodeCoverage.cmake b/packages/CLI11/cmake/CodeCoverage.cmake
index e011ef1342773b14b1e8dedebb3e3e83e4161f3c..1867b174270b0c9c246a952e5f3160cbb5800f29 100644
--- a/packages/CLI11/cmake/CodeCoverage.cmake
+++ b/packages/CLI11/cmake/CodeCoverage.cmake
@@ -88,7 +88,7 @@ elseif(NOT CMAKE_COMPILER_IS_GNUCXX)
 endif()
 
 set(COVERAGE_COMPILER_FLAGS
-    "-g -O0 --coverage -fprofile-arcs -ftest-coverage -fno-inline -fno-inline-small-functions -fno-default-inline"
+    "-g -O0 --coverage -fprofile-arcs -ftest-coverage -fno-inline -fno-inline-small-functions -fno-default-inline -fno-elide-constructors"
     CACHE INTERNAL "")
 
 set(CMAKE_CXX_FLAGS_COVERAGE
diff --git a/packages/CLI11/examples/CMakeLists.txt b/packages/CLI11/examples/CMakeLists.txt
index 131a9fd02514e1aa835b1d613698f0550f7a8799..0ed2f8ec8f77f4aa8b146983a61d17ca16875f85 100644
--- a/packages/CLI11/examples/CMakeLists.txt
+++ b/packages/CLI11/examples/CMakeLists.txt
@@ -189,6 +189,12 @@ add_test(NAME prefix_command COMMAND prefix_command -v 3 2 1 -- other one two 3)
 set_property(TEST prefix_command PROPERTY PASS_REGULAR_EXPRESSION "Prefix: 3 : 2 : 1"
                                           "Remaining commands: other one two 3")
 
+add_cli_exe(arg_capture arg_capture.cpp)
+add_test(NAME arg_capture COMMAND arg_capture -v 27 --sub -v 13 --val prefix)
+set_property(TEST arg_capture PROPERTY PASS_REGULAR_EXPRESSION "value=27")
+add_test(NAME arg_capture2 COMMAND arg_capture -v 27 --sub -v 13 --val prefix)
+set_property(TEST arg_capture2 PROPERTY PASS_REGULAR_EXPRESSION "after Args:-v 13 --val prefix")
+
 add_cli_exe(callback_passthrough callback_passthrough.cpp)
 add_test(NAME callback_passthrough1 COMMAND callback_passthrough --argname t2 --t2 test)
 set_property(TEST callback_passthrough1 PROPERTY PASS_REGULAR_EXPRESSION "the value is now test")
diff --git a/packages/CLI11/examples/arg_capture.cpp b/packages/CLI11/examples/arg_capture.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bdf3afebcdf7eb17de8608ac39cb215ab0e79c21
--- /dev/null
+++ b/packages/CLI11/examples/arg_capture.cpp
@@ -0,0 +1,34 @@
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
+// under NSF AWARD 1414736 and by the respective contributors.
+// All rights reserved.
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+// Code modified from https://github.com/CLIUtils/CLI11/issues/559
+
+#include <CLI/CLI.hpp>
+#include <iostream>
+#include <string>
+
+/** This example demonstrates the use of `prefix_command` on a subcommand
+to capture all subsequent arguments along with an alias to make it appear as a regular options.
+
+All the values after the "sub" or "--sub" are available in the remaining() method.
+*/
+int main(int argc, const char *argv[]) {
+
+    int value{0};
+    CLI::App app{"Test App"};
+    app.add_option("-v", value, "value");
+
+    auto *subcom = app.add_subcommand("sub", "")->prefix_command();
+    subcom->alias("--sub");
+    CLI11_PARSE(app, argc, argv);
+
+    std::cout << "value=" << value << '\n';
+    std::cout << "after Args:";
+    for(const auto &aarg : subcom->remaining()) {
+        std::cout << aarg << " ";
+    }
+    std::cout << '\n';
+}
diff --git a/packages/CLI11/examples/callback_passthrough.cpp b/packages/CLI11/examples/callback_passthrough.cpp
index 1aac0df6beef22f1431159bda5099e9c6142ad6b..234ed894b7d7cc232cda5e062358615f1a6ebd64 100644
--- a/packages/CLI11/examples/callback_passthrough.cpp
+++ b/packages/CLI11/examples/callback_passthrough.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
diff --git a/packages/CLI11/examples/config_app.cpp b/packages/CLI11/examples/config_app.cpp
index a0426ad616b7221533047ae16c1ae4a0f734712d..ccd9824259fd304f231928b3ea350f85307c705b 100644
--- a/packages/CLI11/examples/config_app.cpp
+++ b/packages/CLI11/examples/config_app.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -40,11 +40,11 @@ int main(int argc, char **argv) {
     }
 
     std::cout << "Working on file: " << file << ", direct count: " << app.count("--file")
-              << ", opt count: " << opt->count() << std::endl;
+              << ", opt count: " << opt->count() << '\n';
     std::cout << "Working on count: " << count << ", direct count: " << app.count("--count")
-              << ", opt count: " << copt->count() << std::endl;
+              << ", opt count: " << copt->count() << '\n';
     std::cout << "Received flag: " << v << " (" << flag->count() << ") times\n";
-    std::cout << "Some value: " << value << std::endl;
+    std::cout << "Some value: " << value << '\n';
 
     return 0;
 }
diff --git a/packages/CLI11/examples/custom_parse.cpp b/packages/CLI11/examples/custom_parse.cpp
index eaaedd552f65af734eec6c9d1a2e048cbfa53369..2f9c2a08f4ef3a5a731142462fc3d0fab5f884ad 100644
--- a/packages/CLI11/examples/custom_parse.cpp
+++ b/packages/CLI11/examples/custom_parse.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -23,7 +23,7 @@ using DoubleValues = Values<double>;
 
 // the lexical cast operator should be in the same namespace as the type for ADL to work properly
 bool lexical_cast(const std::string &input, Values<double> & /*v*/) {
-    std::cout << "called correct lexical_cast function ! val: " << input << std::endl;
+    std::cout << "called correct lexical_cast function ! val: " << input << '\n';
     return true;
 }
 
diff --git a/packages/CLI11/examples/digit_args.cpp b/packages/CLI11/examples/digit_args.cpp
index a0785ddbdf7c1df5c8123e7cb2b24241b0b4be66..2144f22d327bb8a54098d27e9ba046a2b8a67b29 100644
--- a/packages/CLI11/examples/digit_args.cpp
+++ b/packages/CLI11/examples/digit_args.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -16,6 +16,6 @@ int main(int argc, char **argv) {
 
     CLI11_PARSE(app, argc, argv);
 
-    std::cout << "value = " << val << std::endl;
+    std::cout << "value = " << val << '\n';
     return 0;
 }
diff --git a/packages/CLI11/examples/enum.cpp b/packages/CLI11/examples/enum.cpp
index 133adde9aca4c62e755b3597e6e299af2c0b7e54..863eda4dc1a8fbf73beedb1b4e87baead4496d07 100644
--- a/packages/CLI11/examples/enum.cpp
+++ b/packages/CLI11/examples/enum.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -27,7 +27,7 @@ int main(int argc, char **argv) {
 
     // CLI11's built in enum streaming can be used outside CLI11 like this:
     using CLI::enums::operator<<;
-    std::cout << "Enum received: " << level << std::endl;
+    std::cout << "Enum received: " << level << '\n';
 
     return 0;
 }
diff --git a/packages/CLI11/examples/enum_ostream.cpp b/packages/CLI11/examples/enum_ostream.cpp
index 1f8ac57e409d637f1533b4adbadf649d5976a134..939a3fa7d81834c677a41b9896351cd2acb02490 100644
--- a/packages/CLI11/examples/enum_ostream.cpp
+++ b/packages/CLI11/examples/enum_ostream.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -44,7 +44,7 @@ int main(int argc, char **argv) {
 
     // CLI11's built in enum streaming can be used outside CLI11 like this:
     using CLI::enums::operator<<;
-    std::cout << "Enum received: " << level << std::endl;
+    std::cout << "Enum received: " << level << '\n';
 
     return 0;
 }
diff --git a/packages/CLI11/examples/formatter.cpp b/packages/CLI11/examples/formatter.cpp
index 4973cf95f9ec27334fda2a02fe3aef461d84da43..b9afb1f96af2e19faf4dcdd0cb079de13484d01e 100644
--- a/packages/CLI11/examples/formatter.cpp
+++ b/packages/CLI11/examples/formatter.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -31,7 +31,7 @@ int main(int argc, char **argv) {
 
     CLI11_PARSE(app, argc, argv);
 
-    std::cout << "This app was meant to show off the formatter, run with -h" << std::endl;
+    std::cout << "This app was meant to show off the formatter, run with -h" << '\n';
 
     return 0;
 }
diff --git a/packages/CLI11/examples/groups.cpp b/packages/CLI11/examples/groups.cpp
index 09c5d6ba2bdfea9108ad9cbc4bbaaf092e4be4c0..8084f75186dfa805a6cbbb77651654c6b28017c9 100644
--- a/packages/CLI11/examples/groups.cpp
+++ b/packages/CLI11/examples/groups.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -30,10 +30,10 @@ int main(int argc, char **argv) {
     }
 
     std::cout << "Working on file: " << file << ", direct count: " << app.count("--file")
-              << ", opt count: " << opt->count() << std::endl;
+              << ", opt count: " << opt->count() << '\n';
     std::cout << "Working on count: " << count << ", direct count: " << app.count("--count")
-              << ", opt count: " << copt->count() << std::endl;
-    std::cout << "Some value: " << value << std::endl;
+              << ", opt count: " << copt->count() << '\n';
+    std::cout << "Some value: " << value << '\n';
 
     return 0;
 }
diff --git a/packages/CLI11/examples/inter_argument_order.cpp b/packages/CLI11/examples/inter_argument_order.cpp
index e8c489c2ad34e7d349655dca9ab9ef5bf2cfb3b1..d0a8ba55bf503c7f51e097edd7b2ca65c9ed2132 100644
--- a/packages/CLI11/examples/inter_argument_order.cpp
+++ b/packages/CLI11/examples/inter_argument_order.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -46,6 +46,6 @@ int main(int argc, char **argv) {
 
     // Prove the vector is correct
     for(auto &pair : keyval) {
-        std::cout << pair.first << " : " << pair.second << std::endl;
+        std::cout << pair.first << " : " << pair.second << '\n';
     }
 }
diff --git a/packages/CLI11/examples/modhelp.cpp b/packages/CLI11/examples/modhelp.cpp
index d0f8cf87525194e9f3ac69b8b9c80ea5c6efecac..472e302414fc3c577bea6e13b8aae2c78b8f2885 100644
--- a/packages/CLI11/examples/modhelp.cpp
+++ b/packages/CLI11/examples/modhelp.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -26,10 +26,10 @@ Note that this will not shortcut `->required` and other similar options.)raw"};
         if(*help)
             throw CLI::CallForHelp();
     } catch(const CLI::Error &e) {
-        std::cout << "Option -a string in help: " << some_option << std::endl;
+        std::cout << "Option -a string in help: " << some_option << '\n';
         return test.exit(e);
     }
 
-    std::cout << "Option -a string: " << some_option << std::endl;
+    std::cout << "Option -a string: " << some_option << '\n';
     return 0;
 }
diff --git a/packages/CLI11/examples/nested.cpp b/packages/CLI11/examples/nested.cpp
index 3587023ac3b813413d033b7ba1b3d80178bcc5dc..cfdb5736f94f56d3c5663ea0b4c0e6ff4bbe5e1a 100644
--- a/packages/CLI11/examples/nested.cpp
+++ b/packages/CLI11/examples/nested.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
diff --git a/packages/CLI11/examples/option_groups.cpp b/packages/CLI11/examples/option_groups.cpp
index 3a282536bc00ee11e97a15dbde36f6b846cf1e09..f95671a1dfb1a1e37981dbeca63842d5687c673f 100644
--- a/packages/CLI11/examples/option_groups.cpp
+++ b/packages/CLI11/examples/option_groups.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -33,13 +33,13 @@ int main(int argc, char **argv) {
     CLI11_PARSE(app, argc, argv);
 
     std::string format_type = (csv) ? std::string("CSV") : ((human) ? "human readable" : "binary");
-    std::cout << "Selected " << format_type << " format" << std::endl;
+    std::cout << "Selected " << format_type << " format\n";
     if(!fileLoc.empty()) {
-        std::cout << " sent to file " << fileLoc << std::endl;
+        std::cout << " sent to file " << fileLoc << '\n';
     } else if(!networkAddress.empty()) {
-        std::cout << " sent over network to " << networkAddress << std::endl;
+        std::cout << " sent over network to " << networkAddress << '\n';
     } else {
-        std::cout << " sent to std::cout" << std::endl;
+        std::cout << " sent to std::cout\n";
     }
 
     return 0;
diff --git a/packages/CLI11/examples/positional_arity.cpp b/packages/CLI11/examples/positional_arity.cpp
index d2d9b9c89a108cd82a04ae86f1375962889a90c9..0db7ce3be7d17a42614eb7c605eafcfef80fe8b2 100644
--- a/packages/CLI11/examples/positional_arity.cpp
+++ b/packages/CLI11/examples/positional_arity.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
diff --git a/packages/CLI11/examples/positional_validation.cpp b/packages/CLI11/examples/positional_validation.cpp
index 6b552daa536a4842b6600ced176ad37d30a3f0c1..ad283e98c633f53d0aeccd61324dad2d671a88e5 100644
--- a/packages/CLI11/examples/positional_validation.cpp
+++ b/packages/CLI11/examples/positional_validation.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
diff --git a/packages/CLI11/examples/prefix_command.cpp b/packages/CLI11/examples/prefix_command.cpp
index 843f40374616283f9208004237f8e95fdf651bb8..f681a04630a3576ce001107061dee546b747b9ca 100644
--- a/packages/CLI11/examples/prefix_command.cpp
+++ b/packages/CLI11/examples/prefix_command.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -25,11 +25,11 @@ int main(int argc, char **argv) {
     for(int v : vals)
         std::cout << ": " << v << " ";
 
-    std::cout << std::endl << "Remaining commands: ";
+    std::cout << '\n' << "Remaining commands: ";
 
     for(const auto &com : more_comms)
         std::cout << com << " ";
-    std::cout << std::endl;
+    std::cout << '\n';
 
     return 0;
 }
diff --git a/packages/CLI11/examples/ranges.cpp b/packages/CLI11/examples/ranges.cpp
index ec14905bf4be51920b6aee7df10cdb3dea3909aa..63ad06ab9e5a5022661552c67a3eb8bd91c4ad4b 100644
--- a/packages/CLI11/examples/ranges.cpp
+++ b/packages/CLI11/examples/ranges.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
diff --git a/packages/CLI11/examples/retired.cpp b/packages/CLI11/examples/retired.cpp
index 28f61da04dfc5f867e5f6c79cd67cf4c1fc35c2c..24f9585c11b756f8d43af1643cec4e7f055b18d0 100644
--- a/packages/CLI11/examples/retired.cpp
+++ b/packages/CLI11/examples/retired.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
diff --git a/packages/CLI11/examples/shapes.cpp b/packages/CLI11/examples/shapes.cpp
index d3f48ac73c40bb395321fbc88532b341af33eddc..39ea579285f7673464908d510f352a46557575ca 100644
--- a/packages/CLI11/examples/shapes.cpp
+++ b/packages/CLI11/examples/shapes.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -18,7 +18,7 @@ int main(int argc, char **argv) {
     int circle_counter{0};
     circle->callback([&radius, &circle_counter] {
         ++circle_counter;
-        std::cout << "circle" << circle_counter << " with radius " << radius << std::endl;
+        std::cout << "circle" << circle_counter << " with radius " << radius << '\n';
     });
 
     circle->add_option("radius", radius, "the radius of the circle")->required();
@@ -32,7 +32,7 @@ int main(int argc, char **argv) {
         if(edge2 == 0) {
             edge2 = edge1;
         }
-        std::cout << "rectangle" << rect_counter << " with edges [" << edge1 << ',' << edge2 << "]" << std::endl;
+        std::cout << "rectangle" << rect_counter << " with edges [" << edge1 << ',' << edge2 << "]\n";
         edge2 = 0;
     });
 
@@ -45,7 +45,7 @@ int main(int argc, char **argv) {
     tri->callback([&sides, &tri_counter] {
         ++tri_counter;
 
-        std::cout << "triangle" << tri_counter << " with sides [" << CLI::detail::join(sides) << "]" << std::endl;
+        std::cout << "triangle" << tri_counter << " with sides [" << CLI::detail::join(sides) << "]\n";
     });
 
     tri->add_option("sides", sides, "the side lengths of the triangle");
diff --git a/packages/CLI11/examples/simple.cpp b/packages/CLI11/examples/simple.cpp
index b7095dd2cc703648668e8921efa43b01c6308ad2..c33037d20451e4ab9f6d32ccb93d808c282769d9 100644
--- a/packages/CLI11/examples/simple.cpp
+++ b/packages/CLI11/examples/simple.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -28,11 +28,11 @@ int main(int argc, char **argv) {
     CLI11_PARSE(app, argc, argv);
 
     std::cout << "Working on file: " << file << ", direct count: " << app.count("--file")
-              << ", opt count: " << opt->count() << std::endl;
+              << ", opt count: " << opt->count() << '\n';
     std::cout << "Working on count: " << count << ", direct count: " << app.count("--count")
-              << ", opt count: " << copt->count() << std::endl;
+              << ", opt count: " << copt->count() << '\n';
     std::cout << "Received flag: " << v << " (" << flag->count() << ") times\n";
-    std::cout << "Some value: " << value << std::endl;
+    std::cout << "Some value: " << value << '\n';
 
     return 0;
 }
diff --git a/packages/CLI11/examples/subcom_help.cpp b/packages/CLI11/examples/subcom_help.cpp
index 65030eb86aa0bae0f9c6a29fe0fdc71ccc967adc..d7cfadaa33c9382c021f28ebe5f9b346654e043a 100644
--- a/packages/CLI11/examples/subcom_help.cpp
+++ b/packages/CLI11/examples/subcom_help.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -15,7 +15,7 @@ int main(int argc, char *argv[]) {
     cli_sub.add_option("sub_arg", sub_arg, "Argument for subcommand")->required();
     CLI11_PARSE(cli_global, argc, argv);
     if(cli_sub) {
-        std::cout << "Got: " << sub_arg << std::endl;
+        std::cout << "Got: " << sub_arg << '\n';
     }
     return 0;
 }
diff --git a/packages/CLI11/examples/subcom_in_files/subcommand_a.cpp b/packages/CLI11/examples/subcom_in_files/subcommand_a.cpp
index bb1a6a13d4bc1c6a7d2acffc2998caae8c8bb2f0..19d309148ab63100f82178b3a756d1ead6992a71 100644
--- a/packages/CLI11/examples/subcom_in_files/subcommand_a.cpp
+++ b/packages/CLI11/examples/subcom_in_files/subcommand_a.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -30,8 +30,8 @@ void setup_subcommand_a(CLI::App &app) {
 /// but having a separate function is cleaner.
 void run_subcommand_a(SubcommandAOptions const &opt) {
     // Do stuff...
-    std::cout << "Working on file: " << opt.file << std::endl;
+    std::cout << "Working on file: " << opt.file << '\n';
     if(opt.with_foo) {
-        std::cout << "Using foo!" << std::endl;
+        std::cout << "Using foo!" << '\n';
     }
 }
diff --git a/packages/CLI11/examples/subcom_in_files/subcommand_a.hpp b/packages/CLI11/examples/subcom_in_files/subcommand_a.hpp
index 6a8395d1a3219efc67417b563ecf75da3eea1943..ae08464afbebd25cd5d09a4f03ba2685bacffce5 100644
--- a/packages/CLI11/examples/subcom_in_files/subcommand_a.hpp
+++ b/packages/CLI11/examples/subcom_in_files/subcommand_a.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
diff --git a/packages/CLI11/examples/subcom_in_files/subcommand_main.cpp b/packages/CLI11/examples/subcom_in_files/subcommand_main.cpp
index e65339c901c52689c3e3b1b045d6d8ca463a70f0..795dfd486b238fa93082402d09c02aec9c824c20 100644
--- a/packages/CLI11/examples/subcom_in_files/subcommand_main.cpp
+++ b/packages/CLI11/examples/subcom_in_files/subcommand_main.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
diff --git a/packages/CLI11/examples/subcom_partitioned.cpp b/packages/CLI11/examples/subcom_partitioned.cpp
index b6273eaed6fdd1737e0a92008958a81237a457d2..a46eea1ce6dba31e7ae774357ff3addae94ff647 100644
--- a/packages/CLI11/examples/subcom_partitioned.cpp
+++ b/packages/CLI11/examples/subcom_partitioned.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -37,10 +37,10 @@ int main(int argc, char **argv) {
     }
 
     std::cout << "Working on file: " << file << ", direct count: " << impOpt->count("--file")
-              << ", opt count: " << opt->count() << std::endl;
+              << ", opt count: " << opt->count() << '\n';
     std::cout << "Working on count: " << count << ", direct count: " << impOpt->count("--count")
-              << ", opt count: " << copt->count() << std::endl;
-    std::cout << "Some value: " << value << std::endl;
+              << ", opt count: " << copt->count() << '\n';
+    std::cout << "Some value: " << value << '\n';
 
     return 0;
 }
diff --git a/packages/CLI11/examples/subcommands.cpp b/packages/CLI11/examples/subcommands.cpp
index e69c04eed06971905c53d0b4490028d373fd704b..fe93edc2f42ebfb2de3519e49c964d5a8e8be79b 100644
--- a/packages/CLI11/examples/subcommands.cpp
+++ b/packages/CLI11/examples/subcommands.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -24,12 +24,11 @@ int main(int argc, char **argv) {
 
     CLI11_PARSE(app, argc, argv);
 
-    std::cout << "Working on --file from start: " << file << std::endl;
-    std::cout << "Working on --count from stop: " << s->count() << ", direct count: " << stop->count("--count")
-              << std::endl;
-    std::cout << "Count of --random flag: " << app.count("--random") << std::endl;
+    std::cout << "Working on --file from start: " << file << '\n';
+    std::cout << "Working on --count from stop: " << s->count() << ", direct count: " << stop->count("--count") << '\n';
+    std::cout << "Count of --random flag: " << app.count("--random") << '\n';
     for(auto *subcom : app.get_subcommands())
-        std::cout << "Subcommand: " << subcom->get_name() << std::endl;
+        std::cout << "Subcommand: " << subcom->get_name() << '\n';
 
     return 0;
 }
diff --git a/packages/CLI11/examples/testEXE.cpp b/packages/CLI11/examples/testEXE.cpp
index b2cac7fbae1d93ef91eeb09a593c8bf8ee1e0e35..b42c60f2a73e2249ea27243bd8416869e5b46b21 100644
--- a/packages/CLI11/examples/testEXE.cpp
+++ b/packages/CLI11/examples/testEXE.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -12,15 +12,17 @@
 
 int main(int argc, const char *argv[]) {
 
-    int logLevel{0};
+    int value{0};
     CLI::App app{"Test App"};
+    app.add_option("-v", value, "value");
 
-    app.add_option("-v", logLevel, "level");
-
-    auto *subcom = app.add_subcommand("sub", "")->fallthrough();
-    subcom->preparse_callback([&app](size_t) { app.get_subcommand("sub")->add_option_group("group"); });
-
+    auto *subcom = app.add_subcommand("sub", "")->prefix_command();
     CLI11_PARSE(app, argc, argv);
 
-    std::cout << "level: " << logLevel << std::endl;
+    std::cout << "value =" << value << '\n';
+    std::cout << "after Args:";
+    for(const auto &aarg : subcom->remaining()) {
+        std::cout << aarg << " ";
+    }
+    std::cout << '\n';
 }
diff --git a/packages/CLI11/examples/validators.cpp b/packages/CLI11/examples/validators.cpp
index 87eb07ab20e09fbe3558c122406aa03ec52ca7e0..44ff15546c3392f7bd0bc09ff78332359750ecb3 100644
--- a/packages/CLI11/examples/validators.cpp
+++ b/packages/CLI11/examples/validators.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -19,7 +19,7 @@ int main(int argc, char **argv) {
     app.add_option("-v,--value", count, "Value in range")->check(CLI::Range(3, 6));
     CLI11_PARSE(app, argc, argv);
 
-    std::cout << "Try printing help or failing the validator" << std::endl;
+    std::cout << "Try printing help or failing the validator" << '\n';
 
     return 0;
 }
diff --git a/packages/CLI11/fuzz/CMakeLists.txt b/packages/CLI11/fuzz/CMakeLists.txt
index 21df4028f53bcb747c1716260238863dfb2c60b8..5f5cff5dc032b35cb69f848820741f92527f7ba4 100644
--- a/packages/CLI11/fuzz/CMakeLists.txt
+++ b/packages/CLI11/fuzz/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+# Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 # under NSF AWARD 1414736 and by the respective contributors.
 # All rights reserved.
 #
@@ -19,14 +19,17 @@ if(CMAKE_CXX_STANDARD GREATER 16)
       set(CLI11_FUZZ_ARTIFACT_PATH ${PROJECT_BINARY_DIR}/fuzz)
     endif()
 
-    if(NOT CLI11_FUZZ_TIME)
-      set(CLI11_FUZZ_TIME 360)
+    if(NOT CLI11_FUZZ_TIME_APP)
+      set(CLI11_FUZZ_TIME_APP 600)
+    endif()
+    if(NOT CLI11_FUZZ_TIME_FILE)
+      set(CLI11_FUZZ_TIME_FILE 240)
     endif()
     add_custom_target(
       QUICK_CLI11_APP_FUZZ
       COMMAND ${CMAKE_COMMAND} -E make_directory corp
       COMMAND
-        cli11_app_fuzzer corp -max_total_time=${CLI11_FUZZ_TIME} -max_len=2048
+        cli11_app_fuzzer corp -max_total_time=${CLI11_FUZZ_TIME_APP} -max_len=2148
         -dict=${CMAKE_CURRENT_SOURCE_DIR}/fuzz_dictionary1.txt
         -exact_artifact_path=${CLI11_FUZZ_ARTIFACT_PATH}/cli11_app_fail_artifact.txt)
 
@@ -34,7 +37,7 @@ if(CMAKE_CXX_STANDARD GREATER 16)
       QUICK_CLI11_FILE_FUZZ
       COMMAND ${CMAKE_COMMAND} -E make_directory corp
       COMMAND
-        cli11_file_fuzzer corp -max_total_time=${CLI11_FUZZ_TIME} -max_len=2048
+        cli11_file_fuzzer corp -max_total_time=${CLI11_FUZZ_TIME_FILE} -max_len=2048
         -dict=${CMAKE_CURRENT_SOURCE_DIR}/fuzz_dictionary2.txt
         -exact_artifact_path=${CLI11_FUZZ_ARTIFACT_PATH}/cli11_file_fail_artifact.txt)
 
diff --git a/packages/CLI11/fuzz/cli11_app_fuzz.cpp b/packages/CLI11/fuzz/cli11_app_fuzz.cpp
index 7cd10b889899141bd09d7811ea8c0106fb95b8ae..e6acc35b9f1cdbbc31c431b0c2aa82e31d5174ab 100644
--- a/packages/CLI11/fuzz/cli11_app_fuzz.cpp
+++ b/packages/CLI11/fuzz/cli11_app_fuzz.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -15,16 +15,41 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
         return 0;
     }
     std::string parseString(reinterpret_cast<const char *>(Data), Size);
-
+    std::string optionString;
+    std::string flagString;
+    if(parseString.size() > 25) {
+        optionString = parseString.substr(0, 25);
+        parseString.erase(0, 25);
+    }
+    if(parseString.size() > 25) {
+        flagString = parseString.substr(0, 25);
+        parseString.erase(0, 25);
+    }
     CLI::FuzzApp fuzzdata;
-
     auto app = fuzzdata.generateApp();
+    try {
+        if(!optionString.empty()) {
+            app->add_option(optionString, fuzzdata.buffer);
+        }
+        if(!flagString.empty()) {
+            app->add_flag(flagString, fuzzdata.intbuffer);
+        }
+    } catch(const CLI::ConstructionError &e) {
+        return 0;  // Non-zero return values are reserved for future use.
+    }
+
     try {
         app->parse(parseString);
+
     } catch(const CLI::ParseError &e) {
         //(app)->exit(e);
         // this just indicates we caught an error known by CLI
+        return 0;  // Non-zero return values are reserved for future use.
     }
-
-    return 0;  // Non-zero return values are reserved for future use.
+    // should be able to write the config to a file and read from it again
+    std::string configOut = app->config_to_str();
+    app->clear();
+    std::stringstream out(configOut);
+    app->parse_from_stream(out);
+    return 0;
 }
diff --git a/packages/CLI11/fuzz/cli11_file_fuzz.cpp b/packages/CLI11/fuzz/cli11_file_fuzz.cpp
index e769114eb9b1ab5ee2fcd8ffc60fed9b29f1710f..754108d17b2efb3c768f907621b9630d6b2b7ad9 100644
--- a/packages/CLI11/fuzz/cli11_file_fuzz.cpp
+++ b/packages/CLI11/fuzz/cli11_file_fuzz.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -22,8 +22,15 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
     auto app = fuzzdata.generateApp();
     try {
         app->parse_from_stream(out);
+        // should be able to write the config to a file and read from it again
+        std::string configOut = app->config_to_str();
+
+        app->clear();
+        std::stringstream out(configOut);
+        app->parse_from_stream(out);
+
     } catch(const CLI::ParseError &e) {
-        (app)->exit(e);
+        // (app)->exit(e);
         // this just indicates we caught an error known by CLI
     }
 
diff --git a/packages/CLI11/fuzz/fuzzApp.cpp b/packages/CLI11/fuzz/fuzzApp.cpp
index dc401f933a56921c37a45201e767c60fadbfde00..21d510126097bc4d364cac2e1e90419d4326d87c 100644
--- a/packages/CLI11/fuzz/fuzzApp.cpp
+++ b/packages/CLI11/fuzz/fuzzApp.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -44,9 +44,10 @@ std::shared_ptr<CLI::App> FuzzApp::generateApp() {
     auto fApp = std::make_shared<CLI::App>("fuzzing App", "fuzzer");
     fApp->set_config("--config");
     fApp->add_flag("-a,--flag");
-    fApp->add_flag("-b,--flag2", flag1);
+    fApp->add_flag("-b,--flag2,!--nflag2", flag1);
     fApp->add_flag("-c{34},--flag3{1}", flagCnt)->disable_flag_override();
     fApp->add_flag("-e,--flagA", flagAtomic);
+    fApp->add_flag("--atd", doubleAtomic);
 
     fApp->add_option("-d,--opt1", val8);
     fApp->add_option("--opt2", val16);
@@ -64,10 +65,12 @@ std::shared_ptr<CLI::App> FuzzApp::generateApp() {
     fApp->add_option("--dopt1", v1);
     fApp->add_option("--dopt2", v2);
 
-    fApp->add_option("--vopt1", vv1);
-    fApp->add_option("--vopt2", vvs);
-    fApp->add_option("--vopt3", vstr);
-    fApp->add_option("--vopt4", vecvecd);
+    auto *vgroup = fApp->add_option_group("vectors");
+
+    vgroup->add_option("--vopt1", vv1);
+    vgroup->add_option("--vopt2", vvs)->inject_separator();
+    vgroup->add_option("--vopt3", vstr);
+    vgroup->add_option("--vopt4", vecvecd)->inject_separator();
 
     fApp->add_option("--oopt1", od1);
     fApp->add_option("--oopt2", ods);
@@ -75,10 +78,73 @@ std::shared_ptr<CLI::App> FuzzApp::generateApp() {
     fApp->add_option("--tup1", p1);
     fApp->add_option("--tup2", t1);
     fApp->add_option("--tup4", tcomplex);
+    vgroup->add_option("--vtup", vectup);
 
     fApp->add_option("--dwrap", dwrap);
     fApp->add_option("--iwrap", iwrap);
-
+    fApp->add_option("--swrap", swrap);
+    // file checks
+    fApp->add_option("--dexists")->check(ExistingDirectory);
+    fApp->add_option("--fexists")->check(ExistingFile);
+    fApp->add_option("--fnexists")->check(NonexistentPath);
+
+    auto *sub = fApp->add_subcommand("sub1");
+
+    sub->add_option("--sopt2", val16)->check(Range(1, 10));
+    sub->add_option("--sopt3", val32)->check(PositiveNumber);
+    sub->add_option("--sopt4", val64)->check(NonNegativeNumber);
+
+    sub->add_option("--sopt5", uval8)->transform(Bound(6, 20));
+    sub->add_option("--sopt6", uval16);
+    sub->add_option("--sopt7", uval32);
+    sub->add_option("--sopt8", uval64);
+
+    sub->add_option("--saopt1", atomicval64);
+    sub->add_option("--saopt2", atomicuval64);
+
+    sub->add_option("--sdopt1", v1);
+    sub->add_option("--sdopt2", v2);
+
+    sub->add_option("--svopt1", vv1);
+    sub->add_option("--svopt2", vvs);
+    sub->add_option("--svopt3", vstr);
+    sub->add_option("--svopt4", vecvecd);
+
+    sub->add_option("--soopt1", od1);
+    sub->add_option("--soopt2", ods);
+
+    sub->add_option("--stup1", p1);
+    sub->add_option("--stup2", t1);
+    sub->add_option("--stup4", tcomplex);
+    sub->add_option("--svtup", vectup);
+
+    sub->add_option("--sdwrap", dwrap);
+    sub->add_option("--siwrap", iwrap);
+
+    auto *resgroup = fApp->add_option_group("outputOrder");
+
+    resgroup->add_option("--vA", vstrA)->expected(0, 2)->multi_option_policy(CLI::MultiOptionPolicy::TakeAll);
+    resgroup->add_option("--vB", vstrB)->expected(0, 2)->multi_option_policy(CLI::MultiOptionPolicy::TakeLast);
+    resgroup->add_option("--vC", vstrC)->expected(0, 2)->multi_option_policy(CLI::MultiOptionPolicy::TakeFirst);
+    resgroup->add_option("--vD", vstrD)->expected(0, 2)->multi_option_policy(CLI::MultiOptionPolicy::Reverse);
+    resgroup->add_option("--vS", val32)->expected(0, 2)->multi_option_policy(CLI::MultiOptionPolicy::Sum);
+    resgroup->add_option("--vM", mergeBuffer)->expected(0, 2)->multi_option_policy(CLI::MultiOptionPolicy::Join);
+    resgroup->add_option("--vE", vstrE)->expected(2, 4)->delimiter(',');
+
+    auto *vldtr = fApp->add_option_group("validators");
+
+    validator_strings.resize(10);
+    vldtr->add_option("--vdtr1", validator_strings[0])->join()->check(CLI::PositiveNumber);
+    vldtr->add_option("--vdtr2", validator_strings[1])->join()->check(CLI::NonNegativeNumber);
+    vldtr->add_option("--vdtr3", validator_strings[2])->join()->check(CLI::NonexistentPath);
+    vldtr->add_option("--vdtr4", validator_strings[3])->join()->check(CLI::Range(7, 3456));
+    vldtr->add_option("--vdtr5", validator_strings[4])
+        ->join()
+        ->check(CLI::Range(std::string("aa"), std::string("zz"), "string range"));
+    vldtr->add_option("--vdtr6", validator_strings[5])->join()->check(CLI::TypeValidator<double>());
+    vldtr->add_option("--vdtr7", validator_strings[6])->join()->check(CLI::TypeValidator<bool>());
+    vldtr->add_option("--vdtr8", validator_strings[7])->join()->check(CLI::ValidIPV4);
+    vldtr->add_option("--vdtr9", validator_strings[8])->join()->transform(CLI::Bound(2, 255));
     return fApp;
 }
 
diff --git a/packages/CLI11/fuzz/fuzzApp.hpp b/packages/CLI11/fuzz/fuzzApp.hpp
index 01600cc259dce0697c5202bc30a321920a9c36e9..73c8f7e9add3111a19d9b201bbb7d227e08fa963 100644
--- a/packages/CLI11/fuzz/fuzzApp.hpp
+++ b/packages/CLI11/fuzz/fuzzApp.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -41,6 +41,16 @@ class doubleWrapper {
     double val{0.0};
 };
 
+class stringWrapper {
+  public:
+    stringWrapper() = default;
+    explicit stringWrapper(std::string_view v) : val(v){};
+    CLI11_NODISCARD std::string value() const { return val; }
+
+  private:
+    std::string val{};
+};
+
 class FuzzApp {
   public:
     FuzzApp() = default;
@@ -65,6 +75,7 @@ class FuzzApp {
 
     std::vector<double> vv1{};
     std::vector<std::string> vstr{};
+
     std::vector<std::vector<double>> vecvecd{};
     std::vector<std::vector<std::string>> vvs{};
     std::optional<double> od1{};
@@ -80,6 +91,7 @@ class FuzzApp {
                std::vector<int>,
                std::optional<std::string>>
         tcomplex2{};
+    std::vector<std::tuple<std::string, double, char, std::vector<std::string>>> vectup{};
     std::string_view vstrv = "";
 
     bool flag1{false};
@@ -88,5 +100,19 @@ class FuzzApp {
 
     intWrapper64 iwrap{0};
     doubleWrapper dwrap{0.0};
+    stringWrapper swrap{};
+    std::string buffer{};
+    int intbuffer{0};
+    std::atomic<double> doubleAtomic{0.0};
+
+    // for testing restrictions and reduction methods
+    std::vector<std::string> vstrA{};
+    std::vector<std::string> vstrB{};
+    std::vector<std::string> vstrC{};
+    std::vector<std::string> vstrD{};
+    std::vector<std::string> vstrE{};
+    std::vector<std::string> vstrF{};
+    std::string mergeBuffer{};
+    std::vector<std::string> validator_strings{};
 };
 }  // namespace CLI
diff --git a/packages/CLI11/fuzz/fuzzCommand.cpp b/packages/CLI11/fuzz/fuzzCommand.cpp
index 07ab0df2e24d8390ef490fa09597cc991430d8a8..2ab9805c816c7f4018fadeddff985ce8f7a8f444 100644
--- a/packages/CLI11/fuzz/fuzzCommand.cpp
+++ b/packages/CLI11/fuzz/fuzzCommand.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -20,5 +20,6 @@ int main(int argc, char **argv) {
         (app)->exit(e);
         // this just indicates we caught an error known by CLI
     }
+
     return 0;
 }
diff --git a/packages/CLI11/fuzz/fuzz_dictionary1.txt b/packages/CLI11/fuzz/fuzz_dictionary1.txt
index c044eecd4b56e798960acdbfb87220203fa43031..327f658ba17c7f96055382be0c3a2357efc8bb6b 100644
--- a/packages/CLI11/fuzz/fuzz_dictionary1.txt
+++ b/packages/CLI11/fuzz/fuzz_dictionary1.txt
@@ -4,6 +4,7 @@
 "-c"
 "-d"
 "-e"
+" "
 "--flag1"
 "--flag"
 "--flag2"
@@ -32,3 +33,142 @@
 "--tup4"
 "--dwrap"
 "--iwrap"
+"--vtup"
+"--atd"
+"sub1"
+"--sflag1"
+"--sflag"
+"--sflag2"
+"--sflagA"
+"--sopt1"
+"--sopt2"
+"--sopt3"
+"--sopt4"
+"--sopt5"
+"--sopt6"
+"--sopt7"
+"--sopt8"
+"--sopt9"
+"--saopt1"
+"--saopt2"
+"--sdopt1"
+"--sdopt2"
+"--svopt1"
+"--svopt2"
+"--svopt3"
+"--svopt4"
+"--soopt1"
+"--soopt2"
+"--stup1"
+"--stup2"
+"--stup4"
+"--sdwrap"
+"--siwrap"
+"--svtup"
+"--satd"
+"--vA"
+"--vB"
+"--vC"
+"--vD"
+"--vS"
+"--vM"
+"--vE"
+"--vdtr"
+"nflag2"
+"stup1"
+"svtup"
+"sdwrap"
+"siwrap"
+"++"
+"="
+"vtup"
+"soopt2"
+"--"
+"svopt4"
+"opt8"
+"config"
+"dwrap"
+"soptneg"
+"flag1"
+"flag"
+"flag2"
+"flag3"
+"enable"
+"help"
+"flagA"
+"opt1"
+"opt2"
+"opt3"
+"opt4"
+"opt5"
+"opt6"
+"opt7"
+"opt8"
+"opt9"
+"aopt1"
+"aopt2"
+"dopt1"
+"dopt2"
+"vopt1"
+"vopt2"
+"vopt3"
+"vopt4"
+"oopt1"
+"oopt2"
+"tup1"
+"tup2"
+"tup4"
+"dwrap"
+"iwrap"
+"swrap"
+"vtup"
+"atd"
+"sflag1"
+"sflag"
+"sflag2"
+"sflagA"
+"sopt1"
+"sopt2"
+"sopt3"
+"sopt4"
+"sopt5"
+"sopt6"
+"sopt7"
+"sopt8"
+"sopt9"
+"saopt1"
+"saopt2"
+"sdopt1"
+"sdopt2"
+"svopt1"
+"svopt2"
+"svopt3"
+"svopt4"
+"soopt1"
+"soopt2"
+"stup1"
+"stup2"
+"stup4"
+"sdwrap"
+"siwrap"
+"vdtr"
+"svtup"
+"satd"
+"%%"
+"dexists"
+"fexists"
+"fnexists"
+",-"
+",--"
+"{false}"
+"{4}"
+"!"
+"{"
+"}"
+"vA"
+"vB"
+"vC"
+"vD"
+"vS"
+"vM"
+"vE"
diff --git a/packages/CLI11/fuzz/fuzz_dictionary2.txt b/packages/CLI11/fuzz/fuzz_dictionary2.txt
index 12dd8f1f6c8fb74cb207817ebd0ea5169354b512..828be3a3c19eedc84af592d68bcc6cd657126034 100644
--- a/packages/CLI11/fuzz/fuzz_dictionary2.txt
+++ b/packages/CLI11/fuzz/fuzz_dictionary2.txt
@@ -35,3 +35,58 @@
 "tup4"
 "dwrap"
 "iwrap"
+"vtup"
+"atd"
+"sub1"
+"soopt1"
+"soopt2"
+"stup1"
+"stup2"
+"stup4"
+"stup2"
+"stup4"
+"sdwrap"
+"siwrap"
+"svtup"
+"satd"
+"sflag1"
+"sflag"
+"sflag2"
+"sflagA"
+"sopt1"
+"sopt2"
+"sopt3"
+"sopt4"
+"sopt5"
+"sopt6"
+"sopt7"
+"sopt8"
+"sopt9"
+"saopt1"
+"saopt2"
+"sdopt1"
+"sdopt2"
+"svopt1"
+"svopt2"
+"svopt3"
+"svopt4"
+"config"
+"nflag2"
+"vdtr"
+"--"
+"fuzzer"
+"t-"
+"++"
+"su"
+"%%"
+"swrap"
+"dexists"
+"fexists"
+"fnexists"
+"vA"
+"vB"
+"vC"
+"vD"
+"vS"
+"vM"
+"vE"
diff --git a/packages/CLI11/include/CLI/App.hpp b/packages/CLI11/include/CLI/App.hpp
index 2676445d155562f77aee64738768c64d024e5ec3..b63be47e78270d5a3a139205b1308aa981b514fb 100644
--- a/packages/CLI11/include/CLI/App.hpp
+++ b/packages/CLI11/include/CLI/App.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -290,6 +290,14 @@ class App {
 
     ///@}
 
+#ifdef _WIN32
+    /// When normalizing argv to UTF-8 on Windows, this is the storage for normalized args.
+    std::vector<std::string> normalized_argv_{};
+
+    /// When normalizing argv to UTF-8 on Windows, this is the `char**` value returned to the user.
+    std::vector<char *> normalized_argv_view_{};
+#endif
+
     /// Special private constructor for subcommand
     App(std::string app_description, std::string app_name, App *parent);
 
@@ -309,6 +317,9 @@ class App {
     /// virtual destructor
     virtual ~App() = default;
 
+    /// Convert the contents of argv to UTF-8. Only does something on Windows, does nothing elsewhere.
+    CLI11_NODISCARD char **ensure_utf8(char **argv);
+
     /// Set a callback for execution when all parsing and processing has completed
     ///
     /// Due to a bug in c++11,
@@ -1223,6 +1234,9 @@ class App {
     /// Read and process a configuration file (main app only)
     void _process_config_file();
 
+    /// Read and process a particular configuration file
+    void _process_config_file(const std::string &config_file, bool throw_error);
+
     /// Get envname options if not yet passed. Runs on *all* subcommands.
     void _process_env();
 
diff --git a/packages/CLI11/include/CLI/Argv.hpp b/packages/CLI11/include/CLI/Argv.hpp
index 35d81a6eaf3d888ccd245469bce2386800053b95..545bd58d1a8b9775be01d93dcf316b3f0f1e2533 100644
--- a/packages/CLI11/include/CLI/Argv.hpp
+++ b/packages/CLI11/include/CLI/Argv.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -6,10 +6,21 @@
 
 #pragma once
 
+// [CLI11:public_includes:set]
+#include <string>
+#include <vector>
+// [CLI11:public_includes:end]
+
 #include <CLI/Macros.hpp>
 
 namespace CLI {
 // [CLI11:argv_hpp:verbatim]
+namespace detail {
+#ifdef _WIN32
+/// Decode and return UTF-8 argv from GetCommandLineW.
+CLI11_INLINE std::vector<std::string> compute_win32_argv();
+#endif
+}  // namespace detail
 
 /// argc as passed in to this executable.
 CLI11_INLINE int argc();
diff --git a/packages/CLI11/include/CLI/CLI.hpp b/packages/CLI11/include/CLI/CLI.hpp
index fa9d4bb5394e6a88f146ffde82b01bcfc2e26f75..df401e0039d17d3c516c970991a5753af28f8bcb 100644
--- a/packages/CLI11/include/CLI/CLI.hpp
+++ b/packages/CLI11/include/CLI/CLI.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
diff --git a/packages/CLI11/include/CLI/Config.hpp b/packages/CLI11/include/CLI/Config.hpp
index a91f0da6e8d371d6002066020df9f396260f32dd..942c43f4024e8f1048dd7e951e2adb2f61eedf9a 100644
--- a/packages/CLI11/include/CLI/Config.hpp
+++ b/packages/CLI11/include/CLI/Config.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -24,7 +24,10 @@ namespace CLI {
 // [CLI11:config_hpp:verbatim]
 namespace detail {
 
-std::string convert_arg_for_ini(const std::string &arg, char stringQuote = '"', char characterQuote = '\'');
+std::string convert_arg_for_ini(const std::string &arg,
+                                char stringQuote = '"',
+                                char literalQuote = '\'',
+                                bool disable_multi_line = false);
 
 /// Comma separated join, adds quotes if needed
 std::string ini_join(const std::vector<std::string> &args,
@@ -32,7 +35,9 @@ std::string ini_join(const std::vector<std::string> &args,
                      char arrayStart = '[',
                      char arrayEnd = ']',
                      char stringQuote = '"',
-                     char characterQuote = '\'');
+                     char literalQuote = '\'');
+
+void clean_name_string(std::string &name, const std::string &keyChars);
 
 std::vector<std::string> generate_parents(const std::string &section, std::string &name, char parentSeparator);
 
diff --git a/packages/CLI11/include/CLI/ConfigFwd.hpp b/packages/CLI11/include/CLI/ConfigFwd.hpp
index a9ae2176a953c40cd0f41cc5eb85c408f6174721..fabf84dcef33dfcd3f1a55ecb0b89f28f18ecc66 100644
--- a/packages/CLI11/include/CLI/ConfigFwd.hpp
+++ b/packages/CLI11/include/CLI/ConfigFwd.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -10,6 +10,7 @@
 #include <algorithm>
 #include <fstream>
 #include <iostream>
+#include <memory>
 #include <string>
 #include <vector>
 // [CLI11:public_includes:end]
@@ -29,7 +30,6 @@ struct ConfigItem {
 
     /// This is the name
     std::string name{};
-
     /// Listing of inputs
     std::vector<std::string> inputs{};
 
@@ -92,8 +92,8 @@ class ConfigBase : public Config {
     char valueDelimiter = '=';
     /// the character to use around strings
     char stringQuote = '"';
-    /// the character to use around single characters
-    char characterQuote = '\'';
+    /// the character to use around single characters and literal strings
+    char literalQuote = '\'';
     /// the maximum number of layers to allow
     uint8_t maximumLayers{255};
     /// the separator used to separator parent layers
@@ -129,10 +129,10 @@ class ConfigBase : public Config {
         valueDelimiter = vSep;
         return this;
     }
-    /// Specify the quote characters used around strings and characters
-    ConfigBase *quoteCharacter(char qString, char qChar) {
+    /// Specify the quote characters used around strings and literal strings
+    ConfigBase *quoteCharacter(char qString, char literalChar) {
         stringQuote = qString;
-        characterQuote = qChar;
+        literalQuote = literalChar;
         return this;
     }
     /// Specify the maximum number of parents
diff --git a/packages/CLI11/include/CLI/Encoding.hpp b/packages/CLI11/include/CLI/Encoding.hpp
index 379e33b20ce9e4ae58b49f15ce1f06eee4427107..d723878f372437677c3eade145b58aeecdd9d78f 100644
--- a/packages/CLI11/include/CLI/Encoding.hpp
+++ b/packages/CLI11/include/CLI/Encoding.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
diff --git a/packages/CLI11/include/CLI/Error.hpp b/packages/CLI11/include/CLI/Error.hpp
index 0900da53ce374d07fa1cceaa23a55bf4e19b2875..2d6f673e943090fa0e9f285cd3592271370f71fb 100644
--- a/packages/CLI11/include/CLI/Error.hpp
+++ b/packages/CLI11/include/CLI/Error.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -123,7 +123,13 @@ class BadNameString : public ConstructionError {
     CLI11_ERROR_DEF(ConstructionError, BadNameString)
     CLI11_ERROR_SIMPLE(BadNameString)
     static BadNameString OneCharName(std::string name) { return BadNameString("Invalid one char name: " + name); }
+    static BadNameString MissingDash(std::string name) {
+        return BadNameString("Long names strings require 2 dashes " + name);
+    }
     static BadNameString BadLongName(std::string name) { return BadNameString("Bad long name: " + name); }
+    static BadNameString BadPositionalName(std::string name) {
+        return BadNameString("Invalid positional Name: " + name);
+    }
     static BadNameString DashesOnly(std::string name) {
         return BadNameString("Must have a name, not just dashes: " + name);
     }
diff --git a/packages/CLI11/include/CLI/Formatter.hpp b/packages/CLI11/include/CLI/Formatter.hpp
index f58058f27add2842b57d776b54e91b7ea930e62a..bc54caf5529cc1afc2d29f83ac0e1f0045179b01 100644
--- a/packages/CLI11/include/CLI/Formatter.hpp
+++ b/packages/CLI11/include/CLI/Formatter.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
diff --git a/packages/CLI11/include/CLI/FormatterFwd.hpp b/packages/CLI11/include/CLI/FormatterFwd.hpp
index 5ef0a5b585e377554c036ea5e7f41994e476105a..a0949b49d1f1e10dd6ae7844cc73c517d9473a27 100644
--- a/packages/CLI11/include/CLI/FormatterFwd.hpp
+++ b/packages/CLI11/include/CLI/FormatterFwd.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
diff --git a/packages/CLI11/include/CLI/Macros.hpp b/packages/CLI11/include/CLI/Macros.hpp
index c7ac94e8741370f3a1633dfcf8568af8ac8339d8..3fd26475e5d644265029352a8ea34daa39fa46d3 100644
--- a/packages/CLI11/include/CLI/Macros.hpp
+++ b/packages/CLI11/include/CLI/Macros.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
diff --git a/packages/CLI11/include/CLI/Option.hpp b/packages/CLI11/include/CLI/Option.hpp
index d32350738e34031a14c3be68e59d75a09b20831b..a0fa7ceac3aa3c4fe3b21bafb0b920952afec067 100644
--- a/packages/CLI11/include/CLI/Option.hpp
+++ b/packages/CLI11/include/CLI/Option.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -41,7 +41,8 @@ enum class MultiOptionPolicy : char {
     TakeFirst,  //!< take only the first Expected number of arguments
     Join,       //!< merge all the arguments together into a single string via the delimiter character default('\n')
     TakeAll,    //!< just get all the passed argument regardless
-    Sum         //!< sum all the arguments together if numerical or concatenate directly without delimiter
+    Sum,        //!< sum all the arguments together if numerical or concatenate directly without delimiter
+    Reverse,    //!< take only the last Expected number of arguments in reverse order
 };
 
 /// This is the CRTP base class for Option and OptionDefaults. It was designed this way
@@ -549,12 +550,12 @@ class Option : public OptionBase<Option> {
         if(!lnames_.empty()) {
             return lnames_[0];
         }
-        if(!pname_.empty()) {
-            return pname_;
-        }
         if(!snames_.empty()) {
             return snames_[0];
         }
+        if(!pname_.empty()) {
+            return pname_;
+        }
         return envname_;
     }
     /// The number of times the option expects to be included
@@ -577,13 +578,13 @@ class Option : public OptionBase<Option> {
     CLI11_NODISCARD int get_items_expected() const { return get_items_expected_min(); }
 
     /// True if the argument can be given directly
-    CLI11_NODISCARD bool get_positional() const { return pname_.length() > 0; }
+    CLI11_NODISCARD bool get_positional() const { return !pname_.empty(); }
 
     /// True if option has at least one non-positional name
-    CLI11_NODISCARD bool nonpositional() const { return (snames_.size() + lnames_.size()) > 0; }
+    CLI11_NODISCARD bool nonpositional() const { return (!lnames_.empty() || !snames_.empty()); }
 
     /// True if option has description
-    CLI11_NODISCARD bool has_description() const { return description_.length() > 0; }
+    CLI11_NODISCARD bool has_description() const { return !description_.empty(); }
 
     /// Get the description
     CLI11_NODISCARD const std::string &get_description() const { return description_; }
diff --git a/packages/CLI11/include/CLI/Split.hpp b/packages/CLI11/include/CLI/Split.hpp
index d00e7f8cbe6b5bd5f0c395de7b96164e575e20e1..165575393a6c36d2be97fc2dc535d7140a27ab14 100644
--- a/packages/CLI11/include/CLI/Split.hpp
+++ b/packages/CLI11/include/CLI/Split.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
diff --git a/packages/CLI11/include/CLI/StringTools.hpp b/packages/CLI11/include/CLI/StringTools.hpp
index 2a31005c858a00a71f489cf4c485d7805cb410cb..fb0069b7161e8cda5d041ff4b8a1b8016c9f93db 100644
--- a/packages/CLI11/include/CLI/StringTools.hpp
+++ b/packages/CLI11/include/CLI/StringTools.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -120,6 +120,9 @@ inline std::string trim_copy(const std::string &str) {
 /// remove quotes at the front and back of a string either '"' or '\''
 CLI11_INLINE std::string &remove_quotes(std::string &str);
 
+/// remove quotes from all elements of a string vector and process escaped components
+CLI11_INLINE void remove_quotes(std::vector<std::string> &args);
+
 /// Add a leader to the beginning of all new lines (nothing is added
 /// at the start of the first line). `"; "` would be for ini files
 ///
@@ -140,14 +143,16 @@ CLI11_INLINE std::ostream &format_aliases(std::ostream &out, const std::vector<s
 
 /// Verify the first character of an option
 /// - is a trigger character, ! has special meaning and new lines would just be annoying to deal with
-template <typename T> bool valid_first_char(T c) { return ((c != '-') && (c != '!') && (c != ' ') && c != '\n'); }
+template <typename T> bool valid_first_char(T c) {
+    return ((c != '-') && (static_cast<unsigned char>(c) > 33));  // space and '!' not allowed
+}
 
 /// Verify following characters of an option
 template <typename T> bool valid_later_char(T c) {
     // = and : are value separators, { has special meaning for option defaults,
-    // and \n would just be annoying to deal with in many places allowing space here has too much potential for
-    // inadvertent entry errors and bugs
-    return ((c != '=') && (c != ':') && (c != '{') && (c != ' ') && c != '\n');
+    // and control codes other than tab would just be annoying to deal with in many places allowing space here has too
+    // much potential for inadvertent entry errors and bugs
+    return ((c != '=') && (c != ':') && (c != '{') && ((static_cast<unsigned char>(c) > 32) || c == '\t'));
 }
 
 /// Verify an option/subcommand name
@@ -210,18 +215,46 @@ template <typename Callable> inline std::string find_and_modify(std::string str,
     return str;
 }
 
+/// close a sequence of characters indicated by a closure character.  Brackets allows sub sequences
+/// recognized bracket sequences include "'`[(<{  other closure characters are assumed to be literal strings
+CLI11_INLINE std::size_t close_sequence(const std::string &str, std::size_t start, char closure_char);
+
 /// Split a string '"one two" "three"' into 'one two', 'three'
-/// Quote characters can be ` ' or "
+/// Quote characters can be ` ' or " or bracket characters [{(< with matching to the matching bracket
 CLI11_INLINE std::vector<std::string> split_up(std::string str, char delimiter = '\0');
 
+/// get the value of an environmental variable or empty string if empty
+CLI11_INLINE std::string get_environment_value(const std::string &env_name);
+
 /// This function detects an equal or colon followed by an escaped quote after an argument
 /// then modifies the string to replace the equality with a space.  This is needed
 /// to allow the split up function to work properly and is intended to be used with the find_and_modify function
 /// the return value is the offset+1 which is required by the find_and_modify function.
 CLI11_INLINE std::size_t escape_detect(std::string &str, std::size_t offset);
 
-/// Add quotes if the string contains spaces
-CLI11_INLINE std::string &add_quotes_if_needed(std::string &str);
+/// @brief  detect if a string has escapable characters
+/// @param str the string to do the detection on
+/// @return true if the string has escapable characters
+CLI11_INLINE bool has_escapable_character(const std::string &str);
+
+/// @brief escape all escapable characters
+/// @param str the string to escape
+/// @return a string with the escapble characters escaped with '\'
+CLI11_INLINE std::string add_escaped_characters(const std::string &str);
+
+/// @brief replace the escaped characters with their equivalent
+CLI11_INLINE std::string remove_escaped_characters(const std::string &str);
+
+/// generate a string with all non printable characters escaped to hex codes
+CLI11_INLINE std::string binary_escape_string(const std::string &string_to_escape);
+
+CLI11_INLINE bool is_binary_escaped_string(const std::string &escaped_string);
+
+/// extract an escaped binary_string
+CLI11_INLINE std::string extract_binary_string(const std::string &escaped_string);
+
+/// process a quoted string, remove the quotes and if appropriate handle escaped characters
+CLI11_INLINE bool process_quoted_string(std::string &str, char string_char = '\"', char literal_char = '\'');
 
 }  // namespace detail
 
diff --git a/packages/CLI11/include/CLI/Timer.hpp b/packages/CLI11/include/CLI/Timer.hpp
index b185d3302a7ef82703bc642a635f5224c6dcdf6d..7ffc2d9b2e5a62551f3c4a98947b498e9d454446 100644
--- a/packages/CLI11/include/CLI/Timer.hpp
+++ b/packages/CLI11/include/CLI/Timer.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -126,7 +126,7 @@ class AutoTimer : public Timer {
     // GCC 4.7 does not support using inheriting constructors.
 
     /// This destructor prints the string
-    ~AutoTimer() { std::cout << to_string() << std::endl; }
+    ~AutoTimer() { std::cout << to_string() << '\n'; }
 };
 
 }  // namespace CLI
diff --git a/packages/CLI11/include/CLI/TypeTools.hpp b/packages/CLI11/include/CLI/TypeTools.hpp
index 9d43ea3614011db9e59f9827676b5a34ed976294..7e66c6adc7c668f91762e076a06cf150073fb03e 100644
--- a/packages/CLI11/include/CLI/TypeTools.hpp
+++ b/packages/CLI11/include/CLI/TypeTools.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -7,6 +7,7 @@
 #pragma once
 
 // [CLI11:public_includes:set]
+#include <algorithm>
 #include <cmath>
 #include <cstdint>
 #include <exception>
@@ -607,12 +608,23 @@ template <typename T> struct classify_object<T, typename std::enable_if<is_bool<
 template <typename T> struct classify_object<T, typename std::enable_if<std::is_floating_point<T>::value>::type> {
     static constexpr object_category value{object_category::floating_point};
 };
+#if defined _MSC_VER
+// in MSVC wstring should take precedence if available this isn't as useful on other compilers due to the broader use of
+// utf-8 encoding
+#define WIDE_STRING_CHECK                                                                                              \
+    !std::is_assignable<T &, std::wstring>::value && !std::is_constructible<T, std::wstring>::value
+#define STRING_CHECK true
+#else
+#define WIDE_STRING_CHECK true
+#define STRING_CHECK !std::is_assignable<T &, std::string>::value && !std::is_constructible<T, std::string>::value
+#endif
 
 /// String and similar direct assignment
 template <typename T>
-struct classify_object<T,
-                       typename std::enable_if<!std::is_floating_point<T>::value && !std::is_integral<T>::value &&
-                                               std::is_assignable<T &, std::string>::value>::type> {
+struct classify_object<
+    T,
+    typename std::enable_if<!std::is_floating_point<T>::value && !std::is_integral<T>::value && WIDE_STRING_CHECK &&
+                            std::is_assignable<T &, std::string>::value>::type> {
     static constexpr object_category value{object_category::string_assignable};
 };
 
@@ -622,7 +634,7 @@ struct classify_object<
     T,
     typename std::enable_if<!std::is_floating_point<T>::value && !std::is_integral<T>::value &&
                             !std::is_assignable<T &, std::string>::value && (type_count<T>::value == 1) &&
-                            std::is_constructible<T, std::string>::value>::type> {
+                            WIDE_STRING_CHECK && std::is_constructible<T, std::string>::value>::type> {
     static constexpr object_category value{object_category::string_constructible};
 };
 
@@ -630,9 +642,7 @@ struct classify_object<
 template <typename T>
 struct classify_object<T,
                        typename std::enable_if<!std::is_floating_point<T>::value && !std::is_integral<T>::value &&
-                                               !std::is_assignable<T &, std::string>::value &&
-                                               !std::is_constructible<T, std::string>::value &&
-                                               std::is_assignable<T &, std::wstring>::value>::type> {
+                                               STRING_CHECK && std::is_assignable<T &, std::wstring>::value>::type> {
     static constexpr object_category value{object_category::wstring_assignable};
 };
 
@@ -640,10 +650,8 @@ template <typename T>
 struct classify_object<
     T,
     typename std::enable_if<!std::is_floating_point<T>::value && !std::is_integral<T>::value &&
-                            !std::is_assignable<T &, std::string>::value &&
-                            !std::is_constructible<T, std::string>::value &&
                             !std::is_assignable<T &, std::wstring>::value && (type_count<T>::value == 1) &&
-                            std::is_constructible<T, std::wstring>::value>::type> {
+                            STRING_CHECK && std::is_constructible<T, std::wstring>::value>::type> {
     static constexpr object_category value{object_category::wstring_constructible};
 };
 
@@ -854,7 +862,7 @@ bool integral_conversion(const std::string &input, T &output) noexcept {
     if(input.empty() || input.front() == '-') {
         return false;
     }
-    char *val = nullptr;
+    char *val{nullptr};
     errno = 0;
     std::uint64_t output_ll = std::strtoull(input.c_str(), &val, 0);
     if(errno == ERANGE) {
@@ -870,6 +878,33 @@ bool integral_conversion(const std::string &input, T &output) noexcept {
         output = (output_sll < 0) ? static_cast<T>(0) : static_cast<T>(output_sll);
         return (static_cast<std::int64_t>(output) == output_sll);
     }
+    // remove separators
+    if(input.find_first_of("_'") != std::string::npos) {
+        std::string nstring = input;
+        nstring.erase(std::remove(nstring.begin(), nstring.end(), '_'), nstring.end());
+        nstring.erase(std::remove(nstring.begin(), nstring.end(), '\''), nstring.end());
+        return integral_conversion(nstring, output);
+    }
+    if(input.compare(0, 2, "0o") == 0) {
+        val = nullptr;
+        errno = 0;
+        output_ll = std::strtoull(input.c_str() + 2, &val, 8);
+        if(errno == ERANGE) {
+            return false;
+        }
+        output = static_cast<T>(output_ll);
+        return (val == (input.c_str() + input.size()) && static_cast<std::uint64_t>(output) == output_ll);
+    }
+    if(input.compare(0, 2, "0b") == 0) {
+        val = nullptr;
+        errno = 0;
+        output_ll = std::strtoull(input.c_str() + 2, &val, 2);
+        if(errno == ERANGE) {
+            return false;
+        }
+        output = static_cast<T>(output_ll);
+        return (val == (input.c_str() + input.size()) && static_cast<std::uint64_t>(output) == output_ll);
+    }
     return false;
 }
 
@@ -894,11 +929,38 @@ bool integral_conversion(const std::string &input, T &output) noexcept {
         output = static_cast<T>(1);
         return true;
     }
+    // remove separators
+    if(input.find_first_of("_'") != std::string::npos) {
+        std::string nstring = input;
+        nstring.erase(std::remove(nstring.begin(), nstring.end(), '_'), nstring.end());
+        nstring.erase(std::remove(nstring.begin(), nstring.end(), '\''), nstring.end());
+        return integral_conversion(nstring, output);
+    }
+    if(input.compare(0, 2, "0o") == 0) {
+        val = nullptr;
+        errno = 0;
+        output_ll = std::strtoll(input.c_str() + 2, &val, 8);
+        if(errno == ERANGE) {
+            return false;
+        }
+        output = static_cast<T>(output_ll);
+        return (val == (input.c_str() + input.size()) && static_cast<std::int64_t>(output) == output_ll);
+    }
+    if(input.compare(0, 2, "0b") == 0) {
+        val = nullptr;
+        errno = 0;
+        output_ll = std::strtoll(input.c_str() + 2, &val, 2);
+        if(errno == ERANGE) {
+            return false;
+        }
+        output = static_cast<T>(output_ll);
+        return (val == (input.c_str() + input.size()) && static_cast<std::int64_t>(output) == output_ll);
+    }
     return false;
 }
 
-/// Convert a flag into an integer value  typically binary flags
-inline std::int64_t to_flag_value(std::string val) {
+/// Convert a flag into an integer value  typically binary flags sets errno to nonzero if conversion failed
+inline std::int64_t to_flag_value(std::string val) noexcept {
     static const std::string trueString("true");
     static const std::string falseString("false");
     if(val == trueString) {
@@ -926,7 +988,8 @@ inline std::int64_t to_flag_value(std::string val) {
             ret = 1;
             break;
         default:
-            throw std::invalid_argument("unrecognized character");
+            errno = EINVAL;
+            return -1;
         }
         return ret;
     }
@@ -935,7 +998,11 @@ inline std::int64_t to_flag_value(std::string val) {
     } else if(val == falseString || val == "off" || val == "no" || val == "disable") {
         ret = -1;
     } else {
-        ret = std::stoll(val);
+        char *loc_ptr{nullptr};
+        ret = std::strtoll(val.c_str(), &loc_ptr, 0);
+        if(loc_ptr != (val.c_str() + val.size()) && errno == 0) {
+            errno = EINVAL;
+        }
     }
     return ret;
 }
@@ -964,18 +1031,16 @@ bool lexical_cast(const std::string &input, T &output) {
 template <typename T,
           enable_if_t<classify_object<T>::value == object_category::boolean_value, detail::enabler> = detail::dummy>
 bool lexical_cast(const std::string &input, T &output) {
-    try {
-        auto out = to_flag_value(input);
+    errno = 0;
+    auto out = to_flag_value(input);
+    if(errno == 0) {
         output = (out > 0);
-        return true;
-    } catch(const std::invalid_argument &) {
-        return false;
-    } catch(const std::out_of_range &) {
-        // if the number is out of the range of a 64 bit value then it is still a number and for this purpose is still
-        // valid all we care about the sign
+    } else if(errno == ERANGE) {
         output = (input[0] != '-');
-        return true;
+    } else {
+        return false;
     }
+    return true;
 }
 
 /// Floats
@@ -988,7 +1053,17 @@ bool lexical_cast(const std::string &input, T &output) {
     char *val = nullptr;
     auto output_ld = std::strtold(input.c_str(), &val);
     output = static_cast<T>(output_ld);
-    return val == (input.c_str() + input.size());
+    if(val == (input.c_str() + input.size())) {
+        return true;
+    }
+    // remove separators
+    if(input.find_first_of("_'") != std::string::npos) {
+        std::string nstring = input;
+        nstring.erase(std::remove(nstring.begin(), nstring.end(), '_'), nstring.end());
+        nstring.erase(std::remove(nstring.begin(), nstring.end(), '\''), nstring.end());
+        return lexical_cast(nstring, output);
+    }
+    return false;
 }
 
 /// complex
@@ -1309,9 +1384,7 @@ bool lexical_conversion(const std::vector<std ::string> &strings, AssignTo &outp
     FirstType v1;
     SecondType v2;
     bool retval = lexical_assign<FirstType, FirstType>(strings[0], v1);
-    if(strings.size() > 1) {
-        retval = retval && lexical_assign<SecondType, SecondType>(strings[1], v2);
-    }
+    retval = retval && lexical_assign<SecondType, SecondType>((strings.size() > 1) ? strings[1] : std::string{}, v2);
     if(retval) {
         output = AssignTo{v1, v2};
     }
@@ -1326,6 +1399,9 @@ template <class AssignTo,
                       detail::enabler> = detail::dummy>
 bool lexical_conversion(const std::vector<std ::string> &strings, AssignTo &output) {
     output.erase(output.begin(), output.end());
+    if(strings.empty()) {
+        return true;
+    }
     if(strings.size() == 1 && strings[0] == "{}") {
         return true;
     }
@@ -1628,12 +1704,13 @@ inline std::string sum_string_vector(const std::vector<std::string> &values) {
         double tv{0.0};
         auto comp = lexical_cast(arg, tv);
         if(!comp) {
-            try {
-                tv = static_cast<double>(detail::to_flag_value(arg));
-            } catch(const std::exception &) {
-                fail = true;
+            errno = 0;
+            auto fv = detail::to_flag_value(arg);
+            fail = (errno != 0);
+            if(fail) {
                 break;
             }
+            tv = static_cast<double>(fv);
         }
         val += tv;
     }
@@ -1642,13 +1719,10 @@ inline std::string sum_string_vector(const std::vector<std::string> &values) {
             output.append(arg);
         }
     } else {
-        if(val <= static_cast<double>((std::numeric_limits<std::int64_t>::min)()) ||
-           val >= static_cast<double>((std::numeric_limits<std::int64_t>::max)()) ||
-           std::ceil(val) == std::floor(val)) {
-            output = detail::value_string(static_cast<int64_t>(val));
-        } else {
-            output = detail::value_string(val);
-        }
+        std::ostringstream out;
+        out.precision(16);
+        out << val;
+        output = out.str();
     }
     return output;
 }
diff --git a/packages/CLI11/include/CLI/Validators.hpp b/packages/CLI11/include/CLI/Validators.hpp
index 59d800de860e70b98005038038d77729cc895d33..bdddeb84f2ac95a303bb3a74b0b01df3e0c363bf 100644
--- a/packages/CLI11/include/CLI/Validators.hpp
+++ b/packages/CLI11/include/CLI/Validators.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -218,6 +218,11 @@ class IPV4Validator : public Validator {
     IPV4Validator();
 };
 
+class EscapedStringTransformer : public Validator {
+  public:
+    EscapedStringTransformer();
+};
+
 }  // namespace detail
 
 // Static is not needed here, because global const implies static.
@@ -237,6 +242,9 @@ const detail::NonexistentPathValidator NonexistentPath;
 /// Check for an IP4 address
 const detail::IPV4Validator ValidIPV4;
 
+/// convert escaped characters into their associated values
+const detail::EscapedStringTransformer EscapedString;
+
 /// Validate the input as a particular type
 template <typename DesiredType> class TypeValidator : public Validator {
   public:
diff --git a/packages/CLI11/include/CLI/Version.hpp b/packages/CLI11/include/CLI/Version.hpp
index d5c817a9c3d84c2b17d40e3d61e20d57e2697bf4..7e9db02a68f722f015724b57ff42e6e1bf76d7fc 100644
--- a/packages/CLI11/include/CLI/Version.hpp
+++ b/packages/CLI11/include/CLI/Version.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -9,8 +9,8 @@
 // [CLI11:version_hpp:verbatim]
 
 #define CLI11_VERSION_MAJOR 2
-#define CLI11_VERSION_MINOR 3
-#define CLI11_VERSION_PATCH 2
-#define CLI11_VERSION "2.3.2"
+#define CLI11_VERSION_MINOR 4
+#define CLI11_VERSION_PATCH 0
+#define CLI11_VERSION "2.4.0"
 
 // [CLI11:version_hpp:end]
diff --git a/packages/CLI11/include/CLI/impl/App_inl.hpp b/packages/CLI11/include/CLI/impl/App_inl.hpp
index 7d487442f924d56758e68d8d88eae1767a26daf0..ae8b5f33960021fc000c31d0f9d45a1525944836 100644
--- a/packages/CLI11/include/CLI/impl/App_inl.hpp
+++ b/packages/CLI11/include/CLI/impl/App_inl.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -57,10 +57,32 @@ CLI11_INLINE App::App(std::string app_description, std::string app_name, App *pa
     }
 }
 
+CLI11_NODISCARD CLI11_INLINE char **App::ensure_utf8(char **argv) {
+#ifdef _WIN32
+    (void)argv;
+
+    normalized_argv_ = detail::compute_win32_argv();
+
+    if(!normalized_argv_view_.empty()) {
+        normalized_argv_view_.clear();
+    }
+
+    normalized_argv_view_.reserve(normalized_argv_.size());
+    for(auto &arg : normalized_argv_) {
+        // using const_cast is well-defined, string is known to not be const.
+        normalized_argv_view_.push_back(const_cast<char *>(arg.data()));
+    }
+
+    return normalized_argv_view_.data();
+#else
+    return argv;
+#endif
+}
+
 CLI11_INLINE App *App::name(std::string app_name) {
 
     if(parent_ != nullptr) {
-        auto oname = name_;
+        std::string oname = name_;
         name_ = app_name;
         const auto &res = _compare_subcommand_names(*this, *_get_fallthrough_parent());
         if(!res.empty()) {
@@ -141,6 +163,32 @@ CLI11_INLINE Option *App::add_option(std::string option_name,
 
     if(std::find_if(std::begin(options_), std::end(options_), [&myopt](const Option_p &v) { return *v == myopt; }) ==
        std::end(options_)) {
+        if(myopt.lnames_.empty() && myopt.snames_.empty()) {
+            // if the option is positional only there is additional potential for ambiguities in config files and needs
+            // to be checked
+            std::string test_name = "--" + myopt.get_single_name();
+            if(test_name.size() == 3) {
+                test_name.erase(0, 1);
+            }
+
+            auto *op = get_option_no_throw(test_name);
+            if(op != nullptr) {
+                throw(OptionAlreadyAdded("added option positional name matches existing option: " + test_name));
+            }
+        } else if(parent_ != nullptr) {
+            for(auto &ln : myopt.lnames_) {
+                auto *op = parent_->get_option_no_throw(ln);
+                if(op != nullptr) {
+                    throw(OptionAlreadyAdded("added option matches existing positional option: " + ln));
+                }
+            }
+            for(auto &sn : myopt.snames_) {
+                auto *op = parent_->get_option_no_throw(sn);
+                if(op != nullptr) {
+                    throw(OptionAlreadyAdded("added option matches existing positional option: " + sn));
+                }
+            }
+        }
         options_.emplace_back();
         Option_p &option = options_.back();
         option.reset(new Option(option_name, option_description, option_callback, this));
@@ -315,8 +363,11 @@ CLI11_INLINE Option *App::set_config(std::string option_name,
         }
         if(!default_filename.empty()) {
             config_ptr_->default_str(std::move(default_filename));
+            config_ptr_->force_callback_ = true;
         }
         config_ptr_->configurable(false);
+        // set the option to take the last value and reverse given by default
+        config_ptr_->multi_option_policy(MultiOptionPolicy::Reverse);
     }
 
     return config_ptr_;
@@ -346,13 +397,14 @@ CLI11_INLINE bool App::remove_option(Option *opt) {
 CLI11_INLINE App *App::add_subcommand(std::string subcommand_name, std::string subcommand_description) {
     if(!subcommand_name.empty() && !detail::valid_name_string(subcommand_name)) {
         if(!detail::valid_first_char(subcommand_name[0])) {
-            throw IncorrectConstruction("Subcommand name starts with invalid character, '!' and '-' are not allowed");
+            throw IncorrectConstruction(
+                "Subcommand name starts with invalid character, '!' and '-' and control characters");
         }
         for(auto c : subcommand_name) {
             if(!detail::valid_later_char(c)) {
                 throw IncorrectConstruction(std::string("Subcommand name contains invalid character ('") + c +
                                             "'), all characters are allowed except"
-                                            "'=',':','{','}', and ' '");
+                                            "'=',':','{','}', ' ', and control characters");
             }
         }
     }
@@ -527,8 +579,12 @@ CLI11_INLINE void App::parse(std::string commandline, bool program_name_included
     auto args = detail::split_up(std::move(commandline));
     // remove all empty strings
     args.erase(std::remove(args.begin(), args.end(), std::string{}), args.end());
+    try {
+        detail::remove_quotes(args);
+    } catch(const std::invalid_argument &arg) {
+        throw CLI::ParseError(arg.what(), CLI::ExitCodes::InvalidError);
+    }
     std::reverse(args.begin(), args.end());
-
     parse(std::move(args));
 }
 
@@ -602,7 +658,7 @@ CLI11_INLINE int App::exit(const Error &e, std::ostream &out, std::ostream &err)
     }
 
     if(e.get_name() == "CallForVersion") {
-        out << e.what() << std::endl;
+        out << e.what() << '\n';
         return e.get_exit_code();
     }
 
@@ -698,7 +754,8 @@ CLI11_NODISCARD CLI11_INLINE std::string App::help(std::string prev, AppFormatMo
 CLI11_NODISCARD CLI11_INLINE std::string App::version() const {
     std::string val;
     if(version_ptr_ != nullptr) {
-        auto rv = version_ptr_->results();
+        // copy the results for reuse later
+        results_t rv = version_ptr_->results();
         version_ptr_->clear();
         version_ptr_->add_result("true");
         try {
@@ -807,7 +864,7 @@ CLI11_NODISCARD CLI11_INLINE bool App::check_name(std::string name_to_check) con
     if(local_name == name_to_check) {
         return true;
     }
-    for(auto les : aliases_) {  // NOLINT(performance-for-range-copy)
+    for(std::string les : aliases_) {  // NOLINT(performance-for-range-copy)
         if(ignore_underscore_) {
             les = detail::remove_underscore(les);
         }
@@ -1010,34 +1067,42 @@ CLI11_NODISCARD CLI11_INLINE detail::Classifier App::_recognize(const std::strin
     return detail::Classifier::NONE;
 }
 
+CLI11_INLINE void App::_process_config_file(const std::string &config_file, bool throw_error) {
+    auto path_result = detail::check_path(config_file.c_str());
+    if(path_result == detail::path_type::file) {
+        try {
+            std::vector<ConfigItem> values = config_formatter_->from_file(config_file);
+            _parse_config(values);
+        } catch(const FileError &) {
+            if(throw_error)
+                throw;
+        }
+    } else if(throw_error) {
+        throw FileError::Missing(config_file);
+    }
+}
+
 CLI11_INLINE void App::_process_config_file() {
     if(config_ptr_ != nullptr) {
         bool config_required = config_ptr_->get_required();
         auto file_given = config_ptr_->count() > 0;
+        if(!(file_given || config_ptr_->envname_.empty())) {
+            std::string ename_string = detail::get_environment_value(config_ptr_->envname_);
+            if(!ename_string.empty()) {
+                config_ptr_->add_result(ename_string);
+            }
+        }
+        config_ptr_->run_callback();
+
         auto config_files = config_ptr_->as<std::vector<std::string>>();
         if(config_files.empty() || config_files.front().empty()) {
             if(config_required) {
-                throw FileError::Missing("no specified config file");
+                throw FileError("config file is required but none was given");
             }
             return;
         }
-        for(auto rit = config_files.rbegin(); rit != config_files.rend(); ++rit) {
-            const auto &config_file = *rit;
-            auto path_result = detail::check_path(config_file.c_str());
-            if(path_result == detail::path_type::file) {
-                try {
-                    std::vector<ConfigItem> values = config_formatter_->from_file(config_file);
-                    _parse_config(values);
-                    if(!file_given) {
-                        config_ptr_->add_result(config_file);
-                    }
-                } catch(const FileError &) {
-                    if(config_required || file_given)
-                        throw;
-                }
-            } else if(config_required || file_given) {
-                throw FileError::Missing(config_file);
-            }
+        for(const auto &config_file : config_files) {
+            _process_config_file(config_file, config_required || file_given);
         }
     }
 }
@@ -1045,32 +1110,24 @@ CLI11_INLINE void App::_process_config_file() {
 CLI11_INLINE void App::_process_env() {
     for(const Option_p &opt : options_) {
         if(opt->count() == 0 && !opt->envname_.empty()) {
-            char *buffer = nullptr;
-            std::string ename_string;
-
-#ifdef _MSC_VER
-            // Windows version
-            std::size_t sz = 0;
-            if(_dupenv_s(&buffer, &sz, opt->envname_.c_str()) == 0 && buffer != nullptr) {
-                ename_string = std::string(buffer);
-                free(buffer);
-            }
-#else
-            // This also works on Windows, but gives a warning
-            buffer = std::getenv(opt->envname_.c_str());
-            if(buffer != nullptr)
-                ename_string = std::string(buffer);
-#endif
-
+            std::string ename_string = detail::get_environment_value(opt->envname_);
             if(!ename_string.empty()) {
-                opt->add_result(ename_string);
+                std::string result = ename_string;
+                result = opt->_validate(result, 0);
+                if(result.empty()) {
+                    opt->add_result(ename_string);
+                }
             }
         }
     }
 
     for(App_p &sub : subcommands_) {
-        if(sub->get_name().empty() || !sub->parse_complete_callback_)
-            sub->_process_env();
+        if(sub->get_name().empty() || !sub->parse_complete_callback_) {
+            if(sub->count_all() > 0) {
+                // only process environment variables if the callback has actually been triggered already
+                sub->_process_env();
+            }
+        }
     }
 }
 
@@ -1370,12 +1427,11 @@ CLI11_INLINE void App::_parse_config(const std::vector<ConfigItem> &args) {
 }
 
 CLI11_INLINE bool App::_parse_single_config(const ConfigItem &item, std::size_t level) {
+
     if(level < item.parents.size()) {
         try {
             auto *subcom = get_subcommand(item.parents.at(level));
-            auto result = subcom->_parse_single_config(item, level + 1);
-
-            return result;
+            return subcom->_parse_single_config(item, level + 1);
         } catch(const OptionNotFound &) {
             return false;
         }
@@ -1405,10 +1461,11 @@ CLI11_INLINE bool App::_parse_single_config(const ConfigItem &item, std::size_t
         if(item.name.size() == 1) {
             op = get_option_no_throw("-" + item.name);
         }
+        if(op == nullptr) {
+            op = get_option_no_throw(item.name);
+        }
     }
-    if(op == nullptr) {
-        op = get_option_no_throw(item.name);
-    }
+
     if(op == nullptr) {
         // If the option was not present
         if(get_allow_config_extras() == config_extras_mode::capture)
@@ -1435,29 +1492,54 @@ CLI11_INLINE bool App::_parse_single_config(const ConfigItem &item, std::size_t
                 auto res = config_formatter_->to_flag(item);
                 bool converted{false};
                 if(op->get_disable_flag_override()) {
-
-                    try {
-                        auto val = detail::to_flag_value(res);
-                        if(val == 1) {
-                            res = op->get_flag_value(item.name, "{}");
-                            converted = true;
-                        }
-                    } catch(...) {
+                    auto val = detail::to_flag_value(res);
+                    if(val == 1) {
+                        res = op->get_flag_value(item.name, "{}");
+                        converted = true;
                     }
                 }
 
                 if(!converted) {
+                    errno = 0;
                     res = op->get_flag_value(item.name, res);
                 }
 
                 op->add_result(res);
                 return true;
             }
-            if(static_cast<int>(item.inputs.size()) > op->get_items_expected_max()) {
+            if(static_cast<int>(item.inputs.size()) > op->get_items_expected_max() &&
+               op->get_multi_option_policy() != MultiOptionPolicy::TakeAll) {
                 if(op->get_items_expected_max() > 1) {
                     throw ArgumentMismatch::AtMost(item.fullname(), op->get_items_expected_max(), item.inputs.size());
                 }
-                throw ConversionError::TooManyInputsFlag(item.fullname());
+
+                if(!op->get_disable_flag_override()) {
+                    throw ConversionError::TooManyInputsFlag(item.fullname());
+                }
+                // if the disable flag override is set then we must have the flag values match a known flag value
+                // this is true regardless of the output value, so an array input is possible and must be accounted for
+                for(const auto &res : item.inputs) {
+                    bool valid_value{false};
+                    if(op->default_flag_values_.empty()) {
+                        if(res == "true" || res == "false" || res == "1" || res == "0") {
+                            valid_value = true;
+                        }
+                    } else {
+                        for(const auto &valid_res : op->default_flag_values_) {
+                            if(valid_res.second == res) {
+                                valid_value = true;
+                                break;
+                            }
+                        }
+                    }
+
+                    if(valid_value) {
+                        op->add_result(res);
+                    } else {
+                        throw InvalidError("invalid flag argument given");
+                    }
+                }
+                return true;
             }
         }
         op->add_result(item.inputs);
@@ -1492,7 +1574,7 @@ CLI11_INLINE bool App::_parse_single(std::vector<std::string> &args, bool &posit
     case detail::Classifier::SHORT:
     case detail::Classifier::WINDOWS_STYLE:
         // If already parsed a subcommand, don't accept options_
-        _parse_arg(args, classifier, false);
+        retval = _parse_arg(args, classifier, false);
         break;
     case detail::Classifier::NONE:
         // Probably a positional or something for a parent (sub)command
@@ -1534,6 +1616,7 @@ CLI11_NODISCARD CLI11_INLINE bool App::_has_remaining_positionals() const {
 CLI11_INLINE bool App::_parse_positional(std::vector<std::string> &args, bool haltOnSubcommand) {
 
     const std::string &positional = args.back();
+    Option *posOpt{nullptr};
 
     if(positionals_at_end_) {
         // deal with the case of required arguments at the end which should take precedence over other arguments
@@ -1550,56 +1633,47 @@ CLI11_INLINE bool App::_parse_positional(std::vector<std::string> &args, bool ha
                                 continue;
                             }
                         }
-
-                        parse_order_.push_back(opt.get());
-                        /// if we require a separator add it here
-                        if(opt->get_inject_separator()) {
-                            if(!opt->results().empty() && !opt->results().back().empty()) {
-                                opt->add_result(std::string{});
-                            }
-                        }
-                        if(opt->get_trigger_on_parse() &&
-                           opt->current_option_state_ == Option::option_state::callback_run) {
-                            opt->clear();
-                        }
-                        opt->add_result(positional);
-                        if(opt->get_trigger_on_parse()) {
-                            opt->run_callback();
-                        }
-                        args.pop_back();
-                        return true;
+                        posOpt = opt.get();
+                        break;
                     }
                 }
             }
         }
     }
-    for(const Option_p &opt : options_) {
-        // Eat options, one by one, until done
-        if(opt->get_positional() &&
-           (static_cast<int>(opt->count()) < opt->get_items_expected_min() || opt->get_allow_extra_args())) {
-            if(validate_positionals_) {
-                std::string pos = positional;
-                pos = opt->_validate(pos, 0);
-                if(!pos.empty()) {
-                    continue;
-                }
-            }
-            if(opt->get_inject_separator()) {
-                if(!opt->results().empty() && !opt->results().back().empty()) {
-                    opt->add_result(std::string{});
+    if(posOpt == nullptr) {
+        for(const Option_p &opt : options_) {
+            // Eat options, one by one, until done
+            if(opt->get_positional() &&
+               (static_cast<int>(opt->count()) < opt->get_items_expected_min() || opt->get_allow_extra_args())) {
+                if(validate_positionals_) {
+                    std::string pos = positional;
+                    pos = opt->_validate(pos, 0);
+                    if(!pos.empty()) {
+                        continue;
+                    }
                 }
+                posOpt = opt.get();
+                break;
             }
-            if(opt->get_trigger_on_parse() && opt->current_option_state_ == Option::option_state::callback_run) {
-                opt->clear();
-            }
-            opt->add_result(positional);
-            if(opt->get_trigger_on_parse()) {
-                opt->run_callback();
+        }
+    }
+    if(posOpt != nullptr) {
+        parse_order_.push_back(posOpt);
+        if(posOpt->get_inject_separator()) {
+            if(!posOpt->results().empty() && !posOpt->results().back().empty()) {
+                posOpt->add_result(std::string{});
             }
-            parse_order_.push_back(opt.get());
-            args.pop_back();
-            return true;
         }
+        if(posOpt->get_trigger_on_parse() && posOpt->current_option_state_ == Option::option_state::callback_run) {
+            posOpt->clear();
+        }
+        posOpt->add_result(positional);
+        if(posOpt->get_trigger_on_parse()) {
+            posOpt->run_callback();
+        }
+
+        args.pop_back();
+        return true;
     }
 
     for(auto &subc : subcommands_) {
@@ -1942,7 +2016,7 @@ CLI11_INLINE void App::_trigger_pre_parse(std::size_t remaining_args) {
     } else if(immediate_callback_) {
         if(!name_.empty()) {
             auto pcnt = parsed_;
-            auto extras = std::move(missing_);
+            missing_t extras = std::move(missing_);
             clear();
             parsed_ = pcnt;
             pre_parse_called_ = true;
@@ -2128,12 +2202,12 @@ CLI11_INLINE void retire_option(App *app, Option *opt) {
                             ->allow_extra_args(opt->get_allow_extra_args());
 
     app->remove_option(opt);
-    auto *opt2 = app->add_option(option_copy->get_name(false, true), "option has been retired and has no effect")
-                     ->type_name("RETIRED")
-                     ->default_str("RETIRED")
-                     ->type_size(option_copy->get_type_size_min(), option_copy->get_type_size_max())
-                     ->expected(option_copy->get_expected_min(), option_copy->get_expected_max())
-                     ->allow_extra_args(option_copy->get_allow_extra_args());
+    auto *opt2 = app->add_option(option_copy->get_name(false, true), "option has been retired and has no effect");
+    opt2->type_name("RETIRED")
+        ->default_str("RETIRED")
+        ->type_size(option_copy->get_type_size_min(), option_copy->get_type_size_max())
+        ->expected(option_copy->get_expected_min(), option_copy->get_expected_max())
+        ->allow_extra_args(option_copy->get_allow_extra_args());
 
     Validator retired_warning{[opt2](std::string &) {
                                   std::cout << "WARNING " << opt2->get_name() << " is retired and has no effect\n";
diff --git a/packages/CLI11/include/CLI/impl/Argv_inl.hpp b/packages/CLI11/include/CLI/impl/Argv_inl.hpp
index 3d00a570d1e5032ed72bd4a255c8e60be50934d2..620f1fb73faa35590f43cb9045605ccd54745aef 100644
--- a/packages/CLI11/include/CLI/impl/Argv_inl.hpp
+++ b/packages/CLI11/include/CLI/impl/Argv_inl.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -29,6 +29,10 @@
 #define _X86_
 #elif defined(__arm__) || defined(_M_ARM) || defined(_M_ARMT)
 #define _ARM_
+#elif defined(__aarch64__) || defined(_M_ARM64)
+#define _ARM64_
+#elif defined(_M_ARM64EC)
+#define _ARM64EC_
 #endif
 #endif
 
@@ -85,6 +89,29 @@ static const std::vector<const char *> static_args = [] {
 }();
 #endif
 
+#ifdef _WIN32
+CLI11_INLINE std::vector<std::string> compute_win32_argv() {
+    std::vector<std::string> result;
+    int argc = 0;
+
+    auto deleter = [](wchar_t **ptr) { LocalFree(ptr); };
+    // NOLINTBEGIN(*-avoid-c-arrays)
+    auto wargv = std::unique_ptr<wchar_t *[], decltype(deleter)>(CommandLineToArgvW(GetCommandLineW(), &argc), deleter);
+    // NOLINTEND(*-avoid-c-arrays)
+
+    if(wargv == nullptr) {
+        throw std::runtime_error("CommandLineToArgvW failed with code " + std::to_string(GetLastError()));
+    }
+
+    result.reserve(static_cast<size_t>(argc));
+    for(size_t i = 0; i < static_cast<size_t>(argc); ++i) {
+        result.push_back(narrow(wargv[i]));
+    }
+
+    return result;
+}
+#endif
+
 /// Command-line arguments, as passed in to this executable, converted to utf-8 on Windows.
 CLI11_INLINE const std::vector<const char *> &args() {
     // This function uses initialization via lambdas extensively to take advantage of the thread safety of static
@@ -92,28 +119,7 @@ CLI11_INLINE const std::vector<const char *> &args() {
 
 #ifdef _WIN32
     static const std::vector<const char *> static_args = [] {
-        static const std::vector<std::string> static_args_as_strings = [] {
-            // On Windows, take arguments from GetCommandLineW and convert them to utf-8.
-            std::vector<std::string> args_as_strings;
-            int argc = 0;
-
-            auto deleter = [](wchar_t **ptr) { LocalFree(ptr); };
-            // NOLINTBEGIN(*-avoid-c-arrays)
-            auto wargv =
-                std::unique_ptr<wchar_t *[], decltype(deleter)>(CommandLineToArgvW(GetCommandLineW(), &argc), deleter);
-            // NOLINTEND(*-avoid-c-arrays)
-
-            if(wargv == nullptr) {
-                throw std::runtime_error("CommandLineToArgvW failed with code " + std::to_string(GetLastError()));
-            }
-
-            args_as_strings.reserve(static_cast<size_t>(argc));
-            for(size_t i = 0; i < static_cast<size_t>(argc); ++i) {
-                args_as_strings.push_back(narrow(wargv[i]));
-            }
-
-            return args_as_strings;
-        }();
+        static const std::vector<std::string> static_args_as_strings = compute_win32_argv();
 
         std::vector<const char *> static_args_result;
         static_args_result.reserve(static_args_as_strings.size());
diff --git a/packages/CLI11/include/CLI/impl/Config_inl.hpp b/packages/CLI11/include/CLI/impl/Config_inl.hpp
index 8021d5f63aa34e1cfeac43e37b274829c49a7dd1..92537c0e8c2ad4cb9c40abe859c289af169cdc04 100644
--- a/packages/CLI11/include/CLI/impl/Config_inl.hpp
+++ b/packages/CLI11/include/CLI/impl/Config_inl.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -19,9 +19,19 @@
 namespace CLI {
 // [CLI11:config_inl_hpp:verbatim]
 
+static constexpr auto multiline_literal_quote = R"(''')";
+static constexpr auto multiline_string_quote = R"(""")";
+
 namespace detail {
 
-CLI11_INLINE std::string convert_arg_for_ini(const std::string &arg, char stringQuote, char characterQuote) {
+CLI11_INLINE bool is_printable(const std::string &test_string) {
+    return std::all_of(test_string.begin(), test_string.end(), [](char x) {
+        return (isprint(static_cast<unsigned char>(x)) != 0 || x == '\n' || x == '\t');
+    });
+}
+
+CLI11_INLINE std::string
+convert_arg_for_ini(const std::string &arg, char stringQuote, char literalQuote, bool disable_multi_line) {
     if(arg.empty()) {
         return std::string(2, stringQuote);
     }
@@ -34,12 +44,20 @@ CLI11_INLINE std::string convert_arg_for_ini(const std::string &arg, char string
         using CLI::detail::lexical_cast;
         double val = 0.0;
         if(lexical_cast(arg, val)) {
-            return arg;
+            if(arg.find_first_not_of("0123456789.-+eE") == std::string::npos) {
+                return arg;
+            }
         }
     }
     // just quote a single non numeric character
     if(arg.size() == 1) {
-        return std::string(1, characterQuote) + arg + characterQuote;
+        if(isprint(static_cast<unsigned char>(arg.front())) == 0) {
+            return binary_escape_string(arg);
+        }
+        if(arg == "'") {
+            return std::string(1, stringQuote) + "'" + stringQuote;
+        }
+        return std::string(1, literalQuote) + arg + literalQuote;
     }
     // handle hex, binary or octal arguments
     if(arg.front() == '0') {
@@ -59,10 +77,16 @@ CLI11_INLINE std::string convert_arg_for_ini(const std::string &arg, char string
             }
         }
     }
-    if(arg.find_first_of(stringQuote) == std::string::npos) {
-        return std::string(1, stringQuote) + arg + stringQuote;
+    if(!is_printable(arg)) {
+        return binary_escape_string(arg);
     }
-    return characterQuote + arg + characterQuote;
+    if(detail::has_escapable_character(arg)) {
+        if(arg.size() > 100 && !disable_multi_line) {
+            return std::string(multiline_literal_quote) + arg + multiline_literal_quote;
+        }
+        return std::string(1, stringQuote) + detail::add_escaped_characters(arg) + stringQuote;
+    }
+    return std::string(1, stringQuote) + arg + stringQuote;
 }
 
 CLI11_INLINE std::string ini_join(const std::vector<std::string> &args,
@@ -70,10 +94,12 @@ CLI11_INLINE std::string ini_join(const std::vector<std::string> &args,
                                   char arrayStart,
                                   char arrayEnd,
                                   char stringQuote,
-                                  char characterQuote) {
+                                  char literalQuote) {
+    bool disable_multi_line{false};
     std::string joined;
     if(args.size() > 1 && arrayStart != '\0') {
         joined.push_back(arrayStart);
+        disable_multi_line = true;
     }
     std::size_t start = 0;
     for(const auto &arg : args) {
@@ -83,7 +109,7 @@ CLI11_INLINE std::string ini_join(const std::vector<std::string> &args,
                 joined.push_back(' ');
             }
         }
-        joined.append(convert_arg_for_ini(arg, stringQuote, characterQuote));
+        joined.append(convert_arg_for_ini(arg, stringQuote, literalQuote, disable_multi_line));
     }
     if(args.size() > 1 && arrayEnd != '\0') {
         joined.push_back(arrayEnd);
@@ -96,22 +122,22 @@ generate_parents(const std::string &section, std::string &name, char parentSepar
     std::vector<std::string> parents;
     if(detail::to_lower(section) != "default") {
         if(section.find(parentSeparator) != std::string::npos) {
-            parents = detail::split(section, parentSeparator);
+            parents = detail::split_up(section, parentSeparator);
         } else {
             parents = {section};
         }
     }
     if(name.find(parentSeparator) != std::string::npos) {
-        std::vector<std::string> plist = detail::split(name, parentSeparator);
+        std::vector<std::string> plist = detail::split_up(name, parentSeparator);
         name = plist.back();
-        detail::remove_quotes(name);
         plist.pop_back();
         parents.insert(parents.end(), plist.begin(), plist.end());
     }
-
     // clean up quotes on the parents
-    for(auto &parent : parents) {
-        detail::remove_quotes(parent);
+    try {
+        detail::remove_quotes(parents);
+    } catch(const std::invalid_argument &iarg) {
+        throw CLI::ParseError(iarg.what(), CLI::ExitCodes::InvalidError);
     }
     return parents;
 }
@@ -164,30 +190,59 @@ checkParentSegments(std::vector<ConfigItem> &output, const std::string &currentS
     output.back().parents = std::move(parents);
     output.back().name = "++";
 }
+
+/// @brief  checks if a string represents a multiline comment
+CLI11_INLINE bool hasMLString(std::string const &fullString, char check) {
+    if(fullString.length() < 3) {
+        return false;
+    }
+    auto it = fullString.rbegin();
+    return (*it == check) && (*(it + 1) == check) && (*(it + 2) == check);
+}
 }  // namespace detail
 
 inline std::vector<ConfigItem> ConfigBase::from_config(std::istream &input) const {
     std::string line;
+    std::string buffer;
     std::string currentSection = "default";
     std::string previousSection = "default";
     std::vector<ConfigItem> output;
     bool isDefaultArray = (arrayStart == '[' && arrayEnd == ']' && arraySeparator == ',');
     bool isINIArray = (arrayStart == '\0' || arrayStart == ' ') && arrayStart == arrayEnd;
     bool inSection{false};
+    bool inMLineComment{false};
+    bool inMLineValue{false};
+
     char aStart = (isINIArray) ? '[' : arrayStart;
     char aEnd = (isINIArray) ? ']' : arrayEnd;
     char aSep = (isINIArray && arraySeparator == ' ') ? ',' : arraySeparator;
     int currentSectionIndex{0};
-    while(getline(input, line)) {
+
+    std::string line_sep_chars{parentSeparatorChar, commentChar, valueDelimiter};
+    while(getline(input, buffer)) {
         std::vector<std::string> items_buffer;
         std::string name;
-
-        detail::trim(line);
+        line = detail::trim_copy(buffer);
         std::size_t len = line.length();
         // lines have to be at least 3 characters to have any meaning to CLI just skip the rest
         if(len < 3) {
             continue;
         }
+        if(line.compare(0, 3, multiline_string_quote) == 0 || line.compare(0, 3, multiline_literal_quote) == 0) {
+            inMLineComment = true;
+            auto cchar = line.front();
+            while(inMLineComment) {
+                if(getline(input, line)) {
+                    detail::trim(line);
+                } else {
+                    break;
+                }
+                if(detail::hasMLString(line, cchar)) {
+                    inMLineComment = false;
+                }
+            }
+            continue;
+        }
         if(line.front() == '[' && line.back() == ']') {
             if(currentSection != "default") {
                 // insert a section end which is just an empty items_buffer
@@ -219,49 +274,130 @@ inline std::vector<ConfigItem> ConfigBase::from_config(std::istream &input) cons
         if(line.front() == ';' || line.front() == '#' || line.front() == commentChar) {
             continue;
         }
-
+        std::size_t search_start = 0;
+        if(line.find_first_of("\"'`") != std::string::npos) {
+            while(search_start < line.size()) {
+                auto test_char = line[search_start];
+                if(test_char == '\"' || test_char == '\'' || test_char == '`') {
+                    search_start = detail::close_sequence(line, search_start, line[search_start]);
+                    ++search_start;
+                } else if(test_char == valueDelimiter || test_char == commentChar) {
+                    --search_start;
+                    break;
+                } else if(test_char == ' ' || test_char == '\t' || test_char == parentSeparatorChar) {
+                    ++search_start;
+                } else {
+                    search_start = line.find_first_of(line_sep_chars, search_start);
+                }
+            }
+        }
         // Find = in string, split and recombine
-        auto pos = line.find(valueDelimiter);
-        if(pos != std::string::npos) {
-            name = detail::trim_copy(line.substr(0, pos));
-            std::string item = detail::trim_copy(line.substr(pos + 1));
-            auto cloc = item.find(commentChar);
-            if(cloc != std::string::npos) {
-                item.erase(cloc, std::string::npos);  // NOLINT(readability-suspicious-call-argument)
-                detail::trim(item);
+        auto delimiter_pos = line.find_first_of(valueDelimiter, search_start + 1);
+        auto comment_pos = line.find_first_of(commentChar, search_start);
+        if(comment_pos < delimiter_pos) {
+            delimiter_pos = std::string::npos;
+        }
+        if(delimiter_pos != std::string::npos) {
+
+            name = detail::trim_copy(line.substr(0, delimiter_pos));
+            std::string item = detail::trim_copy(line.substr(delimiter_pos + 1, std::string::npos));
+            bool mlquote =
+                (item.compare(0, 3, multiline_literal_quote) == 0 || item.compare(0, 3, multiline_string_quote) == 0);
+            if(!mlquote && comment_pos != std::string::npos) {
+                auto citems = detail::split_up(item, commentChar);
+                item = detail::trim_copy(citems.front());
             }
-            if(item.size() > 1 && item.front() == aStart) {
+            if(mlquote) {
+                // mutliline string
+                auto keyChar = item.front();
+                item = buffer.substr(delimiter_pos + 1, std::string::npos);
+                detail::ltrim(item);
+                item.erase(0, 3);
+                inMLineValue = true;
+                bool lineExtension{false};
+                bool firstLine = true;
+                if(!item.empty() && item.back() == '\\') {
+                    item.pop_back();
+                    lineExtension = true;
+                }
+                while(inMLineValue) {
+                    std::string l2;
+                    if(!std::getline(input, l2)) {
+                        break;
+                    }
+                    line = l2;
+                    detail::rtrim(line);
+                    if(detail::hasMLString(line, keyChar)) {
+                        line.pop_back();
+                        line.pop_back();
+                        line.pop_back();
+                        if(lineExtension) {
+                            detail::ltrim(line);
+                        } else if(!(firstLine && item.empty())) {
+                            item.push_back('\n');
+                        }
+                        firstLine = false;
+                        item += line;
+                        inMLineValue = false;
+                        if(!item.empty() && item.back() == '\n') {
+                            item.pop_back();
+                        }
+                        if(keyChar == '\"') {
+                            try {
+                                item = detail::remove_escaped_characters(item);
+                            } catch(const std::invalid_argument &iarg) {
+                                throw CLI::ParseError(iarg.what(), CLI::ExitCodes::InvalidError);
+                            }
+                        }
+                    } else {
+                        if(lineExtension) {
+                            detail::trim(l2);
+                        } else if(!(firstLine && item.empty())) {
+                            item.push_back('\n');
+                        }
+                        lineExtension = false;
+                        firstLine = false;
+                        if(!l2.empty() && l2.back() == '\\') {
+                            lineExtension = true;
+                            l2.pop_back();
+                        }
+                        item += l2;
+                    }
+                }
+                items_buffer = {item};
+            } else if(item.size() > 1 && item.front() == aStart) {
                 for(std::string multiline; item.back() != aEnd && std::getline(input, multiline);) {
                     detail::trim(multiline);
                     item += multiline;
                 }
-                items_buffer = detail::split_up(item.substr(1, item.length() - 2), aSep);
+                if(item.back() == aEnd) {
+                    items_buffer = detail::split_up(item.substr(1, item.length() - 2), aSep);
+                } else {
+                    items_buffer = detail::split_up(item.substr(1, std::string::npos), aSep);
+                }
             } else if((isDefaultArray || isINIArray) && item.find_first_of(aSep) != std::string::npos) {
                 items_buffer = detail::split_up(item, aSep);
             } else if((isDefaultArray || isINIArray) && item.find_first_of(' ') != std::string::npos) {
-                items_buffer = detail::split_up(item);
+                items_buffer = detail::split_up(item, '\0');
             } else {
                 items_buffer = {item};
             }
         } else {
-            name = detail::trim_copy(line);
-            auto cloc = name.find(commentChar);
-            if(cloc != std::string::npos) {
-                name.erase(cloc, std::string::npos);  // NOLINT(readability-suspicious-call-argument)
-                detail::trim(name);
-            }
-
+            name = detail::trim_copy(line.substr(0, comment_pos));
             items_buffer = {"true"};
         }
-        if(name.find(parentSeparatorChar) == std::string::npos) {
-            detail::remove_quotes(name);
-        }
-        // clean up quotes on the items
-        for(auto &it : items_buffer) {
-            detail::remove_quotes(it);
+        std::vector<std::string> parents;
+        try {
+            parents = detail::generate_parents(currentSection, name, parentSeparatorChar);
+            detail::process_quoted_string(name);
+            // clean up quotes on the items and check for escaped strings
+            for(auto &it : items_buffer) {
+                detail::process_quoted_string(it, stringQuote, literalQuote);
+            }
+        } catch(const std::invalid_argument &ia) {
+            throw CLI::ParseError(ia.what(), CLI::ExitCodes::InvalidError);
         }
 
-        std::vector<std::string> parents = detail::generate_parents(currentSection, name, parentSeparatorChar);
         if(parents.size() > maximumLayers) {
             continue;
         }
@@ -298,6 +434,23 @@ inline std::vector<ConfigItem> ConfigBase::from_config(std::istream &input) cons
     return output;
 }
 
+CLI11_INLINE std::string &clean_name_string(std::string &name, const std::string &keyChars) {
+    if(name.find_first_of(keyChars) != std::string::npos || (name.front() == '[' && name.back() == ']') ||
+       (name.find_first_of("'`\"\\") != std::string::npos)) {
+        if(name.find_first_of('\'') == std::string::npos) {
+            name.insert(0, 1, '\'');
+            name.push_back('\'');
+        } else {
+            if(detail::has_escapable_character(name)) {
+                name = detail::add_escaped_characters(name);
+            }
+            name.insert(0, 1, '\"');
+            name.push_back('\"');
+        }
+    }
+    return name;
+}
+
 CLI11_INLINE std::string
 ConfigBase::to_config(const App *app, bool default_also, bool write_description, std::string prefix) const {
     std::stringstream out;
@@ -305,6 +458,18 @@ ConfigBase::to_config(const App *app, bool default_also, bool write_description,
     commentLead.push_back(commentChar);
     commentLead.push_back(' ');
 
+    std::string commentTest = "#;";
+    commentTest.push_back(commentChar);
+    commentTest.push_back(parentSeparatorChar);
+
+    std::string keyChars = commentTest;
+    keyChars.push_back(literalQuote);
+    keyChars.push_back(stringQuote);
+    keyChars.push_back(arrayStart);
+    keyChars.push_back(arrayEnd);
+    keyChars.push_back(valueDelimiter);
+    keyChars.push_back(arraySeparator);
+
     std::vector<std::string> groups = app->get_groups();
     bool defaultUsed = false;
     groups.insert(groups.begin(), std::string("Options"));
@@ -330,13 +495,17 @@ ConfigBase::to_config(const App *app, bool default_also, bool write_description,
                         continue;
                     }
                 }
-                std::string name = prefix + opt->get_single_name();
+                std::string single_name = opt->get_single_name();
+                if(single_name.empty()) {
+                    continue;
+                }
+
                 std::string value = detail::ini_join(
-                    opt->reduced_results(), arraySeparator, arrayStart, arrayEnd, stringQuote, characterQuote);
+                    opt->reduced_results(), arraySeparator, arrayStart, arrayEnd, stringQuote, literalQuote);
 
                 if(value.empty() && default_also) {
                     if(!opt->get_default_str().empty()) {
-                        value = detail::convert_arg_for_ini(opt->get_default_str(), stringQuote, characterQuote);
+                        value = detail::convert_arg_for_ini(opt->get_default_str(), stringQuote, literalQuote, false);
                     } else if(opt->get_expected_min() == 0) {
                         value = "false";
                     } else if(opt->get_run_callback_for_default()) {
@@ -345,13 +514,35 @@ ConfigBase::to_config(const App *app, bool default_also, bool write_description,
                 }
 
                 if(!value.empty()) {
+
                     if(!opt->get_fnames().empty()) {
-                        value = opt->get_flag_value(name, value);
+                        try {
+                            value = opt->get_flag_value(single_name, value);
+                        } catch(const CLI::ArgumentMismatch &) {
+                            bool valid{false};
+                            for(const auto &test_name : opt->get_fnames()) {
+                                try {
+                                    value = opt->get_flag_value(test_name, value);
+                                    single_name = test_name;
+                                    valid = true;
+                                } catch(const CLI::ArgumentMismatch &) {
+                                    continue;
+                                }
+                            }
+                            if(!valid) {
+                                value = detail::ini_join(
+                                    opt->results(), arraySeparator, arrayStart, arrayEnd, stringQuote, literalQuote);
+                            }
+                        }
                     }
                     if(write_description && opt->has_description()) {
                         out << '\n';
                         out << commentLead << detail::fix_newlines(commentLead, opt->get_description()) << '\n';
                     }
+                    clean_name_string(single_name, keyChars);
+
+                    std::string name = prefix + single_name;
+
                     out << name << valueDelimiter << value << '\n';
                 }
             }
@@ -360,31 +551,56 @@ ConfigBase::to_config(const App *app, bool default_also, bool write_description,
     auto subcommands = app->get_subcommands({});
     for(const App *subcom : subcommands) {
         if(subcom->get_name().empty()) {
+            if(!default_also && (subcom->count_all() == 0)) {
+                continue;
+            }
             if(write_description && !subcom->get_group().empty()) {
                 out << '\n' << commentLead << subcom->get_group() << " Options\n";
             }
+            /*if (!prefix.empty() || app->get_parent() == nullptr) {
+                out << '[' << prefix << "___"<< subcom->get_group() << "]\n";
+            } else {
+                std::string subname = app->get_name() + parentSeparatorChar + "___"+subcom->get_group();
+                const auto *p = app->get_parent();
+                while(p->get_parent() != nullptr) {
+                    subname = p->get_name() + parentSeparatorChar +subname;
+                    p = p->get_parent();
+                }
+                out << '[' << subname << "]\n";
+            }
+            */
             out << to_config(subcom, default_also, write_description, prefix);
         }
     }
 
     for(const App *subcom : subcommands) {
         if(!subcom->get_name().empty()) {
+            if(!default_also && (subcom->count_all() == 0)) {
+                continue;
+            }
+            std::string subname = subcom->get_name();
+            clean_name_string(subname, keyChars);
+
             if(subcom->get_configurable() && app->got_subcommand(subcom)) {
                 if(!prefix.empty() || app->get_parent() == nullptr) {
-                    out << '[' << prefix << subcom->get_name() << "]\n";
+
+                    out << '[' << prefix << subname << "]\n";
                 } else {
-                    std::string subname = app->get_name() + parentSeparatorChar + subcom->get_name();
+                    std::string appname = app->get_name();
+                    clean_name_string(appname, keyChars);
+                    subname = appname + parentSeparatorChar + subname;
                     const auto *p = app->get_parent();
                     while(p->get_parent() != nullptr) {
-                        subname = p->get_name() + parentSeparatorChar + subname;
+                        std::string pname = p->get_name();
+                        clean_name_string(pname, keyChars);
+                        subname = pname + parentSeparatorChar + subname;
                         p = p->get_parent();
                     }
                     out << '[' << subname << "]\n";
                 }
                 out << to_config(subcom, default_also, write_description, "");
             } else {
-                out << to_config(
-                    subcom, default_also, write_description, prefix + subcom->get_name() + parentSeparatorChar);
+                out << to_config(subcom, default_also, write_description, prefix + subname + parentSeparatorChar);
             }
         }
     }
diff --git a/packages/CLI11/include/CLI/impl/Encoding_inl.hpp b/packages/CLI11/include/CLI/impl/Encoding_inl.hpp
index f5d7e9a83fbb42f7b47408cb02b67c9be48d8b3c..1c82f4fc57b11711202628cf7d9a249a386bed65 100644
--- a/packages/CLI11/include/CLI/impl/Encoding_inl.hpp
+++ b/packages/CLI11/include/CLI/impl/Encoding_inl.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
diff --git a/packages/CLI11/include/CLI/impl/Formatter_inl.hpp b/packages/CLI11/include/CLI/impl/Formatter_inl.hpp
index 84652fefa4aebcb2ef5257c887d0e5494d40c5f1..3b1c8c288882326cebc31adcf82c0b7a266946f4 100644
--- a/packages/CLI11/include/CLI/impl/Formatter_inl.hpp
+++ b/packages/CLI11/include/CLI/impl/Formatter_inl.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -132,7 +132,7 @@ CLI11_INLINE std::string Formatter::make_usage(const App *app, std::string name)
             << (app->get_require_subcommand_min() == 0 ? "]" : "");
     }
 
-    out << std::endl;
+    out << '\n';
 
     return out.str();
 }
diff --git a/packages/CLI11/include/CLI/impl/Option_inl.hpp b/packages/CLI11/include/CLI/impl/Option_inl.hpp
index a24df9ab293700a31bba830581247d5b01d5b412..aa6f4657d8ff8987d3ab4fe2e09d0bc7d1c2f65b 100644
--- a/packages/CLI11/include/CLI/impl/Option_inl.hpp
+++ b/packages/CLI11/include/CLI/impl/Option_inl.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -309,13 +309,29 @@ CLI11_INLINE void Option::run_callback() {
 
 CLI11_NODISCARD CLI11_INLINE const std::string &Option::matching_name(const Option &other) const {
     static const std::string estring;
-    for(const std::string &sname : snames_)
+    for(const std::string &sname : snames_) {
         if(other.check_sname(sname))
             return sname;
-    for(const std::string &lname : lnames_)
+        if(other.check_lname(sname))
+            return sname;
+    }
+    for(const std::string &lname : lnames_) {
         if(other.check_lname(lname))
             return lname;
-
+        if(lname.size() == 1) {
+            if(other.check_sname(lname)) {
+                return lname;
+            }
+        }
+    }
+    if(snames_.empty() && lnames_.empty() && !pname_.empty()) {
+        if(other.check_sname(pname_) || other.check_lname(pname_) || pname_ == other.pname_)
+            return pname_;
+    }
+    if(other.snames_.empty() && other.fnames_.empty() && !other.pname_.empty()) {
+        if(check_sname(other.pname_) || check_lname(other.pname_) || (pname_ == other.pname_))
+            return other.pname_;
+    }
     if(ignore_case_ ||
        ignore_underscore_) {  // We need to do the inverse, in case we are ignore_case or ignore underscore
         for(const std::string &sname : other.snames_)
@@ -369,6 +385,9 @@ CLI11_NODISCARD CLI11_INLINE std::string Option::get_flag_value(const std::strin
             if(default_ind >= 0) {
                 // We can static cast this to std::size_t because it is more than 0 in this block
                 if(default_flag_values_[static_cast<std::size_t>(default_ind)].second != input_value) {
+                    if(input_value == default_str_ && force_callback_) {
+                        return input_value;
+                    }
                     throw(ArgumentMismatch::FlagOverride(name));
                 }
             } else {
@@ -389,15 +408,15 @@ CLI11_NODISCARD CLI11_INLINE std::string Option::get_flag_value(const std::strin
         return input_value;
     }
     if(default_flag_values_[static_cast<std::size_t>(ind)].second == falseString) {
-        try {
-            auto val = detail::to_flag_value(input_value);
-            return (val == 1) ? falseString : (val == (-1) ? trueString : std::to_string(-val));
-        } catch(const std::invalid_argument &) {
+        errno = 0;
+        auto val = detail::to_flag_value(input_value);
+        if(errno != 0) {
+            errno = 0;
             return input_value;
         }
-    } else {
-        return input_value;
+        return (val == 1) ? falseString : (val == (-1) ? trueString : std::to_string(-val));
     }
+    return input_value;
 }
 
 CLI11_INLINE Option *Option::add_result(std::string s) {
@@ -500,7 +519,8 @@ CLI11_INLINE void Option::_validate_results(results_t &res) const {
         if(type_size_max_ > 1) {  // in this context index refers to the index in the type
             int index = 0;
             if(get_items_expected_max() < static_cast<int>(res.size()) &&
-               multi_option_policy_ == CLI::MultiOptionPolicy::TakeLast) {
+               (multi_option_policy_ == CLI::MultiOptionPolicy::TakeLast ||
+                multi_option_policy_ == CLI::MultiOptionPolicy::Reverse)) {
                 // create a negative index for the earliest ones
                 index = get_items_expected_max() - static_cast<int>(res.size());
             }
@@ -518,7 +538,8 @@ CLI11_INLINE void Option::_validate_results(results_t &res) const {
         } else {
             int index = 0;
             if(expected_max_ < static_cast<int>(res.size()) &&
-               multi_option_policy_ == CLI::MultiOptionPolicy::TakeLast) {
+               (multi_option_policy_ == CLI::MultiOptionPolicy::TakeLast ||
+                multi_option_policy_ == CLI::MultiOptionPolicy::Reverse)) {
                 // create a negative index for the earliest ones
                 index = expected_max_ - static_cast<int>(res.size());
             }
@@ -550,6 +571,15 @@ CLI11_INLINE void Option::_reduce_results(results_t &out, const results_t &origi
             out.assign(original.end() - static_cast<results_t::difference_type>(trim_size), original.end());
         }
     } break;
+    case MultiOptionPolicy::Reverse: {
+        // Allow multi-option sizes (including 0)
+        std::size_t trim_size = std::min<std::size_t>(
+            static_cast<std::size_t>(std::max<int>(get_items_expected_max(), 1)), original.size());
+        if(original.size() != trim_size || trim_size > 1) {
+            out.assign(original.end() - static_cast<results_t::difference_type>(trim_size), original.end());
+        }
+        std::reverse(out.begin(), out.end());
+    } break;
     case MultiOptionPolicy::TakeFirst: {
         std::size_t trim_size = std::min<std::size_t>(
             static_cast<std::size_t>(std::max<int>(get_items_expected_max(), 1)), original.size());
@@ -579,7 +609,12 @@ CLI11_INLINE void Option::_reduce_results(results_t &out, const results_t &origi
             throw ArgumentMismatch::AtLeast(get_name(), static_cast<int>(num_min), original.size());
         }
         if(original.size() > num_max) {
-            throw ArgumentMismatch::AtMost(get_name(), static_cast<int>(num_max), original.size());
+            if(original.size() == 2 && num_max == 1 && original[1] == "%%" && original[0] == "{}") {
+                // this condition is a trap for the following empty indicator check on config files
+                out = original;
+            } else {
+                throw ArgumentMismatch::AtMost(get_name(), static_cast<int>(num_max), original.size());
+            }
         }
         break;
     }
@@ -588,11 +623,11 @@ CLI11_INLINE void Option::_reduce_results(results_t &out, const results_t &origi
     // {} is the indicator for an empty container
     if(out.empty()) {
         if(original.size() == 1 && original[0] == "{}" && get_items_expected_min() > 0) {
-            out.push_back("{}");
-            out.push_back("%%");
+            out.emplace_back("{}");
+            out.emplace_back("%%");
         }
     } else if(out.size() == 1 && out[0] == "{}" && get_items_expected_min() > 0) {
-        out.push_back("%%");
+        out.emplace_back("%%");
     }
 }
 
diff --git a/packages/CLI11/include/CLI/impl/Split_inl.hpp b/packages/CLI11/include/CLI/impl/Split_inl.hpp
index d974f80a6f7618de7a8fb2cdf2ddade30cb48fc7..7cd8e5b3705673f4d927392c69581ff2e00cc45c 100644
--- a/packages/CLI11/include/CLI/impl/Split_inl.hpp
+++ b/packages/CLI11/include/CLI/impl/Split_inl.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -106,7 +106,6 @@ get_names(const std::vector<std::string> &input) {
     std::vector<std::string> short_names;
     std::vector<std::string> long_names;
     std::string pos_name;
-
     for(std::string name : input) {
         if(name.length() == 0) {
             continue;
@@ -114,6 +113,8 @@ get_names(const std::vector<std::string> &input) {
         if(name.length() > 1 && name[0] == '-' && name[1] != '-') {
             if(name.length() == 2 && valid_first_char(name[1]))
                 short_names.emplace_back(1, name[1]);
+            else if(name.length() > 2)
+                throw BadNameString::MissingDash(name);
             else
                 throw BadNameString::OneCharName(name);
         } else if(name.length() > 2 && name.substr(0, 2) == "--") {
@@ -125,12 +126,15 @@ get_names(const std::vector<std::string> &input) {
         } else if(name == "-" || name == "--") {
             throw BadNameString::DashesOnly(name);
         } else {
-            if(pos_name.length() > 0)
+            if(!pos_name.empty())
                 throw BadNameString::MultiPositionalNames(name);
-            pos_name = name;
+            if(valid_name_string(name)) {
+                pos_name = name;
+            } else {
+                throw BadNameString::BadPositionalName(name);
+            }
         }
     }
-
     return std::make_tuple(short_names, long_names, pos_name);
 }
 
diff --git a/packages/CLI11/include/CLI/impl/StringTools_inl.hpp b/packages/CLI11/include/CLI/impl/StringTools_inl.hpp
index 9b81fbde3475b03f67bc1f4330b6efdfe0d87d93..5a120c39e6f55685853bffa2f6ba110642002545 100644
--- a/packages/CLI11/include/CLI/impl/StringTools_inl.hpp
+++ b/packages/CLI11/include/CLI/impl/StringTools_inl.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -11,6 +11,7 @@
 
 // [CLI11:public_includes:set]
 #include <string>
+#include <utility>
 #include <vector>
 // [CLI11:public_includes:end]
 
@@ -60,7 +61,17 @@ CLI11_INLINE std::string &rtrim(std::string &str, const std::string &filter) {
 }
 
 CLI11_INLINE std::string &remove_quotes(std::string &str) {
-    if(str.length() > 1 && (str.front() == '"' || str.front() == '\'')) {
+    if(str.length() > 1 && (str.front() == '"' || str.front() == '\'' || str.front() == '`')) {
+        if(str.front() == str.back()) {
+            str.pop_back();
+            str.erase(str.begin(), str.begin() + 1);
+        }
+    }
+    return str;
+}
+
+CLI11_INLINE std::string &remove_outer(std::string &str, char key) {
+    if(str.length() > 1 && (str.front() == key)) {
         if(str.front() == str.back()) {
             str.pop_back();
             str.erase(str.begin(), str.begin() + 1);
@@ -180,37 +191,220 @@ find_member(std::string name, const std::vector<std::string> names, bool ignore_
     return (it != std::end(names)) ? (it - std::begin(names)) : (-1);
 }
 
+static const std::string escapedChars("\b\t\n\f\r\"\\");
+static const std::string escapedCharsCode("btnfr\"\\");
+static const std::string bracketChars{"\"'`[(<{"};
+static const std::string matchBracketChars("\"'`])>}");
+
+CLI11_INLINE bool has_escapable_character(const std::string &str) {
+    return (str.find_first_of(escapedChars) != std::string::npos);
+}
+
+CLI11_INLINE std::string add_escaped_characters(const std::string &str) {
+    std::string out;
+    out.reserve(str.size() + 4);
+    for(char s : str) {
+        auto sloc = escapedChars.find_first_of(s);
+        if(sloc != std::string::npos) {
+            out.push_back('\\');
+            out.push_back(escapedCharsCode[sloc]);
+        } else {
+            out.push_back(s);
+        }
+    }
+    return out;
+}
+
+CLI11_INLINE std::uint32_t hexConvert(char hc) {
+    int hcode{0};
+    if(hc >= '0' && hc <= '9') {
+        hcode = (hc - '0');
+    } else if(hc >= 'A' && hc <= 'F') {
+        hcode = (hc - 'A' + 10);
+    } else if(hc >= 'a' && hc <= 'f') {
+        hcode = (hc - 'a' + 10);
+    } else {
+        hcode = -1;
+    }
+    return static_cast<uint32_t>(hcode);
+}
+
+CLI11_INLINE char make_char(std::uint32_t code) { return static_cast<char>(static_cast<unsigned char>(code)); }
+
+CLI11_INLINE void append_codepoint(std::string &str, std::uint32_t code) {
+    if(code < 0x80) {  // ascii code equivalent
+        str.push_back(static_cast<char>(code));
+    } else if(code < 0x800) {  // \u0080 to \u07FF
+        // 110yyyyx 10xxxxxx; 0x3f == 0b0011'1111
+        str.push_back(make_char(0xC0 | code >> 6));
+        str.push_back(make_char(0x80 | (code & 0x3F)));
+    } else if(code < 0x10000) {  // U+0800...U+FFFF
+        if(0xD800 <= code && code <= 0xDFFF) {
+            throw std::invalid_argument("[0xD800, 0xDFFF] are not valid UTF-8.");
+        }
+        // 1110yyyy 10yxxxxx 10xxxxxx
+        str.push_back(make_char(0xE0 | code >> 12));
+        str.push_back(make_char(0x80 | (code >> 6 & 0x3F)));
+        str.push_back(make_char(0x80 | (code & 0x3F)));
+    } else if(code < 0x110000) {  // U+010000 ... U+10FFFF
+        // 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx
+        str.push_back(make_char(0xF0 | code >> 18));
+        str.push_back(make_char(0x80 | (code >> 12 & 0x3F)));
+        str.push_back(make_char(0x80 | (code >> 6 & 0x3F)));
+        str.push_back(make_char(0x80 | (code & 0x3F)));
+    }
+}
+
+CLI11_INLINE std::string remove_escaped_characters(const std::string &str) {
+
+    std::string out;
+    out.reserve(str.size());
+    for(auto loc = str.begin(); loc < str.end(); ++loc) {
+        if(*loc == '\\') {
+            if(str.end() - loc < 2) {
+                throw std::invalid_argument("invalid escape sequence " + str);
+            }
+            auto ecloc = escapedCharsCode.find_first_of(*(loc + 1));
+            if(ecloc != std::string::npos) {
+                out.push_back(escapedChars[ecloc]);
+                ++loc;
+            } else if(*(loc + 1) == 'u') {
+                // must have 4 hex characters
+                if(str.end() - loc < 6) {
+                    throw std::invalid_argument("unicode sequence must have 4 hex codes " + str);
+                }
+                std::uint32_t code{0};
+                std::uint32_t mplier{16 * 16 * 16};
+                for(int ii = 2; ii < 6; ++ii) {
+                    std::uint32_t res = hexConvert(*(loc + ii));
+                    if(res > 0x0F) {
+                        throw std::invalid_argument("unicode sequence must have 4 hex codes " + str);
+                    }
+                    code += res * mplier;
+                    mplier = mplier / 16;
+                }
+                append_codepoint(out, code);
+                loc += 5;
+            } else if(*(loc + 1) == 'U') {
+                // must have 8 hex characters
+                if(str.end() - loc < 10) {
+                    throw std::invalid_argument("unicode sequence must have 8 hex codes " + str);
+                }
+                std::uint32_t code{0};
+                std::uint32_t mplier{16 * 16 * 16 * 16 * 16 * 16 * 16};
+                for(int ii = 2; ii < 10; ++ii) {
+                    std::uint32_t res = hexConvert(*(loc + ii));
+                    if(res > 0x0F) {
+                        throw std::invalid_argument("unicode sequence must have 8 hex codes " + str);
+                    }
+                    code += res * mplier;
+                    mplier = mplier / 16;
+                }
+                append_codepoint(out, code);
+                loc += 9;
+            } else if(*(loc + 1) == '0') {
+                out.push_back('\0');
+                ++loc;
+            } else {
+                throw std::invalid_argument(std::string("unrecognized escape sequence \\") + *(loc + 1) + " in " + str);
+            }
+        } else {
+            out.push_back(*loc);
+        }
+    }
+    return out;
+}
+
+CLI11_INLINE std::size_t close_string_quote(const std::string &str, std::size_t start, char closure_char) {
+    std::size_t loc{0};
+    for(loc = start + 1; loc < str.size(); ++loc) {
+        if(str[loc] == closure_char) {
+            break;
+        }
+        if(str[loc] == '\\') {
+            // skip the next character for escaped sequences
+            ++loc;
+        }
+    }
+    return loc;
+}
+
+CLI11_INLINE std::size_t close_literal_quote(const std::string &str, std::size_t start, char closure_char) {
+    auto loc = str.find_first_of(closure_char, start + 1);
+    return (loc != std::string::npos ? loc : str.size());
+}
+
+CLI11_INLINE std::size_t close_sequence(const std::string &str, std::size_t start, char closure_char) {
+
+    auto bracket_loc = matchBracketChars.find(closure_char);
+    switch(bracket_loc) {
+    case 0:
+        return close_string_quote(str, start, closure_char);
+    case 1:
+    case 2:
+    case std::string::npos:
+        return close_literal_quote(str, start, closure_char);
+    default:
+        break;
+    }
+
+    std::string closures(1, closure_char);
+    auto loc = start + 1;
+
+    while(loc < str.size()) {
+        if(str[loc] == closures.back()) {
+            closures.pop_back();
+            if(closures.empty()) {
+                return loc;
+            }
+        }
+        bracket_loc = bracketChars.find(str[loc]);
+        if(bracket_loc != std::string::npos) {
+            switch(bracket_loc) {
+            case 0:
+                loc = close_string_quote(str, loc, str[loc]);
+                break;
+            case 1:
+            case 2:
+                loc = close_literal_quote(str, loc, str[loc]);
+                break;
+            default:
+                closures.push_back(matchBracketChars[bracket_loc]);
+                break;
+            }
+        }
+        ++loc;
+    }
+    if(loc > str.size()) {
+        loc = str.size();
+    }
+    return loc;
+}
+
 CLI11_INLINE std::vector<std::string> split_up(std::string str, char delimiter) {
 
-    const std::string delims("\'\"`");
     auto find_ws = [delimiter](char ch) {
         return (delimiter == '\0') ? std::isspace<char>(ch, std::locale()) : (ch == delimiter);
     };
     trim(str);
 
     std::vector<std::string> output;
-    bool embeddedQuote = false;
-    char keyChar = ' ';
     while(!str.empty()) {
-        if(delims.find_first_of(str[0]) != std::string::npos) {
-            keyChar = str[0];
-            auto end = str.find_first_of(keyChar, 1);
-            while((end != std::string::npos) && (str[end - 1] == '\\')) {  // deal with escaped quotes
-                end = str.find_first_of(keyChar, end + 1);
-                embeddedQuote = true;
-            }
-            if(end != std::string::npos) {
-                output.push_back(str.substr(1, end - 1));
+        if(bracketChars.find_first_of(str[0]) != std::string::npos) {
+            auto bracketLoc = bracketChars.find_first_of(str[0]);
+            auto end = close_sequence(str, 0, matchBracketChars[bracketLoc]);
+            if(end >= str.size()) {
+                output.push_back(std::move(str));
+                str.clear();
+            } else {
+                output.push_back(str.substr(0, end + 1));
                 if(end + 2 < str.size()) {
                     str = str.substr(end + 2);
                 } else {
                     str.clear();
                 }
-
-            } else {
-                output.push_back(str.substr(1));
-                str = "";
             }
+
         } else {
             auto it = std::find_if(std::begin(str), std::end(str), find_ws);
             if(it != std::end(str)) {
@@ -219,14 +413,9 @@ CLI11_INLINE std::vector<std::string> split_up(std::string str, char delimiter)
                 str = std::string(it + 1, str.end());
             } else {
                 output.push_back(str);
-                str = "";
+                str.clear();
             }
         }
-        // transform any embedded quotes into the regular character
-        if(embeddedQuote) {
-            output.back() = find_and_replace(output.back(), std::string("\\") + keyChar, std::string(1, keyChar));
-            embeddedQuote = false;
-        }
         trim(str);
     }
     return output;
@@ -244,15 +433,140 @@ CLI11_INLINE std::size_t escape_detect(std::string &str, std::size_t offset) {
     return offset + 1;
 }
 
-CLI11_INLINE std::string &add_quotes_if_needed(std::string &str) {
-    if((str.front() != '"' && str.front() != '\'') || str.front() != str.back()) {
-        char quote = str.find('"') < str.find('\'') ? '\'' : '"';
-        if(str.find(' ') != std::string::npos) {
-            str.insert(0, 1, quote);
-            str.append(1, quote);
+CLI11_INLINE std::string binary_escape_string(const std::string &string_to_escape) {
+    // s is our escaped output string
+    std::string escaped_string{};
+    // loop through all characters
+    for(char c : string_to_escape) {
+        // check if a given character is printable
+        // the cast is necessary to avoid undefined behaviour
+        if(isprint(static_cast<unsigned char>(c)) == 0) {
+            std::stringstream stream;
+            // if the character is not printable
+            // we'll convert it to a hex string using a stringstream
+            // note that since char is signed we have to cast it to unsigned first
+            stream << std::hex << static_cast<unsigned int>(static_cast<unsigned char>(c));
+            std::string code = stream.str();
+            escaped_string += std::string("\\x") + (code.size() < 2 ? "0" : "") + code;
+
+        } else {
+            escaped_string.push_back(c);
         }
     }
-    return str;
+    if(escaped_string != string_to_escape) {
+        auto sqLoc = escaped_string.find('\'');
+        while(sqLoc != std::string::npos) {
+            escaped_string.replace(sqLoc, sqLoc + 1, "\\x27");
+            sqLoc = escaped_string.find('\'');
+        }
+        escaped_string.insert(0, "'B\"(");
+        escaped_string.push_back(')');
+        escaped_string.push_back('"');
+        escaped_string.push_back('\'');
+    }
+    return escaped_string;
+}
+
+CLI11_INLINE bool is_binary_escaped_string(const std::string &escaped_string) {
+    size_t ssize = escaped_string.size();
+    if(escaped_string.compare(0, 3, "B\"(") == 0 && escaped_string.compare(ssize - 2, 2, ")\"") == 0) {
+        return true;
+    }
+    return (escaped_string.compare(0, 4, "'B\"(") == 0 && escaped_string.compare(ssize - 3, 3, ")\"'") == 0);
+}
+
+CLI11_INLINE std::string extract_binary_string(const std::string &escaped_string) {
+    std::size_t start{0};
+    std::size_t tail{0};
+    size_t ssize = escaped_string.size();
+    if(escaped_string.compare(0, 3, "B\"(") == 0 && escaped_string.compare(ssize - 2, 2, ")\"") == 0) {
+        start = 3;
+        tail = 2;
+    } else if(escaped_string.compare(0, 4, "'B\"(") == 0 && escaped_string.compare(ssize - 3, 3, ")\"'") == 0) {
+        start = 4;
+        tail = 3;
+    }
+
+    if(start == 0) {
+        return escaped_string;
+    }
+    std::string outstring;
+
+    outstring.reserve(ssize - start - tail);
+    std::size_t loc = start;
+    while(loc < ssize - tail) {
+        // ssize-2 to skip )" at the end
+        if(escaped_string[loc] == '\\' && (escaped_string[loc + 1] == 'x' || escaped_string[loc + 1] == 'X')) {
+            auto c1 = escaped_string[loc + 2];
+            auto c2 = escaped_string[loc + 3];
+
+            std::uint32_t res1 = hexConvert(c1);
+            std::uint32_t res2 = hexConvert(c2);
+            if(res1 <= 0x0F && res2 <= 0x0F) {
+                loc += 4;
+                outstring.push_back(static_cast<char>(res1 * 16 + res2));
+                continue;
+            }
+        }
+        outstring.push_back(escaped_string[loc]);
+        ++loc;
+    }
+    return outstring;
+}
+
+CLI11_INLINE void remove_quotes(std::vector<std::string> &args) {
+    for(auto &arg : args) {
+        if(arg.front() == '\"' && arg.back() == '\"') {
+            remove_quotes(arg);
+            // only remove escaped for string arguments not literal strings
+            arg = remove_escaped_characters(arg);
+        } else {
+            remove_quotes(arg);
+        }
+    }
+}
+
+CLI11_INLINE bool process_quoted_string(std::string &str, char string_char, char literal_char) {
+    if(str.size() <= 1) {
+        return false;
+    }
+    if(detail::is_binary_escaped_string(str)) {
+        str = detail::extract_binary_string(str);
+        return true;
+    }
+    if(str.front() == string_char && str.back() == string_char) {
+        detail::remove_outer(str, string_char);
+        if(str.find_first_of('\\') != std::string::npos) {
+            str = detail::remove_escaped_characters(str);
+        }
+        return true;
+    }
+    if((str.front() == literal_char || str.front() == '`') && str.back() == str.front()) {
+        detail::remove_outer(str, str.front());
+        return true;
+    }
+    return false;
+}
+
+std::string get_environment_value(const std::string &env_name) {
+    char *buffer = nullptr;
+    std::string ename_string;
+
+#ifdef _MSC_VER
+    // Windows version
+    std::size_t sz = 0;
+    if(_dupenv_s(&buffer, &sz, env_name.c_str()) == 0 && buffer != nullptr) {
+        ename_string = std::string(buffer);
+        free(buffer);
+    }
+#else
+    // This also works on Windows, but gives a warning
+    buffer = std::getenv(env_name.c_str());
+    if(buffer != nullptr) {
+        ename_string = std::string(buffer);
+    }
+#endif
+    return ename_string;
 }
 
 }  // namespace detail
diff --git a/packages/CLI11/include/CLI/impl/Validators_inl.hpp b/packages/CLI11/include/CLI/impl/Validators_inl.hpp
index a2295ecdf8954d63cede377731710397ecd85de3..bf73e21a9ab53daacd7d3761970530338c85153c 100644
--- a/packages/CLI11/include/CLI/impl/Validators_inl.hpp
+++ b/packages/CLI11/include/CLI/impl/Validators_inl.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -135,7 +135,7 @@ CLI11_INLINE path_type check_path(const char *file) noexcept {
     switch(stat.type()) {
     case std::filesystem::file_type::none:  // LCOV_EXCL_LINE
     case std::filesystem::file_type::not_found:
-        return path_type::nonexistent;
+        return path_type::nonexistent;  // LCOV_EXCL_LINE
     case std::filesystem::file_type::directory:
         return path_type::directory;
     case std::filesystem::file_type::symlink:
@@ -229,10 +229,29 @@ CLI11_INLINE IPV4Validator::IPV4Validator() : Validator("IPV4") {
                 return std::string("Each IP number must be between 0 and 255 ") + var;
             }
         }
-        return std::string();
+        return std::string{};
     };
 }
 
+CLI11_INLINE EscapedStringTransformer::EscapedStringTransformer() {
+    func_ = [](std::string &str) {
+        try {
+            if(str.size() > 1 && (str.front() == '\"' || str.front() == '\'' || str.front() == '`') &&
+               str.front() == str.back()) {
+                process_quoted_string(str);
+            } else if(str.find_first_of('\\') != std::string::npos) {
+                if(detail::is_binary_escaped_string(str)) {
+                    str = detail::extract_binary_string(str);
+                } else {
+                    str = remove_escaped_characters(str);
+                }
+            }
+            return std::string{};
+        } catch(const std::invalid_argument &ia) {
+            return std::string(ia.what());
+        }
+    };
+}
 }  // namespace detail
 
 CLI11_INLINE FileOnDefaultPath::FileOnDefaultPath(std::string default_path, bool enableErrorReturn)
diff --git a/packages/CLI11/src/CMakeLists.txt b/packages/CLI11/src/CMakeLists.txt
index 4f7af6ad190295688e2cdad3d8bd6fa7845ede5a..f62c895c2e7e19527b48b092d8183240c8f8f0a4 100644
--- a/packages/CLI11/src/CMakeLists.txt
+++ b/packages/CLI11/src/CMakeLists.txt
@@ -112,6 +112,10 @@ if(CLI11_SINGLE_FILE)
   if(CLI11_INSTALL)
     install(FILES "${PROJECT_BINARY_DIR}/include/CLI11.hpp"
             DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+    configure_file("${CLI11_SOURCE_DIR}/cmake/CLIsingle.hpp.in"
+                   "${PROJECT_BINARY_DIR}/include/CLI/CLI.hpp" @ONLY)
+    install(FILES "${PROJECT_BINARY_DIR}/include/CLI/CLI.hpp"
+            DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/CLI)
   endif()
   add_library(CLI11_SINGLE INTERFACE)
   target_link_libraries(CLI11_SINGLE INTERFACE CLI11)
@@ -129,7 +133,7 @@ if(CLI11_INSTALL)
   if(NOT CLI11_SINGLE_FILE)
     install(FILES ${CLI11_headers} ${CLI11_library_headers}
             DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/CLI")
-    if(NOT CLI11_COMPILE)
+    if(NOT CLI11_PRECOMPILED)
       install(FILES ${CLI11_impl_headers} DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/CLI/impl")
     endif()
   endif()
diff --git a/packages/CLI11/src/Precompile.cpp b/packages/CLI11/src/Precompile.cpp
index 5afd54cb99b1a9511a36e7b30de3a5317d1d8bad..effb5f36e020645432f25fe00280e32e82ad8a88 100644
--- a/packages/CLI11/src/Precompile.cpp
+++ b/packages/CLI11/src/Precompile.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
diff --git a/packages/CLI11/tests/AppTest.cpp b/packages/CLI11/tests/AppTest.cpp
index 2cdefc4ee61d0770802c491d698e52c9a8b8e4a3..4e716a8e3075f24940fe3f88cbdd093984bbe159 100644
--- a/packages/CLI11/tests/AppTest.cpp
+++ b/packages/CLI11/tests/AppTest.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -11,6 +11,8 @@
 #include <complex>
 #include <cstdint>
 #include <cstdlib>
+#include <limits>
+#include <map>
 
 TEST_CASE_METHOD(TApp, "OneFlagShort", "[app]") {
     app.add_flag("-c,--count");
@@ -26,7 +28,7 @@ TEST_CASE_METHOD(TApp, "OneFlagShortValues", "[app]") {
     run();
     CHECK(app.count("-c") == 1u);
     CHECK(app.count("--count") == 1u);
-    auto v = app["-c"]->results();
+    const auto &v = app["-c"]->results();
     CHECK("v1" == v[0]);
 
     CHECK_THROWS_AS(app["--invalid"], CLI::OptionNotFound);
@@ -53,10 +55,21 @@ TEST_CASE_METHOD(TApp, "OneFlagShortValuesAs", "[app]") {
     auto vec = opt->as<std::vector<int>>();
     CHECK(1 == vec[0]);
     CHECK(2 == vec[1]);
+
+    flg->multi_option_policy(CLI::MultiOptionPolicy::Sum);
+    vec = opt->as<std::vector<int>>();
+    CHECK(3 == vec[0]);
+    CHECK(vec.size() == 1);
+
     flg->multi_option_policy(CLI::MultiOptionPolicy::Join);
     CHECK("1\n2" == opt->as<std::string>());
     flg->delimiter(',');
     CHECK("1,2" == opt->as<std::string>());
+    flg->multi_option_policy(CLI::MultiOptionPolicy::Reverse)->expected(1, 300);
+    vec = opt->as<std::vector<int>>();
+    REQUIRE(vec.size() == 2U);
+    CHECK(2 == vec[0]);
+    CHECK(1 == vec[1]);
 }
 
 TEST_CASE_METHOD(TApp, "OneFlagShortWindows", "[app]") {
@@ -68,28 +81,6 @@ TEST_CASE_METHOD(TApp, "OneFlagShortWindows", "[app]") {
     CHECK(app.count("--count") == 1u);
 }
 
-TEST_CASE_METHOD(TApp, "WindowsLongShortMix1", "[app]") {
-    app.allow_windows_style_options();
-
-    auto *a = app.add_flag("-c");
-    auto *b = app.add_flag("--c");
-    args = {"/c"};
-    run();
-    CHECK(a->count() == 1u);
-    CHECK(b->count() == 0u);
-}
-
-TEST_CASE_METHOD(TApp, "WindowsLongShortMix2", "[app]") {
-    app.allow_windows_style_options();
-
-    auto *a = app.add_flag("--c");
-    auto *b = app.add_flag("-c");
-    args = {"/c"};
-    run();
-    CHECK(a->count() == 1u);
-    CHECK(b->count() == 0u);
-}
-
 TEST_CASE_METHOD(TApp, "CountNonExist", "[app]") {
     app.add_flag("-c,--count");
     args = {"-c"};
@@ -424,10 +415,10 @@ TEST_CASE_METHOD(TApp, "OneStringEqualVersionSingleStringQuotedEscapedCharacters
     app.add_option("-s,--string", str);
     app.add_option("-t,--tstr", str2);
     app.add_option("-m,--mstr", str3);
-    app.parse(R"raw(--string="this is my \"quoted\" string" -t 'qst\'ring 2' -m=`"quoted\` string"`")raw");
-    CHECK("this is my \"quoted\" string" == str);
-    CHECK("qst\'ring 2" == str2);
-    CHECK("\"quoted` string\"" == str3);
+    app.parse(R"raw(--string="this is my \n\"quoted\" string" -t 'qst\ring 2' -m=`"quoted\n string"`")raw");
+    CHECK("this is my \n\"quoted\" string" == str);  // escaped
+    CHECK("qst\\ring 2" == str2);                    // literal
+    CHECK("\"quoted\\n string\"" == str3);           // double quoted literal
 }
 
 TEST_CASE_METHOD(TApp, "OneStringEqualVersionSingleStringQuotedMultipleWithEqual", "[app]") {
@@ -640,6 +631,28 @@ TEST_CASE_METHOD(TApp, "StrangeOptionNames", "[app]") {
     CHECK(app["--{}"]->as<int>() == 5);
 }
 
+TEST_CASE_METHOD(TApp, "singledash", "[app]") {
+    app.add_option("-t");
+    try {
+        app.add_option("-test");
+    } catch(const CLI::BadNameString &e) {
+        std::string str = e.what();
+        CHECK_THAT(str, Contains("2 dashes"));
+        CHECK_THAT(str, Contains("-test"));
+    } catch(...) {
+        CHECK(false);
+    }
+    try {
+        app.add_option("-!");
+    } catch(const CLI::BadNameString &e) {
+        std::string str = e.what();
+        CHECK_THAT(str, Contains("one char"));
+        CHECK_THAT(str, Contains("-!"));
+    } catch(...) {
+        CHECK(false);
+    }
+}
+
 TEST_CASE_METHOD(TApp, "FlagLikeOption", "[app]") {
     bool val{false};
     auto *opt = app.add_option("--flag", val)->type_size(0)->default_str("true");
@@ -828,7 +841,7 @@ TEST_CASE_METHOD(TApp, "SumOptFloat", "[app]") {
 
     run();
 
-    CHECK(0.6 == val);
+    CHECK(std::fabs(0.6 - val) <= std::numeric_limits<double>::epsilon());
 }
 
 TEST_CASE_METHOD(TApp, "SumOptString", "[app]") {
@@ -843,6 +856,29 @@ TEST_CASE_METHOD(TApp, "SumOptString", "[app]") {
     CHECK("i2" == val);
 }
 
+TEST_CASE_METHOD(TApp, "ReverseOpt", "[app]") {
+
+    std::vector<std::string> val;
+    auto *opt1 = app.add_option("--val", val)->multi_option_policy(CLI::MultiOptionPolicy::Reverse);
+
+    args = {"--val=string1", "--val=string2", "--val", "string3", "string4"};
+
+    run();
+
+    CHECK(val.size() == 4U);
+
+    CHECK(val.front() == "string4");
+    CHECK(val.back() == "string1");
+
+    opt1->expected(1, 2);
+    run();
+    CHECK(val.size() == 2U);
+
+    CHECK(val.front() == "string4");
+    CHECK(val.back() == "string3");
+    CHECK(opt1->get_multi_option_policy() == CLI::MultiOptionPolicy::Reverse);
+}
+
 TEST_CASE_METHOD(TApp, "JoinOpt2", "[app]") {
 
     std::string str;
@@ -1044,6 +1080,42 @@ TEST_CASE_METHOD(TApp, "emptyVectorReturn", "[app]") {
     CHECK_FALSE(strs3.empty());
 }
 
+TEST_CASE_METHOD(TApp, "emptyVectorReturnReduce", "[app]") {
+
+    std::vector<std::string> strs;
+    std::vector<std::string> strs2;
+    std::vector<std::string> strs3;
+    auto *opt1 = app.add_option("--str", strs)->required()->expected(0, 2);
+    app.add_option("--str3", strs3)->expected(1, 3);
+    app.add_option("--str2", strs2)->expected(1, 1)->take_first();
+    args = {"--str"};
+
+    CHECK_NOTHROW(run());
+    CHECK(std::vector<std::string>({""}) == strs);
+    args = {"--str", "one", "two"};
+
+    run();
+
+    CHECK(std::vector<std::string>({"one", "two"}) == strs);
+
+    args = {"--str", "{}", "--str2", "{}", "test"};
+
+    run();
+
+    CHECK(strs.empty());
+    CHECK(std::vector<std::string>{"{}"} == strs2);
+    opt1->default_str("{}");
+    args = {"--str"};
+
+    CHECK_NOTHROW(run());
+    CHECK(strs.empty());
+    opt1->required(false);
+    args = {"--str3", "{}"};
+
+    CHECK_NOTHROW(run());
+    CHECK_FALSE(strs3.empty());
+}
+
 TEST_CASE_METHOD(TApp, "RequiredOptsDoubleShort", "[app]") {
 
     std::vector<std::string> strs;
@@ -1118,6 +1190,21 @@ TEST_CASE_METHOD(TApp, "PositionalAtEnd", "[app]") {
     CHECK_THROWS_AS(run(), CLI::ExtrasError);
 }
 
+// Tests positionals at end
+TEST_CASE_METHOD(TApp, "PositionalInjectSeparator", "[app]") {
+    std::string options;
+    std::vector<std::vector<std::string>> foo;
+
+    app.add_option("-O", options);
+    auto *fooopt = app.add_option("foo", foo);
+    fooopt->inject_separator();
+    args = {"test1", "-O", "Test", "test2"};
+    run();
+
+    CHECK("Test" == options);
+    CHECK(foo.size() == 2U);
+}
+
 // Tests positionals at end
 TEST_CASE_METHOD(TApp, "RequiredPositionals", "[app]") {
     std::vector<std::string> sources;
@@ -1715,6 +1802,30 @@ TEST_CASE_METHOD(TApp, "FileExists", "[app]") {
     CHECK(!CLI::ExistingFile(myfile).empty());
 }
 
+#if defined CLI11_HAS_FILESYSTEM && CLI11_HAS_FILESYSTEM > 0 && defined(_MSC_VER)
+TEST_CASE_METHOD(TApp, "filesystemWideName", "[app]") {
+    std::filesystem::path myfile{L"voil\u20ac.txt"};
+
+    std::filesystem::path fpath;
+    app.add_option("--file", fpath)->check(CLI::ExistingFile, "existing file");
+
+    CHECK_THROWS_AS(app.parse(L"--file voil\u20ac.txt"), CLI::ValidationError);
+
+    bool ok = static_cast<bool>(std::ofstream(myfile).put('a'));  // create file
+    CHECK(ok);
+
+    // deactivate the check, so it should run now
+
+    CHECK_NOTHROW(app.parse(L"--file voil\u20ac.txt"));
+
+    CHECK(fpath == myfile);
+
+    CHECK(std::filesystem::exists(fpath));
+    std::filesystem::remove(myfile);
+    CHECK(!std::filesystem::exists(fpath));
+}
+#endif
+
 TEST_CASE_METHOD(TApp, "NotFileExists", "[app]") {
     std::string myfile{"TestNonFileNotUsed.txt"};
     CHECK(!CLI::ExistingFile(myfile).empty());
@@ -1991,6 +2102,28 @@ TEST_CASE_METHOD(TApp, "RangeDouble", "[app]") {
     run();
 }
 
+TEST_CASE_METHOD(TApp, "RangeFloat", "[app]") {
+
+    float x{0.0f};
+    /// Note that this must be a float in Range, too
+    app.add_option("--one", x, "testing floats")->check(CLI::Range(3.0, 6.0));
+
+    args = {"--one=1"};
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
+
+    args = {"--one=7"};
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
+
+    args = {"--one=3"};
+    run();
+
+    args = {"--one=5"};
+    run();
+
+    args = {"--one=6"};
+    run();
+}
+
 TEST_CASE_METHOD(TApp, "NonNegative", "[app]") {
 
     std::string res;
@@ -2512,3 +2645,39 @@ TEST_CASE("System Args", "[app]") {
         FAIL("Executable '" << commandline << "' failed with an unknown return code");
     }
 }
+
+// #845
+TEST_CASE("Ensure UTF-8", "[app]") {
+    const char *commandline = CLI11_ENSURE_UTF8_EXE " 1234 false \"hello world\"";
+    int retval = std::system(commandline);
+
+    if(retval == -1) {
+        FAIL("Executable '" << commandline << "' reported that argv pointer changed where it should not have been");
+    }
+
+    if(retval > 0) {
+        FAIL("Executable '" << commandline << "' reported different argv at index " << (retval - 1));
+    }
+
+    if(retval != 0) {
+        FAIL("Executable '" << commandline << "' failed with an unknown return code");
+    }
+}
+
+// #845
+TEST_CASE("Ensure UTF-8 called twice", "[app]") {
+    const char *commandline = CLI11_ENSURE_UTF8_TWICE_EXE " 1234 false \"hello world\"";
+    int retval = std::system(commandline);
+
+    if(retval == -1) {
+        FAIL("Executable '" << commandline << "' reported that argv pointer changed where it should not have been");
+    }
+
+    if(retval > 0) {
+        FAIL("Executable '" << commandline << "' reported different argv at index " << (retval - 1));
+    }
+
+    if(retval != 0) {
+        FAIL("Executable '" << commandline << "' failed with an unknown return code");
+    }
+}
diff --git a/packages/CLI11/tests/BoostOptionTypeTest.cpp b/packages/CLI11/tests/BoostOptionTypeTest.cpp
index 1dabc37da9204b754bca3dc923a4e6a431e65507..9b0ff3c6e2e2d8fc3baf4056472ce4877f50aafc 100644
--- a/packages/CLI11/tests/BoostOptionTypeTest.cpp
+++ b/packages/CLI11/tests/BoostOptionTypeTest.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
diff --git a/packages/CLI11/tests/CMakeLists.txt b/packages/CLI11/tests/CMakeLists.txt
index 7bd47744cb7e6e625b55e6fccad83faf4eedf565..621584807298930e590e2e17fa5048530d471aa9 100644
--- a/packages/CLI11/tests/CMakeLists.txt
+++ b/packages/CLI11/tests/CMakeLists.txt
@@ -1,10 +1,10 @@
-if(CLI11_SANITIZERS)
+if(CLI11_SANITIZERS AND ${CMAKE_VERSION} VERSION_GREATER "3.13.0")
   message(STATUS "Using arsenm/sanitizers-cmake")
   FetchContent_Declare(
     sanitizers
     GIT_REPOSITORY https://github.com/arsenm/sanitizers-cmake.git
     GIT_SHALLOW 1
-    GIT_TAG c3dc841)
+    GIT_TAG 3f0542e)
 
   FetchContent_GetProperties(sanitizers)
 
@@ -115,16 +115,15 @@ foreach(DATA_FILE IN LISTS DATA_FILES)
             "${CMAKE_CURRENT_BINARY_DIR}/${DATA_FILE}"
     MAIN_DEPENDENCY "${CMAKE_CURRENT_SOURCE_DIR}/${DATA_FILE}"
     VERBATIM)
-  target_sources(catch_main PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/${DATA_FILE}")
 endforeach()
+add_custom_target(cli11_test_data DEPENDS ${DATA_FILES})
 
 # Build dependent applications which are launched from test code
-set(CLI11_DEPENDENT_APPLICATIONS system_args)
+set(CLI11_DEPENDENT_APPLICATIONS system_args ensure_utf8 ensure_utf8_twice)
 
 foreach(APP IN LISTS CLI11_DEPENDENT_APPLICATIONS)
   add_executable(${APP} applications/${APP}.cpp)
   target_include_directories(${APP} PRIVATE ${CMAKE_SOURCE_DIR}/include)
-  add_dependencies(catch_main ${APP})
 endforeach()
 
 function(add_dependent_application_definitions TARGET)
@@ -138,6 +137,7 @@ endfunction()
 # Target must already exist
 macro(add_catch_test TESTNAME)
   target_link_libraries(${TESTNAME} PUBLIC catch_main)
+  add_dependencies(${TESTNAME} cli11_test_data)
 
   add_test(${TESTNAME} ${TESTNAME})
   set_target_properties(${TESTNAME} PROPERTIES FOLDER "Tests")
@@ -277,3 +277,80 @@ if(CMAKE_BUILD_TYPE STREQUAL Coverage)
     ${CLI11_TESTS}
     ${CLI11_MULTIONLY_TESTS})
 endif()
+
+set(CLI11_PACKAGE_SEARCH_LOC "")
+
+# tests of the cmake package and pkg-config package
+if(CLI11_INSTALL_PACKAGE_TESTS)
+  if(NOT MSVC)
+    set(package_test_command --test-command "${CMAKE_CTEST_COMMAND}")
+  else() # don't try to run the tests on MSVC since that would require copying the dll's and doing
+    # some other setup that isn't that important to run on all OS
+    set(package_test_command)
+  endif()
+
+  if(CMAKE_BUILD_TYPE)
+    set(CLI11_PACKAGE_TEST_BUILD_TYPE ${CMAKE_BUILD_TYPE})
+  else()
+    set(CLI11_PACKAGE_TEST_BUILD_TYPE Release)
+  endif()
+
+  file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/find_package_tests")
+
+  if(MSVC AND ${CMAKE_VERSION} VERSION_GREATER 3.12.9)
+    # Tests for other CMake projects including and using CLI11 using find_package
+    add_test(
+      NAME find-package-testsA
+      COMMAND
+        ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" -A "${CMAKE_GENERATOR_PLATFORM}"
+        "-DCLI11_DIR=${CMAKE_INSTALL_PREFIX}" "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
+        "-DCMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD}"
+        "-DCMAKE_RUNTIME_OUTPUT_DIRECTORY=${CMAKE_RUNTIME_OUTPUT_DIRECTORY}"
+        "${CMAKE_CURRENT_SOURCE_DIR}/find_package_tests"
+      WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/find_package_tests")
+  else()
+    add_test(
+      NAME find-package-testsA
+      COMMAND
+        ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" "-DCLI11_DIR=${CMAKE_INSTALL_PREFIX}"
+        "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}" "-DCMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD}"
+        "-DCMAKE_RUNTIME_OUTPUT_DIRECTORY=${CMAKE_RUNTIME_OUTPUT_DIRECTORY}"
+        "${CMAKE_CURRENT_SOURCE_DIR}/find_package_tests"
+      WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/find_package_tests")
+  endif()
+
+  add_test(NAME find-package-testsB
+           COMMAND ${CMAKE_COMMAND} --build "${CMAKE_CURRENT_BINARY_DIR}/find_package_tests"
+                   --config ${CLI11_PACKAGE_TEST_BUILD_TYPE})
+
+  add_test(
+    NAME find-package-testsC
+    COMMAND ${CMAKE_CTEST_COMMAND} -C ${CLI11_PACKAGE_TEST_BUILD_TYPE}
+    WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/find_package_tests")
+  set_property(TEST find-package-testsA PROPERTY LABELS Packaging)
+  set_property(TEST find-package-testsB PROPERTY LABELS Packaging)
+  set_property(TEST find-package-testsB PROPERTY DEPENDS find-package-testsA)
+  set_property(TEST find-package-testsC PROPERTY LABELS Packaging)
+  set_property(TEST find-package-testsC PROPERTY DEPENDS find-package-testsB)
+
+  if(NOT MSVC)
+    # Tests for other CMake projects using the package_config files
+    add_test(
+      package-config-tests
+      ${CMAKE_CTEST_COMMAND}
+      -C
+      --build-and-test
+      "${CMAKE_CURRENT_SOURCE_DIR}/package_config_tests"
+      "${CMAKE_CURRENT_BINARY_DIR}/package_config_tests"
+      --build-generator
+      "${CMAKE_GENERATOR}"
+      --build-generator-platform
+      "${CMAKE_GENERATOR_PLATFORM}"
+      --build-options
+      "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
+      "-DCMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD}"
+      "-DCLI11_DIR=${CMAKE_INSTALL_PREFIX}"
+      ${package_test_command})
+    set_property(TEST package-config-tests PROPERTY LABELS Packaging)
+  endif()
+endif()
diff --git a/packages/CLI11/tests/ComplexTypeTest.cpp b/packages/CLI11/tests/ComplexTypeTest.cpp
index adcd26c4b24da2a5d4205a8a3d262b0d8c237dcc..4747f64fca3595951f11e9f9045e28b55d802be0 100644
--- a/packages/CLI11/tests/ComplexTypeTest.cpp
+++ b/packages/CLI11/tests/ComplexTypeTest.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
diff --git a/packages/CLI11/tests/ConfigFileTest.cpp b/packages/CLI11/tests/ConfigFileTest.cpp
index 206872728c1734d1632000d9f9c208cb23839060..708e71ee224d02fb6876d0b82ae644c61c434214 100644
--- a/packages/CLI11/tests/ConfigFileTest.cpp
+++ b/packages/CLI11/tests/ConfigFileTest.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -27,6 +27,15 @@ TEST_CASE("StringBased: convert_arg_for_ini", "[config]") {
     CHECK("-22E14" == CLI::detail::convert_arg_for_ini("-22E14"));
 
     CHECK("'a'" == CLI::detail::convert_arg_for_ini("a"));
+
+    CHECK("'\\'" == CLI::detail::convert_arg_for_ini("\\"));
+
+    CHECK("\"'\"" == CLI::detail::convert_arg_for_ini("'"));
+
+    std::string tstring1;
+    tstring1.push_back('\0');
+    // binary string conversion single character
+    CHECK("'B\"(\\x00)\"'" == CLI::detail::convert_arg_for_ini(tstring1));
     // hex
     CHECK("0x5461FAED" == CLI::detail::convert_arg_for_ini("0x5461FAED"));
     // hex fail
@@ -84,6 +93,7 @@ TEST_CASE("StringBased: FirstWithComments", "[config]") {
     ofile << "one=three\n";
     ofile << "two=four\n";
     ofile << "; and another one\n";
+    ofile << "   ; and yet another one\n";
 
     ofile.seekg(0, std::ios::beg);
 
@@ -187,6 +197,135 @@ TEST_CASE("StringBased: TomlVector", "[config]") {
     CHECK(output.at(4).inputs.at(2) == "three");
 }
 
+TEST_CASE("StringBased: TomlMultiLineString1", "[config]") {
+    std::stringstream ofile;
+
+    ofile << "one = [three]\n";
+    ofile << "two = \"\"\"test\n";
+    ofile << "five = [six, and, seven]\n";
+    ofile << "eight\"\"\"\n";
+    ofile << "three=7    \n";
+
+    ofile.seekg(0, std::ios::beg);
+
+    std::vector<CLI::ConfigItem> output = CLI::ConfigINI().from_config(ofile);
+
+    CHECK(output.size() == 3u);
+    CHECK(output.at(0).name == "one");
+    CHECK(output.at(0).inputs.size() == 1u);
+    CHECK(output.at(0).inputs.at(0) == "three");
+    CHECK(output.at(1).name == "two");
+    CHECK(output.at(1).inputs.size() == 1u);
+    CHECK(output.at(1).inputs.at(0) == "test\nfive = [six, and, seven]\neight");
+    CHECK(output.at(2).name == "three");
+    CHECK(output.at(2).inputs.size() == 1u);
+    CHECK(output.at(2).inputs.at(0) == "7");
+}
+
+TEST_CASE("StringBased: TomlMultiLineString2", "[config]") {
+    std::stringstream ofile;
+
+    ofile << "one = [three]\n";
+    ofile << "two = '''test  \n";
+    ofile << "five = [six, and, seven] \n";
+    ofile << "'''\n";
+    ofile << "three=7    \n";
+
+    ofile.seekg(0, std::ios::beg);
+
+    std::vector<CLI::ConfigItem> output = CLI::ConfigINI().from_config(ofile);
+
+    CHECK(output.size() == 3u);
+    CHECK(output.at(0).name == "one");
+    CHECK(output.at(0).inputs.size() == 1u);
+    CHECK(output.at(0).inputs.at(0) == "three");
+    CHECK(output.at(1).name == "two");
+    CHECK(output.at(1).inputs.size() == 1u);
+    CHECK(output.at(1).inputs.at(0) == "test  \nfive = [six, and, seven] ");
+    CHECK(output.at(2).name == "three");
+    CHECK(output.at(2).inputs.size() == 1u);
+    CHECK(output.at(2).inputs.at(0) == "7");
+}
+
+TEST_CASE("StringBased: TomlMultiLineString3", "[config]") {
+    std::stringstream ofile;
+
+    ofile << "one = [three]\n";
+    ofile << "two = \"\"\"\n";
+    ofile << "test \\\n";
+    ofile << "     five = [six, and, seven] \\\n";
+    ofile << "eight\"\"\"\n";
+    ofile << "three=7    \n";
+
+    ofile.seekg(0, std::ios::beg);
+
+    std::vector<CLI::ConfigItem> output = CLI::ConfigINI().from_config(ofile);
+
+    CHECK(output.size() == 3u);
+    CHECK(output.at(0).name == "one");
+    CHECK(output.at(0).inputs.size() == 1u);
+    CHECK(output.at(0).inputs.at(0) == "three");
+    CHECK(output.at(1).name == "two");
+    CHECK(output.at(1).inputs.size() == 1u);
+    CHECK(output.at(1).inputs.at(0) == "test five = [six, and, seven] eight");
+    CHECK(output.at(2).name == "three");
+    CHECK(output.at(2).inputs.size() == 1u);
+    CHECK(output.at(2).inputs.at(0) == "7");
+}
+
+TEST_CASE("StringBased: TomlMultiLineString4", "[config]") {
+    std::stringstream ofile;
+
+    ofile << "one = [three]\n";
+    ofile << "two = \"\"\"\n";
+    ofile << "test\n";
+    ofile << "five = [six, and, seven]\n";
+    ofile << "\"\"\"\n";
+    ofile << "three=7    \n";
+
+    ofile.seekg(0, std::ios::beg);
+
+    std::vector<CLI::ConfigItem> output = CLI::ConfigINI().from_config(ofile);
+
+    CHECK(output.size() == 3u);
+    CHECK(output.at(0).name == "one");
+    CHECK(output.at(0).inputs.size() == 1u);
+    CHECK(output.at(0).inputs.at(0) == "three");
+    CHECK(output.at(1).name == "two");
+    CHECK(output.at(1).inputs.size() == 1u);
+    CHECK(output.at(1).inputs.at(0) == "test\nfive = [six, and, seven]");
+    CHECK(output.at(2).name == "three");
+    CHECK(output.at(2).inputs.size() == 1u);
+    CHECK(output.at(2).inputs.at(0) == "7");
+}
+
+TEST_CASE("StringBased: TomlMultiLineString5", "[config]") {
+    std::stringstream ofile;
+
+    ofile << "one = [three]\n";
+    ofile << "two = \"\"\" mline \\\n";
+    ofile << "test\n";
+    ofile << '\n';
+    ofile << "five = [six, and, seven]\n";
+    ofile << "\"\"\"\n";
+    ofile << "three=7    \n";
+
+    ofile.seekg(0, std::ios::beg);
+
+    std::vector<CLI::ConfigItem> output = CLI::ConfigINI().from_config(ofile);
+
+    CHECK(output.size() == 3u);
+    CHECK(output.at(0).name == "one");
+    CHECK(output.at(0).inputs.size() == 1u);
+    CHECK(output.at(0).inputs.at(0) == "three");
+    CHECK(output.at(1).name == "two");
+    CHECK(output.at(1).inputs.size() == 1u);
+    CHECK(output.at(1).inputs.at(0) == " mline test\n\nfive = [six, and, seven]");
+    CHECK(output.at(2).name == "three");
+    CHECK(output.at(2).inputs.size() == 1u);
+    CHECK(output.at(2).inputs.at(0) == "7");
+}
+
 TEST_CASE("StringBased: Spaces", "[config]") {
     std::stringstream ofile;
 
@@ -362,6 +501,38 @@ TEST_CASE("StringBased: Layers2LevelChange", "[config]") {
     CHECK(checkSections(output));
 }
 
+TEST_CASE("StringBased: Layers2LevelChangeInQuotes", "[config]") {
+    std::stringstream ofile;
+
+    ofile << "simple = true\n\n";
+    ofile << "[\"other\".\"sub2\".cmd]\n";
+    ofile << "[other.\"sub3\".\"cmd\"]\n";
+    ofile << "absolute_newest = true\n";
+    ofile.seekg(0, std::ios::beg);
+
+    std::vector<CLI::ConfigItem> output = CLI::ConfigINI().from_config(ofile);
+
+    // 2 flags and 5 openings and 5 closings
+    CHECK(output.size() == 12u);
+    CHECK(checkSections(output));
+}
+
+TEST_CASE("StringBased: Layers2LevelChangeInQuotesWithDot", "[config]") {
+    std::stringstream ofile;
+
+    ofile << "simple = true\n\n";
+    ofile << "[\"other\".\"sub2.cmd\"]\n";
+    ofile << "[other.\"sub3.cmd\"]\n";
+    ofile << "absolute_newest = true\n";
+    ofile.seekg(0, std::ios::beg);
+
+    std::vector<CLI::ConfigItem> output = CLI::ConfigINI().from_config(ofile);
+
+    // 2 flags and 3 openings and 3 closings
+    CHECK(output.size() == 8u);
+    CHECK(checkSections(output));
+}
+
 TEST_CASE("StringBased: Layers3LevelChange", "[config]") {
     std::stringstream ofile;
 
@@ -443,9 +614,9 @@ TEST_CASE_METHOD(TApp, "IniNotRequired", "[config]") {
 
     {
         std::ofstream out{tmpini};
-        out << "[default]" << std::endl;
-        out << "two=99" << std::endl;
-        out << "three=3" << std::endl;
+        out << "[default]" << '\n';
+        out << "two=99" << '\n';
+        out << "three=3" << '\n';
     }
 
     int one = 0, two = 0, three = 0;
@@ -480,8 +651,8 @@ TEST_CASE_METHOD(TApp, "IniSuccessOnUnknownOption", "[config]") {
 
     {
         std::ofstream out{tmpini};
-        out << "three=3" << std::endl;
-        out << "two=99" << std::endl;
+        out << "three=3" << '\n';
+        out << "two=99" << '\n';
     }
 
     int two{0};
@@ -500,8 +671,8 @@ TEST_CASE_METHOD(TApp, "IniGetRemainingOption", "[config]") {
     std::string ExtraOptionValue = "3";
     {
         std::ofstream out{tmpini};
-        out << ExtraOption << "=" << ExtraOptionValue << std::endl;
-        out << "two=99" << std::endl;
+        out << ExtraOption << "=" << ExtraOptionValue << '\n';
+        out << "two=99" << '\n';
     }
 
     int two{0};
@@ -523,7 +694,7 @@ TEST_CASE_METHOD(TApp, "IniRemainingSub", "[config]") {
         out << "[map]\n";
         out << "a = 1\n";
         out << "b=[1,2,3]\n";
-        out << "c = 3" << std::endl;
+        out << "c = 3" << '\n';
     }
 
     REQUIRE_NOTHROW(run());
@@ -556,7 +727,7 @@ TEST_CASE_METHOD(TApp, "IniGetNoRemaining", "[config]") {
 
     {
         std::ofstream out{tmpini};
-        out << "two=99" << std::endl;
+        out << "two=99" << '\n';
     }
 
     int two{0};
@@ -602,9 +773,9 @@ TEST_CASE_METHOD(TApp, "IniRequiredbadConfigurator", "[config]") {
 
     {
         std::ofstream out{tmpini};
-        out << "[default]" << std::endl;
-        out << "two=99" << std::endl;
-        out << "three=3" << std::endl;
+        out << "[default]" << '\n';
+        out << "two=99" << '\n';
+        out << "three=3" << '\n';
     }
 
     app.set_config("--config", tmpini)->required();
@@ -620,9 +791,9 @@ TEST_CASE_METHOD(TApp, "IniNotRequiredbadConfigurator", "[config]") {
 
     {
         std::ofstream out{tmpini};
-        out << "[default]" << std::endl;
-        out << "two=99" << std::endl;
-        out << "three=3" << std::endl;
+        out << "[default]" << '\n';
+        out << "two=99" << '\n';
+        out << "three=3" << '\n';
     }
 
     app.set_config("--config", tmpini);
@@ -643,16 +814,16 @@ TEST_CASE_METHOD(TApp, "IniNotRequiredNotDefault", "[config]") {
 
     {
         std::ofstream out{tmpini};
-        out << "[default]" << std::endl;
-        out << "two=99" << std::endl;
-        out << "three=3" << std::endl;
+        out << "[default]" << '\n';
+        out << "two=99" << '\n';
+        out << "three=3" << '\n';
     }
 
     {
         std::ofstream out{tmpini2};
-        out << "[default]" << std::endl;
-        out << "two=98" << std::endl;
-        out << "three=4" << std::endl;
+        out << "[default]" << '\n';
+        out << "two=98" << '\n';
+        out << "three=4" << '\n';
     }
 
     int one{0}, two{0}, three{0};
@@ -673,6 +844,36 @@ TEST_CASE_METHOD(TApp, "IniNotRequiredNotDefault", "[config]") {
     CHECK(tmpini2.c_str() == app.get_config_ptr()->as<std::string>());
 }
 
+TEST_CASE_METHOD(TApp, "IniEnvironmentalFileName", "[config]") {
+
+    TempFile tmpini{"TestIniTmp.ini"};
+
+    app.set_config("--config", "")->envname("CONFIG")->required();
+
+    {
+        std::ofstream out{tmpini};
+        out << "[default]" << '\n';
+        out << "two=99" << '\n';
+        out << "three=3" << '\n';
+    }
+
+    int one{0}, two{0}, three{0};
+    app.add_option("--one", one);
+    app.add_option("--two", two);
+    app.add_option("--three", three);
+
+    put_env("CONFIG", tmpini);
+
+    CHECK_NOTHROW(run());
+
+    CHECK(two == 99);
+    CHECK(three == 3);
+
+    unset_env("CONFIG");
+
+    CHECK_THROWS_AS(run(), CLI::FileError);
+}
+
 TEST_CASE_METHOD(TApp, "MultiConfig", "[config]") {
 
     TempFile tmpini{"TestIniTmp.ini"};
@@ -682,16 +883,16 @@ TEST_CASE_METHOD(TApp, "MultiConfig", "[config]") {
 
     {
         std::ofstream out{tmpini};
-        out << "[default]" << std::endl;
-        out << "two=99" << std::endl;
-        out << "three=3" << std::endl;
+        out << "[default]" << '\n';
+        out << "two=99" << '\n';
+        out << "three=3" << '\n';
     }
 
     {
         std::ofstream out{tmpini2};
-        out << "[default]" << std::endl;
-        out << "one=55" << std::endl;
-        out << "three=4" << std::endl;
+        out << "[default]" << '\n';
+        out << "one=55" << '\n';
+        out << "three=4" << '\n';
     }
 
     int one{0}, two{0}, three{0};
@@ -714,6 +915,90 @@ TEST_CASE_METHOD(TApp, "MultiConfig", "[config]") {
     CHECK(one == 55);
 }
 
+TEST_CASE_METHOD(TApp, "MultiConfig_takelast", "[config]") {
+
+    TempFile tmpini{"TestIniTmp.ini"};
+    TempFile tmpini2{"TestIniTmp2.ini"};
+
+    app.set_config("--config")->multi_option_policy(CLI::MultiOptionPolicy::TakeLast)->expected(1, 3);
+
+    {
+        std::ofstream out{tmpini};
+        out << "[default]" << '\n';
+        out << "two=99" << '\n';
+        out << "three=3" << '\n';
+    }
+
+    {
+        std::ofstream out{tmpini2};
+        out << "[default]" << '\n';
+        out << "one=55" << '\n';
+        out << "three=4" << '\n';
+    }
+
+    int one{0}, two{0}, three{0};
+    app.add_option("--one", one);
+    app.add_option("--two", two);
+    app.add_option("--three", three);
+
+    args = {"--config", tmpini, "--config", tmpini2};
+    run();
+
+    CHECK(two == 99);
+    CHECK(three == 3);
+    CHECK(one == 55);
+
+    two = 0;
+    args = {"--config", tmpini2, "--config", tmpini};
+    run();
+
+    CHECK(two == 99);
+    CHECK(three == 4);
+    CHECK(one == 55);
+}
+
+TEST_CASE_METHOD(TApp, "MultiConfig_takeAll", "[config]") {
+
+    TempFile tmpini{"TestIniTmp.ini"};
+    TempFile tmpini2{"TestIniTmp2.ini"};
+
+    app.set_config("--config")->multi_option_policy(CLI::MultiOptionPolicy::TakeAll);
+
+    {
+        std::ofstream out{tmpini};
+        out << "[default]" << '\n';
+        out << "two=99" << '\n';
+        out << "three=3" << '\n';
+    }
+
+    {
+        std::ofstream out{tmpini2};
+        out << "[default]" << '\n';
+        out << "one=55" << '\n';
+        out << "three=4" << '\n';
+    }
+
+    int one{0}, two{0}, three{0};
+    app.add_option("--one", one);
+    app.add_option("--two", two);
+    app.add_option("--three", three);
+
+    args = {"--config", tmpini, "--config", tmpini2};
+    run();
+
+    CHECK(two == 99);
+    CHECK(three == 3);
+    CHECK(one == 55);
+
+    two = 0;
+    args = {"--config", tmpini2, "--config", tmpini};
+    run();
+
+    CHECK(two == 99);
+    CHECK(three == 4);
+    CHECK(one == 55);
+}
+
 TEST_CASE_METHOD(TApp, "MultiConfig_single", "[config]") {
 
     TempFile tmpini{"TestIniTmp.ini"};
@@ -723,16 +1008,16 @@ TEST_CASE_METHOD(TApp, "MultiConfig_single", "[config]") {
 
     {
         std::ofstream out{tmpini};
-        out << "[default]" << std::endl;
-        out << "two=99" << std::endl;
-        out << "three=3" << std::endl;
+        out << "[default]" << '\n';
+        out << "two=99" << '\n';
+        out << "three=3" << '\n';
     }
 
     {
         std::ofstream out{tmpini2};
-        out << "[default]" << std::endl;
-        out << "one=55" << std::endl;
-        out << "three=4" << std::endl;
+        out << "[default]" << '\n';
+        out << "one=55" << '\n';
+        out << "three=4" << '\n';
     }
 
     int one{0}, two{0}, three{0};
@@ -778,8 +1063,8 @@ TEST_CASE_METHOD(TApp, "IniOverwrite", "[config]") {
     TempFile tmpini{"TestIniTmp.ini"};
     {
         std::ofstream out{tmpini};
-        out << "[default]" << std::endl;
-        out << "two=99" << std::endl;
+        out << "[default]" << '\n';
+        out << "two=99" << '\n';
     }
 
     std::string orig = "filename_not_exist.ini";
@@ -803,9 +1088,9 @@ TEST_CASE_METHOD(TApp, "IniRequired", "[config]") {
 
     {
         std::ofstream out{tmpini};
-        out << "[default]" << std::endl;
-        out << "two=99" << std::endl;
-        out << "three=3" << std::endl;
+        out << "[default]" << '\n';
+        out << "two=99" << '\n';
+        out << "three=3" << '\n';
     }
 
     int one{0}, two{0}, three{0};
@@ -846,9 +1131,9 @@ TEST_CASE_METHOD(TApp, "IniInlineComment", "[config]") {
 
     {
         std::ofstream out{tmpini};
-        out << "[default]" << std::endl;
-        out << "two=99 ; this is a two" << std::endl;
-        out << "three=3; this is a three" << std::endl;
+        out << "[default]" << '\n';
+        out << "two=99 ; this is a two" << '\n';
+        out << "three=3; this is a three" << '\n';
     }
 
     int one{0}, two{0}, three{0};
@@ -888,9 +1173,9 @@ TEST_CASE_METHOD(TApp, "TomlInlineComment", "[config]") {
 
     {
         std::ofstream out{tmpini};
-        out << "[default]" << std::endl;
-        out << "two=99 # this is a two" << std::endl;
-        out << "three=3# this is a three" << std::endl;
+        out << "[default]" << '\n';
+        out << "two=99 # this is a two" << '\n';
+        out << "three=3# this is a three" << '\n';
     }
 
     int one{0}, two{0}, three{0};
@@ -922,6 +1207,86 @@ TEST_CASE_METHOD(TApp, "TomlInlineComment", "[config]") {
     CHECK_THROWS_AS(run(), CLI::RequiredError);
 }
 
+TEST_CASE_METHOD(TApp, "TomlDocStringComment", "[config]") {
+
+    TempFile tmpini{"TestIniTmp.ini"};
+
+    app.set_config("--config", tmpini, "", true);
+
+    {
+        std::ofstream out{tmpini};
+        out << "[default]" << '\n';
+        out << "two=99" << '\n';
+        out << "three=3" << '\n';
+        out << R"(""")" << '\n';
+        out << "one=35" << '\n';
+        out << R"(""")" << '\n';
+    }
+
+    int one{0}, two{0}, three{0};
+    app.add_option("--one", one);
+    app.add_option("--two", two);
+    app.add_option("--three", three);
+
+    CHECK_NOTHROW(run());
+    CHECK(0 == one);
+    CHECK(99 == two);
+    CHECK(3 == three);
+}
+
+TEST_CASE_METHOD(TApp, "TomlDocStringComment2", "[config]") {
+
+    TempFile tmpini{"TestIniTmp.ini"};
+
+    app.set_config("--config", tmpini, "", true);
+
+    {
+        std::ofstream out{tmpini};
+        out << "[default]" << '\n';
+        out << "two=99" << '\n';
+        out << "'''" << '\n';
+        out << "one=35" << '\n';
+        out << "last comment line three=6 '''" << '\n';
+        out << "three=3" << '\n';
+    }
+
+    int one{0}, two{0}, three{0};
+    app.add_option("--one", one);
+    app.add_option("--two", two);
+    app.add_option("--three", three);
+
+    CHECK_NOTHROW(run());
+    CHECK(0 == one);
+    CHECK(99 == two);
+    CHECK(3 == three);
+}
+
+TEST_CASE_METHOD(TApp, "TomlDocStringComment3", "[config]") {
+
+    TempFile tmpini{"TestIniTmp.ini"};
+
+    app.set_config("--config", tmpini, "", true);
+
+    {
+        std::ofstream out{tmpini};
+        out << "[default]" << '\n';
+        out << "two=99" << '\n';
+        out << "three=3" << '\n';
+        out << "'''" << '\n';
+        out << "one=35" << '\n';
+    }
+
+    int one{0}, two{0}, three{0};
+    app.add_option("--one", one);
+    app.add_option("--two", two);
+    app.add_option("--three", three);
+
+    CHECK_NOTHROW(run());
+    CHECK(0 == one);
+    CHECK(99 == two);
+    CHECK(3 == three);
+}
+
 TEST_CASE_METHOD(TApp, "ConfigModifiers", "[config]") {
 
     app.set_config("--config", "test.ini", "", true);
@@ -953,9 +1318,9 @@ TEST_CASE_METHOD(TApp, "IniVector", "[config]") {
 
     {
         std::ofstream out{tmpini};
-        out << "[default]" << std::endl;
-        out << "two=2 3" << std::endl;
-        out << "three=1 2 3" << std::endl;
+        out << "[default]" << '\n';
+        out << "two=2 3" << '\n';
+        out << "three=1 2 3" << '\n';
     }
 
     std::vector<int> two, three;
@@ -1141,12 +1506,155 @@ TEST_CASE_METHOD(TApp, "IniVectorMultiple", "[config]") {
     app.add_option("--three", three)->required();
 
     run();
-
-    CHECK(two == std::vector<int>({2, 3}));
-    CHECK(three == std::vector<int>({1, 2, 3}));
+
+    CHECK(two == std::vector<int>({2, 3}));
+    CHECK(three == std::vector<int>({1, 2, 3}));
+}
+
+TEST_CASE_METHOD(TApp, "IniLayered", "[config]") {
+
+    TempFile tmpini{"TestIniTmp.ini"};
+
+    app.set_config("--config", tmpini);
+
+    {
+        std::ofstream out{tmpini};
+        out << "[default]" << '\n';
+        out << "val=1" << '\n';
+        out << "[subcom]" << '\n';
+        out << "val=2" << '\n';
+        out << "subsubcom.val=3" << '\n';
+    }
+
+    int one{0}, two{0}, three{0};
+    app.add_option("--val", one);
+    auto *subcom = app.add_subcommand("subcom");
+    subcom->add_option("--val", two);
+    auto *subsubcom = subcom->add_subcommand("subsubcom");
+    subsubcom->add_option("--val", three);
+
+    run();
+
+    CHECK(one == 1);
+    CHECK(two == 2);
+    CHECK(three == 3);
+
+    CHECK(0U == subcom->count());
+    CHECK(!*subcom);
+}
+
+TEST_CASE_METHOD(TApp, "IniLayeredStream", "[config]") {
+
+    TempFile tmpini{"TestIniTmp.ini"};
+
+    app.set_config("--config", tmpini);
+
+    {
+        std::ofstream out{tmpini};
+        out << "[default]" << '\n';
+        out << "val=1" << '\n';
+        out << "[subcom]" << '\n';
+        out << "val=2" << '\n';
+        out << "subsubcom.val=3" << '\n';
+    }
+
+    int one{0}, two{0}, three{0};
+    app.add_option("--val", one);
+    auto *subcom = app.add_subcommand("subcom");
+    subcom->add_option("--val", two);
+    auto *subsubcom = subcom->add_subcommand("subsubcom");
+    subsubcom->add_option("--val", three);
+
+    std::ifstream in{tmpini};
+    app.parse_from_stream(in);
+
+    CHECK(one == 1);
+    CHECK(two == 2);
+    CHECK(three == 3);
+
+    CHECK(0U == subcom->count());
+    CHECK(!*subcom);
+}
+
+TEST_CASE_METHOD(TApp, "IniLayeredDotSection", "[config]") {
+
+    TempFile tmpini{"TestIniTmp.ini"};
+
+    app.set_config("--config", tmpini);
+
+    {
+        std::ofstream out{tmpini};
+        out << "[default]" << '\n';
+        out << "val=1" << '\n';
+        out << "[subcom]" << '\n';
+        out << "val=2" << '\n';
+        out << "[subcom.subsubcom]" << '\n';
+        out << "val=3" << '\n';
+    }
+
+    int one{0}, two{0}, three{0};
+    app.add_option("--val", one);
+    auto *subcom = app.add_subcommand("subcom");
+    subcom->add_option("--val", two);
+    auto *subsubcom = subcom->add_subcommand("subsubcom");
+    subsubcom->add_option("--val", three);
+
+    run();
+
+    CHECK(one == 1);
+    CHECK(two == 2);
+    CHECK(three == 3);
+
+    CHECK(0U == subcom->count());
+    CHECK(!*subcom);
+
+    three = 0;
+    // check maxlayers
+    app.get_config_formatter_base()->maxLayers(1);
+    run();
+    CHECK(three == 0);
+}
+
+TEST_CASE_METHOD(TApp, "IniLayeredDotSectionInQuotes", "[config]") {
+
+    TempFile tmpini{"TestIniTmp.ini"};
+
+    app.set_config("--config", tmpini);
+
+    {
+        std::ofstream out{tmpini};
+        out << "[default]" << '\n';
+        out << "val=1" << '\n';
+        out << "['subcom']" << '\n';
+        out << "val=2" << '\n';
+        out << "['subcom'.\"subsubcom\"]" << '\n';
+        out << "val=3" << '\n';
+    }
+
+    int one{0}, two{0}, three{0};
+    app.add_option("--val", one);
+    auto *subcom = app.add_subcommand("subcom");
+    subcom->add_option("--val", two);
+    auto *subsubcom = subcom->add_subcommand("subsubcom");
+    subsubcom->add_option("--val", three);
+
+    run();
+
+    CHECK(one == 1);
+    CHECK(two == 2);
+    CHECK(three == 3);
+
+    CHECK(0U == subcom->count());
+    CHECK(!*subcom);
+
+    three = 0;
+    // check maxlayers
+    app.get_config_formatter_base()->maxLayers(1);
+    run();
+    CHECK(three == 0);
 }
 
-TEST_CASE_METHOD(TApp, "IniLayered", "[config]") {
+TEST_CASE_METHOD(TApp, "IniLayeredCustomSectionSeparator", "[config]") {
 
     TempFile tmpini{"TestIniTmp.ini"};
 
@@ -1154,13 +1662,14 @@ TEST_CASE_METHOD(TApp, "IniLayered", "[config]") {
 
     {
         std::ofstream out{tmpini};
-        out << "[default]" << std::endl;
-        out << "val=1" << std::endl;
-        out << "[subcom]" << std::endl;
-        out << "val=2" << std::endl;
-        out << "subsubcom.val=3" << std::endl;
+        out << "[default]" << '\n';
+        out << "val=1" << '\n';
+        out << "[subcom]" << '\n';
+        out << "val=2" << '\n';
+        out << "[subcom|subsubcom]" << '\n';
+        out << "val=3" << '\n';
     }
-
+    app.get_config_formatter_base()->parentSeparator('|');
     int one{0}, two{0}, three{0};
     app.add_option("--val", one);
     auto *subcom = app.add_subcommand("subcom");
@@ -1178,7 +1687,31 @@ TEST_CASE_METHOD(TApp, "IniLayered", "[config]") {
     CHECK(!*subcom);
 }
 
-TEST_CASE_METHOD(TApp, "IniLayeredStream", "[config]") {
+TEST_CASE_METHOD(TApp, "IniLayeredOptionGroupAlias", "[config]") {
+
+    TempFile tmpini{"TestIniTmp.ini"};
+
+    app.set_config("--config", tmpini);
+
+    {
+        std::ofstream out{tmpini};
+        out << "[default]" << '\n';
+        out << "val=1" << '\n';
+        out << "[ogroup]" << '\n';
+        out << "val2=2" << '\n';
+    }
+    int one{0}, two{0};
+    app.add_option("--val", one);
+    auto *subcom = app.add_option_group("ogroup")->alias("ogroup");
+    subcom->add_option("--val2", two);
+
+    run();
+
+    CHECK(one == 1);
+    CHECK(two == 2);
+}
+
+TEST_CASE_METHOD(TApp, "IniSubcommandConfigurable", "[config]") {
 
     TempFile tmpini{"TestIniTmp.ini"};
 
@@ -1186,32 +1719,33 @@ TEST_CASE_METHOD(TApp, "IniLayeredStream", "[config]") {
 
     {
         std::ofstream out{tmpini};
-        out << "[default]" << std::endl;
-        out << "val=1" << std::endl;
-        out << "[subcom]" << std::endl;
-        out << "val=2" << std::endl;
-        out << "subsubcom.val=3" << std::endl;
+        out << "[default]" << '\n';
+        out << "val=1" << '\n';
+        out << "[subcom]" << '\n';
+        out << "val=2" << '\n';
+        out << "subsubcom.val=3" << '\n';
     }
 
     int one{0}, two{0}, three{0};
     app.add_option("--val", one);
     auto *subcom = app.add_subcommand("subcom");
+    subcom->configurable();
     subcom->add_option("--val", two);
     auto *subsubcom = subcom->add_subcommand("subsubcom");
     subsubcom->add_option("--val", three);
 
-    std::ifstream in{tmpini};
-    app.parse_from_stream(in);
+    run();
 
     CHECK(one == 1);
     CHECK(two == 2);
     CHECK(three == 3);
 
-    CHECK(0U == subcom->count());
-    CHECK(!*subcom);
+    CHECK(1U == subcom->count());
+    CHECK(*subcom);
+    CHECK(app.got_subcommand(subcom));
 }
 
-TEST_CASE_METHOD(TApp, "IniLayeredDotSection", "[config]") {
+TEST_CASE_METHOD(TApp, "IniSubcommandConfigurableInQuotes", "[config]") {
 
     TempFile tmpini{"TestIniTmp.ini"};
 
@@ -1219,17 +1753,17 @@ TEST_CASE_METHOD(TApp, "IniLayeredDotSection", "[config]") {
 
     {
         std::ofstream out{tmpini};
-        out << "[default]" << std::endl;
-        out << "val=1" << std::endl;
-        out << "[subcom]" << std::endl;
-        out << "val=2" << std::endl;
-        out << "[subcom.subsubcom]" << std::endl;
-        out << "val=3" << std::endl;
+        out << "[default]" << '\n';
+        out << "val=1" << '\n';
+        out << "[subcom]" << '\n';
+        out << "val=2" << '\n';
+        out << "\"subsubcom\".'val'=3" << '\n';
     }
 
     int one{0}, two{0}, three{0};
     app.add_option("--val", one);
     auto *subcom = app.add_subcommand("subcom");
+    subcom->configurable();
     subcom->add_option("--val", two);
     auto *subsubcom = subcom->add_subcommand("subsubcom");
     subsubcom->add_option("--val", three);
@@ -1240,17 +1774,12 @@ TEST_CASE_METHOD(TApp, "IniLayeredDotSection", "[config]") {
     CHECK(two == 2);
     CHECK(three == 3);
 
-    CHECK(0U == subcom->count());
-    CHECK(!*subcom);
-
-    three = 0;
-    // check maxlayers
-    app.get_config_formatter_base()->maxLayers(1);
-    run();
-    CHECK(three == 0);
+    CHECK(1U == subcom->count());
+    CHECK(*subcom);
+    CHECK(app.got_subcommand(subcom));
 }
 
-TEST_CASE_METHOD(TApp, "IniLayeredCustomSectionSeparator", "[config]") {
+TEST_CASE_METHOD(TApp, "IniSubcommandConfigurableInQuotesAlias", "[config]") {
 
     TempFile tmpini{"TestIniTmp.ini"};
 
@@ -1258,19 +1787,19 @@ TEST_CASE_METHOD(TApp, "IniLayeredCustomSectionSeparator", "[config]") {
 
     {
         std::ofstream out{tmpini};
-        out << "[default]" << std::endl;
-        out << "val=1" << std::endl;
-        out << "[subcom]" << std::endl;
-        out << "val=2" << std::endl;
-        out << "[subcom|subsubcom]" << std::endl;
-        out << "val=3" << std::endl;
+        out << "[default]" << '\n';
+        out << "val=1" << '\n';
+        out << "[subcom]" << '\n';
+        out << "val=2" << '\n';
+        out << R"("sub\tsub\t.com".'val'=3)" << '\n';
     }
-    app.get_config_formatter_base()->parentSeparator('|');
+
     int one{0}, two{0}, three{0};
     app.add_option("--val", one);
     auto *subcom = app.add_subcommand("subcom");
+    subcom->configurable();
     subcom->add_option("--val", two);
-    auto *subsubcom = subcom->add_subcommand("subsubcom");
+    auto *subsubcom = subcom->add_subcommand("subsubcom")->alias("sub\tsub\t.com");
     subsubcom->add_option("--val", three);
 
     run();
@@ -1279,11 +1808,12 @@ TEST_CASE_METHOD(TApp, "IniLayeredCustomSectionSeparator", "[config]") {
     CHECK(two == 2);
     CHECK(three == 3);
 
-    CHECK(0U == subcom->count());
-    CHECK(!*subcom);
+    CHECK(1U == subcom->count());
+    CHECK(*subcom);
+    CHECK(app.got_subcommand(subcom));
 }
 
-TEST_CASE_METHOD(TApp, "IniLayeredOptionGroupAlias", "[config]") {
+TEST_CASE_METHOD(TApp, "IniSubcommandConfigurableInQuotesAliasWithEquals", "[config]") {
 
     TempFile tmpini{"TestIniTmp.ini"};
 
@@ -1291,23 +1821,33 @@ TEST_CASE_METHOD(TApp, "IniLayeredOptionGroupAlias", "[config]") {
 
     {
         std::ofstream out{tmpini};
-        out << "[default]" << std::endl;
-        out << "val=1" << std::endl;
-        out << "[ogroup]" << std::endl;
-        out << "val2=2" << std::endl;
+        out << "[default]" << '\n';
+        out << "val=1" << '\n';
+        out << "[subcom]" << '\n';
+        out << "val=2" << '\n';
+        out << R"("sub=sub=.com".'val'=3)" << '\n';
     }
-    int one{0}, two{0};
+
+    int one{0}, two{0}, three{0};
     app.add_option("--val", one);
-    auto *subcom = app.add_option_group("ogroup")->alias("ogroup");
-    subcom->add_option("--val2", two);
+    auto *subcom = app.add_subcommand("subcom");
+    subcom->configurable();
+    subcom->add_option("--val", two);
+    auto *subsubcom = subcom->add_subcommand("subsubcom")->alias("sub=sub=.com");
+    subsubcom->add_option("--val", three);
 
     run();
 
     CHECK(one == 1);
     CHECK(two == 2);
+    CHECK(three == 3);
+
+    CHECK(1U == subcom->count());
+    CHECK(*subcom);
+    CHECK(app.got_subcommand(subcom));
 }
 
-TEST_CASE_METHOD(TApp, "IniSubcommandConfigurable", "[config]") {
+TEST_CASE_METHOD(TApp, "IniSubcommandConfigurableInQuotesAliasWithComment", "[config]") {
 
     TempFile tmpini{"TestIniTmp.ini"};
 
@@ -1315,11 +1855,11 @@ TEST_CASE_METHOD(TApp, "IniSubcommandConfigurable", "[config]") {
 
     {
         std::ofstream out{tmpini};
-        out << "[default]" << std::endl;
-        out << "val=1" << std::endl;
-        out << "[subcom]" << std::endl;
-        out << "val=2" << std::endl;
-        out << "subsubcom.val=3" << std::endl;
+        out << "[default]" << '\n';
+        out << "val=1" << '\n';
+        out << "[subcom]" << '\n';
+        out << "val=2" << '\n';
+        out << R"("sub#sub;.com".'val'=3)" << '\n';
     }
 
     int one{0}, two{0}, three{0};
@@ -1327,7 +1867,7 @@ TEST_CASE_METHOD(TApp, "IniSubcommandConfigurable", "[config]") {
     auto *subcom = app.add_subcommand("subcom");
     subcom->configurable();
     subcom->add_option("--val", two);
-    auto *subsubcom = subcom->add_subcommand("subsubcom");
+    auto *subsubcom = subcom->add_subcommand("subsubcom")->alias("sub#sub;.com");
     subsubcom->add_option("--val", three);
 
     run();
@@ -1335,10 +1875,6 @@ TEST_CASE_METHOD(TApp, "IniSubcommandConfigurable", "[config]") {
     CHECK(one == 1);
     CHECK(two == 2);
     CHECK(three == 3);
-
-    CHECK(1U == subcom->count());
-    CHECK(*subcom);
-    CHECK(app.got_subcommand(subcom));
 }
 
 TEST_CASE_METHOD(TApp, "IniSubcommandConfigurablePreParse", "[config]") {
@@ -1349,11 +1885,11 @@ TEST_CASE_METHOD(TApp, "IniSubcommandConfigurablePreParse", "[config]") {
 
     {
         std::ofstream out{tmpini};
-        out << "[default]" << std::endl;
-        out << "val=1" << std::endl;
-        out << "[subcom]" << std::endl;
-        out << "val=2" << std::endl;
-        out << "subsubcom.val=3" << std::endl;
+        out << "[default]" << '\n';
+        out << "val=1" << '\n';
+        out << "[subcom]" << '\n';
+        out << "val=2" << '\n';
+        out << "subsubcom.val=3" << '\n';
     }
 
     int one{0}, two{0}, three{0}, four{0};
@@ -1391,11 +1927,11 @@ TEST_CASE_METHOD(TApp, "IniSection", "[config]") {
 
     {
         std::ofstream out{tmpini};
-        out << "[config]" << std::endl;
-        out << "val=2" << std::endl;
-        out << "subsubcom.val=3" << std::endl;
-        out << "[default]" << std::endl;
-        out << "val=1" << std::endl;
+        out << "[config]" << '\n';
+        out << "val=2" << '\n';
+        out << "subsubcom.val=3" << '\n';
+        out << "[default]" << '\n';
+        out << "val=1" << '\n';
     }
 
     int val{0};
@@ -1415,11 +1951,11 @@ TEST_CASE_METHOD(TApp, "IniSection2", "[config]") {
 
     {
         std::ofstream out{tmpini};
-        out << "[default]" << std::endl;
-        out << "val=1" << std::endl;
-        out << "[config]" << std::endl;
-        out << "val=2" << std::endl;
-        out << "subsubcom.val=3" << std::endl;
+        out << "[default]" << '\n';
+        out << "val=1" << '\n';
+        out << "[config]" << '\n';
+        out << "val=2" << '\n';
+        out << "subsubcom.val=3" << '\n';
     }
 
     int val{0};
@@ -1439,11 +1975,11 @@ TEST_CASE_METHOD(TApp, "jsonLikeParsing", "[config]") {
 
     {
         std::ofstream out{tmpjson};
-        out << "{" << std::endl;
-        out << "\"val\":1," << std::endl;
-        out << R"("val2":"test",)" << std::endl;
-        out << "\"flag\":true" << std::endl;
-        out << "}" << std::endl;
+        out << "{" << '\n';
+        out << "\"val\":1," << '\n';
+        out << R"("val2":"test",)" << '\n';
+        out << "\"flag\":true" << '\n';
+        out << "}" << '\n';
     }
 
     int val{0};
@@ -1470,17 +2006,17 @@ TEST_CASE_METHOD(TApp, "TomlSectionNumber", "[config]") {
 
     {
         std::ofstream out{tmpini};
-        out << "[default]" << std::endl;
-        out << "val=1" << std::endl;
-        out << "[[config]]" << std::endl;
-        out << "val=2" << std::endl;
-        out << "subsubcom.val=3" << std::endl;
-        out << "[[config]]" << std::endl;
-        out << "val=4" << std::endl;
-        out << "subsubcom.val=3" << std::endl;
-        out << "[[config]]" << std::endl;
-        out << "val=6" << std::endl;
-        out << "subsubcom.val=3" << std::endl;
+        out << "[default]" << '\n';
+        out << "val=1" << '\n';
+        out << "[[config]]" << '\n';
+        out << "val=2" << '\n';
+        out << "subsubcom.val=3" << '\n';
+        out << "[[config]]" << '\n';
+        out << "val=4" << '\n';
+        out << "subsubcom.val=3" << '\n';
+        out << "[[config]]" << '\n';
+        out << "val=6" << '\n';
+        out << "subsubcom.val=3" << '\n';
     }
 
     int val{0};
@@ -1514,12 +2050,12 @@ TEST_CASE_METHOD(TApp, "IniSubcommandConfigurableParseComplete", "[config]") {
 
     {
         std::ofstream out{tmpini};
-        out << "[default]" << std::endl;
-        out << "val=1" << std::endl;
-        out << "[subcom]" << std::endl;
-        out << "val=2" << std::endl;
-        out << "[subcom.subsubcom]" << std::endl;
-        out << "val=3" << std::endl;
+        out << "[default]" << '\n';
+        out << "val=1" << '\n';
+        out << "[subcom]" << '\n';
+        out << "val=2" << '\n';
+        out << "[subcom.subsubcom]" << '\n';
+        out << "val=3" << '\n';
     }
 
     int one{0}, two{0}, three{0}, four{0};
@@ -1559,14 +2095,14 @@ TEST_CASE_METHOD(TApp, "IniSubcommandMultipleSections", "[config]") {
 
     {
         std::ofstream out{tmpini};
-        out << "[default]" << std::endl;
-        out << "val=1" << std::endl;
-        out << "[subcom]" << std::endl;
-        out << "val=2" << std::endl;
-        out << "[subcom.subsubcom]" << std::endl;
-        out << "val=3" << std::endl;
-        out << "[subcom2]" << std::endl;
-        out << "val=4" << std::endl;
+        out << "[default]" << '\n';
+        out << "val=1" << '\n';
+        out << "[subcom]" << '\n';
+        out << "val=2" << '\n';
+        out << "[subcom.subsubcom]" << '\n';
+        out << "val=3" << '\n';
+        out << "[subcom2]" << '\n';
+        out << "val=4" << '\n';
     }
 
     int one{0}, two{0}, three{0}, four{0};
@@ -1607,9 +2143,9 @@ TEST_CASE_METHOD(TApp, "DuplicateSubcommandCallbacks", "[config]") {
 
     {
         std::ofstream out{tmptoml};
-        out << "[[foo]]" << std::endl;
-        out << "[[foo]]" << std::endl;
-        out << "[[foo]]" << std::endl;
+        out << "[[foo]]" << '\n';
+        out << "[[foo]]" << '\n';
+        out << "[[foo]]" << '\n';
     }
 
     auto *foo = app.add_subcommand("foo");
@@ -1631,7 +2167,7 @@ TEST_CASE_METHOD(TApp, "SubcommandCallbackSingle", "[config]") {
 
     {
         std::ofstream out{tmptoml};
-        out << "[foo]" << std::endl;
+        out << "[foo]" << '\n';
     }
     int count{0};
     auto *foo = app.add_subcommand("foo");
@@ -1650,8 +2186,8 @@ TEST_CASE_METHOD(TApp, "IniFailure", "[config]") {
     app.allow_config_extras(false);
     {
         std::ofstream out{tmpini};
-        out << "[default]" << std::endl;
-        out << "val=1" << std::endl;
+        out << "[default]" << '\n';
+        out << "val=1" << '\n';
     }
 
     CHECK_THROWS_AS(run(), CLI::ConfigError);
@@ -1667,8 +2203,8 @@ TEST_CASE_METHOD(TApp, "IniConfigurable", "[config]") {
 
     {
         std::ofstream out{tmpini};
-        out << "[default]" << std::endl;
-        out << "val=1" << std::endl;
+        out << "[default]" << '\n';
+        out << "val=1" << '\n';
     }
 
     REQUIRE_NOTHROW(run());
@@ -1685,11 +2221,48 @@ TEST_CASE_METHOD(TApp, "IniNotConfigurable", "[config]") {
 
     {
         std::ofstream out{tmpini};
-        out << "[default]" << std::endl;
-        out << "val=1" << std::endl;
+        out << "[default]" << '\n';
+        out << "val=1" << '\n';
     }
 
     CHECK_THROWS_AS(run(), CLI::ConfigError);
+    app.allow_config_extras(CLI::config_extras_mode::ignore_all);
+    CHECK_NOTHROW(run());
+}
+
+TEST_CASE_METHOD(TApp, "IniFlagDisableOverrideFlagArray", "[config]") {
+
+    TempFile tmpini{"TestIniTmp.ini"};
+
+    app.set_config("--config", tmpini);
+    int value{0};
+    app.add_flag("--val", value)->configurable(true)->disable_flag_override();
+
+    {
+        std::ofstream out{tmpini};
+        out << "[default]" << '\n';
+        out << "val=[1,true,false,true]" << '\n';
+    }
+
+    REQUIRE_NOTHROW(run());
+    CHECK(value == 2);
+}
+
+TEST_CASE_METHOD(TApp, "IniFlagInvalidDisableOverrideFlagArray", "[config]") {
+
+    TempFile tmpini{"TestIniTmp.ini"};
+
+    app.set_config("--config", tmpini);
+    int value{0};
+    app.add_flag("--val", value)->configurable(true)->disable_flag_override();
+
+    {
+        std::ofstream out{tmpini};
+        out << "[default]" << '\n';
+        out << "val=[1,true,false,not_valid]" << '\n';
+    }
+
+    CHECK_THROWS_AS(run(), CLI::InvalidError);
 }
 
 TEST_CASE_METHOD(TApp, "IniSubFailure", "[config]") {
@@ -1701,8 +2274,8 @@ TEST_CASE_METHOD(TApp, "IniSubFailure", "[config]") {
     app.allow_config_extras(false);
     {
         std::ofstream out{tmpini};
-        out << "[other]" << std::endl;
-        out << "val=1" << std::endl;
+        out << "[other]" << '\n';
+        out << "val=1" << '\n';
     }
 
     CHECK_THROWS_AS(run(), CLI::ConfigError);
@@ -1716,8 +2289,8 @@ TEST_CASE_METHOD(TApp, "IniNoSubFailure", "[config]") {
     app.allow_config_extras(CLI::config_extras_mode::error);
     {
         std::ofstream out{tmpini};
-        out << "[other]" << std::endl;
-        out << "val=1" << std::endl;
+        out << "[other]" << '\n';
+        out << "val=1" << '\n';
     }
 
     CHECK_THROWS_AS(run(), CLI::ConfigError);
@@ -1732,7 +2305,7 @@ TEST_CASE_METHOD(TApp, "IniFlagConvertFailure", "[config]") {
 
     {
         std::ofstream out{tmpini};
-        out << "flag=moobook" << std::endl;
+        out << "flag=moobook" << '\n';
     }
     run();
     bool result{false};
@@ -1753,7 +2326,7 @@ TEST_CASE_METHOD(TApp, "IniFlagNumbers", "[config]") {
 
     {
         std::ofstream out{tmpini};
-        out << "flag=3" << std::endl;
+        out << "flag=3" << '\n';
     }
 
     REQUIRE_NOTHROW(run());
@@ -1771,7 +2344,7 @@ TEST_CASE_METHOD(TApp, "IniFlagDual", "[config]") {
 
     {
         std::ofstream out{tmpini};
-        out << "flag=1 1" << std::endl;
+        out << "flag=1 1" << '\n';
     }
 
     CHECK_THROWS_AS(run(), CLI::ConversionError);
@@ -1788,7 +2361,7 @@ TEST_CASE_METHOD(TApp, "IniVectorMax", "[config]") {
 
     {
         std::ofstream out{tmpini};
-        out << "vec=[a,b,c]" << std::endl;
+        out << "vec=[a,b,c]" << '\n';
     }
 
     CHECK_THROWS_AS(run(), CLI::ArgumentMismatch);
@@ -1804,7 +2377,58 @@ TEST_CASE_METHOD(TApp, "IniShort", "[config]") {
 
     {
         std::ofstream out{tmpini};
-        out << "f=3" << std::endl;
+        out << "f=3" << '\n';
+    }
+
+    REQUIRE_NOTHROW(run());
+    CHECK(3 == key);
+}
+
+TEST_CASE_METHOD(TApp, "IniShortQuote1", "[config]") {
+
+    TempFile tmpini{"TestIniTmp.ini"};
+
+    int key{0};
+    app.add_option("--flag,-f", key);
+    app.set_config("--config", tmpini);
+
+    {
+        std::ofstream out{tmpini};
+        out << "\"f\"=3" << '\n';
+    }
+
+    REQUIRE_NOTHROW(run());
+    CHECK(3 == key);
+}
+
+TEST_CASE_METHOD(TApp, "IniShortQuote2", "[config]") {
+
+    TempFile tmpini{"TestIniTmp.ini"};
+
+    int key{0};
+    app.add_option("--flag,-f", key);
+    app.set_config("--config", tmpini);
+
+    {
+        std::ofstream out{tmpini};
+        out << "'f'=3" << '\n';
+    }
+
+    REQUIRE_NOTHROW(run());
+    CHECK(3 == key);
+}
+
+TEST_CASE_METHOD(TApp, "IniShortQuote3", "[config]") {
+
+    TempFile tmpini{"TestIniTmp.ini"};
+
+    int key{0};
+    app.add_option("--flag,-f", key);
+    app.set_config("--config", tmpini);
+
+    {
+        std::ofstream out{tmpini};
+        out << "`f`=3" << '\n';
     }
 
     REQUIRE_NOTHROW(run());
@@ -1821,7 +2445,7 @@ TEST_CASE_METHOD(TApp, "IniDefaultPath", "[config]") {
 
     {
         std::ofstream out{tmpini};
-        out << "f=3" << std::endl;
+        out << "f=3" << '\n';
     }
 
     REQUIRE_NOTHROW(run());
@@ -1840,7 +2464,7 @@ TEST_CASE_METHOD(TApp, "IniMultipleDefaultPath", "[config]") {
 
     {
         std::ofstream out{tmpini};
-        out << "f=3" << std::endl;
+        out << "f=3" << '\n';
     }
 
     args = {"--config", "TestIniTmp.ini"};
@@ -1860,7 +2484,7 @@ TEST_CASE_METHOD(TApp, "IniMultipleDefaultPathAlternate", "[config]") {
 
     {
         std::ofstream out{tmpini};
-        out << "f=3" << std::endl;
+        out << "f=3" << '\n';
     }
 
     args = {"--config", "TestIniTmp.ini"};
@@ -1879,7 +2503,7 @@ TEST_CASE_METHOD(TApp, "IniPositional", "[config]") {
 
     {
         std::ofstream out{tmpini};
-        out << "key=3" << std::endl;
+        out << "key=3" << '\n';
     }
 
     REQUIRE_NOTHROW(run());
@@ -1896,7 +2520,7 @@ TEST_CASE_METHOD(TApp, "IniEnvironmental", "[config]") {
 
     {
         std::ofstream out{tmpini};
-        out << "CLI11_TEST_ENV_KEY_TMP=3" << std::endl;
+        out << "CLI11_TEST_ENV_KEY_TMP=3" << '\n';
     }
 
     REQUIRE_NOTHROW(run());
@@ -1916,10 +2540,10 @@ TEST_CASE_METHOD(TApp, "IniFlagText", "[config]") {
 
     {
         std::ofstream out{tmpini};
-        out << "flag1=true" << std::endl;
-        out << "flag2=on" << std::endl;
-        out << "flag3=off" << std::endl;
-        out << "flag4=1" << std::endl;
+        out << "flag1=true" << '\n';
+        out << "flag2=on" << '\n';
+        out << "flag3=off" << '\n';
+        out << "flag4=1" << '\n';
     }
 
     run();
@@ -1936,13 +2560,73 @@ TEST_CASE_METHOD(TApp, "IniFlags", "[config]") {
 
     {
         std::ofstream out{tmpini};
-        out << "[default]" << std::endl;
-        out << "two=2" << std::endl;
-        out << "three=true" << std::endl;
-        out << "four=on" << std::endl;
-        out << "five" << std::endl;
+        out << "[default]" << '\n';
+        out << "two=2" << '\n';
+        out << "three=true" << '\n';
+        out << "four=on" << '\n';
+        out << "five" << '\n';
+    }
+
+    int two{0};
+    bool three{false}, four{false}, five{false};
+    app.add_flag("--two", two);
+    app.add_flag("--three", three);
+    app.add_flag("--four", four);
+    app.add_flag("--five", five);
+
+    run();
+
+    CHECK(two == 2);
+    CHECK(three);
+    CHECK(four);
+    CHECK(five);
+}
+
+TEST_CASE_METHOD(TApp, "IniFlagsComment", "[config]") {
+    TempFile tmpini{"TestIniTmp.ini"};
+    app.set_config("--config", tmpini);
+
+    {
+        std::ofstream out{tmpini};
+        out << "[default]" << '\n';
+        out << "two=2 # comment 1" << '\n';
+        out << "three=true" << '\n';
+        out << "four=on #comment 2" << '\n';
+        out << "five #comment 3" << '\n';
+        out << '\n';
+    }
+
+    int two{0};
+    bool three{false}, four{false}, five{false};
+    app.add_flag("--two", two);
+    app.add_flag("--three", three);
+    app.add_flag("--four", four);
+    app.add_flag("--five", five);
+
+    run();
+
+    CHECK(two == 2);
+    CHECK(three);
+    CHECK(four);
+    CHECK(five);
+}
+
+TEST_CASE_METHOD(TApp, "IniFlagsAltComment", "[config]") {
+    TempFile tmpini{"TestIniTmp.ini"};
+    app.set_config("--config", tmpini);
+
+    {
+        std::ofstream out{tmpini};
+        out << "[default]" << '\n';
+        out << "two=2 % comment 1" << '\n';
+        out << "three=true" << '\n';
+        out << "four=on %% comment 2" << '\n';
+        out << "five %= 3" << '\n';
+        out << '\n';
     }
 
+    auto config = app.get_config_formatter_base();
+    config->comment('%');
     int two{0};
     bool three{false}, four{false}, five{false};
     app.add_flag("--two", two);
@@ -1964,11 +2648,11 @@ TEST_CASE_METHOD(TApp, "IniFalseFlags", "[config]") {
 
     {
         std::ofstream out{tmpini};
-        out << "[default]" << std::endl;
-        out << "two=-2" << std::endl;
-        out << "three=false" << std::endl;
-        out << "four=1" << std::endl;
-        out << "five" << std::endl;
+        out << "[default]" << '\n';
+        out << "two=-2" << '\n';
+        out << "three=false" << '\n';
+        out << "four=1" << '\n';
+        out << "five" << '\n';
     }
 
     int two{0};
@@ -1992,11 +2676,11 @@ TEST_CASE_METHOD(TApp, "IniFalseFlagsDef", "[config]") {
 
     {
         std::ofstream out{tmpini};
-        out << "[default]" << std::endl;
-        out << "two=2" << std::endl;
-        out << "three=true" << std::endl;
-        out << "four=on" << std::endl;
-        out << "five" << std::endl;
+        out << "[default]" << '\n';
+        out << "two=2" << '\n';
+        out << "three=true" << '\n';
+        out << "four=on" << '\n';
+        out << "five" << '\n';
     }
 
     int two{0};
@@ -2020,10 +2704,10 @@ TEST_CASE_METHOD(TApp, "IniFalseFlagsDefDisableOverrideError", "[config]") {
 
     {
         std::ofstream out{tmpini};
-        out << "[default]" << std::endl;
-        out << "two=2" << std::endl;
-        out << "four=on" << std::endl;
-        out << "five" << std::endl;
+        out << "[default]" << '\n';
+        out << "two=2" << '\n';
+        out << "four=on" << '\n';
+        out << "five" << '\n';
     }
 
     int two{0};
@@ -2041,10 +2725,10 @@ TEST_CASE_METHOD(TApp, "IniFalseFlagsDefDisableOverrideSuccess", "[config]") {
 
     {
         std::ofstream out{tmpini};
-        out << "[default]" << std::endl;
-        out << "two=2" << std::endl;
-        out << "four={}" << std::endl;
-        out << "val=15" << std::endl;
+        out << "[default]" << '\n';
+        out << "two=2" << '\n';
+        out << "four={}" << '\n';
+        out << "val=15" << '\n';
     }
 
     int two{0}, four{0}, val{0};
@@ -2071,20 +2755,20 @@ TEST_CASE_METHOD(TApp, "IniDisableFlagOverride", "[config]") {
 
     {
         std::ofstream out{tmpini};
-        out << "[default]" << std::endl;
-        out << "two=2" << std::endl;
+        out << "[default]" << '\n';
+        out << "two=2" << '\n';
     }
 
     {
         std::ofstream out{tmpini2};
-        out << "[default]" << std::endl;
-        out << "two=7" << std::endl;
+        out << "[default]" << '\n';
+        out << "two=7" << '\n';
     }
 
     {
         std::ofstream out{tmpini3};
-        out << "[default]" << std::endl;
-        out << "three=true" << std::endl;
+        out << "[default]" << '\n';
+        out << "three=true" << '\n';
     }
 
     int val{0};
@@ -2185,6 +2869,17 @@ TEST_CASE_METHOD(TApp, "TomlOutputShortSingleDescription", "[config]") {
     CHECK_THAT(str, Contains("# " + description + "\n" + flag + "=false\n"));
 }
 
+TEST_CASE_METHOD(TApp, "TomlOutputdefaultOptionString", "[config]") {
+    std::string option = "some_option";
+    const std::string description = "Some short description.";
+    app.add_option("--" + option, description)->run_callback_for_default();
+
+    run();
+
+    std::string str = app.config_to_str(true, true);
+    CHECK_THAT(str, Contains("# " + description + "\n" + option + "=\"\"\n"));
+}
+
 TEST_CASE_METHOD(TApp, "TomlOutputShortDoubleDescription", "[config]") {
     std::string flag1 = "flagnr1";
     std::string flag2 = "flagnr2";
@@ -2269,6 +2964,7 @@ TEST_CASE_METHOD(TApp, "TomlOutputOptionGroupMultiLineDescription", "[config]")
     og->description("Option group description.\n"
                     "That has multiple lines.");
     og->add_flag("--" + flag, description);
+    args = {"--" + flag};
     run();
 
     std::string str = app.config_to_str(true, true);
@@ -2276,6 +2972,26 @@ TEST_CASE_METHOD(TApp, "TomlOutputOptionGroupMultiLineDescription", "[config]")
     CHECK_THAT(str, Contains("# That has multiple lines.\n"));
 }
 
+TEST_CASE_METHOD(TApp, "TomlOutputMultilineString", "[config]") {
+    std::string desc = "flag";
+    app.add_option("--opt", desc);
+
+    std::string argString = "this is a very long string \n that covers multiple lines \nand should be longer than 100 "
+                            "characters \nto trigger the multiline string";
+    args = {"--opt", argString};
+
+    run();
+
+    std::string str = app.config_to_str(true, true);
+
+    std::istringstream nfile(str);
+
+    app.clear();
+    desc = "";
+    app.parse_from_stream(nfile);
+    CHECK(desc == argString);
+}
+
 TEST_CASE_METHOD(TApp, "TomlOutputSubcommandMultiLineDescription", "[config]") {
     std::string flag = "flag";
     const std::string description = "Short flag description.\n";
@@ -2539,7 +3255,7 @@ TEST_CASE_METHOD(TApp, "TomlOutputQuoted", "[config]") {
 
     std::string str = app.config_to_str();
     CHECK_THAT(str, Contains("val1=\"I am a string\""));
-    CHECK_THAT(str, Contains("val2='I am a \"confusing\" string'"));
+    CHECK_THAT(str, Contains("val2=\"I am a \\\"confusing\\\" string\""));
 }
 
 TEST_CASE_METHOD(TApp, "DefaultsTomlOutputQuoted", "[config]") {
@@ -2554,7 +3270,7 @@ TEST_CASE_METHOD(TApp, "DefaultsTomlOutputQuoted", "[config]") {
 
     std::string str = app.config_to_str(true);
     CHECK_THAT(str, Contains("val1=\"I am a string\""));
-    CHECK_THAT(str, Contains("val2='I am a \"confusing\" string'"));
+    CHECK_THAT(str, Contains("val2=\"I am a \\\"confusing\\\" string\""));
 }
 
 // #298
@@ -2568,7 +3284,7 @@ TEST_CASE_METHOD(TApp, "StopReadingConfigOnClear", "[config]") {
 
     {
         std::ofstream out{tmpini};
-        out << "volume=1" << std::endl;
+        out << "volume=1" << '\n';
     }
 
     int volume{0};
@@ -2590,7 +3306,7 @@ TEST_CASE_METHOD(TApp, "ConfigWriteReadWrite", "[config]") {
     std::string config1 = app.config_to_str(true, true);
     {
         std::ofstream out{tmpini};
-        out << config1 << std::endl;
+        out << config1 << '\n';
     }
 
     app.set_config("--config", tmpini, "Read an ini file", true);
@@ -2613,7 +3329,7 @@ TEST_CASE_METHOD(TApp, "ConfigWriteReadNegated", "[config]") {
     std::string config1 = app.config_to_str(false, false);
     {
         std::ofstream out{tmpini};
-        out << config1 << std::endl;
+        out << config1 << '\n';
     }
     CHECK_FALSE(flag);
     args.clear();
@@ -2926,6 +3642,23 @@ TEST_CASE_METHOD(TApp, "IniOutputSubsubcom", "[config]") {
     CHECK_THAT(str, Contains("other.sub2.newest=true"));
 }
 
+TEST_CASE_METHOD(TApp, "IniOutputSubsubcomWithDot", "[config]") {
+
+    app.add_flag("--simple");
+    auto *subcom = app.add_subcommand("other");
+    subcom->add_flag("--newer");
+    auto *subsubcom = subcom->add_subcommand("sub2.bb");
+    subsubcom->add_flag("--newest");
+    app.config_formatter(std::make_shared<CLI::ConfigINI>());
+    args = {"--simple", "other", "--newer", "sub2.bb", "--newest"};
+    run();
+
+    std::string str = app.config_to_str();
+    CHECK_THAT(str, Contains("simple=true"));
+    CHECK_THAT(str, Contains("other.newer=true"));
+    CHECK_THAT(str, Contains("other.'sub2.bb'.newest=true"));
+}
+
 TEST_CASE_METHOD(TApp, "IniOutputSubsubcomCustomSep", "[config]") {
 
     app.add_flag("--simple");
@@ -2944,6 +3677,42 @@ TEST_CASE_METHOD(TApp, "IniOutputSubsubcomCustomSep", "[config]") {
     CHECK_THAT(str, Contains("other|sub2|newest=true"));
 }
 
+TEST_CASE_METHOD(TApp, "IniOutputSubsubcomCustomSepWithInternalSep", "[config]") {
+
+    app.add_flag("--simple");
+    auto *subcom = app.add_subcommand("other");
+    subcom->add_flag("--newer");
+    auto *subsubcom = subcom->add_subcommand("sub2|BB");
+    subsubcom->add_flag("--newest");
+    app.config_formatter(std::make_shared<CLI::ConfigINI>());
+    app.get_config_formatter_base()->parentSeparator('|');
+    args = {"--simple", "other", "--newer", "sub2|BB", "--newest"};
+    run();
+
+    std::string str = app.config_to_str();
+    CHECK_THAT(str, Contains("simple=true"));
+    CHECK_THAT(str, Contains("other|newer=true"));
+    CHECK_THAT(str, Contains("other|'sub2|BB'|newest=true"));
+}
+
+TEST_CASE_METHOD(TApp, "IniOutputSubsubcomCustomSepWithInternalQuote", "[config]") {
+
+    app.add_flag("--simple");
+    auto *subcom = app.add_subcommand("other");
+    subcom->add_flag("--newer");
+    auto *subsubcom = subcom->add_subcommand("sub2'BB");
+    subsubcom->add_flag("--newest");
+    app.config_formatter(std::make_shared<CLI::ConfigINI>());
+    app.get_config_formatter_base()->parentSeparator('|');
+    args = {"--simple", "other", "--newer", "sub2'BB", "--newest"};
+    run();
+
+    std::string str = app.config_to_str();
+    CHECK_THAT(str, Contains("simple=true"));
+    CHECK_THAT(str, Contains("other|newer=true"));
+    CHECK_THAT(str, Contains("other|\"sub2'BB\"|newest=true"));
+}
+
 TEST_CASE_METHOD(TApp, "IniOutputSubsubcomConfigurable", "[config]") {
 
     app.add_flag("--simple");
@@ -3005,7 +3774,7 @@ TEST_CASE_METHOD(TApp, "IniOutputQuoted", "[config]") {
 
     std::string str = app.config_to_str();
     CHECK_THAT(str, Contains("val1=\"I am a string\""));
-    CHECK_THAT(str, Contains("val2='I am a \"confusing\" string'"));
+    CHECK_THAT(str, Contains("val2=\"I am a \\\"confusing\\\" string\""));
 }
 
 TEST_CASE_METHOD(TApp, "DefaultsIniOutputQuoted", "[config]") {
@@ -3020,5 +3789,5 @@ TEST_CASE_METHOD(TApp, "DefaultsIniOutputQuoted", "[config]") {
 
     std::string str = app.config_to_str(true);
     CHECK_THAT(str, Contains("val1=\"I am a string\""));
-    CHECK_THAT(str, Contains("val2='I am a \"confusing\" string'"));
+    CHECK_THAT(str, Contains("val2=\"I am a \\\"confusing\\\" string\""));
 }
diff --git a/packages/CLI11/tests/CreationTest.cpp b/packages/CLI11/tests/CreationTest.cpp
index a51abd4889244d4fbd453db9b98d7a6a0d9c7ba0..46f57770fb3873a8716e9a423826c519c266271b 100644
--- a/packages/CLI11/tests/CreationTest.cpp
+++ b/packages/CLI11/tests/CreationTest.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -40,6 +40,16 @@ TEST_CASE_METHOD(TApp, "AddingExistingWithCase", "[creation]") {
     CHECK_NOTHROW(app.add_flag("--Cat,-C"));
 }
 
+TEST_CASE_METHOD(TApp, "AddingExistingShortLong", "[creation]") {
+    app.add_flag("-c");
+    CHECK_THROWS_AS(app.add_flag("--c"), CLI::OptionAlreadyAdded);
+}
+
+TEST_CASE_METHOD(TApp, "AddingExistingLongShort", "[creation]") {
+    app.add_flag("--c");
+    CHECK_THROWS_AS(app.add_option("-c"), CLI::OptionAlreadyAdded);
+}
+
 TEST_CASE_METHOD(TApp, "AddingExistingWithCaseAfter", "[creation]") {
     auto *count = app.add_flag("-c,--count");
     app.add_flag("--Cat,-C");
@@ -68,6 +78,37 @@ TEST_CASE_METHOD(TApp, "AddingExistingWithUnderscoreAfter2", "[creation]") {
     CHECK_THROWS_AS(count->ignore_underscore(), CLI::OptionAlreadyAdded);
 }
 
+TEST_CASE_METHOD(TApp, "matchPositional", "[creation]") {
+    app.add_option("firstoption");
+    CHECK_THROWS_AS(app.add_option("--firstoption"), CLI::OptionAlreadyAdded);
+}
+
+TEST_CASE_METHOD(TApp, "matchPositional2", "[creation]") {
+    app.add_option("--firstoption");
+    CHECK_THROWS_AS(app.add_option("firstoption"), CLI::OptionAlreadyAdded);
+}
+
+TEST_CASE_METHOD(TApp, "matchPositionalInOptionGroup1", "[creation]") {
+
+    auto *g1 = app.add_option_group("group_b");
+    g1->add_option("--firstoption");
+    CHECK_THROWS_AS(app.add_option("firstoption"), CLI::OptionAlreadyAdded);
+}
+
+TEST_CASE_METHOD(TApp, "matchPositionalInOptionGroup2", "[creation]") {
+
+    app.add_option("firstoption");
+    auto *g1 = app.add_option_group("group_b");
+    CHECK_THROWS_AS(g1->add_option("--firstoption"), CLI::OptionAlreadyAdded);
+}
+
+TEST_CASE_METHOD(TApp, "matchPositionalInOptionGroup3", "[creation]") {
+
+    app.add_option("f");
+    auto *g1 = app.add_option_group("group_b");
+    CHECK_THROWS_AS(g1->add_option("-f"), CLI::OptionAlreadyAdded);
+}
+
 TEST_CASE_METHOD(TApp, "AddingMultipleInfPositionals", "[creation]") {
     std::vector<std::string> one, two;
     app.add_option("one", one);
diff --git a/packages/CLI11/tests/DeprecatedTest.cpp b/packages/CLI11/tests/DeprecatedTest.cpp
index 063c67e5e884dc6f004b23176eabb9b57bc63f3c..e4a151f2cf04142c8994ab539a72d1514ceddb93 100644
--- a/packages/CLI11/tests/DeprecatedTest.cpp
+++ b/packages/CLI11/tests/DeprecatedTest.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
diff --git a/packages/CLI11/tests/EncodingTest.cpp b/packages/CLI11/tests/EncodingTest.cpp
index b026ee0143dc3331f8b49f306358db7031233bf2..6b84fc20eb27e5085ad573f6a4f98124764a2358 100644
--- a/packages/CLI11/tests/EncodingTest.cpp
+++ b/packages/CLI11/tests/EncodingTest.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
diff --git a/packages/CLI11/tests/FormatterTest.cpp b/packages/CLI11/tests/FormatterTest.cpp
index 2563c9421c54ea03c2a5a9ea7284a8a42db11288..215dcb1022e2e019486c803aaca3b7ded835ca08 100644
--- a/packages/CLI11/tests/FormatterTest.cpp
+++ b/packages/CLI11/tests/FormatterTest.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
diff --git a/packages/CLI11/tests/FuzzFailTest.cpp b/packages/CLI11/tests/FuzzFailTest.cpp
index 22148368819a9a4763dde949b68c5082c445694d..124c8f4287e0818033561ae2f81500c71cefecba 100644
--- a/packages/CLI11/tests/FuzzFailTest.cpp
+++ b/packages/CLI11/tests/FuzzFailTest.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -23,15 +23,78 @@ std::string loadFailureFile(const std::string &type, int index) {
 
 TEST_CASE("app_fail") {
     CLI::FuzzApp fuzzdata;
-
     auto app = fuzzdata.generateApp();
 
-    int index = GENERATE(range(1, 3));
-
+    int index = GENERATE(range(1, 4));
+    std::string optionString;
     auto parseData = loadFailureFile("fuzz_app_fail", index);
+    if(index >= 3 && parseData.size() > 25) {
+        optionString = parseData.substr(0, 25);
+        parseData.erase(0, 25);
+    }
+
     try {
 
-        app->parse(parseData);
+        if(!optionString.empty()) {
+            app->add_option(optionString, fuzzdata.buffer);
+        }
+        try {
+            app->parse(parseData);
+        } catch(const CLI::ParseError & /*e*/) {
+            CHECK(true);
+        }
+    } catch(const CLI::ConstructionError & /*e*/) {
+        CHECK(true);
+    }
+}
+
+TEST_CASE("file_fail") {
+    CLI::FuzzApp fuzzdata;
+    auto app = fuzzdata.generateApp();
+
+    int index = GENERATE(range(1, 9));
+    auto parseData = loadFailureFile("fuzz_file_fail", index);
+    std::stringstream out(parseData);
+    try {
+        app->parse_from_stream(out);
     } catch(const CLI::ParseError & /*e*/) {
+        CHECK(true);
+    }
+}
+
+TEST_CASE("app_file_gen_fail") {
+    CLI::FuzzApp fuzzdata;
+    auto app = fuzzdata.generateApp();
+
+    int index = GENERATE(range(1, 40));
+    std::string optionString, flagString;
+    auto parseData = loadFailureFile("fuzz_app_file_fail", index);
+    if(parseData.size() > 25) {
+        optionString = parseData.substr(0, 25);
+        parseData.erase(0, 25);
+    }
+    if(parseData.size() > 25) {
+        flagString = parseData.substr(0, 25);
+        parseData.erase(0, 25);
+    }
+    try {
+
+        if(!optionString.empty()) {
+            app->add_option(optionString, fuzzdata.buffer);
+        }
+        if(!flagString.empty()) {
+            app->add_flag(flagString, fuzzdata.intbuffer);
+        }
+        try {
+            app->parse(parseData);
+        } catch(const CLI::ParseError & /*e*/) {
+            return;
+        }
+    } catch(const CLI::ConstructionError & /*e*/) {
+        return;
     }
+    std::string configOut = app->config_to_str();
+    app->clear();
+    std::stringstream out(configOut);
+    app->parse_from_stream(out);
 }
diff --git a/packages/CLI11/tests/HelpTest.cpp b/packages/CLI11/tests/HelpTest.cpp
index c4403f754f48ed99c8188bf3cdecae64448da4ac..e21b29b0a764bb2e974d5b60ba0ba61a9e3bcfcf 100644
--- a/packages/CLI11/tests/HelpTest.cpp
+++ b/packages/CLI11/tests/HelpTest.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -10,6 +10,8 @@
 #include "CLI/CLI.hpp"
 #endif
 
+#include "app_helper.hpp"
+
 #include "catch.hpp"
 #include <fstream>
 
@@ -718,6 +720,22 @@ TEST_CASE("THelp: CustomHelp", "[help]") {
     }
 }
 
+TEST_CASE("THelp: HelpSubcommandPriority", "[help]") {
+    CLI::App app{"My prog"};
+
+    app.set_help_flag("-h", "display help and exit");
+
+    auto *sub1 = app.add_subcommand("sub1");
+    std::string someFile = "";
+
+    put_env("SOME_FILE", "NOT_A_FILE");
+    sub1->add_option("-f,--file", someFile)->envname("SOME_FILE")->required()->expected(1)->check(CLI::ExistingFile);
+
+    std::string input{"sub1 -h"};
+    CHECK_THROWS_AS(app.parse(input), CLI::CallForHelp);
+    unset_env("SOME_FILE");
+}
+
 TEST_CASE("THelp: NextLineShouldBeAlignmentInMultilineDescription", "[help]") {
     CLI::App app;
     int i{0};
@@ -1318,3 +1336,19 @@ TEST_CASE("TVersion: parse_throw", "[help]") {
         CHECK(1U == cptr->count());
     }
 }
+
+TEST_CASE("TVersion: exit", "[help]") {
+
+    CLI::App app;
+
+    app.set_version_flag("--version", CLI11_VERSION);
+
+    try {
+        app.parse("--version");
+    } catch(const CLI::CallForVersion &v) {
+        std::ostringstream out;
+        auto ret = app.exit(v, out);
+        CHECK_THAT(out.str(), Contains(CLI11_VERSION));
+        CHECK(0 == ret);
+    }
+}
diff --git a/packages/CLI11/tests/HelpersTest.cpp b/packages/CLI11/tests/HelpersTest.cpp
index 5186b47fcfc2a5648048d42cb82570437d1962ac..44262417adfb6c0b34f499129312294a8aef0503 100644
--- a/packages/CLI11/tests/HelpersTest.cpp
+++ b/packages/CLI11/tests/HelpersTest.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -165,6 +165,7 @@ TEST_CASE("String: InvalidName", "[helpers]") {
     CHECK(CLI::detail::valid_name_string("b@d2?"));
     CHECK(CLI::detail::valid_name_string("2vali?d"));
     CHECK_FALSE(CLI::detail::valid_name_string("!valid"));
+    CHECK_FALSE(CLI::detail::valid_name_string("!va\nlid"));
 }
 
 TEST_CASE("StringTools: Modify", "[helpers]") {
@@ -201,15 +202,26 @@ TEST_CASE("StringTools: Modify3", "[helpers]") {
 }
 
 TEST_CASE("StringTools: flagValues", "[helpers]") {
+    errno = 0;
     CHECK(-1 == CLI::detail::to_flag_value("0"));
+    CHECK(errno == 0);
     CHECK(1 == CLI::detail::to_flag_value("t"));
     CHECK(1 == CLI::detail::to_flag_value("1"));
     CHECK(6 == CLI::detail::to_flag_value("6"));
     CHECK(-6 == CLI::detail::to_flag_value("-6"));
     CHECK(-1 == CLI::detail::to_flag_value("false"));
     CHECK(1 == CLI::detail::to_flag_value("YES"));
-    CHECK_THROWS_AS(CLI::detail::to_flag_value("frog"), std::invalid_argument);
-    CHECK_THROWS_AS(CLI::detail::to_flag_value("q"), std::invalid_argument);
+    errno = 0;
+    CLI::detail::to_flag_value("frog");
+    CHECK(errno == EINVAL);
+    errno = 0;
+    CLI::detail::to_flag_value("q");
+    CHECK(errno == EINVAL);
+    errno = 0;
+    CLI::detail::to_flag_value(
+        "77777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777");
+    CHECK(errno == ERANGE);
+    errno = 0;
     CHECK(-1 == CLI::detail::to_flag_value("NO"));
     CHECK(475555233 == CLI::detail::to_flag_value("475555233"));
 }
@@ -226,6 +238,189 @@ TEST_CASE("StringTools: Validation", "[helpers]") {
     CHECK_FALSE(CLI::detail::isalpha("test2"));
 }
 
+TEST_CASE("StringTools: binaryEscapseConversion", "[helpers]") {
+    std::string testString("string1");
+    std::string estring = CLI::detail::binary_escape_string(testString);
+    CHECK(testString == estring);
+    CHECK_FALSE(CLI::detail::is_binary_escaped_string(estring));
+
+    std::string testString2("\nstring1\n");
+    estring = CLI::detail::binary_escape_string(testString2);
+    CHECK_FALSE(testString == estring);
+    CHECK(CLI::detail::is_binary_escaped_string(estring));
+    std::string rstring = CLI::detail::extract_binary_string(estring);
+    CHECK(rstring == testString2);
+
+    CLI::detail::remove_quotes(estring);
+    CHECK(CLI::detail::is_binary_escaped_string(estring));
+    std::string rstringrq = CLI::detail::extract_binary_string(estring);
+    CHECK(rstringrq == testString2);
+
+    testString2.push_back(0);
+    testString2.push_back(static_cast<char>(197));
+    testString2.push_back(78);
+    testString2.push_back(-34);
+
+    rstring = CLI::detail::extract_binary_string(CLI::detail::binary_escape_string(testString2));
+    CHECK(rstring == testString2);
+
+    testString2.push_back('b');
+    testString2.push_back('G');
+
+    rstring = CLI::detail::extract_binary_string(CLI::detail::binary_escape_string(testString2));
+    CHECK(rstring == testString2);
+    auto rstring2 = CLI::detail::extract_binary_string(rstring);
+    CHECK(rstring == rstring2);
+}
+
+TEST_CASE("StringTools: binaryStrings", "[helpers]") {
+    std::string rstring = "B\"()\"";
+    CHECK(CLI::detail::extract_binary_string(rstring).empty());
+
+    rstring = "B\"(\\x35\\xa7)\"";
+    CHECK(CLI::detail::is_binary_escaped_string(rstring));
+    auto result = CLI::detail::extract_binary_string(rstring);
+    CHECK(result[0] == static_cast<char>(0x35));
+    CHECK(result[1] == static_cast<char>(0xa7));
+
+    rstring = "'B\"(\\x3e\\xf7)\"'";
+    CHECK(CLI::detail::is_binary_escaped_string(rstring));
+    result = CLI::detail::extract_binary_string(rstring);
+    CHECK(result[0] == static_cast<char>(0x3e));
+    CHECK(result[1] == static_cast<char>(0xf7));
+
+    rstring = "B\"(\\x3E\\xf7)\"";
+    result = CLI::detail::extract_binary_string(rstring);
+    CHECK(result[0] == static_cast<char>(0x3e));
+    CHECK(result[1] == static_cast<char>(0xf7));
+
+    rstring = "B\"(\\X3E\\XF7)\"";
+    result = CLI::detail::extract_binary_string(rstring);
+    CHECK(result[0] == static_cast<char>(0x3e));
+    CHECK(result[1] == static_cast<char>(0xf7));
+
+    rstring = "B\"(\\XME\\XK7)\"";
+    result = CLI::detail::extract_binary_string(rstring);
+    CHECK(result == "\\XME\\XK7");
+
+    rstring = "B\"(\\XEM\\X7K)\"";
+    result = CLI::detail::extract_binary_string(rstring);
+    CHECK(result == "\\XEM\\X7K");
+}
+
+TEST_CASE("StringTools: escapeConversion", "[helpers]") {
+    CHECK(CLI::detail::remove_escaped_characters("test\\\"") == "test\"");
+    CHECK(CLI::detail::remove_escaped_characters("test\\\\") == "test\\");
+    CHECK(CLI::detail::remove_escaped_characters("test\\b") == "test\b");
+    CHECK(CLI::detail::remove_escaped_characters("test\\t") == "test\t");
+    CHECK(CLI::detail::remove_escaped_characters("test\\n\\r\\t\\f") == "test\n\r\t\f");
+    CHECK(CLI::detail::remove_escaped_characters("test\\r") == "test\r");
+    CHECK(CLI::detail::remove_escaped_characters("test\\f") == "test\f");
+    std::string zstring = "test";
+    zstring.push_back('\0');
+    zstring.append("test\n");
+    CHECK(CLI::detail::remove_escaped_characters("test\\0test\\n") == zstring);
+
+    CHECK_THROWS_AS(CLI::detail::remove_escaped_characters("test\\m_bad"), std::invalid_argument);
+    CHECK_THROWS_AS(CLI::detail::remove_escaped_characters("test\\"), std::invalid_argument);
+}
+
+TEST_CASE("StringTools: quotedString", "[helpers]") {
+
+    std::string rstring = "'B\"(\\x35\\xa7)\"'";
+    auto s2 = rstring;
+    CLI::detail::process_quoted_string(s2);
+    CHECK(s2[0] == static_cast<char>(0x35));
+    CHECK(s2[1] == static_cast<char>(0xa7));
+    s2 = rstring;
+    CLI::detail::remove_quotes(s2);
+    CLI::detail::process_quoted_string(s2);
+    CHECK(s2[0] == static_cast<char>(0x35));
+    CHECK(s2[1] == static_cast<char>(0xa7));
+
+    std::string qbase = R"("this\nis\na\nfour\tline test")";
+    std::string qresult = "this\nis\na\nfour\tline test";
+
+    std::string q1 = qbase;
+
+    // test remove quotes and escape processing
+    CLI::detail::process_quoted_string(q1);
+    CHECK(q1 == qresult);
+
+    std::string q2 = qbase;
+    q2.front() = '\'';
+    q2.pop_back();
+    q2.push_back('\'');
+    std::string qliteral = qbase.substr(1);
+    qliteral.pop_back();
+
+    // test remove quotes for literal string
+    CHECK(CLI::detail::process_quoted_string(q2));
+    CHECK(q2 == qliteral);
+
+    std::string q3 = qbase;
+    q3.front() = '`';
+    q3.pop_back();
+    q3.push_back('`');
+
+    // test remove quotes for literal string
+    CHECK(CLI::detail::process_quoted_string(q3));
+    CHECK(q3 == qliteral);
+
+    std::string q4 = qbase;
+    q4.front() = '|';
+    q4.pop_back();
+    q4.push_back('|');
+
+    // check that it doesn't process
+    CHECK_FALSE(CLI::detail::process_quoted_string(q4));
+    // test custom string quote character
+    CHECK(CLI::detail::process_quoted_string(q4, '|'));
+    CHECK(q4 == qresult);
+
+    std::string q5 = qbase;
+    q5.front() = '?';
+    q5.pop_back();
+    q5.push_back('?');
+
+    // test custom literal quote character
+    CHECK(CLI::detail::process_quoted_string(q5, '|', '?'));
+    CHECK(q5 == qliteral);
+
+    q3 = qbase;
+    q3.front() = '`';
+    q3.pop_back();
+    q3.push_back('`');
+
+    // test that '`' still works regardless of the other specified characters
+    CHECK(CLI::detail::process_quoted_string(q3));
+    CHECK(q3 == qliteral);
+}
+
+TEST_CASE("StringTools: unicode_literals", "[helpers]") {
+
+    CHECK(CLI::detail::remove_escaped_characters("test\\u03C0\\u00e9") == from_u8string(u8"test\u03C0\u00E9"));
+    CHECK(CLI::detail::remove_escaped_characters("test\\u73C0\\u0057") == from_u8string(u8"test\u73C0\u0057"));
+
+    CHECK(CLI::detail::remove_escaped_characters("test\\U0001F600\\u00E9") == from_u8string(u8"test\U0001F600\u00E9"));
+
+    CHECK_THROWS_AS(CLI::detail::remove_escaped_characters("test\\U0001M600\\u00E9"), std::invalid_argument);
+    CHECK_THROWS_AS(CLI::detail::remove_escaped_characters("test\\U0001E600\\u00M9"), std::invalid_argument);
+    CHECK_THROWS_AS(CLI::detail::remove_escaped_characters("test\\U0001E600\\uD8E9"), std::invalid_argument);
+
+    CHECK_THROWS_AS(CLI::detail::remove_escaped_characters("test\\U0001E600\\uD8"), std::invalid_argument);
+    CHECK_THROWS_AS(CLI::detail::remove_escaped_characters("test\\U0001E60"), std::invalid_argument);
+}
+
+TEST_CASE("StringTools: close_sequence", "[helpers]") {
+    CHECK(CLI::detail::close_sequence("[test]", 0, ']') == 5U);
+    CHECK(CLI::detail::close_sequence("[\"test]\"]", 0, ']') == 8U);
+    CHECK(CLI::detail::close_sequence("[\"test]\"],[t2]", 0, ']') == 8U);
+    CHECK(CLI::detail::close_sequence("[\"test]\"],[t2]", 10, ']') == 13U);
+    CHECK(CLI::detail::close_sequence("{\"test]\"],[t2]", 0, '}') == 14U);
+    CHECK(CLI::detail::close_sequence("[(),(),{},\"]]52{}\",[],[54],[[],[],()]]", 0, ']') == 37U);
+}
+
 TEST_CASE("Trim: Various", "[helpers]") {
     std::string s1{"  sdlfkj sdflk sd s  "};
     std::string a1{"sdlfkj sdflk sd s"};
@@ -501,7 +696,7 @@ TEST_CASE("Validators: ProgramNameSplit", "[helpers]") {
     TempFile myfile{"program_name1.exe"};
     {
         std::ofstream out{myfile};
-        out << "useless string doesn't matter" << std::endl;
+        out << "useless string doesn't matter" << '\n';
     }
     auto res =
         CLI::detail::split_program_name(std::string("./") + std::string(myfile) + " this is a bunch of extra stuff  ");
@@ -511,7 +706,7 @@ TEST_CASE("Validators: ProgramNameSplit", "[helpers]") {
     TempFile myfile2{"program name1.exe"};
     {
         std::ofstream out{myfile2};
-        out << "useless string doesn't matter" << std::endl;
+        out << "useless string doesn't matter" << '\n';
     }
     res = CLI::detail::split_program_name(std::string("   ") + std::string("./") + std::string(myfile2) +
                                           "      this is a bunch of extra stuff  ");
@@ -777,7 +972,7 @@ TEST_CASE("AppHelper: Ofstream", "[helpers]") {
 
         {
             std::ofstream out{myfile};
-            out << "this is output" << std::endl;
+            out << "this is output" << '\n';
         }
 
         CHECK(CLI::ExistingFile(myfile).empty());
@@ -885,47 +1080,96 @@ TEST_CASE("Join: Backward", "[helpers]") {
 }
 
 TEST_CASE("SplitUp: Simple", "[helpers]") {
-    std::vector<std::string> oput = {"one", "two three"};
+    std::vector<std::string> oput = {"one", "\"two three\""};
     std::string orig{R"(one "two three")"};
     std::vector<std::string> result = CLI::detail::split_up(orig);
     CHECK(result == oput);
 }
 
 TEST_CASE("SplitUp: SimpleDifferentQuotes", "[helpers]") {
-    std::vector<std::string> oput = {"one", "two three"};
+    std::vector<std::string> oput = {"one", "`two three`"};
     std::string orig{R"(one `two three`)"};
     std::vector<std::string> result = CLI::detail::split_up(orig);
     CHECK(result == oput);
 }
 
+TEST_CASE("SplitUp: SimpleMissingQuotes", "[helpers]") {
+    std::vector<std::string> oput = {"one", "`two three"};
+    std::string orig{R"(one `two three)"};
+    std::vector<std::string> result = CLI::detail::split_up(orig);
+    CHECK(result == oput);
+}
+
+TEST_CASE("SplitUp: SimpleMissingQuotesEscaped", "[helpers]") {
+    std::vector<std::string> oput = {"one", R"("two three\"")"};
+    std::string orig{R"(one "two three\"")"};
+    std::vector<std::string> result = CLI::detail::split_up(orig);
+    CHECK(result == oput);
+}
+
 TEST_CASE("SplitUp: SimpleDifferentQuotes2", "[helpers]") {
-    std::vector<std::string> oput = {"one", "two three"};
+    std::vector<std::string> oput = {"one", "'two three'"};
     std::string orig{R"(one 'two three')"};
     std::vector<std::string> result = CLI::detail::split_up(orig);
     CHECK(result == oput);
 }
 
+TEST_CASE("SplitUp: Bracket1", "[helpers]") {
+    std::vector<std::string> oput = {"one", "[two, three]"};
+    std::string orig{"one, [two, three]"};
+    std::vector<std::string> result = CLI::detail::split_up(orig, ',');
+    CHECK(result == oput);
+}
+
+TEST_CASE("SplitUp: Bracket2", "[helpers]") {
+    std::vector<std::string> oput = {"one", "<two, three>"};
+    std::string orig{"one, <two, three>"};
+    std::vector<std::string> result = CLI::detail::split_up(orig, ',');
+    CHECK(result == oput);
+}
+
+TEST_CASE("SplitUp: Bracket3", "[helpers]") {
+    std::vector<std::string> oput = {"one", "(two, three)"};
+    std::string orig{"one, (two, three)"};
+    std::vector<std::string> result = CLI::detail::split_up(orig, ',');
+    CHECK(result == oput);
+}
+
+TEST_CASE("SplitUp: Bracket4", "[helpers]") {
+    std::vector<std::string> oput = {"one", "{two, three}"};
+    std::string orig{"one, {two, three}"};
+    std::vector<std::string> result = CLI::detail::split_up(orig, ',');
+    CHECK(result == oput);
+}
+
+TEST_CASE("SplitUp: Comment", "[helpers]") {
+    std::vector<std::string> oput = {R"(["quote1", "#"])"};
+    std::string orig{R"(["quote1", "#"])"};
+    std::vector<std::string> result = CLI::detail::split_up(orig, '#');
+    CHECK(result == oput);
+}
+
 TEST_CASE("SplitUp: Layered", "[helpers]") {
-    std::vector<std::string> output = {R"(one 'two three')"};
+    std::vector<std::string> output = {R"("one 'two three'")"};
     std::string orig{R"("one 'two three'")"};
     std::vector<std::string> result = CLI::detail::split_up(orig);
     CHECK(result == output);
 }
 
 TEST_CASE("SplitUp: Spaces", "[helpers]") {
-    std::vector<std::string> oput = {"one", "  two three"};
+    std::vector<std::string> oput = {"one", "\"  two three\""};
     std::string orig{R"(  one  "  two three" )"};
     std::vector<std::string> result = CLI::detail::split_up(orig);
     CHECK(result == oput);
 }
 
 TEST_CASE("SplitUp: BadStrings", "[helpers]") {
-    std::vector<std::string> oput = {"one", "  two three"};
+    std::vector<std::string> oput = {"one", "\"  two three"};
     std::string orig{R"(  one  "  two three )"};
     std::vector<std::string> result = CLI::detail::split_up(orig);
     CHECK(result == oput);
 
-    oput = {"one", "  two three"};
+    oput = {"one", "'  two three"};
     orig = R"(  one  '  two three )";
     result = CLI::detail::split_up(orig);
     CHECK(result == oput);
@@ -1033,6 +1277,19 @@ TEST_CASE("Types: TypeName", "[helpers]") {
     CHECK((atomic_name == "INT" || atomic_name == "TEXT"));
 }
 
+TEST_CASE("Types: TypeNameStrings", "[helpers]") {
+    auto sclass = CLI::detail::classify_object<std::string>::value;
+    CHECK(CLI::detail::object_category::string_assignable == sclass);
+
+    auto wsclass = CLI::detail::classify_object<std::wstring>::value;
+    CHECK(CLI::detail::object_category::wstring_assignable == wsclass);
+
+#if defined CLI11_HAS_FILEYSTEM && CLI11_HAS_FILESYSTEM > 0 && defined(_MSC_VER)
+    auto fspclass = CLI::detail::classify_object<std::filesystem::path>::value;
+    CHECK(CLI::detail::object_category::wstring_assignable == fspclass);
+#endif
+}
+
 TEST_CASE("Types: OverflowSmall", "[helpers]") {
     signed char x = 0;
     auto strmax = std::to_string((std::numeric_limits<signed char>::max)() + 1);
@@ -1341,3 +1598,14 @@ TEST_CASE("FixNewLines: EdgesCheck", "[helpers]") {
     std::string result = CLI::detail::fix_newlines("; ", input);
     CHECK(output == result);
 }
+
+TEST_CASE("String: environment", "[helpers]") {
+    put_env("TEST1", "TESTS");
+
+    auto value = CLI::detail::get_environment_value("TEST1");
+    CHECK(value == "TESTS");
+    unset_env("TEST1");
+
+    value = CLI::detail::get_environment_value("TEST2");
+    CHECK(value.empty());
+}
diff --git a/packages/CLI11/tests/NewParseTest.cpp b/packages/CLI11/tests/NewParseTest.cpp
index a72af823bb8ce9c596351f3bb77fa67dde750325..9f5aea20a44927b1110955d04c12a0fcf2500fb5 100644
--- a/packages/CLI11/tests/NewParseTest.cpp
+++ b/packages/CLI11/tests/NewParseTest.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -193,7 +193,7 @@ TEST_CASE_METHOD(TApp, "custom_string_converterFail", "[newparse]") {
     CHECK_THROWS_AS(run(), CLI::ConversionError);
 }
 
-/// Wrapper with an unconvenient interface
+/// Wrapper with an inconvenient interface
 template <class T> class badlywrapped {
   public:
     badlywrapped() : value() {}
diff --git a/packages/CLI11/tests/OptionGroupTest.cpp b/packages/CLI11/tests/OptionGroupTest.cpp
index ab4d3c638d29c07ba0d0c54f3d9b216b870f6af3..9112b0be214bd6c6c1d203e8053f8031568f9a9d 100644
--- a/packages/CLI11/tests/OptionGroupTest.cpp
+++ b/packages/CLI11/tests/OptionGroupTest.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
diff --git a/packages/CLI11/tests/OptionTypeTest.cpp b/packages/CLI11/tests/OptionTypeTest.cpp
index 6d06a5af3ebabbe695825183d936cb81f0c2a4d2..5068a8c65d201ed6c702c332d919cb610f2da2b0 100644
--- a/packages/CLI11/tests/OptionTypeTest.cpp
+++ b/packages/CLI11/tests/OptionTypeTest.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -6,6 +6,9 @@
 
 #include "app_helper.hpp"
 
+#include "catch.hpp"
+
+#include <algorithm>
 #include <atomic>
 #include <cmath>
 #include <complex>
@@ -218,6 +221,145 @@ TEST_CASE_METHOD(TApp, "atomic_int_option", "[optiontype]") {
     CHECK(0 == i);
 }
 
+static const std::map<std::string, double> testValuesDouble{
+    {"3.14159", 3.14159},
+    {"-3.14159", -3.14159},
+    {"+1.0", 1.0},
+    {"-0.01", -0.01},
+    {"5e22", 5e22},
+    {"-2E-2", -2e-2},
+    {"5e+22", 5e22},
+    {"1e06", 1e6},
+    {"6.626e-34", 6.626e-34},
+    {"6.626e+34", 6.626e34},
+    {"-6.626e-34", -6.626e-34},
+    {"224_617.445_991", 224617.445991},
+    {"224'617.445'991", 224617.445991},
+    {"inf", std::numeric_limits<double>::infinity()},
+    {"+inf", std::numeric_limits<double>::infinity()},
+    {"-inf", -std::numeric_limits<double>::infinity()},
+    {"nan", std::numeric_limits<double>::signaling_NaN()},
+    {"+nan", std::numeric_limits<double>::signaling_NaN()},
+    {"-nan", -std::numeric_limits<double>::signaling_NaN()},
+
+};
+
+TEST_CASE_METHOD(TApp, "floatingConversions", "[optiontype]") {
+    auto test_data = GENERATE(from_range(testValuesDouble));
+
+    double val{0};
+    app.add_option("--val", val);
+
+    args = {"--val", test_data.first};
+
+    run();
+    if(std::isnan(test_data.second)) {
+        CHECK(std::isnan(val));
+    } else {
+
+        CHECK_THAT(val, WithinRel(test_data.second, 1e-11));
+    }
+}
+
+static const std::map<std::string, std::int64_t> testValuesInt{
+    {"+99", 99},
+    {"99", 99},
+    {"-99", -99},
+    {"0xDEADBEEF", 0xDEADBEEF},
+    {"0xdeadbeef", 0xDEADBEEF},
+    {"0XDEADBEEF", 0xDEADBEEF},
+    {"0Xdeadbeef", 0xDEADBEEF},
+    {"0xdead_beef", 0xDEADBEEF},
+    {"0xdead'beef", 0xDEADBEEF},
+    {"0o01234567", 001234567},
+    {"0o755", 0755},
+    {"0755", 0755},
+    {"995862_262", 995862262},
+    {"995862262", 995862262},
+    {"-995862275", -995862275},
+    {"-995'862'275", -995862275},
+    {"0b11010110", 0xD6},
+    {"0b1101'0110", 0xD6},
+    {"1_2_3_4_5", 12345},
+};
+
+TEST_CASE_METHOD(TApp, "intConversions", "[optiontype]") {
+
+    auto test_data = GENERATE(from_range(testValuesInt));
+
+    std::int64_t val{0};
+    app.add_option("--val", val);
+
+    args = {"--val", test_data.first};
+
+    run();
+
+    CHECK(val == test_data.second);
+}
+
+TEST_CASE_METHOD(TApp, "intConversionsErange", "[optiontype]") {
+
+    std::int64_t val{0};
+    app.add_option("--val", val);
+
+    args = {"--val", "0o11545241241415151512312415123125667"};
+
+    CHECK_THROWS_AS(run(), CLI::ParseError);
+
+    args = {"--val", "0b1011000001101011001100110011111000101010101011111111111111111111111001010111011100"};
+
+    CHECK_THROWS_AS(run(), CLI::ParseError);
+}
+
+static const std::map<std::string, std::uint64_t> testValuesUInt{
+    {"+99", 99},
+    {"99", 99},
+    {"0xDEADBEEF", 0xDEADBEEF},
+    {"0xdeadbeef", 0xDEADBEEF},
+    {"0XDEADBEEF", 0xDEADBEEF},
+    {"0Xdeadbeef", 0xDEADBEEF},
+    {"0xdead_beef", 0xDEADBEEF},
+    {"0xdead'beef", 0xDEADBEEF},
+    {"0o01234567", 001234567},
+    {"0o755", 0755},
+    {"0755", 0755},
+    {"995862_262", 995862262},
+    {"995862262", 995862262},
+    {"+995862275", +995862275},
+    {"995'862'275", 995862275},
+    {"0b11010110", 0xD6},
+    {"0b1101'0110", 0xD6},
+    {"1_2_3_4_5", 12345},
+};
+
+TEST_CASE_METHOD(TApp, "uintConversions", "[optiontype]") {
+
+    auto test_data = GENERATE(from_range(testValuesUInt));
+
+    std::uint64_t val{0};
+    app.add_option("--val", val);
+
+    args = {"--val", test_data.first};
+
+    run();
+
+    CHECK(val == test_data.second);
+}
+
+TEST_CASE_METHOD(TApp, "uintConversionsErange", "[optiontype]") {
+
+    std::uint64_t val{0};
+    app.add_option("--val", val);
+
+    args = {"--val", "0o11545241241415151512312415123125667"};
+
+    CHECK_THROWS_AS(run(), CLI::ParseError);
+
+    args = {"--val", "0b1011000001101011001100110011111000101010101011111111111111111111111001010111011100"};
+
+    CHECK_THROWS_AS(run(), CLI::ParseError);
+}
+
 TEST_CASE_METHOD(TApp, "CharOption", "[optiontype]") {
     char c1{'t'};
     app.add_option("-c", c1);
diff --git a/packages/CLI11/tests/OptionalTest.cpp b/packages/CLI11/tests/OptionalTest.cpp
index 3d78e3498c9a3ff99096867786552fc548d40212..986272d01bcb3912588d5ffac9321640df62d557 100644
--- a/packages/CLI11/tests/OptionalTest.cpp
+++ b/packages/CLI11/tests/OptionalTest.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -53,6 +53,8 @@
 #endif
 // [CLI11:verbatim]
 
+TEST_CASE("OptionalNoEmpty") { CHECK(1 == 1); }
+
 #if CLI11_STD_OPTIONAL
 
 #ifdef _MSC_VER
@@ -70,13 +72,11 @@ TEST_CASE_METHOD(TApp, "StdOptionalTest", "[optional]") {
 
     args = {"-c", "1"};
     run();
-    CHECK(opt);
-    CHECK(1 == *opt);
+    CHECK((opt && (1 == *opt)));
 
     args = {"--count", "3"};
     run();
-    CHECK(opt);
-    CHECK(3 == *opt);
+    CHECK((opt && (3 == *opt)));
 }
 
 TEST_CASE_METHOD(TApp, "StdOptionalVectorEmptyDirect", "[optional]") {
@@ -91,7 +91,7 @@ TEST_CASE_METHOD(TApp, "StdOptionalVectorEmptyDirect", "[optional]") {
     CHECK(!opt);
     args = {"-v", "1", "4", "5"};
     run();
-    CHECK(opt);
+    REQUIRE(opt);
     std::vector<int> expV{1, 4, 5};
     CHECK(expV == *opt);
 }
@@ -125,7 +125,7 @@ TEST_CASE_METHOD(TApp, "StdOptionalUint", "[optional]") {
 
     args = {"-i", "15"};
     run();
-    CHECK(15U == *opt);
+    CHECK((opt && (15U == *opt)));
     static_assert(CLI::detail::classify_object<std::optional<std::uint64_t>>::value ==
                   CLI::detail::object_category::wrapper_value);
 }
@@ -140,13 +140,14 @@ TEST_CASE_METHOD(TApp, "StdOptionalbool", "[optional]") {
 
     args = {"--opt"};
     run();
-    CHECK(opt);
-    CHECK(*opt);
+    CHECK((opt && *opt));
 
     args = {"--no-opt"};
     run();
-    CHECK(opt);
-    CHECK_FALSE(*opt);
+    REQUIRE(opt);
+    if(opt) {
+        CHECK_FALSE(*opt);
+    }
     static_assert(CLI::detail::classify_object<std::optional<bool>>::value ==
                   CLI::detail::object_category::wrapper_value);
 }
@@ -186,12 +187,12 @@ TEST_CASE_METHOD(TApp, "BoostOptionalTest", "[optional]") {
 
     args = {"-c", "1"};
     run();
-    CHECK(opt);
+    REQUIRE(opt);
     CHECK(1 == *opt);
     opt = {};
     args = {"--count", "3"};
     run();
-    CHECK(opt);
+    REQUIRE(opt);
     CHECK(3 == *opt);
 }
 
@@ -203,7 +204,7 @@ TEST_CASE_METHOD(TApp, "BoostOptionalTestZarg", "[optional]") {
 
     args = {"-c", "1"};
     run();
-    CHECK(opt);
+    REQUIRE(opt);
     CHECK(1 == *opt);
     opt = {};
     args = {"--count"};
@@ -219,12 +220,12 @@ TEST_CASE_METHOD(TApp, "BoostOptionalint64Test", "[optional]") {
 
     args = {"-c", "1"};
     run();
-    CHECK(opt);
+    REQUIRE(opt);
     CHECK(1 == *opt);
     opt = {};
     args = {"--count", "3"};
     run();
-    CHECK(opt);
+    REQUIRE(opt);
     CHECK(3 == *opt);
 }
 
@@ -236,12 +237,12 @@ TEST_CASE_METHOD(TApp, "BoostOptionalStringTest", "[optional]") {
 
     args = {"-s", "strval"};
     run();
-    CHECK(opt);
+    REQUIRE(opt);
     CHECK("strval" == *opt);
     opt = {};
     args = {"--string", "strv"};
     run();
-    CHECK(opt);
+    REQUIRE(opt);
     CHECK("strv" == *opt);
 }
 namespace boost {
@@ -266,13 +267,13 @@ TEST_CASE_METHOD(TApp, "BoostOptionalEnumTest", "[optional]") {
     args = {"-v", "3"};
     run();
     checkOpt = static_cast<bool>(opt);
-    CHECK(checkOpt);
+    REQUIRE(checkOpt);
     CHECK(*opt == eval::val3);
     opt = {};
     args = {"--val", "1"};
     run();
     checkOpt = static_cast<bool>(opt);
-    CHECK(checkOpt);
+    REQUIRE(checkOpt);
     CHECK(*opt == eval::val1);
 }
 
@@ -288,7 +289,7 @@ TEST_CASE_METHOD(TApp, "BoostOptionalVector", "[optional]") {
     args = {"-v", "1", "4", "5"};
     run();
     checkOpt = static_cast<bool>(opt);
-    CHECK(checkOpt);
+    REQUIRE(checkOpt);
     std::vector<int> expV{1, 4, 5};
     CHECK(expV == *opt);
 }
@@ -308,7 +309,7 @@ TEST_CASE_METHOD(TApp, "BoostOptionalVectorEmpty", "[optional]") {
     args = {"-v", "1", "4", "5"};
     run();
     checkOpt = static_cast<bool>(opt);
-    CHECK(checkOpt);
+    REQUIRE(checkOpt);
     std::vector<int> expV{1, 4, 5};
     CHECK(expV == *opt);
 }
@@ -328,7 +329,7 @@ TEST_CASE_METHOD(TApp, "BoostOptionalVectorEmptyDirect", "[optional]") {
     args = {"-v", "1", "4", "5"};
     run();
     checkOpt = static_cast<bool>(opt);
-    CHECK(checkOpt);
+    REQUIRE(checkOpt);
     std::vector<int> expV{1, 4, 5};
     CHECK(expV == *opt);
 }
@@ -344,12 +345,12 @@ TEST_CASE_METHOD(TApp, "BoostOptionalComplexDirect", "[optional]") {
     CHECK(!opt);
     args = {"-c", "1+2j"};
     run();
-    CHECK(opt);
+    REQUIRE(opt);
     std::complex<double> val{1, 2};
     CHECK(val == *opt);
     args = {"-c", "3", "-4"};
     run();
-    CHECK(opt);
+    REQUIRE(opt);
     std::complex<double> val2{3, -4};
     CHECK(val2 == *opt);
 }
diff --git a/packages/CLI11/tests/SetTest.cpp b/packages/CLI11/tests/SetTest.cpp
index b326989997824f18fdca60c47a09b38c243df9e0..3afa4759ea234d75b0ccb2396f38757234e07b6c 100644
--- a/packages/CLI11/tests/SetTest.cpp
+++ b/packages/CLI11/tests/SetTest.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
diff --git a/packages/CLI11/tests/SimpleTest.cpp b/packages/CLI11/tests/SimpleTest.cpp
index 14d6558b46c843a10763a0c6e0891b37806bf184..2b0127b52044bb659cc1df716c4ad8a0da7b3d67 100644
--- a/packages/CLI11/tests/SimpleTest.cpp
+++ b/packages/CLI11/tests/SimpleTest.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
diff --git a/packages/CLI11/tests/StringParseTest.cpp b/packages/CLI11/tests/StringParseTest.cpp
index cc1205fe3b7b6050e4e2e56f242dc024c3694b8b..839baf7a9923fcf577ef2b1685ea1ec5e3731090 100644
--- a/packages/CLI11/tests/StringParseTest.cpp
+++ b/packages/CLI11/tests/StringParseTest.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -20,7 +20,7 @@ TEST_CASE_METHOD(TApp, "ExistingExeCheck", "[stringparse]") {
 
     {
         std::ofstream out{tmpexe};
-        out << "useless string doesn't matter" << std::endl;
+        out << "useless string doesn't matter" << '\n';
     }
 
     app.parse(std::string("./") + std::string(tmpexe) +
@@ -42,7 +42,7 @@ TEST_CASE_METHOD(TApp, "ExistingExeCheckWithSpace", "[stringparse]") {
 
     {
         std::ofstream out{tmpexe};
-        out << "useless string doesn't matter" << std::endl;
+        out << "useless string doesn't matter" << '\n';
     }
 
     app.parse(std::string("./") + std::string(tmpexe) +
@@ -66,7 +66,7 @@ TEST_CASE_METHOD(TApp, "ExistingExeCheckWithLotsOfSpace", "[stringparse]") {
 
     {
         std::ofstream out{tmpexe};
-        out << "useless string doesn't matter" << std::endl;
+        out << "useless string doesn't matter" << '\n';
     }
 
     app.parse(std::string("./") + std::string(tmpexe) +
diff --git a/packages/CLI11/tests/SubcommandTest.cpp b/packages/CLI11/tests/SubcommandTest.cpp
index 25415eaa7cf1241f46ac6788f6cfca285543c594..cfe532386f6bd106c2917e03ebc9c991cee81ff1 100644
--- a/packages/CLI11/tests/SubcommandTest.cpp
+++ b/packages/CLI11/tests/SubcommandTest.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -187,6 +187,10 @@ TEST_CASE_METHOD(TApp, "DuplicateSubcommands", "[subcom]") {
     run();
     CHECK(*foo);
     CHECK(3u == foo->count());
+
+    auto subs = app.get_subcommands();
+    // subcommands only get triggered once
+    CHECK(subs.size() == 1U);
 }
 
 TEST_CASE_METHOD(TApp, "DuplicateSubcommandCallbacks", "[subcom]") {
@@ -2114,3 +2118,26 @@ TEST_CASE_METHOD(TApp, "DotNotationSubcommandRecusive2", "[subcom]") {
     CHECK(extras.size() == 1);
     CHECK(extras.front() == "sub1.sub2.sub3.bob");
 }
+
+// Reported bug #903 on github
+TEST_CASE_METHOD(TApp, "subcommandEnvironmentName", "[subcom]") {
+    auto *sub1 = app.add_subcommand("sub1");
+    std::string someFile;
+    int sub1value{0};
+    sub1->add_option("-f,--file", someFile)->envname("SOME_FILE")->required()->check(CLI::ExistingFile);
+    sub1->add_option("-v", sub1value);
+    auto *sub2 = app.add_subcommand("sub2");
+    int completelyUnrelatedToSub1 = 0;
+    sub2->add_option("-v,--value", completelyUnrelatedToSub1)->required();
+
+    args = {"sub2", "-v", "111"};
+    CHECK_NOTHROW(run());
+
+    put_env("SOME_FILE", "notafile.txt");
+
+    CHECK_NOTHROW(run());
+
+    args = {"sub1", "-v", "111"};
+    CHECK_THROWS_AS(run(), CLI::RequiredError);
+    unset_env("SOME_FILE");
+}
diff --git a/packages/CLI11/tests/TimerTest.cpp b/packages/CLI11/tests/TimerTest.cpp
index e15d928cf7ee2a49d5176641e77d601bf74c76eb..0dc2ca94c4a5bdac05e8b4f88993d6c69e3474ef 100644
--- a/packages/CLI11/tests/TimerTest.cpp
+++ b/packages/CLI11/tests/TimerTest.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -63,6 +63,6 @@ TEST_CASE("Timer: PrintTimer", "[timer]") {
 TEST_CASE("Timer: TimeItTimer", "[timer]") {
     CLI::Timer timer;
     std::string output = timer.time_it([]() { std::this_thread::sleep_for(std::chrono::milliseconds(10)); }, .1);
-    std::cout << output << std::endl;
+    std::cout << output << '\n';
     CHECK_THAT(output, Contains("ms"));
 }
diff --git a/packages/CLI11/tests/TransformTest.cpp b/packages/CLI11/tests/TransformTest.cpp
index 9406e0254d0a5d57b34ff005aa6c2bd89cb727a8..97935f21c7df229516e4d62f2c819c9c0eac97ef 100644
--- a/packages/CLI11/tests/TransformTest.cpp
+++ b/packages/CLI11/tests/TransformTest.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -706,6 +706,53 @@ TEST_CASE_METHOD(TApp, "NumberWithUnitBadInput", "[transform]") {
     CHECK_THROWS_AS(run(), CLI::ValidationError);
 }
 
+static const std::map<std::string, std::string> validValues = {
+    {"test\\u03C0\\u00e9", from_u8string(u8"test\u03C0\u00E9")},
+    {"test\\u03C0\\u00e9", from_u8string(u8"test\u73C0\u0057")},
+    {"test\\U0001F600\\u00E9", from_u8string(u8"test\U0001F600\u00E9")},
+    {R"("this\nis\na\nfour\tline test")", "this\nis\na\nfour\tline test"},
+    {"'B\"(\\x35\\xa7\\x46)\"'", std::string{0x35, static_cast<char>(0xa7), 0x46}},
+    {"B\"(\\x35\\xa7\\x46)\"", std::string{0x35, static_cast<char>(0xa7), 0x46}},
+    {"test\\ntest", "test\ntest"},
+    {"\"test\\ntest", "\"test\ntest"},
+    {R"('this\nis\na\nfour\tline test')", R"(this\nis\na\nfour\tline test)"},
+    {R"("this\nis\na\nfour\tline test")", "this\nis\na\nfour\tline test"},
+    {R"(`this\nis\na\nfour\tline test`)", R"(this\nis\na\nfour\tline test)"}};
+
+TEST_CASE_METHOD(TApp, "StringEscapeValid", "[transform]") {
+
+    auto test_data = GENERATE(from_range(validValues));
+
+    std::string value{};
+
+    app.add_option("-n", value)->transform(CLI::EscapedString);
+
+    args = {"-n", test_data.first};
+
+    run();
+    CHECK(test_data.second == value);
+}
+
+static const std::vector<std::string> invalidValues = {"test\\U0001M600\\u00E9",
+                                                       "test\\U0001E600\\u00M9",
+                                                       "test\\U0001E600\\uD8E9",
+                                                       "test\\U0001E600\\uD8",
+                                                       "test\\U0001E60",
+                                                       "test\\qbad"};
+
+TEST_CASE_METHOD(TApp, "StringEscapeInvalid", "[transform]") {
+
+    auto test_data = GENERATE(from_range(invalidValues));
+
+    std::string value{};
+
+    app.add_option("-n", value)->transform(CLI::EscapedString);
+
+    args = {"-n", test_data};
+
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
+}
+
 TEST_CASE_METHOD(TApp, "NumberWithUnitIntOverflow", "[transform]") {
     std::map<std::string, int> mapping{{"a", 1000000}, {"b", 100}, {"c", 101}};
 
diff --git a/packages/CLI11/tests/TrueFalseTest.cpp b/packages/CLI11/tests/TrueFalseTest.cpp
index 93f2f3fb8109ccb40370bbf7ed330b3360c33e89..b14ef29835af4eb19923f3a5bd896a5da509de42 100644
--- a/packages/CLI11/tests/TrueFalseTest.cpp
+++ b/packages/CLI11/tests/TrueFalseTest.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
diff --git a/packages/CLI11/tests/WindowsTest.cpp b/packages/CLI11/tests/WindowsTest.cpp
index a17d58735a66a29fa5b2a02d614420396f6943eb..647a12a6ea8c5603c7b58edc2999b4a905a7de22 100644
--- a/packages/CLI11/tests/WindowsTest.cpp
+++ b/packages/CLI11/tests/WindowsTest.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
diff --git a/packages/CLI11/tests/app_helper.hpp b/packages/CLI11/tests/app_helper.hpp
index 5479e486392ba432f6f2d25f45d6bcf40812a315..fbe2555eb717c5c43e08b248224ae724372742f9 100644
--- a/packages/CLI11/tests/app_helper.hpp
+++ b/packages/CLI11/tests/app_helper.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2021, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -71,6 +71,15 @@ inline void unset_env(std::string name) {
 #endif
 }
 
+/// these are provided for compatibility with the char8_t for C++20 that breaks stuff
+CLI11_INLINE std::string from_u8string(const std::string &s) { return s; }
+CLI11_INLINE std::string from_u8string(std::string &&s) { return std::move(s); }
+#if defined(__cpp_lib_char8_t)
+CLI11_INLINE std::string from_u8string(const std::u8string &s) { return std::string(s.begin(), s.end()); }
+#elif defined(__cpp_char8_t)
+CLI11_INLINE std::string from_u8string(const char8_t *s) { return std::string(reinterpret_cast<const char *>(s)); }
+#endif
+
 CLI11_INLINE void check_identical_files(const char *path1, const char *path2) {
     std::string err1 = CLI::ExistingFile(path1);
     if(!err1.empty()) {
diff --git a/packages/CLI11/tests/applications/ensure_utf8.cpp b/packages/CLI11/tests/applications/ensure_utf8.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..84fd5f2013a32541bdcac674d81809947a72c374
--- /dev/null
+++ b/packages/CLI11/tests/applications/ensure_utf8.cpp
@@ -0,0 +1,35 @@
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
+// under NSF AWARD 1414736 and by the respective contributors.
+// All rights reserved.
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <CLI/CLI.hpp>
+#include <cstring>
+#include <iostream>
+
+int main(int argc, char **argv) {
+    CLI::App app{"App description"};
+    char **original_argv = argv;
+    argv = app.ensure_utf8(argv);
+
+#ifdef _WIN32
+    for(int i = 0; i < argc; i++) {
+        if(std::strcmp(argv[i], original_argv[i]) != 0) {
+            std::cerr << argv[i] << "\n";
+            std::cerr << original_argv[i] << "\n";
+            return i + 1;
+        }
+        argv[i][0] = 'x';  // access it to check that it is accessible
+    }
+
+#else
+    (void)argc;
+
+    if(original_argv != argv) {
+        return -1;
+    }
+#endif
+
+    return 0;
+}
diff --git a/packages/CLI11/tests/applications/ensure_utf8_twice.cpp b/packages/CLI11/tests/applications/ensure_utf8_twice.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a7d0e38029e68c53235de300a4c0a340480d9689
--- /dev/null
+++ b/packages/CLI11/tests/applications/ensure_utf8_twice.cpp
@@ -0,0 +1,36 @@
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
+// under NSF AWARD 1414736 and by the respective contributors.
+// All rights reserved.
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <CLI/CLI.hpp>
+#include <cstring>
+#include <iostream>
+
+int main(int argc, char **argv) {
+    CLI::App app{"App description"};
+    char **original_argv = argv;
+    argv = app.ensure_utf8(argv);
+    argv = app.ensure_utf8(argv);  // completely useless but works ok
+
+#ifdef _WIN32
+    for(int i = 0; i < argc; i++) {
+        if(std::strcmp(argv[i], original_argv[i]) != 0) {
+            std::cerr << argv[i] << "\n";
+            std::cerr << original_argv[i] << "\n";
+            return i + 1;
+        }
+        argv[i][0] = 'x';  // access it to check that it is accessible
+    }
+
+#else
+    (void)argc;
+
+    if(original_argv != argv) {
+        return -1;
+    }
+#endif
+
+    return 0;
+}
diff --git a/packages/CLI11/tests/applications/system_args.cpp b/packages/CLI11/tests/applications/system_args.cpp
index e1e77ba673ce56f69117569b731475372d618a02..2cad18b19e786861ef9ca8c2a9a640a436de143f 100644
--- a/packages/CLI11/tests/applications/system_args.cpp
+++ b/packages/CLI11/tests/applications/system_args.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
diff --git a/packages/CLI11/tests/catch.hpp b/packages/CLI11/tests/catch.hpp
index e6de667325f48d65ea5145207bb4a608ae22c209..4dd233603720755dd04a30c1894aa9c3ce1ded32 100644
--- a/packages/CLI11/tests/catch.hpp
+++ b/packages/CLI11/tests/catch.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -14,10 +14,14 @@
 #include <catch2/catch_template_test_macros.hpp>
 #include <catch2/catch_test_macros.hpp>
 #include <catch2/generators/catch_generators.hpp>
+#include <catch2/generators/catch_generators_range.hpp>
+#include <catch2/matchers/catch_matchers_floating_point.hpp>
 #include <catch2/matchers/catch_matchers_string.hpp>
 
-using Catch::Approx;            // NOLINT(google-global-names-in-headers)
-using Catch::Matchers::Equals;  // NOLINT(google-global-names-in-headers)
+using Catch::Approx;                  // NOLINT(google-global-names-in-headers)
+using Catch::Generators::from_range;  // NOLINT(google-global-names-in-headers)
+using Catch::Matchers::Equals;        // NOLINT(google-global-names-in-headers)
+using Catch::Matchers::WithinRel;     // NOLINT(google-global-names-in-headers)
 
 inline auto Contains(const std::string &x) { return Catch::Matchers::ContainsSubstring(x); }
 
@@ -26,6 +30,7 @@ inline auto Contains(const std::string &x) { return Catch::Matchers::ContainsSub
 #include <catch2/catch.hpp>
 
 using Catch::Equals;              // NOLINT(google-global-names-in-headers)
+using Catch::WithinRel;           // NOLINT(google-global-names-in-headers)
 using Catch::Matchers::Contains;  // NOLINT(google-global-names-in-headers)
 
 #endif
diff --git a/packages/CLI11/tests/find_package_tests/CMakeLists.txt b/packages/CLI11/tests/find_package_tests/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6d5aa2775be761bc67d1b9fb4f0d8f0455817d58
--- /dev/null
+++ b/packages/CLI11/tests/find_package_tests/CMakeLists.txt
@@ -0,0 +1,19 @@
+cmake_minimum_required(VERSION 3.10...3.26)
+
+project(CLI11-find-package-test)
+
+include(CTest)
+
+if(CLI11_DIR)
+  set(CMAKE_PREFIX_PATH ${CLI11_DIR})
+endif()
+
+# Test the CLI11 CMake package config
+find_package(CLI11 2.0 REQUIRED)
+
+# Test the target
+add_executable(package-test ../../examples/positional_validation.cpp)
+target_link_libraries(package-test CLI11::CLI11)
+
+add_test(NAME package-test1 COMMAND package-test one)
+set_property(TEST package-test1 PROPERTY PASS_REGULAR_EXPRESSION "File 1 = one")
diff --git a/packages/CLI11/tests/fuzzFail/fuzz_app_fail3 b/packages/CLI11/tests/fuzzFail/fuzz_app_fail3
new file mode 100644
index 0000000000000000000000000000000000000000..0c62a1dc8cbfcfc8bfaaa5a39523a56ad0b17253
Binary files /dev/null and b/packages/CLI11/tests/fuzzFail/fuzz_app_fail3 differ
diff --git a/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail1 b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail1
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail10 b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail10
new file mode 100644
index 0000000000000000000000000000000000000000..dbe19a59cd83ff36e26117f85e283f3d23a87625
--- /dev/null
+++ b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail10
@@ -0,0 +1,3 @@
+-e-vC
+,cοΏ½C
+,cοΏ½οΏ½οΏ½οΏ½
diff --git a/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail11 b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail11
new file mode 100644
index 0000000000000000000000000000000000000000..117520b3524704157d1f68593b4e399c6c756c27
--- /dev/null
+++ b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail11
@@ -0,0 +1 @@
+=666666666~5οΏ½5οΏ½--oo?ptvtup@
diff --git a/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail12 b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail12
new file mode 100644
index 0000000000000000000000000000000000000000..6b3db8e29da4c4f9bca8a7efb363c44643331ff9
Binary files /dev/null and b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail12 differ
diff --git a/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail13 b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail13
new file mode 100644
index 0000000000000000000000000000000000000000..00e1aac7f640d8e71f177f6872a33807d9c7eed4
--- /dev/null
+++ b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail13
@@ -0,0 +1 @@
+``'``'######################
diff --git a/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail14 b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail14
new file mode 100644
index 0000000000000000000000000000000000000000..431f0b47edfab2b377d8758ed1a08326ae27cc55
--- /dev/null
+++ b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail14
@@ -0,0 +1,4 @@
+--vB
+s
+ '
+sub
diff --git a/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail15 b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail15
new file mode 100644
index 0000000000000000000000000000000000000000..51a5dcf3ab158459da276f87e511bf94d39b28ca
--- /dev/null
+++ b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail15
@@ -0,0 +1 @@
+												""KοΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½"οΏ½οΏ½
diff --git a/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail16 b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail16
new file mode 100644
index 0000000000000000000000000000000000000000..97b1a80166264019db2d66d51416c88326f7276d
Binary files /dev/null and b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail16 differ
diff --git a/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail17 b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail17
new file mode 100644
index 0000000000000000000000000000000000000000..0edbeabdbf4bbe6607b519e37ec001e57114bc27
--- /dev/null
+++ b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail17
@@ -0,0 +1,9 @@
+
+--vE
+
+οΏ½οΏ½οΏ½οΏ½οΏ½
+
+
+
+
+#
diff --git a/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail18 b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail18
new file mode 100644
index 0000000000000000000000000000000000000000..812e10c64ee11b87471e7b23b9089ee2cf5ee677
--- /dev/null
+++ b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail18
@@ -0,0 +1 @@
+--vD	\οΏ½οΏ½οΏ½οΏ½	\
\ No newline at end of file
diff --git a/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail19 b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail19
new file mode 100644
index 0000000000000000000000000000000000000000..b4a8701d61c8f2e363b826e3749b0e469193147b
--- /dev/null
+++ b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail19
@@ -0,0 +1 @@
+1-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!ceeecae
diff --git a/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail2 b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail2
new file mode 100644
index 0000000000000000000000000000000000000000..b71bde1cb6f6b46ba6ed947d176c2167eafa5760
--- /dev/null
+++ b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail2
@@ -0,0 +1 @@
+-c
diff --git a/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail20 b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail20
new file mode 100644
index 0000000000000000000000000000000000000000..b2c3fb7b019f6933e1da5dff0f340fb33f378afb
--- /dev/null
+++ b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail20
@@ -0,0 +1 @@
+οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½-οΏ½#e,cecb
diff --git a/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail21 b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail21
new file mode 100644
index 0000000000000000000000000000000000000000..df52d7d946872509d899fced875c8d0dcfe608dc
Binary files /dev/null and b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail21 differ
diff --git a/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail22 b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail22
new file mode 100644
index 0000000000000000000000000000000000000000..555d0df16a5de2b8f7d2b8957fea8984cd7032e4
--- /dev/null
+++ b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail22
@@ -0,0 +1 @@
+dwrap'a
diff --git a/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail23 b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail23
new file mode 100644
index 0000000000000000000000000000000000000000..d0f7d9e6986d17e21bec05d3f8c3d76599f7f095
Binary files /dev/null and b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail23 differ
diff --git a/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail24 b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail24
new file mode 100644
index 0000000000000000000000000000000000000000..347f5a3987b58dc8bfe733cd0e3a06f959e01ce0
Binary files /dev/null and b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail24 differ
diff --git a/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail25 b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail25
new file mode 100644
index 0000000000000000000000000000000000000000..2964f0403ec85d5a5f377d9954a8eebd6f26d27a
Binary files /dev/null and b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail25 differ
diff --git a/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail26 b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail26
new file mode 100644
index 0000000000000000000000000000000000000000..0e159868fd9a6e40f4c43350fe78256f99299a3c
--- /dev/null
+++ b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail26
@@ -0,0 +1 @@
+--vC	opCB	(3tp"o3#
diff --git a/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail27 b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail27
new file mode 100644
index 0000000000000000000000000000000000000000..dec270759ccd49e12011c2ed56105a52451838ec
--- /dev/null
+++ b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail27
@@ -0,0 +1,2 @@
+--vDοΏ½  `
+-5
diff --git a/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail28 b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail28
new file mode 100644
index 0000000000000000000000000000000000000000..02c19c7ab65ab10246e8284e9b278bc5d7f1296b
--- /dev/null
+++ b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail28
@@ -0,0 +1,5 @@
+
+
+--vB
+--vB
+οΏ½οΏ½οΏ½οΏ½,οΏ½οΏ½οΏ½οΏ½-vC
diff --git a/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail29 b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail29
new file mode 100644
index 0000000000000000000000000000000000000000..67208475433e6d1018700e3dd73e9bb9de623ea4
--- /dev/null
+++ b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail29
@@ -0,0 +1,4 @@
+
+--vE
+-3vE
+0)-bb=`',,l
diff --git a/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail3 b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail3
new file mode 100644
index 0000000000000000000000000000000000000000..466a70736348a836c4aef0138440b7a7e4605801
--- /dev/null
+++ b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail3
@@ -0,0 +1 @@
+`--vM```-````-c`
diff --git a/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail30 b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail30
new file mode 100644
index 0000000000000000000000000000000000000000..254c424e65c74cd53f010737cf098d1569c35838
--- /dev/null
+++ b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail30
@@ -0,0 +1 @@
+[οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½appt1"wrappt1""\","""\""\","
diff --git a/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail31 b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail31
new file mode 100644
index 0000000000000000000000000000000000000000..7e22d167510f5d691aad3a8c25c7c5f06265f30f
--- /dev/null
+++ b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail31
@@ -0,0 +1 @@
+--$,,,,,,,,,,,,,,,,,,,,A,,,,,,,,,,,-$,,,,,,,,,,,,,,,,,,,,A,,,,,,,,,,,,,,,,,,,,,;--svopt2#,,,,-sC
diff --git a/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail32 b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail32
new file mode 100644
index 0000000000000000000000000000000000000000..d898def9b942ad2a5edd93a156e5d6ecdc0ae53a
--- /dev/null
+++ b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail32
@@ -0,0 +1 @@
+-οΏ½,,,,,,,,,,,,,,,,,,,opt1οΏ½a
diff --git a/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail33 b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail33
new file mode 100644
index 0000000000000000000000000000000000000000..18e61dff62168999c0df845c816dc5ff9b0602a9
--- /dev/null
+++ b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail33
@@ -0,0 +1,2 @@
+'''-$οΏ½
+$
diff --git a/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail34 b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail34
new file mode 100644
index 0000000000000000000000000000000000000000..297cbdccd2d634339dfa165834489853ffcf5c07
--- /dev/null
+++ b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail34
@@ -0,0 +1 @@
+" (\\\,"οΏ½οΏ½οΏ½
diff --git a/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail35 b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail35
new file mode 100644
index 0000000000000000000000000000000000000000..d9b5aa7c4b02cedab5a529128adc3e150b98d201
--- /dev/null
+++ b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail35
@@ -0,0 +1 @@
+'^^^^^^^\^^^^^^''''''@''iοΏ½
diff --git a/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail36 b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail36
new file mode 100644
index 0000000000000000000000000000000000000000..ddd11facc3062bf1987bd905f21e099bff32b293
--- /dev/null
+++ b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail36
@@ -0,0 +1 @@
+"\"
diff --git a/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail37 b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail37
new file mode 100644
index 0000000000000000000000000000000000000000..25d8567d672327f411369b18ec2ff3e5d03bd787
--- /dev/null
+++ b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail37
@@ -0,0 +1 @@
+"οΏ½-t2οΏ½οΏ½οΏ½οΏ½p'--vopt1'οΏ½''e#οΏ½οΏ½'οΏ½''e
diff --git a/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail38 b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail38
new file mode 100644
index 0000000000000000000000000000000000000000..9812202972ccd4445e9879ed59dca5b7811adc08
--- /dev/null
+++ b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail38
@@ -0,0 +1 @@
+ParseErrorEF''					--vo-d{}
diff --git a/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail39 b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail39
new file mode 100644
index 0000000000000000000000000000000000000000..991c5c3bdba4bf6d768d6163b650df10f603736c
--- /dev/null
+++ b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail39
@@ -0,0 +1 @@
+[--'
diff --git a/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail4 b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail4
new file mode 100644
index 0000000000000000000000000000000000000000..7f6475c6c00849b2124fa84caee54476bcd14af8
--- /dev/null
+++ b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail4
@@ -0,0 +1 @@
+-ccaaaa
diff --git a/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail5 b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail5
new file mode 100644
index 0000000000000000000000000000000000000000..280646d579c46db99397e54d8d85ea154ac22204
--- /dev/null
+++ b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail5
@@ -0,0 +1,6 @@
+οΏ½
+atd
+οΏ½VVV-baοΏ½οΏ½=
+οΏ½
+οΏ½
+.-' -
diff --git a/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail6 b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail6
new file mode 100644
index 0000000000000000000000000000000000000000..5b8406c6748b9eaaecda2cf3170862ceaa67b9a0
Binary files /dev/null and b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail6 differ
diff --git a/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail7 b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail7
new file mode 100644
index 0000000000000000000000000000000000000000..2362ab3750f721d274a52a35fe5ab5ddacba960f
--- /dev/null
+++ b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail7
@@ -0,0 +1,3 @@
+
+
+.br-bN3CLI10ParseErrorEa5
diff --git a/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail8 b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail8
new file mode 100644
index 0000000000000000000000000000000000000000..991c5011807fbb2fc5a5644b7c5f6fcd480342cb
Binary files /dev/null and b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail8 differ
diff --git a/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail9 b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail9
new file mode 100644
index 0000000000000000000000000000000000000000..dc0d66e209d14137e491efb8296b875242f666c7
--- /dev/null
+++ b/packages/CLI11/tests/fuzzFail/fuzz_app_file_fail9
@@ -0,0 +1 @@
+=oοΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½p--2vοΏ½οΏ½t'οΏ½-οΏ½-
diff --git a/packages/CLI11/tests/fuzzFail/fuzz_file_fail1 b/packages/CLI11/tests/fuzzFail/fuzz_file_fail1
new file mode 100644
index 0000000000000000000000000000000000000000..06b1c382f04fb5c816c1c68c371dc5eabf68ccf9
--- /dev/null
+++ b/packages/CLI11/tests/fuzzFail/fuzz_file_fail1
@@ -0,0 +1 @@
+nflag2=555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555"="
diff --git a/packages/CLI11/tests/fuzzFail/fuzz_file_fail2 b/packages/CLI11/tests/fuzzFail/fuzz_file_fail2
new file mode 100644
index 0000000000000000000000000000000000000000..db657443de14bf98b9fa034becc9821b4b237679
Binary files /dev/null and b/packages/CLI11/tests/fuzzFail/fuzz_file_fail2 differ
diff --git a/packages/CLI11/tests/fuzzFail/fuzz_file_fail3 b/packages/CLI11/tests/fuzzFail/fuzz_file_fail3
new file mode 100644
index 0000000000000000000000000000000000000000..607bce903b12ed8cff59f6d01d7eebe713fe9216
--- /dev/null
+++ b/packages/CLI11/tests/fuzzFail/fuzz_file_fail3
@@ -0,0 +1 @@
+"\οΏ½"
diff --git a/packages/CLI11/tests/fuzzFail/fuzz_file_fail4 b/packages/CLI11/tests/fuzzFail/fuzz_file_fail4
new file mode 100644
index 0000000000000000000000000000000000000000..e7aac1a293fdc63c4795d076711f6f495f3e311d
--- /dev/null
+++ b/packages/CLI11/tests/fuzzFail/fuzz_file_fail4
@@ -0,0 +1 @@
+""\"
diff --git a/packages/CLI11/tests/fuzzFail/fuzz_file_fail5 b/packages/CLI11/tests/fuzzFail/fuzz_file_fail5
new file mode 100644
index 0000000000000000000000000000000000000000..2acfd3cbac138b8d364aa127336f613abd63dabd
--- /dev/null
+++ b/packages/CLI11/tests/fuzzFail/fuzz_file_fail5
@@ -0,0 +1 @@
+"\uasdwrapοΏ½οΏ½οΏ½-"οΏ½οΏ½-"--confiοΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½.οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½οΏ½g
diff --git a/packages/CLI11/tests/fuzzFail/fuzz_file_fail6 b/packages/CLI11/tests/fuzzFail/fuzz_file_fail6
new file mode 100644
index 0000000000000000000000000000000000000000..450895cff5fd576c18ca9ed2aae08f742747549d
Binary files /dev/null and b/packages/CLI11/tests/fuzzFail/fuzz_file_fail6 differ
diff --git a/packages/CLI11/tests/fuzzFail/fuzz_file_fail7 b/packages/CLI11/tests/fuzzFail/fuzz_file_fail7
new file mode 100644
index 0000000000000000000000000000000000000000..1714e4cbda7bebab78fa0cfe55358fd41620d4c9
--- /dev/null
+++ b/packages/CLI11/tests/fuzzFail/fuzz_file_fail7
@@ -0,0 +1 @@
+--vdtr5=[|
diff --git a/packages/CLI11/tests/fuzzFail/fuzz_file_fail8 b/packages/CLI11/tests/fuzzFail/fuzz_file_fail8
new file mode 100644
index 0000000000000000000000000000000000000000..f060d946f2bacef4ad2bd4ada46ebbf4a80607bb
--- /dev/null
+++ b/packages/CLI11/tests/fuzzFail/fuzz_file_fail8
@@ -0,0 +1 @@
+[οΏ½qοΏ½q[]1."οΏ½"\".saopt1[[]1."οΏ½"\".saopt1[]
diff --git a/packages/CLI11/tests/informational.cpp b/packages/CLI11/tests/informational.cpp
index 4f7f27b52b89a1742df5fd1d2fd7fa01f698b8c0..ae221ea7ac0d360d93ffd256bc60fa4f010cfefb 100644
--- a/packages/CLI11/tests/informational.cpp
+++ b/packages/CLI11/tests/informational.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
@@ -52,5 +52,5 @@ int main() {
     std::cout << "  boost::optional support active\n";
 #endif
 
-    std::cout << std::endl;
+    std::cout << '\n';
 }
diff --git a/packages/CLI11/tests/link_test_1.cpp b/packages/CLI11/tests/link_test_1.cpp
index ba1b2d83767f4e66825529312512aaeb59fb09cc..677261fb979b85585c6941d3d80f5362eb2e00a5 100644
--- a/packages/CLI11/tests/link_test_1.cpp
+++ b/packages/CLI11/tests/link_test_1.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
diff --git a/packages/CLI11/tests/link_test_2.cpp b/packages/CLI11/tests/link_test_2.cpp
index 46d77be26308976e07dd8025451f3d24bc620b05..ae3fa0af59d7afedef581ef776a6cc0569c5b96f 100644
--- a/packages/CLI11/tests/link_test_2.cpp
+++ b/packages/CLI11/tests/link_test_2.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
diff --git a/packages/CLI11/tests/main.cpp b/packages/CLI11/tests/main.cpp
index 451f65038588121bdb253ac2071bc3f09e0fe93a..f8d148c35a92607e0262d67963739f505d5ece37 100644
--- a/packages/CLI11/tests/main.cpp
+++ b/packages/CLI11/tests/main.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
diff --git a/packages/CLI11/tests/meson.build b/packages/CLI11/tests/meson.build
index 4847985550397356655366953240b526a5c3d768..2f1c0ad2789787c468dc84b7c475df632cb0471a 100644
--- a/packages/CLI11/tests/meson.build
+++ b/packages/CLI11/tests/meson.build
@@ -65,7 +65,9 @@ testnames = [
 ]
 
 dependent_applications = [
-    'system_args'
+    'system_args',
+    'ensure_utf8',
+    'ensure_utf8_twice',
 ]
 dependent_applications_definitions = []
 #dependent_applications_targets = []
diff --git a/packages/CLI11/tests/mesonTest/main.cpp b/packages/CLI11/tests/mesonTest/main.cpp
index 39bb7845cc3fcfab96731000dcda7611402e5441..94fb63811fac301044a59cc78f704bd50f6f9405 100644
--- a/packages/CLI11/tests/mesonTest/main.cpp
+++ b/packages/CLI11/tests/mesonTest/main.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
+// Copyright (c) 2017-2024, University of Cincinnati, developed by Henry Schreiner
 // under NSF AWARD 1414736 and by the respective contributors.
 // All rights reserved.
 //
diff --git a/packages/CLI11/tests/package_config_tests/CMakeLists.txt b/packages/CLI11/tests/package_config_tests/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a775e8cfeb36902ab99098cb9d4bd11c2e98268a
--- /dev/null
+++ b/packages/CLI11/tests/package_config_tests/CMakeLists.txt
@@ -0,0 +1,21 @@
+cmake_minimum_required(VERSION 3.10...3.26)
+
+project(CLI11-package-config-test)
+
+include(CTest)
+
+find_package(PkgConfig)
+
+if(CLI11_DIR)
+  set(CMAKE_PREFIX_PATH ${CLI11_DIR} ${CLI11_DIR}/lib)
+endif()
+
+message(STATUS "${CLI11_DIR}-- ${CMAKE_PREFIX_PATH}")
+pkg_check_modules(CLI11 REQUIRED IMPORTED_TARGET CLI11)
+
+# Test the target
+add_executable(package-config-test ../../examples/positional_validation.cpp)
+target_link_libraries(package-config-test PkgConfig::CLI11)
+
+add_test(NAME package-config-test1 COMMAND package-config-test one)
+set_property(TEST package-config-test1 PROPERTY PASS_REGULAR_EXPRESSION "File 1 = one")
diff --git a/packages/Catch2/.bazelrc b/packages/Catch2/.bazelrc
index c01cb39f1d38a40ecaa7351113286de1da321aa7..9cb0aa1b8dfef2d1cd72510a4133c42ae285c47b 100644
--- a/packages/Catch2/.bazelrc
+++ b/packages/Catch2/.bazelrc
@@ -8,3 +8,4 @@ build:vs2022 --cxxopt=/std:c++17
 
 build:windows --config=vs2022
 build:linux --config=gcc11
+build:macos --cxxopt=-std=c++2b
diff --git a/packages/Catch2/.github/workflows/linux-bazel-builds.yml b/packages/Catch2/.github/workflows/linux-bazel-builds.yml
index 9006652e2c2874cd026725710eb061d77dd81ea2..dc826ac0d9043f72362096e84a858c6bc1dac31c 100644
--- a/packages/Catch2/.github/workflows/linux-bazel-builds.yml
+++ b/packages/Catch2/.github/workflows/linux-bazel-builds.yml
@@ -11,7 +11,7 @@ jobs:
         compilation_mode: [fastbuild, dbg, opt]
 
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
 
     - name: Mount bazel cache
       uses: actions/cache@v3
diff --git a/packages/Catch2/.github/workflows/linux-meson-builds.yml b/packages/Catch2/.github/workflows/linux-meson-builds.yml
index dec701b61b22217856030ebb1eb8773db0b1ccfc..4a6cfd5bbb628e7ea9167bbb7b0a548eab62df5d 100644
--- a/packages/Catch2/.github/workflows/linux-meson-builds.yml
+++ b/packages/Catch2/.github/workflows/linux-meson-builds.yml
@@ -18,10 +18,12 @@ jobs:
             other_pkgs: clang-11
 
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
 
     - name: Prepare environment
-      run: sudo apt-get install -y meson ninja-build ${{matrix.other_pkgs}}
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y meson ninja-build ${{matrix.other_pkgs}}
 
     - name: Configure build
       env:
@@ -38,6 +40,5 @@ jobs:
 
     - name: Run tests
       working-directory: ${{runner.workspace}}/meson-build
-      # Hardcode 2 cores we know are there
       run: |
         meson test --verbose
diff --git a/packages/Catch2/.github/workflows/linux-other-builds.yml b/packages/Catch2/.github/workflows/linux-other-builds.yml
index cf4e2c06b755e6bbc0fdc7443f70d98e4420f29f..9afd231af521dae022e6e2cee5a6e2b7e8893a20 100644
--- a/packages/Catch2/.github/workflows/linux-other-builds.yml
+++ b/packages/Catch2/.github/workflows/linux-other-builds.yml
@@ -29,13 +29,13 @@ jobs:
             build_type: Debug
             std: 14
             other_pkgs: g++-7
-            cmake_configurations: -DCATCH_BUILD_EXTRA_TESTS=ON -DCATCH_BUILD_EXAMPLES=ON
+            cmake_configurations: -DCATCH_BUILD_EXTRA_TESTS=ON -DCATCH_BUILD_EXAMPLES=ON -DCATCH_ENABLE_CMAKE_HELPER_TESTS=ON
           - cxx: g++-7
             build_description: Extras + Examples
             build_type: Release
             std: 14
             other_pkgs: g++-7
-            cmake_configurations: -DCATCH_BUILD_EXTRA_TESTS=ON -DCATCH_BUILD_EXAMPLES=ON
+            cmake_configurations: -DCATCH_BUILD_EXTRA_TESTS=ON -DCATCH_BUILD_EXAMPLES=ON -DCATCH_ENABLE_CMAKE_HELPER_TESTS=ON
 
           # Extras and examples with Clang-10
           - cxx: clang++-10
@@ -43,13 +43,13 @@ jobs:
             build_type: Debug
             std: 17
             other_pkgs: clang-10
-            cmake_configurations: -DCATCH_BUILD_EXTRA_TESTS=ON -DCATCH_BUILD_EXAMPLES=ON
+            cmake_configurations: -DCATCH_BUILD_EXTRA_TESTS=ON -DCATCH_BUILD_EXAMPLES=ON -DCATCH_ENABLE_CMAKE_HELPER_TESTS=ON
           - cxx: clang++-10
             build_description: Extras + Examples
             build_type: Release
             std: 17
             other_pkgs: clang-10
-            cmake_configurations: -DCATCH_BUILD_EXTRA_TESTS=ON -DCATCH_BUILD_EXAMPLES=ON
+            cmake_configurations: -DCATCH_BUILD_EXTRA_TESTS=ON -DCATCH_BUILD_EXAMPLES=ON -DCATCH_ENABLE_CMAKE_HELPER_TESTS=ON
 
           # Configure tests with Clang-10
           - cxx: clang++-10
@@ -70,10 +70,12 @@ jobs:
 
 
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
 
     - name: Prepare environment
-      run: sudo apt-get install -y ninja-build ${{matrix.other_pkgs}}
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y ninja-build ${{matrix.other_pkgs}}
 
     - name: Configure build
       working-directory: ${{runner.workspace}}
@@ -100,5 +102,4 @@ jobs:
       env:
           CTEST_OUTPUT_ON_FAILURE: 1
       working-directory: ${{runner.workspace}}/build
-      # Hardcode 2 cores we know are there
-      run: ctest -C ${{matrix.build_type}} -j 2 ${{matrix.other_ctest_args}}
+      run: ctest -C ${{matrix.build_type}} -j `nproc` ${{matrix.other_ctest_args}}
diff --git a/packages/Catch2/.github/workflows/linux-simple-builds.yml b/packages/Catch2/.github/workflows/linux-simple-builds.yml
index 989c4942e221c4cdf8241f573f2f02a8d1f32546..4cca31619e9a8e27d1baf487e3a18aa6e4542706 100644
--- a/packages/Catch2/.github/workflows/linux-simple-builds.yml
+++ b/packages/Catch2/.github/workflows/linux-simple-builds.yml
@@ -83,7 +83,7 @@ jobs:
             other_pkgs: g++-10
 
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
 
     - name: Add repositories for older GCC
       run: |
@@ -92,7 +92,9 @@ jobs:
       if: ${{ matrix.cxx == 'g++-5' || matrix.cxx == 'g++-6' }}
 
     - name: Prepare environment
-      run: sudo apt-get install -y ninja-build ${{matrix.other_pkgs}}
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y ninja-build ${{matrix.other_pkgs}}
 
     - name: Configure build
       working-directory: ${{runner.workspace}}
@@ -118,5 +120,4 @@ jobs:
       env:
           CTEST_OUTPUT_ON_FAILURE: 1
       working-directory: ${{runner.workspace}}/build
-      # Hardcode 2 cores we know are there
-      run: ctest -C ${{matrix.build_type}} -j 2
+      run: ctest -C ${{matrix.build_type}} -j `nproc`
diff --git a/packages/Catch2/.github/workflows/mac-builds.yml b/packages/Catch2/.github/workflows/mac-builds.yml
index 955b81fcc6ea8514d84498ddd8123f2202561608..259d8b367b9e29babf449cdf4d5a4b7c96dd7fff 100644
--- a/packages/Catch2/.github/workflows/mac-builds.yml
+++ b/packages/Catch2/.github/workflows/mac-builds.yml
@@ -22,7 +22,7 @@ jobs:
             extra_tests: ON
 
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
 
     - name: Configure build
       working-directory: ${{runner.workspace}}
@@ -42,11 +42,10 @@ jobs:
 
     - name: Build tests + lib
       working-directory: ${{runner.workspace}}/build
-      run: make -j 2
+      run: make -j `sysctl -n hw.ncpu`
 
     - name: Run tests
       env:
           CTEST_OUTPUT_ON_FAILURE: 1
       working-directory: ${{runner.workspace}}/build
-      # Hardcode 2 cores we know are there
-      run: ctest -C ${{matrix.build_type}} -j 2
+      run: ctest -C ${{matrix.build_type}} -j `sysctl -n hw.ncpu`
diff --git a/packages/Catch2/.github/workflows/validate-header-guards.yml b/packages/Catch2/.github/workflows/validate-header-guards.yml
index c02b5d49e2dcf57ebcb589085c4b0b1d3a513f35..fa9d1574ba2ed9d7bd949ed77bfde2c4c8e0638a 100644
--- a/packages/Catch2/.github/workflows/validate-header-guards.yml
+++ b/packages/Catch2/.github/workflows/validate-header-guards.yml
@@ -9,7 +9,7 @@ jobs:
     steps:
 
       - name: Checkout source code
-        uses: actions/checkout@v2
+        uses: actions/checkout@v4
 
       - name: Setup Dependencies
         uses: actions/setup-python@v2
diff --git a/packages/Catch2/.github/workflows/windows-simple-builds.yml b/packages/Catch2/.github/workflows/windows-simple-builds.yml
index 197fa219e3b2ac81d6734b2097b3eba8f69c5e87..5fb7b8fe799053e4146590219a3b9458ee0f1a16 100644
--- a/packages/Catch2/.github/workflows/windows-simple-builds.yml
+++ b/packages/Catch2/.github/workflows/windows-simple-builds.yml
@@ -13,7 +13,7 @@ jobs:
         build_type: [Debug, Release]
         std: [14, 17]
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
 
       - name: Configure build
         working-directory: ${{runner.workspace}}
diff --git a/packages/Catch2/.gitrepo b/packages/Catch2/.gitrepo
index 0b35a79b5e86abab214ceed2e6f119ed2eab73fc..c3d079d0b3c2899c19424e064e159f84dbbc3f8e 100644
--- a/packages/Catch2/.gitrepo
+++ b/packages/Catch2/.gitrepo
@@ -6,7 +6,7 @@
 [subrepo]
 	remote = git@github.com:catchorg/Catch2.git
 	branch = devel
-	commit = 0631b607ee2bbc07c7c238f0b15b23ef21926960
-	parent = 446a6502fc29873ba84cd5964ea2e0787dc48e49
+	commit = 1078e7e95b3a06d4dadc75188de48bc4afffb955
+	parent = a3b573d7e88e70c258be089529fd58ad41a10a64
 	method = merge
 	cmdver = 0.4.6
diff --git a/packages/Catch2/BUILD.bazel b/packages/Catch2/BUILD.bazel
index 02ec92265648334d09c039877202b2686609397e..c51bf57e70dd840a51478c4667ae02a9240a3a1a 100644
--- a/packages/Catch2/BUILD.bazel
+++ b/packages/Catch2/BUILD.bazel
@@ -49,6 +49,7 @@ expand_template(
         "#cmakedefine CATCH_CONFIG_NOSTDOUT": "",
         "#cmakedefine CATCH_CONFIG_POSIX_SIGNALS": "",
         "#cmakedefine CATCH_CONFIG_PREFIX_ALL": "",
+        "#cmakedefine CATCH_CONFIG_PREFIX_MESSAGES": "",
         "#cmakedefine CATCH_CONFIG_SHARED_LIBRARY": "",
         "#cmakedefine CATCH_CONFIG_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT": "",
         "#cmakedefine CATCH_CONFIG_USE_ASYNC": "",
diff --git a/packages/Catch2/CMake/CatchConfigOptions.cmake b/packages/Catch2/CMake/CatchConfigOptions.cmake
index 067739dc9962a9349211dd735e7c93ea42964591..6eae220dfcb112bf26d9e8456a52518013dcf4dd 100644
--- a/packages/Catch2/CMake/CatchConfigOptions.cmake
+++ b/packages/Catch2/CMake/CatchConfigOptions.cmake
@@ -18,10 +18,12 @@
 macro(AddOverridableConfigOption OptionBaseName)
   option(CATCH_CONFIG_${OptionBaseName} "Read docs/configuration.md for details" OFF)
   option(CATCH_CONFIG_NO_${OptionBaseName} "Read docs/configuration.md for details" OFF)
+  mark_as_advanced(CATCH_CONFIG_${OptionBaseName} CATCH_CONFIG_NO_${OptionBaseName})
 endmacro()
 
 macro(AddConfigOption OptionBaseName)
   option(CATCH_CONFIG_${OptionBaseName} "Read docs/configuration.md for details" OFF)
+  mark_as_advanced(CATCH_CONFIG_${OptionBaseName})
 endmacro()
 
 set(_OverridableOptions
@@ -62,6 +64,7 @@ set(_OtherConfigOptions
   "FAST_COMPILE"
   "NOSTDOUT"
   "PREFIX_ALL"
+  "PREFIX_MESSAGES"
   "WINDOWS_CRTDBG"
 )
 
@@ -78,6 +81,8 @@ endif()
 set(CATCH_CONFIG_DEFAULT_REPORTER "console" CACHE STRING "Read docs/configuration.md for details. The name of the reporter should be without quotes.")
 set(CATCH_CONFIG_CONSOLE_WIDTH "80" CACHE STRING "Read docs/configuration.md for details. Must form a valid integer literal.")
 
+mark_as_advanced(CATCH_CONFIG_SHARED_LIBRARY CATCH_CONFIG_DEFAULT_REPORTER CATCH_CONFIG_CONSOLE_WIDTH)
+
 # There is no good way to both turn this into a CMake cache variable,
 # and keep reasonable default semantics inside the project. Thus we do
 # not define it and users have to provide it as an outside variable.
diff --git a/packages/Catch2/CMake/CatchMiscFunctions.cmake b/packages/Catch2/CMake/CatchMiscFunctions.cmake
index 44c875007ff0717a1fea908d41c67dc8b22082f8..84bd7cc79f692e96faa36803ca12faf22ea558ee 100644
--- a/packages/Catch2/CMake/CatchMiscFunctions.cmake
+++ b/packages/Catch2/CMake/CatchMiscFunctions.cmake
@@ -46,7 +46,6 @@ function(add_warnings_to_targets targets)
         set(CHECKED_WARNING_FLAGS
           "-Wabsolute-value"
           "-Wall"
-          "-Wc++20-compat"
           "-Wcall-to-pure-virtual-from-ctor-dtor"
           "-Wcast-align"
           "-Wcatch-value"
@@ -79,6 +78,7 @@ function(add_warnings_to_targets targets)
           "-Wreturn-std-move"
           "-Wshadow"
           "-Wstrict-aliasing"
+          "-Wsubobject-linkage"
           "-Wsuggest-destructor-override"
           "-Wsuggest-override"
           "-Wundef"
diff --git a/packages/Catch2/CMakeLists.txt b/packages/Catch2/CMakeLists.txt
index b3e811538b11edc5f639f2f26abea21bc5e3a1bf..78ac4c8ad8e4be5c6fc716f6f00624990031d9b1 100644
--- a/packages/Catch2/CMakeLists.txt
+++ b/packages/Catch2/CMakeLists.txt
@@ -11,6 +11,7 @@ endif()
 option(CATCH_INSTALL_DOCS "Install documentation alongside library" ON)
 option(CATCH_INSTALL_EXTRAS "Install extras (CMake scripts, debugger helpers) alongside library" ON)
 option(CATCH_DEVELOPMENT_BUILD "Build tests, enable warnings, enable Werror, etc" OFF)
+option(CATCH_ENABLE_REPRODUCIBLE_BUILD "Add compiler flags for improving build reproducibility" ON)
 
 include(CMakeDependentOption)
 cmake_dependent_option(CATCH_BUILD_TESTING "Build the SelfTest project" ON "CATCH_DEVELOPMENT_BUILD" OFF)
@@ -21,6 +22,7 @@ cmake_dependent_option(CATCH_ENABLE_COVERAGE "Generate coverage for codecov.io"
 cmake_dependent_option(CATCH_ENABLE_WERROR "Enables Werror during build" ON "CATCH_DEVELOPMENT_BUILD" OFF)
 cmake_dependent_option(CATCH_BUILD_SURROGATES "Enable generating and building surrogate TUs for the main headers" OFF "CATCH_DEVELOPMENT_BUILD" OFF)
 cmake_dependent_option(CATCH_ENABLE_CONFIGURE_TESTS "Enable CMake configuration tests. WARNING: VERY EXPENSIVE" OFF "CATCH_DEVELOPMENT_BUILD" OFF)
+cmake_dependent_option(CATCH_ENABLE_CMAKE_HELPER_TESTS "Enable CMake helper tests. WARNING: VERY EXPENSIVE" OFF "CATCH_DEVELOPMENT_BUILD" OFF)
 
 
 # Catch2's build breaks if done in-tree. You probably should not build
@@ -31,7 +33,7 @@ if (CMAKE_BINARY_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
 endif()
 
 project(Catch2
-  VERSION 3.3.2 # CML version placeholder, don't delete
+  VERSION 3.5.2 # CML version placeholder, don't delete
   LANGUAGES CXX
   # HOMEPAGE_URL is not supported until CMake version 3.12, which
   # we do not target yet.
diff --git a/packages/Catch2/CMakePresets.json b/packages/Catch2/CMakePresets.json
index 00f3a6d3afb1a29761a4f1034af68c5ad60be316..885412850d633135ec9f91a9f5fdcefcd610c1d5 100644
--- a/packages/Catch2/CMakePresets.json
+++ b/packages/Catch2/CMakePresets.json
@@ -18,7 +18,8 @@
                 "CATCH_BUILD_EXAMPLES": "ON",
                 "CATCH_BUILD_EXTRA_TESTS": "ON",
                 "CATCH_BUILD_SURROGATES": "ON",
-                "CATCH_ENABLE_CONFIGURE_TESTS": "ON"
+                "CATCH_ENABLE_CONFIGURE_TESTS": "ON",
+                "CATCH_ENABLE_CMAKE_HELPER_TESTS": "ON"
             }
         }
     ]   
diff --git a/packages/Catch2/Doxyfile b/packages/Catch2/Doxyfile
index 07b385ec10afb1a58a9647d1f4fb7882a3d0bf36..914e598481782c52fe82a0a93573bd04afe81146 100644
--- a/packages/Catch2/Doxyfile
+++ b/packages/Catch2/Doxyfile
@@ -1,4 +1,4 @@
-# Doxyfile 1.8.16
+# Doxyfile 1.9.1
 
 # This file describes the settings to be used by the documentation system
 # doxygen (www.doxygen.org) for a project.
@@ -32,7 +32,7 @@ DOXYFILE_ENCODING      = UTF-8
 # title of most generated pages and in a few other places.
 # The default value is: My Project.
 
-PROJECT_NAME           = "Catch2"
+PROJECT_NAME           = Catch2
 
 # The PROJECT_NUMBER tag can be used to enter a project or revision number. This
 # could be handy for archiving the generated documentation or if some version
@@ -51,6 +51,7 @@ PROJECT_BRIEF          = "Popular C++ unit testing framework"
 # pixels and the maximum width should not exceed 200 pixels. Doxygen will copy
 # the logo to the output directory.
 
+PROJECT_LOGO           =
 
 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
 # into which the generated documentation will be written. If a relative path is
@@ -216,6 +217,14 @@ QT_AUTOBRIEF           = YES
 
 MULTILINE_CPP_IS_BRIEF = NO
 
+# By default Python docstrings are displayed as preformatted text and doxygen's
+# special commands cannot be used. By setting PYTHON_DOCSTRING to NO the
+# doxygen's special commands can be used and the contents of the docstring
+# documentation blocks is shown as doxygen documentation.
+# The default value is: YES.
+
+PYTHON_DOCSTRING       = YES
+
 # If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
 # documentation from any documented member that it re-implements.
 # The default value is: YES.
@@ -251,13 +260,7 @@ TAB_SIZE               = 4
 # a double escape (\\{ and \\})
 
 ALIASES                = "complexity=@par Complexity:" \
-                         "noexcept=**Noexcept**"
-
-# This tag can be used to specify a number of word-keyword mappings (TCL only).
-# A mapping has the form "name=value". For example adding "class=itcl::class"
-# will allow you to use the command class in the itcl::class meaning.
-
-TCL_SUBST              =
+                         noexcept=**Noexcept**
 
 # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
 # only. Doxygen will then generate output that is more tailored for C. For
@@ -299,19 +302,22 @@ OPTIMIZE_OUTPUT_SLICE  = NO
 # parses. With this tag you can assign which parser to use for a given
 # extension. Doxygen has a built-in mapping, but you can override or extend it
 # using this tag. The format is ext=language, where ext is a file extension, and
-# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
-# Csharp (C#), C, C++, D, PHP, md (Markdown), Objective-C, Python, Slice,
+# language is one of the parsers supported by doxygen: IDL, Java, JavaScript,
+# Csharp (C#), C, C++, D, PHP, md (Markdown), Objective-C, Python, Slice, VHDL,
 # Fortran (fixed format Fortran: FortranFixed, free formatted Fortran:
 # FortranFree, unknown formatted Fortran: Fortran. In the later case the parser
 # tries to guess whether the code is fixed or free formatted code, this is the
-# default for Fortran type files), VHDL, tcl. For instance to make doxygen treat
-# .inc files as Fortran files (default is PHP), and .f files as C (default is
-# Fortran), use: inc=Fortran f=C.
+# default for Fortran type files). For instance to make doxygen treat .inc files
+# as Fortran files (default is PHP), and .f files as C (default is Fortran),
+# use: inc=Fortran f=C.
 #
 # Note: For files without extension you can use no_extension as a placeholder.
 #
 # Note that for custom extensions you also need to set FILE_PATTERNS otherwise
-# the files are not read by doxygen.
+# the files are not read by doxygen. When specifying no_extension you should add
+# * to the FILE_PATTERNS.
+#
+# Note see also the list of default file extension mappings.
 
 EXTENSION_MAPPING      =
 
@@ -445,6 +451,19 @@ TYPEDEF_HIDES_STRUCT   = NO
 
 LOOKUP_CACHE_SIZE      = 0
 
+# The NUM_PROC_THREADS specifies the number threads doxygen is allowed to use
+# during processing. When set to 0 doxygen will based this on the number of
+# cores available in the system. You can set it explicitly to a value larger
+# than 0 to get more control over the balance between CPU load and processing
+# speed. At this moment only the input processing can be done using multiple
+# threads. Since this is still an experimental feature the default is set to 1,
+# which efficively disables parallel processing. Please report any issues you
+# encounter. Generating dot graphs in parallel is controlled by the
+# DOT_NUM_THREADS setting.
+# Minimum value: 0, maximum value: 32, default value: 1.
+
+NUM_PROC_THREADS       = 1
+
 #---------------------------------------------------------------------------
 # Build related configuration options
 #---------------------------------------------------------------------------
@@ -508,6 +527,13 @@ EXTRACT_LOCAL_METHODS  = NO
 
 EXTRACT_ANON_NSPACES   = NO
 
+# If this flag is set to YES, the name of an unnamed parameter in a declaration
+# will be determined by the corresponding definition. By default unnamed
+# parameters remain unnamed in the output.
+# The default value is: YES.
+
+RESOLVE_UNNAMED_PARAMS = YES
+
 # If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
 # undocumented members inside documented classes or files. If set to NO these
 # members will be included in the various overviews, but no documentation
@@ -525,8 +551,8 @@ HIDE_UNDOC_MEMBERS     = NO
 HIDE_UNDOC_CLASSES     = NO
 
 # If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
-# (class|struct|union) declarations. If set to NO, these declarations will be
-# included in the documentation.
+# declarations. If set to NO, these declarations will be included in the
+# documentation.
 # The default value is: NO.
 
 HIDE_FRIEND_COMPOUNDS  = NO
@@ -545,11 +571,18 @@ HIDE_IN_BODY_DOCS      = NO
 
 INTERNAL_DOCS          = NO
 
-# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
-# names in lower-case letters. If set to YES, upper-case letters are also
-# allowed. This is useful if you have classes or files whose names only differ
-# in case and if your file system supports case sensitive file names. Windows
-# (including Cygwin) ands Mac users are advised to set this option to NO.
+# With the correct setting of option CASE_SENSE_NAMES doxygen will better be
+# able to match the capabilities of the underlying filesystem. In case the
+# filesystem is case sensitive (i.e. it supports files in the same directory
+# whose names only differ in casing), the option must be set to YES to properly
+# deal with such files in case they appear in the input. For filesystems that
+# are not case sensitive the option should be be set to NO to properly deal with
+# output files written for symbols that only differ in casing, such as for two
+# classes, one named CLASS and the other named Class, and to also support
+# references to files without having to specify the exact matching casing. On
+# Windows (including Cygwin) and MacOS, users should typically set this option
+# to NO, whereas on Linux or other Unix flavors it should typically be set to
+# YES.
 # The default value is: system dependent.
 
 CASE_SENSE_NAMES       = NO
@@ -788,7 +821,10 @@ WARN_IF_DOC_ERROR      = YES
 WARN_NO_PARAMDOC       = YES
 
 # If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when
-# a warning is encountered.
+# a warning is encountered. If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS
+# then doxygen will continue running as if WARN_AS_ERROR tag is set to NO, but
+# at the end of the doxygen process doxygen will return with a non-zero status.
+# Possible values are: NO, YES and FAIL_ON_WARNINGS.
 # The default value is: NO.
 
 WARN_AS_ERROR          = NO
@@ -819,13 +855,13 @@ WARN_LOGFILE           = doxygen.errors
 # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
 # Note: If this tag is empty the current directory is searched.
 
-INPUT  = "src/catch2"
+INPUT                  = src/catch2
 
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
 # libiconv (or the iconv built into libc) for the transcoding. See the libiconv
-# documentation (see: https://www.gnu.org/software/libiconv/) for the list of
-# possible encodings.
+# documentation (see:
+# https://www.gnu.org/software/libiconv/) for the list of possible encodings.
 # The default value is: UTF-8.
 
 INPUT_ENCODING         = UTF-8
@@ -838,13 +874,61 @@ INPUT_ENCODING         = UTF-8
 # need to set EXTENSION_MAPPING for the extension otherwise the files are not
 # read by doxygen.
 #
+# Note the list of default checked file patterns might differ from the list of
+# default file extension mappings.
+#
 # If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
 # *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
 # *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
-# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f95, *.f03, *.f08,
-# *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf, *.qsf and *.ice.
-
-# FILE_PATTERNS          =
+# *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C comment),
+# *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f18, *.f, *.for, *.vhd, *.vhdl,
+# *.ucf, *.qsf and *.ice.
+
+FILE_PATTERNS          = *.c \
+                         *.cc \
+                         *.cxx \
+                         *.cpp \
+                         *.c++ \
+                         *.java \
+                         *.ii \
+                         *.ixx \
+                         *.ipp \
+                         *.i++ \
+                         *.inl \
+                         *.idl \
+                         *.ddl \
+                         *.odl \
+                         *.h \
+                         *.hh \
+                         *.hxx \
+                         *.hpp \
+                         *.h++ \
+                         *.cs \
+                         *.d \
+                         *.php \
+                         *.php4 \
+                         *.php5 \
+                         *.phtml \
+                         *.inc \
+                         *.m \
+                         *.markdown \
+                         *.md \
+                         *.mm \
+                         *.dox \
+                         *.py \
+                         *.pyw \
+                         *.f90 \
+                         *.f95 \
+                         *.f03 \
+                         *.f08 \
+                         *.f18 \
+                         *.f \
+                         *.for \
+                         *.vhd \
+                         *.vhdl \
+                         *.ucf \
+                         *.qsf \
+                         *.ice
 
 # The RECURSIVE tag can be used to specify whether or not subdirectories should
 # be searched for input files as well.
@@ -968,6 +1052,7 @@ FILTER_SOURCE_PATTERNS =
 # (index.html). This can be useful if you have a project on for instance GitHub
 # and want to reuse the introduction page also for the doxygen output.
 
+USE_MDFILE_AS_MAINPAGE =
 
 #---------------------------------------------------------------------------
 # Configuration options related to source browsing
@@ -1055,6 +1140,44 @@ USE_HTAGS              = NO
 
 VERBATIM_HEADERS       = YES
 
+# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the
+# clang parser (see:
+# http://clang.llvm.org/) for more accurate parsing at the cost of reduced
+# performance. This can be particularly helpful with template rich C++ code for
+# which doxygen's built-in parser lacks the necessary type information.
+# Note: The availability of this option depends on whether or not doxygen was
+# generated with the -Duse_libclang=ON option for CMake.
+# The default value is: NO.
+
+CLANG_ASSISTED_PARSING = NO
+
+# If clang assisted parsing is enabled and the CLANG_ADD_INC_PATHS tag is set to
+# YES then doxygen will add the directory of each input to the include path.
+# The default value is: YES.
+
+CLANG_ADD_INC_PATHS    = YES
+
+# If clang assisted parsing is enabled you can provide the compiler with command
+# line options that you would normally use when invoking the compiler. Note that
+# the include paths will already be set by doxygen for the files and directories
+# specified with INPUT and INCLUDE_PATH.
+# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
+
+CLANG_OPTIONS          =
+
+# If clang assisted parsing is enabled you can provide the clang parser with the
+# path to the directory containing a file called compile_commands.json. This
+# file is the compilation database (see:
+# http://clang.llvm.org/docs/HowToSetupToolingForLLVM.html) containing the
+# options used when the source files were built. This is equivalent to
+# specifying the -p option to a clang tool, such as clang-check. These options
+# will then be passed to the parser. Any options specified with CLANG_OPTIONS
+# will be added as well.
+# Note: The availability of this option depends on whether or not doxygen was
+# generated with the -Duse_libclang=ON option for CMake.
+
+CLANG_DATABASE_PATH    =
+
 #---------------------------------------------------------------------------
 # Configuration options related to the alphabetical class index
 #---------------------------------------------------------------------------
@@ -1066,13 +1189,6 @@ VERBATIM_HEADERS       = YES
 
 ALPHABETICAL_INDEX     = YES
 
-# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
-# which the alphabetical index list will be split.
-# Minimum value: 1, maximum value: 20, default value: 5.
-# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
-
-COLS_IN_ALPHA_INDEX    = 5
-
 # In case all classes in a project start with a common prefix, all classes will
 # be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
 # can be used to specify a prefix (or a list of prefixes) that should be ignored
@@ -1211,9 +1327,9 @@ HTML_TIMESTAMP         = NO
 
 # If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML
 # documentation will contain a main index with vertical navigation menus that
-# are dynamically created via Javascript. If disabled, the navigation index will
+# are dynamically created via JavaScript. If disabled, the navigation index will
 # consists of multiple levels of tabs that are statically embedded in every HTML
-# page. Disable this option to support browsers that do not have Javascript,
+# page. Disable this option to support browsers that do not have JavaScript,
 # like the Qt help browser.
 # The default value is: YES.
 # This tag requires that the tag GENERATE_HTML is set to YES.
@@ -1243,10 +1359,11 @@ HTML_INDEX_NUM_ENTRIES = 100
 
 # If the GENERATE_DOCSET tag is set to YES, additional index files will be
 # generated that can be used as input for Apple's Xcode 3 integrated development
-# environment (see: https://developer.apple.com/xcode/), introduced with OSX
-# 10.5 (Leopard). To create a documentation set, doxygen will generate a
-# Makefile in the HTML output directory. Running make will produce the docset in
-# that directory and running make install will install the docset in
+# environment (see:
+# https://developer.apple.com/xcode/), introduced with OSX 10.5 (Leopard). To
+# create a documentation set, doxygen will generate a Makefile in the HTML
+# output directory. Running make will produce the docset in that directory and
+# running make install will install the docset in
 # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
 # startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy
 # genXcode/_index.html for more information.
@@ -1288,8 +1405,8 @@ DOCSET_PUBLISHER_NAME  = Publisher
 # If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
 # additional HTML index files: index.hhp, index.hhc, and index.hhk. The
 # index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
-# (see: https://www.microsoft.com/en-us/download/details.aspx?id=21138) on
-# Windows.
+# (see:
+# https://www.microsoft.com/en-us/download/details.aspx?id=21138) on Windows.
 #
 # The HTML Help Workshop contains a compiler that can convert all HTML output
 # generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
@@ -1364,7 +1481,8 @@ QCH_FILE               =
 
 # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
 # Project output. For more information please see Qt Help Project / Namespace
-# (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace).
+# (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace).
 # The default value is: org.doxygen.Project.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
@@ -1372,8 +1490,8 @@ QHP_NAMESPACE          = org.doxygen.Project
 
 # The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
 # Help Project output. For more information please see Qt Help Project / Virtual
-# Folders (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-
-# folders).
+# Folders (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-folders).
 # The default value is: doc.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
@@ -1381,16 +1499,16 @@ QHP_VIRTUAL_FOLDER     = doc
 
 # If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
 # filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-
-# filters).
+# Filters (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHP_CUST_FILTER_NAME   =
 
 # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
 # custom filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-
-# filters).
+# Filters (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHP_CUST_FILTER_ATTRS  =
@@ -1402,9 +1520,9 @@ QHP_CUST_FILTER_ATTRS  =
 
 QHP_SECT_FILTER_ATTRS  =
 
-# The QHG_LOCATION tag can be used to specify the location of Qt's
-# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
-# generated .qhp file.
+# The QHG_LOCATION tag can be used to specify the location (absolute path
+# including file name) of Qt's qhelpgenerator. If non-empty doxygen will try to
+# run qhelpgenerator on the generated .qhp file.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHG_LOCATION           =
@@ -1481,6 +1599,17 @@ TREEVIEW_WIDTH         = 250
 
 EXT_LINKS_IN_WINDOW    = NO
 
+# If the HTML_FORMULA_FORMAT option is set to svg, doxygen will use the pdf2svg
+# tool (see https://github.com/dawbarton/pdf2svg) or inkscape (see
+# https://inkscape.org) to generate formulas as SVG images instead of PNGs for
+# the HTML output. These images will generally look nicer at scaled resolutions.
+# Possible values are: png (the default) and svg (looks nicer but requires the
+# pdf2svg or inkscape tool).
+# The default value is: png.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FORMULA_FORMAT    = png
+
 # Use this tag to change the font size of LaTeX formulas included as images in
 # the HTML documentation. When you change the font size after a successful
 # doxygen run you need to manually remove any form_*.png images from the HTML
@@ -1501,8 +1630,14 @@ FORMULA_FONTSIZE       = 10
 
 FORMULA_TRANSPARENT    = YES
 
+# The FORMULA_MACROFILE can contain LaTeX \newcommand and \renewcommand commands
+# to create new LaTeX commands to be used in formulas as building blocks. See
+# the section "Including formulas" for details.
+
+FORMULA_MACROFILE      =
+
 # Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
-# https://www.mathjax.org) which uses client side Javascript for the rendering
+# https://www.mathjax.org) which uses client side JavaScript for the rendering
 # instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
 # installed or if you want to formulas look prettier in the HTML output. When
 # enabled you may also need to install MathJax separately and configure the path
@@ -1514,7 +1649,7 @@ USE_MATHJAX            = YES
 
 # When MathJax is enabled you can set the default output format to be used for
 # the MathJax output. See the MathJax site (see:
-# http://docs.mathjax.org/en/latest/output.html) for more details.
+# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details.
 # Possible values are: HTML-CSS (which is slower, but has the best
 # compatibility), NativeMML (i.e. MathML) and SVG.
 # The default value is: HTML-CSS.
@@ -1530,7 +1665,7 @@ MATHJAX_FORMAT         = HTML-CSS
 # Content Delivery Network so you can quickly see the result without installing
 # MathJax. However, it is strongly recommended to install a local copy of
 # MathJax from https://www.mathjax.org before deployment.
-# The default value is: https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/.
+# The default value is: https://cdn.jsdelivr.net/npm/mathjax@2.
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
 MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
@@ -1545,7 +1680,8 @@ MATHJAX_EXTENSIONS     = TeX/AMSmath \
 
 # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
 # of code that will be used on startup of the MathJax code. See the MathJax site
-# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
+# (see:
+# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details. For an
 # example see the documentation.
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
@@ -1573,7 +1709,7 @@ MATHJAX_CODEFILE       =
 SEARCHENGINE           = YES
 
 # When the SERVER_BASED_SEARCH tag is enabled the search engine will be
-# implemented using a web server instead of a web client using Javascript. There
+# implemented using a web server instead of a web client using JavaScript. There
 # are two flavors of web server based searching depending on the EXTERNAL_SEARCH
 # setting. When disabled, doxygen will generate a PHP script for searching and
 # an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
@@ -1592,7 +1728,8 @@ SERVER_BASED_SEARCH    = NO
 #
 # Doxygen ships with an example indexer (doxyindexer) and search engine
 # (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: https://xapian.org/).
+# Xapian (see:
+# https://xapian.org/).
 #
 # See the section "External Indexing and Searching" for details.
 # The default value is: NO.
@@ -1605,8 +1742,9 @@ EXTERNAL_SEARCH        = NO
 #
 # Doxygen ships with an example indexer (doxyindexer) and search engine
 # (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: https://xapian.org/). See the section "External Indexing and
-# Searching" for details.
+# Xapian (see:
+# https://xapian.org/). See the section "External Indexing and Searching" for
+# details.
 # This tag requires that the tag SEARCHENGINE is set to YES.
 
 SEARCHENGINE_URL       =
@@ -1770,9 +1908,11 @@ LATEX_EXTRA_FILES      =
 
 PDF_HYPERLINKS         = YES
 
-# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
-# the PDF file directly from the LaTeX files. Set this option to YES, to get a
-# higher quality PDF documentation.
+# If the USE_PDFLATEX tag is set to YES, doxygen will use the engine as
+# specified with LATEX_CMD_NAME to generate the PDF file directly from the LaTeX
+# files. Set this option to YES, to get a higher quality PDF documentation.
+#
+# See also section LATEX_CMD_NAME for selecting the engine.
 # The default value is: YES.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
@@ -2204,7 +2344,7 @@ HIDE_UNDOC_RELATIONS   = YES
 # http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
 # Bell Labs. The other options in this section have no effect if this option is
 # set to NO
-# The default value is: NO.
+# The default value is: YES.
 
 HAVE_DOT               = YES
 
@@ -2283,10 +2423,32 @@ UML_LOOK               = NO
 # but if the number exceeds 15, the total amount of fields shown is limited to
 # 10.
 # Minimum value: 0, maximum value: 100, default value: 10.
-# This tag requires that the tag HAVE_DOT is set to YES.
+# This tag requires that the tag UML_LOOK is set to YES.
 
 UML_LIMIT_NUM_FIELDS   = 10
 
+# If the DOT_UML_DETAILS tag is set to NO, doxygen will show attributes and
+# methods without types and arguments in the UML graphs. If the DOT_UML_DETAILS
+# tag is set to YES, doxygen will add type and arguments for attributes and
+# methods in the UML graphs. If the DOT_UML_DETAILS tag is set to NONE, doxygen
+# will not generate fields with class member information in the UML graphs. The
+# class diagrams will look similar to the default class diagrams but using UML
+# notation for the relationships.
+# Possible values are: NO, YES and NONE.
+# The default value is: NO.
+# This tag requires that the tag UML_LOOK is set to YES.
+
+DOT_UML_DETAILS        = NO
+
+# The DOT_WRAP_THRESHOLD tag can be used to set the maximum number of characters
+# to display on a single line. If the actual line length exceeds this threshold
+# significantly it will wrapped across multiple lines. Some heuristics are apply
+# to avoid ugly line breaks.
+# Minimum value: 0, maximum value: 1000, default value: 17.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_WRAP_THRESHOLD     = 17
+
 # If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
 # collaboration graphs will show the relations between templates and their
 # instances.
@@ -2360,7 +2522,9 @@ DIRECTORY_GRAPH        = NO
 # Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
 # to make the SVG files visible in IE 9+ (other browsers do not have this
 # requirement).
-# Possible values are: png, jpg, gif, svg, png:gd, png:gd:gd, png:cairo,
+# Possible values are: png, png:cairo, png:cairo:cairo, png:cairo:gd, png:gd,
+# png:gd:gd, jpg, jpg:cairo, jpg:cairo:gd, jpg:gd, jpg:gd:gd, gif, gif:cairo,
+# gif:cairo:gd, gif:gd, gif:gd:gd, svg, png:gd, png:gd:gd, png:cairo,
 # png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and
 # png:gdiplus:gdiplus.
 # The default value is: png.
@@ -2476,9 +2640,11 @@ DOT_MULTI_TARGETS      = YES
 
 GENERATE_LEGEND        = YES
 
-# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot
+# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate
 # files that are used to generate the various graphs.
+#
+# Note: This setting is not only used for dot files but also for msc and
+# plantuml temporary files.
 # The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
 
 DOT_CLEANUP            = YES
diff --git a/packages/Catch2/MODULE.bazel b/packages/Catch2/MODULE.bazel
new file mode 100644
index 0000000000000000000000000000000000000000..a7846cd60a25354d7721dcf24f141ad9b2fa323d
--- /dev/null
+++ b/packages/Catch2/MODULE.bazel
@@ -0,0 +1,3 @@
+module(name = "catch2")
+
+bazel_dep(name = "bazel_skylib", version = "1.5.0")
diff --git a/packages/Catch2/WORKSPACE.bazel b/packages/Catch2/WORKSPACE.bazel
index d962a99543e93cb4b43b46e7b051c02034eb4d43..357e6f94411e75e52e43583a8d252cdf252e3088 100644
--- a/packages/Catch2/WORKSPACE.bazel
+++ b/packages/Catch2/WORKSPACE.bazel
@@ -4,10 +4,10 @@ load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 
 http_archive(
     name = "bazel_skylib",
-    sha256 = "b8a1527901774180afc798aeb28c4634bdccf19c4d98e7bdd1ce79d1fe9aaad7",
+    sha256 = "cd55a062e763b9349921f0f5db8c3933288dc8ba4f76dd9416aac68acee3cb94",
     urls = [
-        "https://mirror.bazel.build/github.com/bazelbuild/bazel-skylib/releases/download/1.4.1/bazel-skylib-1.4.1.tar.gz",
-        "https://github.com/bazelbuild/bazel-skylib/releases/download/1.4.1/bazel-skylib-1.4.1.tar.gz",
+        "https://mirror.bazel.build/github.com/bazelbuild/bazel-skylib/releases/download/1.5.0/bazel-skylib-1.5.0.tar.gz",
+        "https://github.com/bazelbuild/bazel-skylib/releases/download/1.5.0/bazel-skylib-1.5.0.tar.gz",
     ],
 )
 
diff --git a/packages/Catch2/appveyor.yml b/packages/Catch2/appveyor.yml
index 3b6580d8bd7ab55ef9bbad9dda1bb0454c443677..7a0ad83ffdd1e90f143cdbe6c7b97d0f0511af31 100644
--- a/packages/Catch2/appveyor.yml
+++ b/packages/Catch2/appveyor.yml
@@ -70,14 +70,3 @@ environment:
       additional_flags: "/permissive- /std:c++latest"
       platform: x64
       configuration: Debug
-
-    - FLAVOR: VS 2017 x64 Debug
-      APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017
-      platform: x64
-      configuration: Debug
-
-    - FLAVOR: VS 2017 x64 Release Coverage
-      APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017
-      coverage: 1
-      platform: x64
-      configuration: Debug
\ No newline at end of file
diff --git a/packages/Catch2/docs/benchmarks.md b/packages/Catch2/docs/benchmarks.md
index 548913c76fb24653a51556ebba67d37d076a93c7..9edbb93c7fb5ad90e6a53f22b8e632d4904c149c 100644
--- a/packages/Catch2/docs/benchmarks.md
+++ b/packages/Catch2/docs/benchmarks.md
@@ -93,7 +93,7 @@ Fibonacci
 -------------------------------------------------------------------------------
 C:\path\to\Catch2\Benchmark.tests.cpp(10)
 ...............................................................................
-benchmark name                                  samples       iterations    estimated
+benchmark name                                  samples       iterations    est run time
                                                 mean          low mean      high mean
                                                 std dev       low std dev   high std dev
 -------------------------------------------------------------------------------
diff --git a/packages/Catch2/docs/ci-and-misc.md b/packages/Catch2/docs/ci-and-misc.md
index c07da29f056a8ab3df96863475436f757e814e66..49bbd98910d1774cbb388f5956fb921b4e798943 100644
--- a/packages/Catch2/docs/ci-and-misc.md
+++ b/packages/Catch2/docs/ci-and-misc.md
@@ -82,7 +82,7 @@ variable set to "1".
 
 ### CodeCoverage module (GCOV, LCOV...)
 
-If you are using GCOV tool to get testing coverage of your code, and are not sure how to integrate it with CMake and Catch, there should be an external example over at https://github.com/fkromer/catch_cmake_coverage
+If you are using GCOV tool to get testing coverage of your code, and are not sure how to integrate it with CMake and Catch, there should be an external example over at https://github.com/claremacrae/catch_cmake_coverage
 
 
 ### pkg-config
diff --git a/packages/Catch2/docs/cmake-integration.md b/packages/Catch2/docs/cmake-integration.md
index e38d5c2f40d28a46aa6b65c22a6c9c6ce0e32bea..86666efe2b181bdc8318c7e52339da80e4865d6f 100644
--- a/packages/Catch2/docs/cmake-integration.md
+++ b/packages/Catch2/docs/cmake-integration.md
@@ -51,7 +51,7 @@ Include(FetchContent)
 FetchContent_Declare(
   Catch2
   GIT_REPOSITORY https://github.com/catchorg/Catch2.git
-  GIT_TAG        v3.0.1 # or a later release
+  GIT_TAG        v3.4.0 # or a later release
 )
 
 FetchContent_MakeAvailable(Catch2)
@@ -203,7 +203,7 @@ the output file name e.g. ".xml".
 
 If specified allows control over when test discovery is performed.
 For a value of `POST_BUILD` (default) test discovery is performed at build time.
-For a a value of `PRE_TEST` test discovery is delayed until just prior to test
+For a value of `PRE_TEST` test discovery is delayed until just prior to test
 execution (useful e.g. in cross-compilation environments).
 ``DISCOVERY_MODE`` defaults to the value of the
 ``CMAKE_CATCH_DISCOVER_TESTS_DISCOVERY_MODE`` variable if it is not passed when
diff --git a/packages/Catch2/docs/command-line.md b/packages/Catch2/docs/command-line.md
index a15a213147b3b4ed8a34a8402fd287f0d82ef5cd..bb483959d5d5ce3cfb3c377a5df8ee098d78aa5f 100644
--- a/packages/Catch2/docs/command-line.md
+++ b/packages/Catch2/docs/command-line.md
@@ -85,43 +85,102 @@ Click one of the following links to take you straight to that option - or scroll
 
 <pre>&lt;test-spec> ...</pre>
 
-Test cases, wildcarded test cases, tags and tag expressions are all passed directly as arguments. Tags are distinguished by being enclosed in square brackets.
+By providing a test spec, you filter which tests will be run. If you call
+Catch2 without any test spec, then it will run all non-hidden test
+cases. A test case is hidden if it has the `[!benchmark]` tag, any tag
+with a dot at the start, e.g. `[.]` or `[.foo]`.
 
-If no test specs are supplied then all test cases, except "hidden" tests, are run.
-A test is hidden by giving it any tag starting with (or just) a period (```.```) - or, in the deprecated case, tagged ```[hide]``` or given name starting with `'./'`. To specify hidden tests from the command line ```[.]``` or ```[hide]``` can be used *regardless of how they were declared*.
+There are three basic test specs that can then be combined into more
+complex specs:
 
-Specs must be enclosed in quotes if they contain spaces. If they do not contain spaces the quotes are optional.
+  * Full test name, e.g. `"Test 1"`.
 
-Wildcards consist of the `*` character at the beginning and/or end of test case names and can substitute for any number of any characters (including none).
+    This allows only test cases whose name is "Test 1".
 
-Test specs are case insensitive.
+  * Wildcarded test name, e.g. `"*Test"`, or `"Test*"`, or `"*Test*"`.
 
-If a spec is prefixed with `exclude:` or the `~` character then the pattern matches an exclusion. This means that tests matching the pattern are excluded from the set - even if a prior inclusion spec included them. Subsequent inclusion specs will take precedence, however.
-Inclusions and exclusions are evaluated in left-to-right order.
+    This allows any test case whose name ends with, starts with, or contains
+    in the middle the string "Test". Note that the wildcard can only be at
+    the start or end.
 
-Test case examples:
+  * Tag name, e.g. `[some-tag]`.
 
+    This allows any test case tagged with "[some-tag]". Remember that some
+    tags are special, e.g. those that start with "." or with "!".
+
+
+You can also combine the basic test specs to create more complex test
+specs. You can:
+
+  * Concatenate specs to apply all of them, e.g. `[some-tag][other-tag]`.
+
+    This allows test cases that are tagged with **both** "[some-tag]" **and**
+    "[other-tag]". A test case with just "[some-tag]" will not pass the filter,
+    nor will test case with just "[other-tag]".
+
+  * Comma-join specs to apply any of them, e.g. `[some-tag],[other-tag]`.
+
+    This allows test cases that are tagged with **either** "[some-tag]" **or**
+    "[other-tag]". A test case with both will obviously also pass the filter.
+
+    Note that commas take precendence over simple concatenation. This means
+    that `[a][b],[c]` accepts tests that are tagged with either both "[a]" and
+    "[b]", or tests that are tagged with just "[c]".
+
+  * Negate the spec by prepending it with `~`, e.g. `~[some-tag]`.
+
+    This rejects any test case that is tagged with "[some-tag]". Note that
+    rejection takes precedence over other filters.
+
+    Note that negations always binds to the following _basic_ test spec.
+    This means that `~[foo][bar]` negates only the "[foo]" tag and not the
+    "[bar]" tag.
+
+Note that when Catch2 is deciding whether to include a test, first it
+checks whether the test matches any negative filters. If it does,
+the test is rejected. After that, the behaviour depends on whether there
+are positive filters as well. If there are no positive filters, all
+remaining non-hidden tests are included. If there are positive filters,
+only tests that match the positive filters are included.
+
+You can also match test names with special characters by escaping them
+with a backslash (`"\"`), e.g. a test named `"Do A, then B"` is matched
+by "Do A\, then B" test spec. Backslash also escapes itself.
+
+
+### Examples
+
+Given these TEST_CASEs,
 ```
-thisTestOnly            Matches the test case called, 'thisTestOnly'
-"this test only"        Matches the test case called, 'this test only'
-these*                  Matches all cases starting with 'these'
-exclude:notThis         Matches all tests except, 'notThis'
-~notThis                Matches all tests except, 'notThis'
-~*private*              Matches all tests except those that contain 'private'
-a* ~ab* abc             Matches all tests that start with 'a', except those that
-                        start with 'ab', except 'abc', which is included
-~[tag1]                 Matches all tests except those tagged with '[tag1]'
--# [#somefile]          Matches all tests from the file 'somefile.cpp'
+TEST_CASE("Test 1") {}
+
+TEST_CASE("Test 2", "[.foo]") {}
+
+TEST_CASE("Test 3", "[.bar]") {}
+
+TEST_CASE("Test 4", "[.][foo][bar]") {}
 ```
 
-Names within square brackets are interpreted as tags.
-A series of tags form an AND expression whereas a comma-separated sequence forms an OR expression. e.g.:
+this is the result of these filters
+```
+./tests                      # Selects only the first test, others are hidden
+./tests "Test 1"             # Selects only the first test, other do not match
+./tests ~"Test 1"            # Selects no tests. Test 1 is rejected, other tests are hidden
+./tests "Test *"             # Selects all tests.
+./tests [bar]                # Selects tests 3 and 4. Other tests are not tagged [bar]
+./tests ~[foo]               # Selects test 1, because it is the only non-hidden test without [foo] tag
+./tests [foo][bar]           # Selects test 4.
+./tests [foo],[bar]          # Selects tests 2, 3, 4.
+./tests ~[foo][bar]          # Selects test 3. 2 and 4 are rejected due to having [foo] tag
+./tests ~"Test 2"[foo]       # Selects test 4, because test 2 is explicitly rejected
+./tests [foo][bar],"Test 1"  # Selects tests 1 and 4.
+./tests "Test 1*"            # Selects test 1, wildcard can match zero characters
+```
 
-<pre>[one][two],[three]</pre>
-This matches all tests tagged `[one]` and `[two]`, as well as all tests tagged `[three]`
+_Note: Using plain asterisk on a command line can cause issues with shell
+expansion. Make sure that the asterisk is passed to Catch2 and is not
+interpreted by the shell._
 
-Test names containing special characters, such as `,` or `[` can specify them on the command line using `\`.
-`\` also escapes itself.
 
 <a id="choosing-a-reporter-to-use"></a>
 ## Choosing a reporter to use
diff --git a/packages/Catch2/docs/configuration.md b/packages/Catch2/docs/configuration.md
index d6e159e529e159eaf8a736aff0dcf8732d691c34..8a3ddfab5c9e9c5cef1415ecffaf5f94d46d5681 100644
--- a/packages/Catch2/docs/configuration.md
+++ b/packages/Catch2/docs/configuration.md
@@ -26,7 +26,8 @@ with the same name.
 
 ## Prefixing Catch macros
 
-    CATCH_CONFIG_PREFIX_ALL
+    CATCH_CONFIG_PREFIX_ALL       // Prefix all macros with CATCH_
+    CATCH_CONFIG_PREFIX_MESSAGES  // Prefix only INFO, UNSCOPED_INFO, WARN and CAPTURE
 
 To keep test code clean and uncluttered Catch uses short macro names (e.g. ```TEST_CASE``` and ```REQUIRE```). Occasionally these may conflict with identifiers from platform headers or the system under test. In this case the above identifier can be defined. This will cause all the Catch user macros to be prefixed with ```CATCH_``` (e.g. ```CATCH_TEST_CASE``` and ```CATCH_REQUIRE```).
 
@@ -267,7 +268,7 @@ must compile and must break into debugger.
 
 ## Static analysis support
 
-> Introduced in Catch2 X.Y.Z.
+> Introduced in Catch2 3.4.0.
 
 Some parts of Catch2, e.g. `SECTION`s, can be hard for static analysis
 tools to reason about. Catch2 can change its internals to help static
diff --git a/packages/Catch2/docs/faq.md b/packages/Catch2/docs/faq.md
index a7d0455a35c94e8dbb7e0f4eb37858747ed4b854..80923d26e8b8d1a769720fb464ec99b5fa2e8115 100644
--- a/packages/Catch2/docs/faq.md
+++ b/packages/Catch2/docs/faq.md
@@ -10,6 +10,7 @@
 [Does Catch2 support running tests in parallel?](#does-catch2-support-running-tests-in-parallel)<br>
 [Can I compile Catch2 into a dynamic library?](#can-i-compile-catch2-into-a-dynamic-library)<br>
 [What repeatability guarantees does Catch2 provide?](#what-repeatability-guarantees-does-catch2-provide)<br>
+[My build cannot find `catch2/catch_user_config.hpp`, how can I fix it?](#my-build-cannot-find-catch2catch_user_confighpp-how-can-i-fix-it)<br>
 
 
 ## How do I run global setup/teardown only if tests will be run?
@@ -83,12 +84,30 @@ and it is also generally repeatable across versions, but we might break
 it from time to time. E.g. we broke repeatability with previous versions
 in v2.13.4 so that test cases with similar names are shuffled better.
 
-Random generators currently rely on platform's stdlib, specifically
-the distributions from `<random>`. We thus provide no extra guarantee
-above what your platform does. **Important: `<random>`'s distributions
+Since Catch2 3.5.0 the random generators use custom distributions,
+that should be repeatable across different platforms, with few caveats.
+For details see the section on random generators in the [Generator
+documentation](generators.md#random-number-generators-details).
+
+Before this version, random generators relied on distributions from
+platform's stdlib. We thus can provide no extra guarantee on top of the
+ones given by your platform. **Important: `<random>`'s distributions
 are not specified to be repeatable across different platforms.**
 
 
+## My build cannot find `catch2/catch_user_config.hpp`, how can I fix it?
+
+`catch2/catch_user_config.hpp` is a generated header that contains user
+compile time configuration. It is generated by CMake/Meson/Bazel during
+build. If you are not using either of these, your three options are to
+
+1) Build Catch2 separately using build tool that will generate the header
+2) Use the amalgamated files to build Catch2
+3) Use CMake to configure a build. This will generate the header and you
+   can copy it into your own checkout of Catch2.
+
+
+
 ---
 
 [Home](Readme.md#top)
diff --git a/packages/Catch2/docs/generators.md b/packages/Catch2/docs/generators.md
index 097997521c09705cc549db61ad42705429bf42e9..8bca54c7510399b02392cf7cacf401cf03b4d3b8 100644
--- a/packages/Catch2/docs/generators.md
+++ b/packages/Catch2/docs/generators.md
@@ -189,6 +189,31 @@ TEST_CASE("type conversion", "[generators]") {
 }
 ```
 
+
+### Random number generators: details
+
+> This section applies from Catch2 3.5.0. Before that, random generators
+> were a thin wrapper around distributions from `<random>`.
+
+All of the `random(a, b)` generators in Catch2 currently generate uniformly
+distributed number in closed interval \[a; b\]. This  is different from
+`std::uniform_real_distribution`, which should return numbers in interval
+\[a; b) (but due to rounding can end up returning b anyway), but the
+difference is intentional, so that `random(a, a)` makes sense. If there is
+enough interest from users, we can provide API to pick any of CC, CO, OC,
+or OO ranges.
+
+Unlike `std::uniform_int_distribution`, Catch2's generators also support
+various single-byte integral types, such as `char` or `bool`.
+
+Given the same seed, the output from the integral generators is
+reproducible across different platforms. For floating point generators,
+we only promise reproducibility on platforms that obey the IEEE 754
+standard, and where `float` is 4 bytes and `double` is 8 bytes. We provide
+no guarantees for `long double`, as the internals of `long double` can
+vary wildly across different platforms.
+
+
 ## Generator interface
 
 You can also implement your own generators, by deriving from the
diff --git a/packages/Catch2/docs/limitations.md b/packages/Catch2/docs/limitations.md
index cc0ed05d1370d2c910c439da74944f228ce0af51..099dd82a51834f64f5a5561ebc946c4387e82c81 100644
--- a/packages/Catch2/docs/limitations.md
+++ b/packages/Catch2/docs/limitations.md
@@ -173,13 +173,3 @@ TEST_CASE("b") {
 
 If you are seeing a problem like this, i.e. weird test paths that trigger only under Clang with `libc++`, or only under very specific version of `libstdc++`, it is very likely you are seeing this. The only known workaround is to use a fixed version of your standard library.
 
-
-### libstdc++, `_GLIBCXX_DEBUG` macro and random ordering of tests
-
-Running a Catch2 binary compiled against libstdc++ with `_GLIBCXX_DEBUG`
-macro defined with `--order rand` will cause a debug check to trigger and
-abort the run due to self-assignment.
-[This is a known bug inside libstdc++](https://stackoverflow.com/questions/22915325/avoiding-self-assignment-in-stdshuffle/23691322)
-
-Workaround: Don't use `--order rand` when compiling against debug-enabled
-libstdc++.
diff --git a/packages/Catch2/docs/matchers.md b/packages/Catch2/docs/matchers.md
index 14c1589821d3ae82b145e67568e3655ebf35b45a..d5be1f5a39992a5f57051aa5a246c0f27a3e243c 100644
--- a/packages/Catch2/docs/matchers.md
+++ b/packages/Catch2/docs/matchers.md
@@ -50,25 +50,43 @@ Both of the string matchers used in the examples above live in the
 `catch_matchers_string.hpp` header, so to compile the code above also
 requires `#include <catch2/matchers/catch_matchers_string.hpp>`.
 
+### Combining operators and lifetimes
+
 **IMPORTANT**: The combining operators do not take ownership of the
-matcher objects being combined. This means that if you store combined
-matcher object, you have to ensure that the matchers being combined
-outlive its last use. What this means is that the following code leads
-to a use-after-free (UAF):
+matcher objects being combined.
+
+This means that if you store combined matcher object, you have to ensure
+that the individual matchers being combined outlive the combined matcher.
+Note that the negation matcher from `!` also counts as combining matcher
+for this.
 
+Explained on an example, this is fine
 ```cpp
-#include <catch2/catch_test_macros.hpp>
-#include <catch2/matchers/catch_matchers_string.hpp>
+CHECK_THAT(value, WithinAbs(0, 2e-2) && !WithinULP(0., 1));
+```
 
-TEST_CASE("Bugs, bugs, bugs", "[Bug]"){
-    std::string str = "Bugs as a service";
+and so is this
+```cpp
+auto is_close_to_zero = WithinAbs(0, 2e-2);
+auto is_zero          = WithinULP(0., 1);
 
-    auto match_expression = Catch::Matchers::EndsWith( "as a service" ) ||
-        (Catch::Matchers::StartsWith( "Big data" ) && !Catch::Matchers::ContainsSubstring( "web scale" ) );
-    REQUIRE_THAT(str, match_expression);
-}
+CHECK_THAT(value, is_close_to_zero && !is_zero);
 ```
 
+but this is not
+```cpp
+auto is_close_to_zero = WithinAbs(0, 2e-2);
+auto is_zero          = WithinULP(0., 1);
+auto is_close_to_but_not_zero = is_close_to_zero && !is_zero;
+
+CHECK_THAT(a_value, is_close_to_but_not_zero); // UAF
+```
+
+because `!is_zero` creates a temporary instance of Negation matcher,
+which the `is_close_to_but_not_zero` refers to. After the line ends,
+the temporary is destroyed and the combined `is_close_to_but_not_zero`
+matcher now refers to non-existent object, so using it causes use-after-free.
+
 
 ## Built-in matchers
 
@@ -286,7 +304,7 @@ comparable. (e.g. you may compare `std::vector<int>` to `std::array<char>`).
 `UnorderedRangeEquals` is similar to `RangeEquals`, but the order
 does not matter. For example "1, 2, 3" would match "3, 2, 1", but not
 "1, 1, 2, 3" As with `RangeEquals`, `UnorderedRangeEquals` compares
-the individual elements using using `operator==` by default.
+the individual elements using `operator==` by default.
 
 Both `RangeEquals` and `UnorderedRangeEquals` optionally accept a
 predicate which can be used to compare the containers element-wise.
diff --git a/packages/Catch2/docs/release-notes.md b/packages/Catch2/docs/release-notes.md
index 8b413b1560fbe2670a90cfb6cc29380e94009c8f..ac78866f60454c7176ee6835196f7caa4d5cb1d9 100644
--- a/packages/Catch2/docs/release-notes.md
+++ b/packages/Catch2/docs/release-notes.md
@@ -2,6 +2,10 @@
 
 # Release notes
 **Contents**<br>
+[3.5.2](#352)<br>
+[3.5.1](#351)<br>
+[3.5.0](#350)<br>
+[3.4.0](#340)<br>
 [3.3.2](#332)<br>
 [3.3.1](#331)<br>
 [3.3.0](#330)<br>
@@ -56,6 +60,87 @@
 [Even Older versions](#even-older-versions)<br>
 
 
+## 3.5.2
+
+### Fixes
+* Fixed `-Wsubobject-linkage` in the Console reporter (#2794)
+* Fixed adding new CLI Options to lvalue parser using `|` (#2787)
+
+
+## 3.5.1
+
+### Improvements
+* Significantly improved performance of the CLI parsing.
+  * This includes the cost of preparing the CLI parser, so Catch2's binaries start much faster.
+
+### Miscellaneous
+* Added support for Bazel modules (#2781)
+* Added CMake option to disable the build reproducibility settings (#2785)
+* Added `log` library linking to the Meson build (#2784)
+
+
+## 3.5.0
+
+### Improvements
+* Introduced `CATCH_CONFIG_PREFIX_MESSAGES` to prefix only logging macros (#2544)
+  * This means `INFO`, `UNSCOPED_INFO`, `WARN` and `CAPTURE`.
+* Section hints in static analysis mode are now `const`
+  * This prevents Clang-Tidy from complaining about `misc-const-correctness`.
+* `from_range` generator supports C arrays and ranges that require ADL (#2737)
+* Stringification support for `std::optional` now also includes `std::nullopt` (#2740)
+* The Console reporter flushes output after writing benchmark runtime estimate.
+  * This means that you can immediately see for how long the benchmark is expected to run.
+* Added workaround to enable compilation with ICC 19.1 (#2551, #2766)
+* Compiling Catch2 for XBox should work out of the box (#2772)
+  * Catch2 should automatically disable getenv when compiled for XBox.
+* Compiling Catch2 with exceptions disabled no longer triggers `Wunused-function` (#2726)
+* **`random` Generators for integral types are now reproducible across different platforms**
+  * Unlike `<random>`, Catch2's generators also support 1 byte integral types (`char`, `bool`, ...)
+* **`random` Generators for `float` and `double` are now reproducible across different platforms**
+  * `long double` varies across different platforms too much to be reproducible
+  * This guarantee applies only to platforms with IEEE 754 floats.
+
+### Fixes
+* UDL declaration inside Catch2 are now strictly conforming to the standard
+  * `operator "" _a` is UB, `operator ""_a` is fine. Seriously.
+* Fixed `CAPTURE` tests failing to compile in C++23 mode (#2744)
+* Fixed missing include in `catch_message.hpp` (#2758)
+* Fixed `CHECK_ELSE` suppressing failure from uncaught exceptions(#2723)
+
+### Miscellaneous
+* The documentation for specifying which tests to run through commandline has been completely rewritten (#2738)
+* Fixed installation when building Catch2 with meson (#2722, #2742)
+* Fixed `catch_discover_tests` when using custom reporter and `PRE_TEST` discovery mode (#2747)
+* `catch_discover_tests` supports multi-config CMake generator in `PRE_TEST` discovery mode (#2739, #2746)
+
+
+## 3.4.0
+
+### Improvements
+* `VectorEquals` supports elements that provide only `==` and not `!=` (#2648)
+* Catch2 supports compiling with IAR compiler (#2651)
+* Various small internal performance improvements
+* Various small internal compilation time improvements
+* XMLReporter now reports location info for INFO and WARN (#1251)
+  * This bumps up the xml format version to 3
+* Documented that `SKIP` in generator constructor can be used to handle empty  generator (#1593)
+* Added experimental static analysis support to `TEST_CASE` and `SECTION` macros (#2681)
+  * The two macros are redefined in a way that helps the SA tools reason about the possible paths through a test case with sections.
+  * The support is controlled by the `CATCH_CONFIG_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT` option and autodetects clang-tidy and Coverity.
+* `*_THROWS`, `*_THROWS_AS`, etc now suppress warning coming from `__attribute__((warn_unused_result))` on GCC  (#2691)
+  * Unlike plain `[[nodiscard]]`, this warning is not silenced by void cast. WTF GCC?
+
+### Fixes
+* Fixed `assertionStarting` events being sent after the expr is evaluated (#2678)
+* Errors in `TEST_CASE` tags are now reported nicely (#2650)
+
+### Miscellaneous
+* Bunch of improvements to `catch_discover_tests`
+  * Added DISCOVERY_MODE option, so the discovery can happen either post build or pre-run.
+  * Fixed handling of semicolons and backslashes in test names (#2674, #2676)
+* meson build can disable building tests (#2693)
+* meson build properly sets meson version 0.54.1 as the minimal supported version (#2688)
+
 
 ## 3.3.2
 
@@ -352,7 +437,7 @@ v3 releases.
 * Added `STATIC_CHECK` macro, similar to `STATIC_REQUIRE` (#2318)
   * When deferred tu runtime, it behaves like `CHECK`, and not like `REQUIRE`.
 * You can have multiple tests with the same name, as long as other parts of the test identity differ (#1915, #1999, #2175)
-  * Test identity includes test's name, test's tags and and test's class name if applicable.
+  * Test identity includes test's name, test's tags and test's class name if applicable.
 * Added new warning, `UnmatchedTestSpec`, to error on test specs with no matching tests
 * The `-w`, `--warn` warning flags can now be provided multiple times to enable multiple warnings
 * The case-insensitive handling of tags is now more reliable and takes up less memory
diff --git a/packages/Catch2/docs/reporters.md b/packages/Catch2/docs/reporters.md
index 496c61a925a8185109cb16432144df1f1fa8c942..e2abfe34d063e6c7497d7b6b3194033153baff77 100644
--- a/packages/Catch2/docs/reporters.md
+++ b/packages/Catch2/docs/reporters.md
@@ -52,7 +52,7 @@ its machine-readable XML output to file `result-junit.xml`, and the
 uses ANSI colour codes for colouring the output.
 
 Using multiple reporters (or one reporter and one-or-more [event
-listeners](event-listener.md#top)) can have surprisingly complex semantics
+listeners](event-listeners.md#top)) can have surprisingly complex semantics
 when using customization points provided to reporters by Catch2, namely
 capturing stdout/stderr from test cases.
 
diff --git a/packages/Catch2/docs/skipping-passing-failing.md b/packages/Catch2/docs/skipping-passing-failing.md
index d866b418d1b572756e914e6c9d7da8ea12e6c0c1..52bb18f764c2b24ba8057f6b299d56cbf9350073 100644
--- a/packages/Catch2/docs/skipping-passing-failing.md
+++ b/packages/Catch2/docs/skipping-passing-failing.md
@@ -9,7 +9,7 @@ In some situations it may not be possible to meaningfully execute a test case,
 for example when the system under test is missing certain hardware capabilities.
 If the required conditions can only be determined at runtime, it often
 doesn't make sense to consider such a test case as either passed or failed,
-because it simply can not run at all.
+because it simply cannot run at all.
 
 To properly express such scenarios, Catch2 provides a way to explicitly
 _skip_ test cases, using the `SKIP` macro:
diff --git a/packages/Catch2/docs/test-cases-and-sections.md b/packages/Catch2/docs/test-cases-and-sections.md
index acebcc51d76419413b2e7a1fc6f873562d158585..01c898bb64973d3883dc5ff133bee42de1cd13cd 100644
--- a/packages/Catch2/docs/test-cases-and-sections.md
+++ b/packages/Catch2/docs/test-cases-and-sections.md
@@ -231,7 +231,7 @@ TEMPLATE_TEST_CASE( "vectors can be sized and resized", "[vector][template]", in
 
 > [Introduced](https://github.com/catchorg/Catch2/issues/1468) in Catch2 2.6.0.
 
-_template-type1_ through _template-typen_ is list of template template
+_template-type1_ through _template-typen_ is list of template
 types which should be combined with each of _template-arg1_ through
  _template-argm_, resulting in _n * m_ test cases. Inside the test case,
 the resulting type is available under the name of `TestType`.
diff --git a/packages/Catch2/docs/tostring.md b/packages/Catch2/docs/tostring.md
index adce3cc764feda479a26389fba74f64733058da3..b99b6742604f2190220cc5b0cacfb603ec6a265f 100644
--- a/packages/Catch2/docs/tostring.md
+++ b/packages/Catch2/docs/tostring.md
@@ -75,7 +75,7 @@ CATCH_TRANSLATE_EXCEPTION( MyType const& ex ) {
 
 Enums that already have a `<<` overload for `std::ostream` will convert to strings as expected.
 If you only need to convert enums to strings for test reporting purposes you can provide a `StringMaker` specialisations as any other type.
-However, as a convenience, Catch provides the `REGISTER_ENUM` helper macro that will generate the `StringMaker` specialiation for you with minimal code.
+However, as a convenience, Catch provides the `REGISTER_ENUM` helper macro that will generate the `StringMaker` specialisation for you with minimal code.
 Simply provide it the (qualified) enum name, followed by all the enum values, and you're done!
 
 E.g.
diff --git a/packages/Catch2/docs/tutorial.md b/packages/Catch2/docs/tutorial.md
index 342c7381812820a1b8a32955e83369a6f7f7ce96..dfccac888dbd1257766709bc8d47936c60b39397 100644
--- a/packages/Catch2/docs/tutorial.md
+++ b/packages/Catch2/docs/tutorial.md
@@ -119,7 +119,7 @@ This is best explained through an example ([code](../examples/100-Fix-Section.cp
 
 ```c++
 TEST_CASE( "vectors can be sized and resized", "[vector]" ) {
-
+    // This setup will be done 4 times in total, once for each section
     std::vector<int> v( 5 );
 
     REQUIRE( v.size() == 5 );
@@ -152,11 +152,12 @@ TEST_CASE( "vectors can be sized and resized", "[vector]" ) {
 }
 ```
 
-For each `SECTION` the `TEST_CASE` is executed from the start. This means
+For each `SECTION` the `TEST_CASE` is **executed from the start**. This means
 that each section is entered with a freshly constructed vector `v`, that
 we know has size 5 and capacity at least 5, because the two assertions
-are also checked before the section is entered. Each run through a test
-case will execute one, and only one, leaf section.
+are also checked before the section is entered. This behaviour may not be
+ideal for tests where setup is expensive. Each run through a test case will
+execute one, and only one, leaf section.
 
 Section can also be nested, in which case the parent section can be
 entered multiple times, once for each leaf section. Nested sections are
diff --git a/packages/Catch2/docs/why-catch.md b/packages/Catch2/docs/why-catch.md
index 2c0178ca5decacbc7bd5214a70e6e8bcc435d121..b7367496b8481896f60aece65f017ef7a888254f 100644
--- a/packages/Catch2/docs/why-catch.md
+++ b/packages/Catch2/docs/why-catch.md
@@ -30,7 +30,7 @@ So what does Catch2 bring to the party that differentiates it from these? Apart
 * Output is through modular reporter objects. Basic textual and XML reporters are included. Custom reporters can easily be added.
 * JUnit xml output is supported for integration with third-party tools, such as CI servers.
 * A default main() function is provided, but you can supply your own for complete control (e.g. integration into your own test runner GUI).
-* A command line parser is provided and can still be used if you choose to provided your own main() function.
+* A command line parser is provided and can still be used if you choose to provide your own main() function.
 * Alternative assertion macro(s) report failures but don't abort the test case
 * Good set of facilities for floating point comparisons (`Catch::Approx` and full set of matchers)
 * Internal and friendly macros are isolated so name clashes can be managed
@@ -41,8 +41,8 @@ So what does Catch2 bring to the party that differentiates it from these? Apart
 
 ## Who else is using Catch2?
 
-A whole lot of people. According to the 2021 JetBrains C++ ecosystem survey,
-about 11% of C++ programmers use Catch2 for unit testing, making it the
+A whole lot of people. According to [the 2022 JetBrains C++ ecosystem survey](https://www.jetbrains.com/lp/devecosystem-2022/cpp/#Which-unit-testing-frameworks-do-you-regularly-use),
+about 12% of C++ programmers use Catch2 for unit testing, making it the
 second most popular unit testing framework.
 
 You can also take a look at the (incomplete) list of [open source projects](opensource-users.md#top)
diff --git a/packages/Catch2/examples/010-TestCase.cpp b/packages/Catch2/examples/010-TestCase.cpp
index 7ec208d5f095b0b22ca7824f145618ef68d41ac6..9e5cd8cd31a1aea231998277ca0bff10a434d6f3 100644
--- a/packages/Catch2/examples/010-TestCase.cpp
+++ b/packages/Catch2/examples/010-TestCase.cpp
@@ -1,3 +1,11 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
 // 010-TestCase.cpp
 // And write tests in the same file:
 #include <catch2/catch_test_macros.hpp>
diff --git a/packages/Catch2/examples/020-TestCase-1.cpp b/packages/Catch2/examples/020-TestCase-1.cpp
index cec55799ac932ea4e06180e1a71032033ede764d..a9d87dbcbd69d75b6c09ce13f9c8ff5231b1e7c0 100644
--- a/packages/Catch2/examples/020-TestCase-1.cpp
+++ b/packages/Catch2/examples/020-TestCase-1.cpp
@@ -1,3 +1,11 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
 // 020-TestCase-1.cpp
 
 #include <catch2/catch_test_macros.hpp>
diff --git a/packages/Catch2/examples/020-TestCase-2.cpp b/packages/Catch2/examples/020-TestCase-2.cpp
index 3f5767b3409411a485bc654bfd6fa98073b9e980..72dd0ffb6ad9305b71b6483a1fc8a5fbd3712c87 100644
--- a/packages/Catch2/examples/020-TestCase-2.cpp
+++ b/packages/Catch2/examples/020-TestCase-2.cpp
@@ -1,3 +1,11 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
 // 020-TestCase-2.cpp
 
 // main() provided by Catch in file 020-TestCase-1.cpp.
diff --git a/packages/Catch2/examples/030-Asn-Require-Check.cpp b/packages/Catch2/examples/030-Asn-Require-Check.cpp
index 0d027ca93ea2149b6c800b41e0fc0c7186ebc233..62cd3cfc4ad884f8b41ce44b4c7bb3d81514d1f7 100644
--- a/packages/Catch2/examples/030-Asn-Require-Check.cpp
+++ b/packages/Catch2/examples/030-Asn-Require-Check.cpp
@@ -1,3 +1,11 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
 // 030-Asn-Require-Check.cpp
 
 // Catch has two natural expression assertion macro's:
diff --git a/packages/Catch2/examples/100-Fix-Section.cpp b/packages/Catch2/examples/100-Fix-Section.cpp
index cfbfa79f999b5301cd28639db9509956961b0ab4..7c8d8aa86cd82e26a743b49b3702b6f5dcc54420 100644
--- a/packages/Catch2/examples/100-Fix-Section.cpp
+++ b/packages/Catch2/examples/100-Fix-Section.cpp
@@ -1,3 +1,11 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
 // 100-Fix-Section.cpp
 
 // Catch has two ways to express fixtures:
diff --git a/packages/Catch2/examples/110-Fix-ClassFixture.cpp b/packages/Catch2/examples/110-Fix-ClassFixture.cpp
index 75c10da62a9f4ff840ca1151f221627dedc5af1c..614c37979cf53f9d66c499c4a720d01b8441f684 100644
--- a/packages/Catch2/examples/110-Fix-ClassFixture.cpp
+++ b/packages/Catch2/examples/110-Fix-ClassFixture.cpp
@@ -1,3 +1,11 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
 // 110-Fix-ClassFixture.cpp
 
 // Catch has two ways to express fixtures:
diff --git a/packages/Catch2/examples/120-Bdd-ScenarioGivenWhenThen.cpp b/packages/Catch2/examples/120-Bdd-ScenarioGivenWhenThen.cpp
index 99cdf9ab99a4eee2edad868c5bf9d64351b1cb67..345d53c387aa0ba116c8e0e59f100de1307411d3 100644
--- a/packages/Catch2/examples/120-Bdd-ScenarioGivenWhenThen.cpp
+++ b/packages/Catch2/examples/120-Bdd-ScenarioGivenWhenThen.cpp
@@ -1,3 +1,11 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
 // 120-Bdd-ScenarioGivenWhenThen.cpp
 
 // main() provided by linkage with Catch2WithMain
diff --git a/packages/Catch2/examples/210-Evt-EventListeners.cpp b/packages/Catch2/examples/210-Evt-EventListeners.cpp
index 6cedb885c448cb0f8a78bde3ab5b3cade1c905ca..56b050d4111e0b91f2a25ca799ff793fd014cd0f 100644
--- a/packages/Catch2/examples/210-Evt-EventListeners.cpp
+++ b/packages/Catch2/examples/210-Evt-EventListeners.cpp
@@ -1,3 +1,11 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
 // 210-Evt-EventListeners.cpp
 
 // Contents:
diff --git a/packages/Catch2/examples/231-Cfg-OutputStreams.cpp b/packages/Catch2/examples/231-Cfg-OutputStreams.cpp
index b77c127354b97dd34823697d08d41923a5e98d19..da1713cf8ca26c37db399aaf45622d61f4fa4485 100644
--- a/packages/Catch2/examples/231-Cfg-OutputStreams.cpp
+++ b/packages/Catch2/examples/231-Cfg-OutputStreams.cpp
@@ -1,3 +1,11 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
 // 231-Cfg-OutputStreams.cpp
 // Show how to replace the streams with a simple custom made streambuf.
 
diff --git a/packages/Catch2/examples/232-Cfg-CustomMain.cpp b/packages/Catch2/examples/232-Cfg-CustomMain.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..69fba7f16e06e0442046adb46b2ebb3d6c0544ab
--- /dev/null
+++ b/packages/Catch2/examples/232-Cfg-CustomMain.cpp
@@ -0,0 +1,41 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+// 232-Cfg-CustomMain.cpp
+// Show how to use custom main and add a custom option to the CLI parser
+
+#include <catch2/catch_session.hpp>
+
+#include <iostream>
+
+int main(int argc, char** argv) {
+  Catch::Session session; // There must be exactly one instance
+
+  int height = 0; // Some user variable you want to be able to set
+
+  // Build a new parser on top of Catch2's
+  using namespace Catch::Clara;
+  auto cli
+    = session.cli()           // Get Catch2's command line parser
+    | Opt( height, "height" ) // bind variable to a new option, with a hint string
+         ["--height"]         // the option names it will respond to
+         ("how high?");       // description string for the help output
+
+  // Now pass the new composite back to Catch2 so it uses that
+  session.cli( cli );
+
+  // Let Catch2 (using Clara) parse the command line
+  int returnCode = session.applyCommandLine( argc, argv );
+  if( returnCode != 0 ) // Indicates a command line error
+      return returnCode;
+
+  // if set on the command line then 'height' is now set at this point
+  std::cout << "height: " << height << std::endl;
+
+  return session.run();
+}
diff --git a/packages/Catch2/examples/300-Gen-OwnGenerator.cpp b/packages/Catch2/examples/300-Gen-OwnGenerator.cpp
index 09643d6f78c47986117805943dd7c4df59c9adf3..b5d951ac474725098b7d2ed1858c7a802e582318 100644
--- a/packages/Catch2/examples/300-Gen-OwnGenerator.cpp
+++ b/packages/Catch2/examples/300-Gen-OwnGenerator.cpp
@@ -1,3 +1,11 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
 // 300-Gen-OwnGenerator.cpp
 // Shows how to define a custom generator.
 
diff --git a/packages/Catch2/examples/301-Gen-MapTypeConversion.cpp b/packages/Catch2/examples/301-Gen-MapTypeConversion.cpp
index ba55f65f156cf5dfaf344198a1d583c57113b5c0..a065d87ae7a110e647d354ff1f67aeaba81615af 100644
--- a/packages/Catch2/examples/301-Gen-MapTypeConversion.cpp
+++ b/packages/Catch2/examples/301-Gen-MapTypeConversion.cpp
@@ -1,3 +1,11 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
 // 301-Gen-MapTypeConversion.cpp
 // Shows how to use map to modify generator's return type.
 
diff --git a/packages/Catch2/examples/302-Gen-Table.cpp b/packages/Catch2/examples/302-Gen-Table.cpp
index 97809889ce1d979fce6e817329c1967c5671b384..3cdb1430136cc6d0ad0825b999c1a5e9792d9754 100644
--- a/packages/Catch2/examples/302-Gen-Table.cpp
+++ b/packages/Catch2/examples/302-Gen-Table.cpp
@@ -1,3 +1,11 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
 // 302-Gen-Table.cpp
 // Shows how to use table to run a test many times with different inputs. Lifted from examples on
 // issue #850.
diff --git a/packages/Catch2/examples/310-Gen-VariablesInGenerators.cpp b/packages/Catch2/examples/310-Gen-VariablesInGenerators.cpp
index 0339c5f18a55ccefa821ebbdf2db4baf78fae4bb..5d24d45a1760e2366d176acf1295aba118694292 100644
--- a/packages/Catch2/examples/310-Gen-VariablesInGenerators.cpp
+++ b/packages/Catch2/examples/310-Gen-VariablesInGenerators.cpp
@@ -1,3 +1,11 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
 // 310-Gen-VariablesInGenerator.cpp
 // Shows how to use variables when creating generators.
 
diff --git a/packages/Catch2/examples/311-Gen-CustomCapture.cpp b/packages/Catch2/examples/311-Gen-CustomCapture.cpp
index d12ee70901261137ba0e98d222f504379beeeded..ee310383557cbe8e754423af53199a007aa9321a 100644
--- a/packages/Catch2/examples/311-Gen-CustomCapture.cpp
+++ b/packages/Catch2/examples/311-Gen-CustomCapture.cpp
@@ -1,3 +1,11 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
 // 311-Gen-CustomCapture.cpp
 // Shows how to provide custom capture list to the generator expression
 
diff --git a/packages/Catch2/examples/CMakeLists.txt b/packages/Catch2/examples/CMakeLists.txt
index f993334180452305fb8c0e4f3356f1e1d07a3e33..82734adab07827475c564a2d0342320abcbc69cb 100644
--- a/packages/Catch2/examples/CMakeLists.txt
+++ b/packages/Catch2/examples/CMakeLists.txt
@@ -30,6 +30,7 @@ set( SOURCES_IDIOMATIC_EXAMPLES
     110-Fix-ClassFixture.cpp
     120-Bdd-ScenarioGivenWhenThen.cpp
     210-Evt-EventListeners.cpp
+    232-Cfg-CustomMain.cpp
     300-Gen-OwnGenerator.cpp
     301-Gen-MapTypeConversion.cpp
     302-Gen-Table.cpp
@@ -53,7 +54,7 @@ set(ALL_EXAMPLE_TARGETS
 )
 
 foreach( name ${ALL_EXAMPLE_TARGETS} )
-    target_link_libraries( ${name} Catch2 Catch2WithMain )
+    target_link_libraries( ${name} Catch2WithMain )
 endforeach()
 
 
diff --git a/packages/Catch2/extras/Catch.cmake b/packages/Catch2/extras/Catch.cmake
index b37b0bf6db6c64d1114b2b507dc0950f9bfa92af..8f30688c52afbc817dc3eccefb66dd120adc3e81 100644
--- a/packages/Catch2/extras/Catch.cmake
+++ b/packages/Catch2/extras/Catch.cmake
@@ -176,8 +176,10 @@ function(catch_discover_tests TARGET)
   string(SUBSTRING ${args_hash} 0 7 args_hash)
 
   # Define rule to generate test list for aforementioned test executable
-  set(ctest_include_file "${CMAKE_CURRENT_BINARY_DIR}/${TARGET}_include-${args_hash}.cmake")
-  set(ctest_tests_file "${CMAKE_CURRENT_BINARY_DIR}/${TARGET}_tests-${args_hash}.cmake")
+  set(ctest_file_base "${CMAKE_CURRENT_BINARY_DIR}/${TARGET}-${args_hash}")
+  set(ctest_include_file "${ctest_file_base}_include.cmake")
+  set(ctest_tests_file "${ctest_file_base}_tests.cmake")
+
   get_property(crosscompiling_emulator
     TARGET ${TARGET}
     PROPERTY CROSSCOMPILING_EMULATOR
@@ -218,6 +220,14 @@ function(catch_discover_tests TARGET)
 
   elseif(_DISCOVERY_MODE STREQUAL "PRE_TEST")
 
+    get_property(GENERATOR_IS_MULTI_CONFIG GLOBAL
+        PROPERTY GENERATOR_IS_MULTI_CONFIG
+    )
+
+    if(GENERATOR_IS_MULTI_CONFIG)
+      set(ctest_tests_file "${ctest_file_base}_tests-$<CONFIG>.cmake")
+    endif()
+
     string(CONCAT ctest_include_content
       "if(EXISTS \"$<TARGET_FILE:${TARGET}>\")"                                    "\n"
       "  if(NOT EXISTS \"${ctest_tests_file}\" OR"                                 "\n"
@@ -249,7 +259,22 @@ function(catch_discover_tests TARGET)
       "endif()"                                                                    "\n"
     )
 
-    file(GENERATE OUTPUT "${ctest_include_file}" CONTENT "${ctest_include_content}")
+    if(GENERATOR_IS_MULTI_CONFIG)
+      foreach(_config ${CMAKE_CONFIGURATION_TYPES})
+        file(GENERATE OUTPUT "${ctest_file_base}_include-${_config}.cmake" CONTENT "${ctest_include_content}" CONDITION $<CONFIG:${_config}>)
+      endforeach()
+      string(CONCAT ctest_include_multi_content
+        "if(NOT CTEST_CONFIGURATION_TYPE)"                                              "\n"
+        "  message(\"No configuration for testing specified, use '-C <cfg>'.\")"        "\n"
+        "else()"                                                                        "\n"
+        "  include(\"${ctest_file_base}_include-\${CTEST_CONFIGURATION_TYPE}.cmake\")"  "\n"
+        "endif()"                                                                       "\n"
+      )
+      file(GENERATE OUTPUT "${ctest_include_file}" CONTENT "${ctest_include_multi_content}")
+    else()
+      file(GENERATE OUTPUT "${ctest_file_base}_include.cmake" CONTENT "${ctest_include_content}")
+      file(WRITE "${ctest_include_file}" "include(\"${ctest_file_base}_include.cmake\")")
+    endif()
   endif()
 
   if(NOT ${CMAKE_VERSION} VERSION_LESS "3.10.0")
diff --git a/packages/Catch2/extras/CatchAddTests.cmake b/packages/Catch2/extras/CatchAddTests.cmake
index 91f79f3c5ba2ec2d9bad8eede892642dc110687f..692e340566fc4c63def7a3a5b60a67e0ccb1a538 100644
--- a/packages/Catch2/extras/CatchAddTests.cmake
+++ b/packages/Catch2/extras/CatchAddTests.cmake
@@ -74,6 +74,10 @@ function(catch_discover_tests_impl)
     )
   endif()
 
+  # Make sure to escape ; (semicolons) in test names first, because
+  # that'd break the foreach loop for "Parse output" later and create
+  # wrongly splitted and thus failing test cases (false positives)
+  string(REPLACE ";" "\;" output "${output}")
   string(REPLACE "\n" ";" output "${output}")
 
   # Prepare reporter
@@ -84,10 +88,10 @@ function(catch_discover_tests_impl)
     # note that the use of --list-reporters is not the important part,
     # we only want to check whether the execution succeeds with ${reporter_arg}
     execute_process(
-      COMMAND ${TEST_EXECUTOR} "${TEST_EXECUTABLE}" ${spec} ${reporter_arg} --list-reporters
+      COMMAND ${_TEST_EXECUTOR} "${_TEST_EXECUTABLE}" ${spec} ${reporter_arg} --list-reporters
       OUTPUT_VARIABLE reporter_check_output
       RESULT_VARIABLE reporter_check_result
-      WORKING_DIRECTORY "${TEST_WORKING_DIR}"
+      WORKING_DIRECTORY "${_TEST_WORKING_DIR}"
     )
     if(${reporter_check_result} EQUAL 255)
       message(FATAL_ERROR
@@ -95,7 +99,7 @@ function(catch_discover_tests_impl)
       )
     elseif(NOT ${reporter_check_result} EQUAL 0)
       message(FATAL_ERROR
-        "Error running test executable '${TEST_EXECUTABLE}':\n"
+        "Error running test executable '${_TEST_EXECUTABLE}':\n"
         "  Result: ${reporter_check_result}\n"
         "  Output: ${reporter_check_output}\n"
       )
@@ -119,15 +123,16 @@ function(catch_discover_tests_impl)
 
   # Parse output
   foreach(line ${output})
-    set(test ${line})
+    set(test "${line}")
     # Escape characters in test case names that would be parsed by Catch2
-    set(test_name ${test})
-    foreach(char , [ ])
-      string(REPLACE ${char} "\\${char}" test_name ${test_name})
+    # Note that the \ escaping must happen FIRST! Do not change the order.
+    set(test_name "${test}")
+    foreach(char \\ , [ ])
+      string(REPLACE ${char} "\\${char}" test_name "${test_name}")
     endforeach(char)
     # ...add output dir
     if(output_dir)
-      string(REGEX REPLACE "[^A-Za-z0-9_]" "_" test_name_clean ${test_name})
+      string(REGEX REPLACE "[^A-Za-z0-9_]" "_" test_name_clean "${test_name}")
       set(output_dir_arg "--out ${output_dir}/${output_prefix}${test_name_clean}${output_suffix}")
     endif()
 
diff --git a/packages/Catch2/extras/catch_amalgamated.cpp b/packages/Catch2/extras/catch_amalgamated.cpp
index a81b1b6ae5f852305d8d7cfefee26317e1486357..f68c9005ba3940ee9bb2f4eda104bd1db6f29f69 100644
--- a/packages/Catch2/extras/catch_amalgamated.cpp
+++ b/packages/Catch2/extras/catch_amalgamated.cpp
@@ -1,3 +1,4 @@
+
 //              Copyright Catch2 Authors
 // Distributed under the Boost Software License, Version 1.0.
 //   (See accompanying file LICENSE.txt or copy at
@@ -5,8 +6,8 @@
 
 // SPDX-License-Identifier: BSL-1.0
 
-//  Catch v3.3.2
-//  Generated: 2023-02-26 10:28:48.270752
+//  Catch v3.5.2
+//  Generated: 2024-01-15 14:06:36.675713
 //  ----------------------------------------------------------
 //  This file is an amalgamation of multiple different files.
 //  You probably shouldn't edit it directly.
@@ -48,6 +49,80 @@ namespace Catch {
 } // namespace Catch
 
 
+// Adapted from donated nonius code.
+
+
+#include <vector>
+
+namespace Catch {
+    namespace Benchmark {
+        namespace Detail {
+            SampleAnalysis analyse(const IConfig &cfg, FDuration* first, FDuration* last) {
+                if (!cfg.benchmarkNoAnalysis()) {
+                    std::vector<double> samples;
+                    samples.reserve(static_cast<size_t>(last - first));
+                    for (auto current = first; current != last; ++current) {
+                        samples.push_back( current->count() );
+                    }
+
+                    auto analysis = Catch::Benchmark::Detail::analyse_samples(
+                        cfg.benchmarkConfidenceInterval(),
+                        cfg.benchmarkResamples(),
+                        samples.data(),
+                        samples.data() + samples.size() );
+                    auto outliers = Catch::Benchmark::Detail::classify_outliers(
+                        samples.data(), samples.data() + samples.size() );
+
+                    auto wrap_estimate = [](Estimate<double> e) {
+                        return Estimate<FDuration> {
+                            FDuration(e.point),
+                                FDuration(e.lower_bound),
+                                FDuration(e.upper_bound),
+                                e.confidence_interval,
+                        };
+                    };
+                    std::vector<FDuration> samples2;
+                    samples2.reserve(samples.size());
+                    for (auto s : samples) {
+                        samples2.push_back( FDuration( s ) );
+                    }
+
+                    return {
+                        CATCH_MOVE(samples2),
+                        wrap_estimate(analysis.mean),
+                        wrap_estimate(analysis.standard_deviation),
+                        outliers,
+                        analysis.outlier_variance,
+                    };
+                } else {
+                    std::vector<FDuration> samples;
+                    samples.reserve(static_cast<size_t>(last - first));
+
+                    FDuration mean = FDuration(0);
+                    int i = 0;
+                    for (auto it = first; it < last; ++it, ++i) {
+                        samples.push_back(FDuration(*it));
+                        mean += FDuration(*it);
+                    }
+                    mean /= i;
+
+                    return SampleAnalysis{
+                        CATCH_MOVE(samples),
+                        Estimate<FDuration>{ mean, mean, mean, 0.0 },
+                        Estimate<FDuration>{ FDuration( 0 ),
+                                             FDuration( 0 ),
+                                             FDuration( 0 ),
+                                             0.0 },
+                        OutlierClassification{},
+                        0.0
+                    };
+                }
+            }
+        } // namespace Detail
+    } // namespace Benchmark
+} // namespace Catch
+
+
 
 
 namespace Catch {
@@ -60,6 +135,7 @@ namespace Catch {
 
 
 
+
 #include <exception>
 
 namespace Catch {
@@ -86,9 +162,11 @@ namespace Catch {
 
 
 
+#include <algorithm>
 #include <cassert>
+#include <cmath>
 #include <cstddef>
-#include <iterator>
+#include <numeric>
 #include <random>
 
 
@@ -96,139 +174,199 @@ namespace Catch {
 #include <future>
 #endif
 
-namespace {
+namespace Catch {
+    namespace Benchmark {
+        namespace Detail {
+            namespace {
+
+                template <typename URng, typename Estimator>
+                static sample
+                resample( URng& rng,
+                          unsigned int resamples,
+                          double const* first,
+                          double const* last,
+                          Estimator& estimator ) {
+                    auto n = static_cast<size_t>( last - first );
+                    std::uniform_int_distribution<size_t> dist( 0, n - 1 );
+
+                    sample out;
+                    out.reserve( resamples );
+                    std::vector<double> resampled;
+                    resampled.reserve( n );
+                    for ( size_t i = 0; i < resamples; ++i ) {
+                        resampled.clear();
+                        for ( size_t s = 0; s < n; ++s ) {
+                            resampled.push_back( first[dist( rng )] );
+                        }
+                        const auto estimate =
+                            estimator( resampled.data(), resampled.data() + resampled.size() );
+                        out.push_back( estimate );
+                    }
+                    std::sort( out.begin(), out.end() );
+                    return out;
+                }
 
-using Catch::Benchmark::Detail::sample;
-
-     template <typename URng, typename Estimator>
-     sample resample(URng& rng, unsigned int resamples, std::vector<double>::iterator first, std::vector<double>::iterator last, Estimator& estimator) {
-         auto n = static_cast<size_t>(last - first);
-         std::uniform_int_distribution<decltype(n)> dist(0, n - 1);
-
-         sample out;
-         out.reserve(resamples);
-         std::generate_n(std::back_inserter(out), resamples, [n, first, &estimator, &dist, &rng] {
-             std::vector<double> resampled;
-             resampled.reserve(n);
-             std::generate_n(std::back_inserter(resampled), n, [first, &dist, &rng] { return first[static_cast<std::ptrdiff_t>(dist(rng))]; });
-             return estimator(resampled.begin(), resampled.end());
-         });
-         std::sort(out.begin(), out.end());
-         return out;
-     }
-
-
-    double erf_inv(double x) {
-        // Code accompanying the article "Approximating the erfinv function" in GPU Computing Gems, Volume 2
-        double w, p;
-
-        w = -log((1.0 - x) * (1.0 + x));
-
-        if (w < 6.250000) {
-            w = w - 3.125000;
-            p = -3.6444120640178196996e-21;
-            p = -1.685059138182016589e-19 + p * w;
-            p = 1.2858480715256400167e-18 + p * w;
-            p = 1.115787767802518096e-17 + p * w;
-            p = -1.333171662854620906e-16 + p * w;
-            p = 2.0972767875968561637e-17 + p * w;
-            p = 6.6376381343583238325e-15 + p * w;
-            p = -4.0545662729752068639e-14 + p * w;
-            p = -8.1519341976054721522e-14 + p * w;
-            p = 2.6335093153082322977e-12 + p * w;
-            p = -1.2975133253453532498e-11 + p * w;
-            p = -5.4154120542946279317e-11 + p * w;
-            p = 1.051212273321532285e-09 + p * w;
-            p = -4.1126339803469836976e-09 + p * w;
-            p = -2.9070369957882005086e-08 + p * w;
-            p = 4.2347877827932403518e-07 + p * w;
-            p = -1.3654692000834678645e-06 + p * w;
-            p = -1.3882523362786468719e-05 + p * w;
-            p = 0.0001867342080340571352 + p * w;
-            p = -0.00074070253416626697512 + p * w;
-            p = -0.0060336708714301490533 + p * w;
-            p = 0.24015818242558961693 + p * w;
-            p = 1.6536545626831027356 + p * w;
-        } else if (w < 16.000000) {
-            w = sqrt(w) - 3.250000;
-            p = 2.2137376921775787049e-09;
-            p = 9.0756561938885390979e-08 + p * w;
-            p = -2.7517406297064545428e-07 + p * w;
-            p = 1.8239629214389227755e-08 + p * w;
-            p = 1.5027403968909827627e-06 + p * w;
-            p = -4.013867526981545969e-06 + p * w;
-            p = 2.9234449089955446044e-06 + p * w;
-            p = 1.2475304481671778723e-05 + p * w;
-            p = -4.7318229009055733981e-05 + p * w;
-            p = 6.8284851459573175448e-05 + p * w;
-            p = 2.4031110387097893999e-05 + p * w;
-            p = -0.0003550375203628474796 + p * w;
-            p = 0.00095328937973738049703 + p * w;
-            p = -0.0016882755560235047313 + p * w;
-            p = 0.0024914420961078508066 + p * w;
-            p = -0.0037512085075692412107 + p * w;
-            p = 0.005370914553590063617 + p * w;
-            p = 1.0052589676941592334 + p * w;
-            p = 3.0838856104922207635 + p * w;
-        } else {
-            w = sqrt(w) - 5.000000;
-            p = -2.7109920616438573243e-11;
-            p = -2.5556418169965252055e-10 + p * w;
-            p = 1.5076572693500548083e-09 + p * w;
-            p = -3.7894654401267369937e-09 + p * w;
-            p = 7.6157012080783393804e-09 + p * w;
-            p = -1.4960026627149240478e-08 + p * w;
-            p = 2.9147953450901080826e-08 + p * w;
-            p = -6.7711997758452339498e-08 + p * w;
-            p = 2.2900482228026654717e-07 + p * w;
-            p = -9.9298272942317002539e-07 + p * w;
-            p = 4.5260625972231537039e-06 + p * w;
-            p = -1.9681778105531670567e-05 + p * w;
-            p = 7.5995277030017761139e-05 + p * w;
-            p = -0.00021503011930044477347 + p * w;
-            p = -0.00013871931833623122026 + p * w;
-            p = 1.0103004648645343977 + p * w;
-            p = 4.8499064014085844221 + p * w;
-        }
-        return p * x;
-    }
-
-    double standard_deviation(std::vector<double>::iterator first, std::vector<double>::iterator last) {
-        auto m = Catch::Benchmark::Detail::mean(first, last);
-        double variance = std::accumulate( first,
-                                           last,
-                                           0.,
-                                           [m]( double a, double b ) {
-                                               double diff = b - m;
-                                               return a + diff * diff;
-                                           } ) /
-                          ( last - first );
-        return std::sqrt( variance );
-    }
+                static double outlier_variance( Estimate<double> mean,
+                                                Estimate<double> stddev,
+                                                int n ) {
+                    double sb = stddev.point;
+                    double mn = mean.point / n;
+                    double mg_min = mn / 2.;
+                    double sg = (std::min)( mg_min / 4., sb / std::sqrt( n ) );
+                    double sg2 = sg * sg;
+                    double sb2 = sb * sb;
+
+                    auto c_max = [n, mn, sb2, sg2]( double x ) -> double {
+                        double k = mn - x;
+                        double d = k * k;
+                        double nd = n * d;
+                        double k0 = -n * nd;
+                        double k1 = sb2 - n * sg2 + nd;
+                        double det = k1 * k1 - 4 * sg2 * k0;
+                        return static_cast<int>( -2. * k0 /
+                                                 ( k1 + std::sqrt( det ) ) );
+                    };
+
+                    auto var_out = [n, sb2, sg2]( double c ) {
+                        double nc = n - c;
+                        return ( nc / n ) * ( sb2 - nc * sg2 );
+                    };
+
+                    return (std::min)( var_out( 1 ),
+                                       var_out(
+                                           (std::min)( c_max( 0. ),
+                                                       c_max( mg_min ) ) ) ) /
+                           sb2;
+                }
 
-}
+                static double erf_inv( double x ) {
+                    // Code accompanying the article "Approximating the erfinv
+                    // function" in GPU Computing Gems, Volume 2
+                    double w, p;
+
+                    w = -log( ( 1.0 - x ) * ( 1.0 + x ) );
+
+                    if ( w < 6.250000 ) {
+                        w = w - 3.125000;
+                        p = -3.6444120640178196996e-21;
+                        p = -1.685059138182016589e-19 + p * w;
+                        p = 1.2858480715256400167e-18 + p * w;
+                        p = 1.115787767802518096e-17 + p * w;
+                        p = -1.333171662854620906e-16 + p * w;
+                        p = 2.0972767875968561637e-17 + p * w;
+                        p = 6.6376381343583238325e-15 + p * w;
+                        p = -4.0545662729752068639e-14 + p * w;
+                        p = -8.1519341976054721522e-14 + p * w;
+                        p = 2.6335093153082322977e-12 + p * w;
+                        p = -1.2975133253453532498e-11 + p * w;
+                        p = -5.4154120542946279317e-11 + p * w;
+                        p = 1.051212273321532285e-09 + p * w;
+                        p = -4.1126339803469836976e-09 + p * w;
+                        p = -2.9070369957882005086e-08 + p * w;
+                        p = 4.2347877827932403518e-07 + p * w;
+                        p = -1.3654692000834678645e-06 + p * w;
+                        p = -1.3882523362786468719e-05 + p * w;
+                        p = 0.0001867342080340571352 + p * w;
+                        p = -0.00074070253416626697512 + p * w;
+                        p = -0.0060336708714301490533 + p * w;
+                        p = 0.24015818242558961693 + p * w;
+                        p = 1.6536545626831027356 + p * w;
+                    } else if ( w < 16.000000 ) {
+                        w = sqrt( w ) - 3.250000;
+                        p = 2.2137376921775787049e-09;
+                        p = 9.0756561938885390979e-08 + p * w;
+                        p = -2.7517406297064545428e-07 + p * w;
+                        p = 1.8239629214389227755e-08 + p * w;
+                        p = 1.5027403968909827627e-06 + p * w;
+                        p = -4.013867526981545969e-06 + p * w;
+                        p = 2.9234449089955446044e-06 + p * w;
+                        p = 1.2475304481671778723e-05 + p * w;
+                        p = -4.7318229009055733981e-05 + p * w;
+                        p = 6.8284851459573175448e-05 + p * w;
+                        p = 2.4031110387097893999e-05 + p * w;
+                        p = -0.0003550375203628474796 + p * w;
+                        p = 0.00095328937973738049703 + p * w;
+                        p = -0.0016882755560235047313 + p * w;
+                        p = 0.0024914420961078508066 + p * w;
+                        p = -0.0037512085075692412107 + p * w;
+                        p = 0.005370914553590063617 + p * w;
+                        p = 1.0052589676941592334 + p * w;
+                        p = 3.0838856104922207635 + p * w;
+                    } else {
+                        w = sqrt( w ) - 5.000000;
+                        p = -2.7109920616438573243e-11;
+                        p = -2.5556418169965252055e-10 + p * w;
+                        p = 1.5076572693500548083e-09 + p * w;
+                        p = -3.7894654401267369937e-09 + p * w;
+                        p = 7.6157012080783393804e-09 + p * w;
+                        p = -1.4960026627149240478e-08 + p * w;
+                        p = 2.9147953450901080826e-08 + p * w;
+                        p = -6.7711997758452339498e-08 + p * w;
+                        p = 2.2900482228026654717e-07 + p * w;
+                        p = -9.9298272942317002539e-07 + p * w;
+                        p = 4.5260625972231537039e-06 + p * w;
+                        p = -1.9681778105531670567e-05 + p * w;
+                        p = 7.5995277030017761139e-05 + p * w;
+                        p = -0.00021503011930044477347 + p * w;
+                        p = -0.00013871931833623122026 + p * w;
+                        p = 1.0103004648645343977 + p * w;
+                        p = 4.8499064014085844221 + p * w;
+                    }
+                    return p * x;
+                }
+
+                static double
+                standard_deviation( double const* first, double const* last ) {
+                    auto m = Catch::Benchmark::Detail::mean( first, last );
+                    double variance =
+                        std::accumulate( first,
+                                         last,
+                                         0.,
+                                         [m]( double a, double b ) {
+                                             double diff = b - m;
+                                             return a + diff * diff;
+                                         } ) /
+                        ( last - first );
+                    return std::sqrt( variance );
+                }
+
+                static sample jackknife( double ( *estimator )( double const*,
+                                                                double const* ),
+                                         double* first,
+                                         double* last ) {
+                    const auto second = first + 1;
+                    sample results;
+                    results.reserve( static_cast<size_t>( last - first ) );
+
+                    for ( auto it = first; it != last; ++it ) {
+                        std::iter_swap( it, first );
+                        results.push_back( estimator( second, last ) );
+                    }
+
+                    return results;
+                }
+
+
+            } // namespace
+        }     // namespace Detail
+    }         // namespace Benchmark
+} // namespace Catch
 
 namespace Catch {
     namespace Benchmark {
         namespace Detail {
 
-#if defined( __GNUC__ ) || defined( __clang__ )
-#    pragma GCC diagnostic push
-#    pragma GCC diagnostic ignored "-Wfloat-equal"
-#endif
-            bool directCompare( double lhs, double rhs ) { return lhs == rhs; }
-#if defined( __GNUC__ ) || defined( __clang__ )
-#    pragma GCC diagnostic pop
-#endif
-
-            double weighted_average_quantile(int k, int q, std::vector<double>::iterator first, std::vector<double>::iterator last) {
+            double weighted_average_quantile( int k,
+                                              int q,
+                                              double* first,
+                                              double* last ) {
                 auto count = last - first;
                 double idx = (count - 1) * k / static_cast<double>(q);
                 int j = static_cast<int>(idx);
                 double g = idx - j;
                 std::nth_element(first, first + j, last);
                 auto xj = first[j];
-                if ( directCompare( g, 0 ) ) {
+                if ( Catch::Detail::directCompare( g, 0 ) ) {
                     return xj;
                 }
 
@@ -236,6 +374,48 @@ namespace Catch {
                 return xj + g * (xj1 - xj);
             }
 
+            OutlierClassification
+            classify_outliers( double const* first, double const* last ) {
+                std::vector<double> copy( first, last );
+
+                auto q1 = weighted_average_quantile( 1, 4, copy.data(), copy.data() + copy.size() );
+                auto q3 = weighted_average_quantile( 3, 4, copy.data(), copy.data() + copy.size() );
+                auto iqr = q3 - q1;
+                auto los = q1 - ( iqr * 3. );
+                auto lom = q1 - ( iqr * 1.5 );
+                auto him = q3 + ( iqr * 1.5 );
+                auto his = q3 + ( iqr * 3. );
+
+                OutlierClassification o;
+                for ( ; first != last; ++first ) {
+                    const double t = *first;
+                    if ( t < los ) {
+                        ++o.low_severe;
+                    } else if ( t < lom ) {
+                        ++o.low_mild;
+                    } else if ( t > his ) {
+                        ++o.high_severe;
+                    } else if ( t > him ) {
+                        ++o.high_mild;
+                    }
+                    ++o.samples_seen;
+                }
+                return o;
+            }
+
+            double mean( double const* first, double const* last ) {
+                auto count = last - first;
+                double sum = 0.;
+                while (first != last) {
+                    sum += *first;
+                    ++first;
+                }
+                return sum / static_cast<double>(count);
+            }
+
+            double normal_cdf( double x ) {
+                return std::erfc( -x / std::sqrt( 2.0 ) ) / 2.0;
+            }
 
             double erfc_inv(double x) {
                 return erf_inv(1.0 - x);
@@ -257,50 +437,77 @@ namespace Catch {
                 return result;
             }
 
+            Estimate<double>
+            bootstrap( double confidence_level,
+                       double* first,
+                       double* last,
+                       sample const& resample,
+                       double ( *estimator )( double const*, double const* ) ) {
+                auto n_samples = last - first;
+
+                double point = estimator( first, last );
+                // Degenerate case with a single sample
+                if ( n_samples == 1 )
+                    return { point, point, point, confidence_level };
+
+                sample jack = jackknife( estimator, first, last );
+                double jack_mean =
+                    mean( jack.data(), jack.data() + jack.size() );
+                double sum_squares = 0, sum_cubes = 0;
+                for ( double x : jack ) {
+                    auto difference = jack_mean - x;
+                    auto square = difference * difference;
+                    auto cube = square * difference;
+                    sum_squares += square;
+                    sum_cubes += cube;
+                }
 
-            double outlier_variance(Estimate<double> mean, Estimate<double> stddev, int n) {
-                double sb = stddev.point;
-                double mn = mean.point / n;
-                double mg_min = mn / 2.;
-                double sg = (std::min)(mg_min / 4., sb / std::sqrt(n));
-                double sg2 = sg * sg;
-                double sb2 = sb * sb;
+                double accel = sum_cubes / ( 6 * std::pow( sum_squares, 1.5 ) );
+                long n = static_cast<long>( resample.size() );
+                double prob_n =
+                    std::count_if( resample.begin(),
+                                   resample.end(),
+                                   [point]( double x ) { return x < point; } ) /
+                    static_cast<double>( n );
+                // degenerate case with uniform samples
+                if ( Catch::Detail::directCompare( prob_n, 0. ) ) {
+                    return { point, point, point, confidence_level };
+                }
 
-                auto c_max = [n, mn, sb2, sg2](double x) -> double {
-                    double k = mn - x;
-                    double d = k * k;
-                    double nd = n * d;
-                    double k0 = -n * nd;
-                    double k1 = sb2 - n * sg2 + nd;
-                    double det = k1 * k1 - 4 * sg2 * k0;
-                    return static_cast<int>(-2. * k0 / (k1 + std::sqrt(det)));
-                };
+                double bias = normal_quantile( prob_n );
+                double z1 = normal_quantile( ( 1. - confidence_level ) / 2. );
 
-                auto var_out = [n, sb2, sg2](double c) {
-                    double nc = n - c;
-                    return (nc / n) * (sb2 - nc * sg2);
+                auto cumn = [n]( double x ) -> long {
+                    return std::lround( normal_cdf( x ) *
+                                        static_cast<double>( n ) );
                 };
-
-                return (std::min)(var_out(1), var_out((std::min)(c_max(0.), c_max(mg_min)))) / sb2;
-            }
-
-
-            bootstrap_analysis analyse_samples(double confidence_level, unsigned int n_resamples, std::vector<double>::iterator first, std::vector<double>::iterator last) {
-                CATCH_INTERNAL_START_WARNINGS_SUPPRESSION
-                CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS
-                static std::random_device entropy;
-                CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
-
-                auto n = static_cast<int>(last - first); // seriously, one can't use integral types without hell in C++
-
-                auto mean = &Detail::mean<std::vector<double>::iterator>;
+                auto a = [bias, accel]( double b ) {
+                    return bias + b / ( 1. - accel * b );
+                };
+                double b1 = bias + z1;
+                double b2 = bias - z1;
+                double a1 = a( b1 );
+                double a2 = a( b2 );
+                auto lo = static_cast<size_t>( (std::max)( cumn( a1 ), 0l ) );
+                auto hi =
+                    static_cast<size_t>( (std::min)( cumn( a2 ), n - 1 ) );
+
+                return { point, resample[lo], resample[hi], confidence_level };
+            }
+
+            bootstrap_analysis analyse_samples(double confidence_level,
+                                               unsigned int n_resamples,
+                                               double* first,
+                                               double* last) {
+                auto mean = &Detail::mean;
                 auto stddev = &standard_deviation;
 
 #if defined(CATCH_CONFIG_USE_ASYNC)
-                auto Estimate = [=](double(*f)(std::vector<double>::iterator, std::vector<double>::iterator)) {
-                    auto seed = entropy();
+                auto Estimate = [=](double(*f)(double const*, double const*)) {
+                    std::random_device rd;
+                    auto seed = rd();
                     return std::async(std::launch::async, [=] {
-                        std::mt19937 rng(seed);
+                        SimplePcg32 rng( seed );
                         auto resampled = resample(rng, n_resamples, first, last, f);
                         return bootstrap(confidence_level, first, last, resampled, f);
                     });
@@ -312,9 +519,10 @@ namespace Catch {
                 auto mean_estimate = mean_future.get();
                 auto stddev_estimate = stddev_future.get();
 #else
-                auto Estimate = [=](double(*f)(std::vector<double>::iterator, std::vector<double>::iterator)) {
-                    auto seed = entropy();
-                    std::mt19937 rng(seed);
+                auto Estimate = [=](double(*f)(double const* , double const*)) {
+                    std::random_device rd;
+                    auto seed = rd();
+                    SimplePcg32 rng( seed );
                     auto resampled = resample(rng, n_resamples, first, last, f);
                     return bootstrap(confidence_level, first, last, resampled, f);
                 };
@@ -323,6 +531,7 @@ namespace Catch {
                 auto stddev_estimate = Estimate(stddev);
 #endif // CATCH_USE_ASYNC
 
+                auto n = static_cast<int>(last - first); // seriously, one can't use integral types without hell in C++
                 double outlier_variance = Detail::outlier_variance(mean_estimate, stddev_estimate, n);
 
                 return { mean_estimate, stddev_estimate, outlier_variance };
@@ -394,10 +603,10 @@ namespace Catch {
     }
 
 namespace literals {
-    Approx operator "" _a(long double val) {
+    Approx operator ""_a(long double val) {
         return Approx(val);
     }
-    Approx operator "" _a(unsigned long long val) {
+    Approx operator ""_a(unsigned long long val) {
         return Approx(val);
     }
 } // end namespace literals
@@ -596,7 +805,7 @@ namespace Catch {
             elem = trim(elem);
         }
 
-        // Insert the default reporter if user hasn't asked for a specfic one
+        // Insert the default reporter if user hasn't asked for a specific one
         if ( m_data.reporterSpecifications.empty() ) {
             m_data.reporterSpecifications.push_back( {
 #if defined( CATCH_CONFIG_DEFAULT_REPORTER )
@@ -775,7 +984,11 @@ namespace Catch {
     }
 
 
-    Capturer::Capturer( StringRef macroName, SourceLineInfo const& lineInfo, ResultWas::OfType resultType, StringRef names ) {
+    Capturer::Capturer( StringRef macroName,
+                        SourceLineInfo const& lineInfo,
+                        ResultWas::OfType resultType,
+                        StringRef names ):
+        m_resultCapture( getResultCapture() ) {
         auto trimmed = [&] (size_t start, size_t end) {
             while (names[start] == ',' || isspace(static_cast<unsigned char>(names[start]))) {
                 ++start;
@@ -852,6 +1065,8 @@ namespace Catch {
 
 
 
+#include <exception>
+
 namespace Catch {
 
     namespace {
@@ -862,7 +1077,7 @@ namespace Catch {
 
         public: // IRegistryHub
             RegistryHub() = default;
-            IReporterRegistry const& getReporterRegistry() const override {
+            ReporterRegistry const& getReporterRegistry() const override {
                 return m_reporterRegistry;
             }
             ITestCaseRegistry const& getTestCaseRegistry() const override {
@@ -938,6 +1153,7 @@ namespace Catch {
 
 #include <algorithm>
 #include <cassert>
+#include <exception>
 #include <iomanip>
 #include <set>
 
@@ -1420,12 +1636,20 @@ namespace Catch {
         for (size_t idx = 0; idx < originalTags.size(); ++idx) {
             auto c = originalTags[idx];
             if (c == '[') {
-                assert(!inTag);
+                CATCH_ENFORCE(
+                    !inTag,
+                    "Found '[' inside a tag while registering test case '"
+                        << _nameAndTags.name << "' at " << _lineInfo );
+
                 inTag = true;
                 tagStart = idx;
             }
             if (c == ']') {
-                assert(inTag);
+                CATCH_ENFORCE(
+                    inTag,
+                    "Found unmatched ']' while registering test case '"
+                        << _nameAndTags.name << "' at " << _lineInfo );
+
                 inTag = false;
                 tagEnd = idx;
                 assert(tagStart < tagEnd);
@@ -1434,7 +1658,11 @@ namespace Catch {
                 // it over to backing storage and actually reference the
                 // backing storage in the saved tags
                 StringRef tagStr = originalTags.substr(tagStart+1, tagEnd - tagStart - 1);
-                CATCH_ENFORCE(!tagStr.empty(), "Empty tags are not allowed");
+                CATCH_ENFORCE( !tagStr.empty(),
+                               "Found an empty tag while registering test case '"
+                                   << _nameAndTags.name << "' at "
+                                   << _lineInfo );
+
                 enforceNotReservedTag(tagStr, lineInfo);
                 properties |= parseSpecialTag(tagStr);
                 // When copying a tag to the backing storage, we need to
@@ -1448,8 +1676,12 @@ namespace Catch {
                 // the tags.
                 internalAppendTag(tagStr);
             }
-            (void)inTag; // Silence "set-but-unused" warning in release mode.
         }
+        CATCH_ENFORCE( !inTag,
+                       "Found an unclosed tag while registering test case '"
+                           << _nameAndTags.name << "' at " << _lineInfo );
+
+
         // Add [.] if relevant
         if (isHidden()) {
             internalAppendTag("."_sr);
@@ -1625,16 +1857,18 @@ namespace Catch {
         return std::any_of( m_filters.begin(), m_filters.end(), [&]( Filter const& f ){ return f.matches( testCase ); } );
     }
 
-    TestSpec::Matches TestSpec::matchesByFilter( std::vector<TestCaseHandle> const& testCases, IConfig const& config ) const
-    {
-        Matches matches( m_filters.size() );
-        std::transform( m_filters.begin(), m_filters.end(), matches.begin(), [&]( Filter const& filter ){
+    TestSpec::Matches TestSpec::matchesByFilter( std::vector<TestCaseHandle> const& testCases, IConfig const& config ) const {
+        Matches matches;
+        matches.reserve( m_filters.size() );
+        for ( auto const& filter : m_filters ) {
             std::vector<TestCaseHandle const*> currentMatches;
-            for( auto const& test : testCases )
-                if( isThrowSafe( test, config ) && filter.matches( test.getTestCaseInfo() ) )
+            for ( auto const& test : testCases )
+                if ( isThrowSafe( test, config ) &&
+                     filter.matches( test.getTestCaseInfo() ) )
                     currentMatches.emplace_back( &test );
-            return FilterMatch{ extractFilterName(filter), currentMatches };
-        } );
+            matches.push_back(
+                FilterMatch{ extractFilterName( filter ), currentMatches } );
+        }
         return matches;
     }
 
@@ -1991,6 +2225,19 @@ namespace Catch {
 }
 
 
+
+
+namespace Catch {
+    namespace Detail {
+        void registerTranslatorImpl(
+            Detail::unique_ptr<IExceptionTranslator>&& translator ) {
+            getMutableRegistryHub().registerTranslator(
+                CATCH_MOVE( translator ) );
+        }
+    } // namespace Detail
+} // namespace Catch
+
+
 #include <ostream>
 
 namespace Catch {
@@ -2021,7 +2268,7 @@ namespace Catch {
     }
 
     Version const& libraryVersion() {
-        static Version version( 3, 3, 2, "", 0 );
+        static Version version( 3, 5, 2, "", 0 );
         return version;
     }
 
@@ -2074,8 +2321,36 @@ namespace Detail {
 
 
 
+#include <random>
 
-std::uint32_t Catch::Generators::Detail::getSeed() { return sharedRng()(); }
+namespace Catch {
+    namespace Generators {
+        namespace Detail {
+            std::uint32_t getSeed() { return sharedRng()(); }
+        } // namespace Detail
+
+        struct RandomFloatingGenerator<long double>::PImpl {
+            PImpl( long double a, long double b, uint32_t seed ):
+                rng( seed ), dist( a, b ) {}
+
+            Catch::SimplePcg32 rng;
+            std::uniform_real_distribution<long double> dist;
+        };
+
+        RandomFloatingGenerator<long double>::RandomFloatingGenerator(
+            long double a, long double b, std::uint32_t seed) :
+            m_pimpl(Catch::Detail::make_unique<PImpl>(a, b, seed)) {
+            static_cast<void>( next() );
+        }
+
+        RandomFloatingGenerator<long double>::~RandomFloatingGenerator() =
+            default;
+        bool RandomFloatingGenerator<long double>::next() {
+            m_current_number = m_pimpl->dist( m_pimpl->rng );
+            return true;
+        }
+    } // namespace Generators
+} // namespace Catch
 
 
 
@@ -2135,9 +2410,7 @@ namespace Catch {
 
 
 
-#include <algorithm>
 #include <cassert>
-#include <iomanip>
 
 namespace Catch {
 
@@ -2172,8 +2445,6 @@ namespace Catch {
         infoMessages( _infoMessages ),
         totals( _totals )
     {
-        assertionResult.m_resultData.lazyExpression.m_transientExpression = _assertionResult.m_resultData.lazyExpression.m_transientExpression;
-
         if( assertionResult.hasMessage() ) {
             // Copy message into messages list.
             // !TBD This should have been done earlier, somewhere
@@ -2232,14 +2503,6 @@ namespace Catch {
 
 
 namespace Catch {
-    IReporterRegistry::~IReporterRegistry() = default;
-}
-
-
-
-
-namespace Catch {
-    ITestInvoker::~ITestInvoker() = default;
     ITestCaseRegistry::~ITestCaseRegistry() = default;
 }
 
@@ -2254,7 +2517,9 @@ namespace Catch {
             ResultDisposition::Flags resultDisposition )
     :   m_assertionInfo{ macroName, lineInfo, capturedExpression, resultDisposition },
         m_resultCapture( getResultCapture() )
-    {}
+    {
+        m_resultCapture.notifyAssertionStarted( m_assertionInfo );
+    }
 
     void AssertionHandler::handleExpr( ITransientExpression const& expr ) {
         m_resultCapture.handleExpr( m_assertionInfo, expr, m_reaction );
@@ -2268,7 +2533,7 @@ namespace Catch {
     }
 
     void AssertionHandler::complete() {
-        setCompleted();
+        m_completed = true;
         if( m_reaction.shouldDebugBreak ) {
 
             // If you find your debugger stopping you here then go one level up on the
@@ -2281,16 +2546,9 @@ namespace Catch {
             throw_test_failure_exception();
         }
         if ( m_reaction.shouldSkip ) {
-#if !defined( CATCH_CONFIG_DISABLE_EXCEPTIONS )
-            throw Catch::TestSkipException();
-#else
-            CATCH_ERROR( "Explicitly skipping tests during runtime requires exceptions" );
-#endif
+            throw_test_skip_exception();
         }
     }
-    void AssertionHandler::setCompleted() {
-        m_completed = true;
-    }
 
     void AssertionHandler::handleUnexpectedInflightException() {
         m_resultCapture.handleUnexpectedInflightException( m_assertionInfo, Catch::translateActiveException(), m_reaction );
@@ -2362,13 +2620,29 @@ namespace {
             ;
     }
 
-    std::string normaliseOpt( std::string const& optName ) {
-#ifdef CATCH_PLATFORM_WINDOWS
-        if ( optName[0] == '/' )
-            return "-" + optName.substr( 1 );
-        else
+    Catch::StringRef normaliseOpt( Catch::StringRef optName ) {
+        if ( optName[0] == '-'
+#if defined(CATCH_PLATFORM_WINDOWS)
+             || optName[0] == '/'
 #endif
-            return optName;
+        ) {
+            return optName.substr( 1, optName.size() );
+        }
+
+        return optName;
+    }
+
+    static size_t find_first_separator(Catch::StringRef sr) {
+        auto is_separator = []( char c ) {
+            return c == ' ' || c == ':' || c == '=';
+        };
+        size_t pos = 0;
+        while (pos < sr.size()) {
+            if (is_separator(sr[pos])) { return pos; }
+            ++pos;
+        }
+
+        return Catch::StringRef::npos;
     }
 
 } // namespace
@@ -2386,23 +2660,23 @@ namespace Catch {
                 }
 
                 if ( it != itEnd ) {
-                    auto const& next = *it;
+                    StringRef next = *it;
                     if ( isOptPrefix( next[0] ) ) {
-                        auto delimiterPos = next.find_first_of( " :=" );
-                        if ( delimiterPos != std::string::npos ) {
+                        auto delimiterPos = find_first_separator(next);
+                        if ( delimiterPos != StringRef::npos ) {
                             m_tokenBuffer.push_back(
                                 { TokenType::Option,
                                   next.substr( 0, delimiterPos ) } );
                             m_tokenBuffer.push_back(
                                 { TokenType::Argument,
-                                  next.substr( delimiterPos + 1 ) } );
+                                  next.substr( delimiterPos + 1, next.size() ) } );
                         } else {
                             if ( next[1] != '-' && next.size() > 2 ) {
-                                std::string opt = "- ";
+                                // Combined short args, e.g. "-ab" for "-a -b"
                                 for ( size_t i = 1; i < next.size(); ++i ) {
-                                    opt[1] = next[i];
                                     m_tokenBuffer.push_back(
-                                        { TokenType::Option, opt } );
+                                        { TokenType::Option,
+                                          next.substr( i, 1 ) } );
                                 }
                             } else {
                                 m_tokenBuffer.push_back(
@@ -2462,12 +2736,12 @@ namespace Catch {
             size_t ParserBase::cardinality() const { return 1; }
 
             InternalParseResult ParserBase::parse( Args const& args ) const {
-                return parse( args.exeName(), TokenStream( args ) );
+                return parse( static_cast<std::string>(args.exeName()), TokenStream( args ) );
             }
 
             ParseState::ParseState( ParseResultType type,
-                                    TokenStream const& remainingTokens ):
-                m_type( type ), m_remainingTokens( remainingTokens ) {}
+                                    TokenStream remainingTokens ):
+                m_type( type ), m_remainingTokens( CATCH_MOVE(remainingTokens) ) {}
 
             ParserResult BoundFlagRef::setFlag( bool flag ) {
                 m_ref = flag;
@@ -2485,34 +2759,34 @@ namespace Catch {
 } // namespace Detail
 
         Detail::InternalParseResult Arg::parse(std::string const&,
-                                               Detail::TokenStream const& tokens) const {
+                                               Detail::TokenStream tokens) const {
             auto validationResult = validate();
             if (!validationResult)
                 return Detail::InternalParseResult(validationResult);
 
-            auto remainingTokens = tokens;
-            auto const& token = *remainingTokens;
+            auto token = *tokens;
             if (token.type != Detail::TokenType::Argument)
                 return Detail::InternalParseResult::ok(Detail::ParseState(
-                    ParseResultType::NoMatch, remainingTokens));
+                    ParseResultType::NoMatch, CATCH_MOVE(tokens)));
 
             assert(!m_ref->isFlag());
             auto valueRef =
                 static_cast<Detail::BoundValueRefBase*>(m_ref.get());
 
-            auto result = valueRef->setValue(remainingTokens->token);
-            if (!result)
-                return Detail::InternalParseResult(result);
+            auto result = valueRef->setValue(static_cast<std::string>(token.token));
+            if ( !result )
+                return Detail::InternalParseResult( result );
             else
-                return Detail::InternalParseResult::ok(Detail::ParseState(
-                    ParseResultType::Matched, ++remainingTokens));
+                return Detail::InternalParseResult::ok(
+                    Detail::ParseState( ParseResultType::Matched,
+                                        CATCH_MOVE( ++tokens ) ) );
         }
 
         Opt::Opt(bool& ref) :
             ParserRefImpl(std::make_shared<Detail::BoundFlagRef>(ref)) {}
 
-        std::vector<Detail::HelpColumns> Opt::getHelpColumns() const {
-            std::ostringstream oss;
+        Detail::HelpColumns Opt::getHelpColumns() const {
+            ReusableStringStream oss;
             bool first = true;
             for (auto const& opt : m_optNames) {
                 if (first)
@@ -2523,10 +2797,10 @@ namespace Catch {
             }
             if (!m_hint.empty())
                 oss << " <" << m_hint << '>';
-            return { { oss.str(), m_description } };
+            return { oss.str(), m_description };
         }
 
-        bool Opt::isMatch(std::string const& optToken) const {
+        bool Opt::isMatch(StringRef optToken) const {
             auto normalisedToken = normaliseOpt(optToken);
             for (auto const& name : m_optNames) {
                 if (normaliseOpt(name) == normalisedToken)
@@ -2536,15 +2810,14 @@ namespace Catch {
         }
 
         Detail::InternalParseResult Opt::parse(std::string const&,
-                                       Detail::TokenStream const& tokens) const {
+                                       Detail::TokenStream tokens) const {
             auto validationResult = validate();
             if (!validationResult)
                 return Detail::InternalParseResult(validationResult);
 
-            auto remainingTokens = tokens;
-            if (remainingTokens &&
-                remainingTokens->type == Detail::TokenType::Option) {
-                auto const& token = *remainingTokens;
+            if (tokens &&
+                tokens->type == Detail::TokenType::Option) {
+                auto const& token = *tokens;
                 if (isMatch(token.token)) {
                     if (m_ref->isFlag()) {
                         auto flagRef =
@@ -2556,35 +2829,35 @@ namespace Catch {
                         if (result.value() ==
                             ParseResultType::ShortCircuitAll)
                             return Detail::InternalParseResult::ok(Detail::ParseState(
-                                result.value(), remainingTokens));
+                                result.value(), CATCH_MOVE(tokens)));
                     } else {
                         auto valueRef =
                             static_cast<Detail::BoundValueRefBase*>(
                                 m_ref.get());
-                        ++remainingTokens;
-                        if (!remainingTokens)
+                        ++tokens;
+                        if (!tokens)
                             return Detail::InternalParseResult::runtimeError(
                                 "Expected argument following " +
                                 token.token);
-                        auto const& argToken = *remainingTokens;
+                        auto const& argToken = *tokens;
                         if (argToken.type != Detail::TokenType::Argument)
                             return Detail::InternalParseResult::runtimeError(
                                 "Expected argument following " +
                                 token.token);
-                        const auto result = valueRef->setValue(argToken.token);
+                        const auto result = valueRef->setValue(static_cast<std::string>(argToken.token));
                         if (!result)
                             return Detail::InternalParseResult(result);
                         if (result.value() ==
                             ParseResultType::ShortCircuitAll)
                             return Detail::InternalParseResult::ok(Detail::ParseState(
-                                result.value(), remainingTokens));
+                                result.value(), CATCH_MOVE(tokens)));
                     }
                     return Detail::InternalParseResult::ok(Detail::ParseState(
-                        ParseResultType::Matched, ++remainingTokens));
+                        ParseResultType::Matched, CATCH_MOVE(++tokens)));
                 }
             }
             return Detail::InternalParseResult::ok(
-                Detail::ParseState(ParseResultType::NoMatch, remainingTokens));
+                Detail::ParseState(ParseResultType::NoMatch, CATCH_MOVE(tokens)));
         }
 
         Detail::Result Opt::validate() const {
@@ -2616,9 +2889,9 @@ namespace Catch {
 
         Detail::InternalParseResult
             ExeName::parse(std::string const&,
-                           Detail::TokenStream const& tokens) const {
+                           Detail::TokenStream tokens) const {
             return Detail::InternalParseResult::ok(
-                Detail::ParseState(ParseResultType::NoMatch, tokens));
+                Detail::ParseState(ParseResultType::NoMatch, CATCH_MOVE(tokens)));
         }
 
         ParserResult ExeName::set(std::string const& newName) {
@@ -2648,9 +2921,9 @@ namespace Catch {
 
         std::vector<Detail::HelpColumns> Parser::getHelpColumns() const {
             std::vector<Detail::HelpColumns> cols;
+            cols.reserve( m_options.size() );
             for ( auto const& o : m_options ) {
-                auto childCols = o.getHelpColumns();
-                cols.insert( cols.end(), childCols.begin(), childCols.end() );
+                cols.push_back(o.getHelpColumns());
             }
             return cols;
         }
@@ -2688,12 +2961,12 @@ namespace Catch {
 
             optWidth = ( std::min )( optWidth, consoleWidth / 2 );
 
-            for ( auto const& cols : rows ) {
-                auto row = TextFlow::Column( cols.left )
+            for ( auto& cols : rows ) {
+                auto row = TextFlow::Column( CATCH_MOVE(cols.left) )
                                .width( optWidth )
                                .indent( 2 ) +
                            TextFlow::Spacer( 4 ) +
-                           TextFlow::Column( cols.right )
+                           TextFlow::Column( static_cast<std::string>(cols.descriptions) )
                                .width( consoleWidth - 7 - optWidth );
                 os << row << '\n';
             }
@@ -2715,7 +2988,7 @@ namespace Catch {
 
         Detail::InternalParseResult
         Parser::parse( std::string const& exeName,
-                       Detail::TokenStream const& tokens ) const {
+                       Detail::TokenStream tokens ) const {
 
             struct ParserInfo {
                 ParserBase const* parser = nullptr;
@@ -2733,7 +3006,7 @@ namespace Catch {
             m_exeName.set( exeName );
 
             auto result = Detail::InternalParseResult::ok(
-                Detail::ParseState( ParseResultType::NoMatch, tokens ) );
+                Detail::ParseState( ParseResultType::NoMatch, CATCH_MOVE(tokens) ) );
             while ( result.value().remainingTokens() ) {
                 bool tokenParsed = false;
 
@@ -2741,7 +3014,7 @@ namespace Catch {
                     if ( parseInfo.parser->cardinality() == 0 ||
                          parseInfo.count < parseInfo.parser->cardinality() ) {
                         result = parseInfo.parser->parse(
-                            exeName, result.value().remainingTokens() );
+                            exeName, CATCH_MOVE(result).value().remainingTokens() );
                         if ( !result )
                             return result;
                         if ( result.value().type() !=
@@ -2767,7 +3040,7 @@ namespace Catch {
         Args::Args(int argc, char const* const* argv) :
             m_exeName(argv[0]), m_args(argv + 1, argv + argc) {}
 
-        Args::Args(std::initializer_list<std::string> args) :
+        Args::Args(std::initializer_list<StringRef> args) :
             m_exeName(*args.begin()),
             m_args(args.begin() + 1, args.end()) {}
 
@@ -2917,7 +3190,7 @@ namespace Catch {
 
             auto const& reporterSpec = *parsed;
 
-            IReporterRegistry::FactoryMap const& factories =
+            auto const& factories =
                 getRegistryHub().getReporterRegistry().getFactories();
             auto result = factories.find( reporterSpec.name() );
 
@@ -3073,8 +3346,8 @@ namespace Catch {
                 ( "split the tests to execute into this many groups" )
             | Opt( setShardIndex, "shard index" )
                 ["--shard-index"]
-                ( "index of the group of tests to execute (see --shard-count)" ) |
-            Opt( config.allowZeroTests )
+                ( "index of the group of tests to execute (see --shard-count)" )
+            | Opt( config.allowZeroTests )
                 ["--allow-running-no-tests"]
                 ( "Treat 'No tests run' as a success" )
             | Arg( config.testsOrTags, "test name|pattern|tags" )
@@ -3155,7 +3428,7 @@ namespace Catch {
     namespace {
         //! A do-nothing implementation of colour, used as fallback for unknown
         //! platforms, and when the user asks to deactivate all colours.
-        class NoColourImpl : public ColourImpl {
+        class NoColourImpl final : public ColourImpl {
         public:
             NoColourImpl( IStream* stream ): ColourImpl( stream ) {}
 
@@ -3173,7 +3446,7 @@ namespace Catch {
 namespace Catch {
 namespace {
 
-    class Win32ColourImpl : public ColourImpl {
+    class Win32ColourImpl final : public ColourImpl {
     public:
         Win32ColourImpl(IStream* stream):
             ColourImpl(stream) {
@@ -3239,7 +3512,7 @@ namespace {
 namespace Catch {
 namespace {
 
-    class ANSIColourImpl : public ColourImpl {
+    class ANSIColourImpl final : public ColourImpl {
     public:
         ANSIColourImpl( IStream* stream ): ColourImpl( stream ) {}
 
@@ -3355,49 +3628,27 @@ namespace Catch {
 
 namespace Catch {
 
-    class Context : public IMutableContext, private Detail::NonCopyable {
-
-    public: // IContext
-        IResultCapture* getResultCapture() override {
-            return m_resultCapture;
-        }
-
-        IConfig const* getConfig() const override {
-            return m_config;
-        }
-
-        ~Context() override;
-
-    public: // IMutableContext
-        void setResultCapture( IResultCapture* resultCapture ) override {
-            m_resultCapture = resultCapture;
-        }
-        void setConfig( IConfig const* config ) override {
-            m_config = config;
-        }
+    Context* Context::currentContext = nullptr;
 
-        friend IMutableContext& getCurrentMutableContext();
-
-    private:
-        IConfig const* m_config = nullptr;
-        IResultCapture* m_resultCapture = nullptr;
-    };
-
-    IMutableContext *IMutableContext::currentContext = nullptr;
-
-    void IMutableContext::createContext()
-    {
+    void cleanUpContext() {
+        delete Context::currentContext;
+        Context::currentContext = nullptr;
+    }
+    void Context::createContext() {
         currentContext = new Context();
     }
 
-    void cleanUpContext() {
-        delete IMutableContext::currentContext;
-        IMutableContext::currentContext = nullptr;
+    Context& getCurrentMutableContext() {
+        if ( !Context::currentContext ) { Context::createContext(); }
+        // NOLINTNEXTLINE(clang-analyzer-core.uninitialized.UndefReturn)
+        return *Context::currentContext;
     }
-    IContext::~IContext() = default;
-    IMutableContext::~IMutableContext() = default;
-    Context::~Context() = default;
 
+    void Context::setResultCapture( IResultCapture* resultCapture ) {
+        m_resultCapture = resultCapture;
+    }
+
+    void Context::setConfig( IConfig const* config ) { m_config = config; }
 
     SimplePcg32& sharedRng() {
         static SimplePcg32 s_rng;
@@ -3635,7 +3886,7 @@ namespace Catch {
             return parsed;
         }
 
-        EnumInfo::~EnumInfo() {}
+        EnumInfo::~EnumInfo() = default;
 
         StringRef EnumInfo::lookup( int value ) const {
             for( auto const& valueToName : m_values ) {
@@ -3680,10 +3931,27 @@ namespace Catch {
 
 
 
+#include <exception>
+
 namespace Catch {
 
-    ExceptionTranslatorRegistry::~ExceptionTranslatorRegistry() {
+#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
+    namespace {
+        static std::string tryTranslators(
+            std::vector<
+                Detail::unique_ptr<IExceptionTranslator const>> const& translators ) {
+            if ( translators.empty() ) {
+                std::rethrow_exception( std::current_exception() );
+            } else {
+                return translators[0]->translate( translators.begin() + 1,
+                                                  translators.end() );
+            }
+        }
+
     }
+#endif //!defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
+
+    ExceptionTranslatorRegistry::~ExceptionTranslatorRegistry() = default;
 
     void ExceptionTranslatorRegistry::registerTranslator( Detail::unique_ptr<IExceptionTranslator>&& translator ) {
         m_translators.push_back( CATCH_MOVE( translator ) );
@@ -3706,7 +3974,7 @@ namespace Catch {
         // First we try user-registered translators. If none of them can
         // handle the exception, it will be rethrown handled by our defaults.
         try {
-            return tryTranslators();
+            return tryTranslators(m_translators);
         }
         // To avoid having to handle TFE explicitly everywhere, we just
         // rethrow it so that it goes back up the caller.
@@ -3730,25 +3998,12 @@ namespace Catch {
         }
     }
 
-    std::string ExceptionTranslatorRegistry::tryTranslators() const {
-        if (m_translators.empty()) {
-            std::rethrow_exception(std::current_exception());
-        } else {
-            return m_translators[0]->translate(m_translators.begin() + 1, m_translators.end());
-        }
-    }
-
 #else // ^^ Exceptions are enabled // Exceptions are disabled vv
     std::string ExceptionTranslatorRegistry::translateActiveException() const {
         CATCH_INTERNAL_ERROR("Attempted to translate active exception under CATCH_CONFIG_DISABLE_EXCEPTIONS!");
     }
-
-    std::string ExceptionTranslatorRegistry::tryTranslators() const {
-        CATCH_INTERNAL_ERROR("Attempted to use exception translators under CATCH_CONFIG_DISABLE_EXCEPTIONS!");
-    }
 #endif
 
-
 }
 
 
@@ -4005,6 +4260,17 @@ namespace Catch {
             return i;
         }
 
+#if defined( __GNUC__ ) || defined( __clang__ )
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wfloat-equal"
+#endif
+        bool directCompare( float lhs, float rhs ) { return lhs == rhs; }
+        bool directCompare( double lhs, double rhs ) { return lhs == rhs; }
+#if defined( __GNUC__ ) || defined( __clang__ )
+#    pragma GCC diagnostic pop
+#endif
+
+
     } // end namespace Detail
 } // end namespace Catch
 
@@ -4053,7 +4319,7 @@ namespace Catch {
 namespace Detail {
     namespace {
         template<typename WriterF, std::size_t bufferSize=256>
-        class StreamBufImpl : public std::streambuf {
+        class StreamBufImpl final : public std::streambuf {
             char data[bufferSize];
             WriterF m_writer;
 
@@ -4101,7 +4367,7 @@ namespace Detail {
 
         ///////////////////////////////////////////////////////////////////////////
 
-        class FileStream : public IStream {
+        class FileStream final : public IStream {
             std::ofstream m_ofs;
         public:
             FileStream( std::string const& filename ) {
@@ -4109,7 +4375,6 @@ namespace Detail {
                 CATCH_ENFORCE( !m_ofs.fail(), "Unable to open file: '" << filename << '\'' );
                 m_ofs << std::unitbuf;
             }
-            ~FileStream() override = default;
         public: // IStream
             std::ostream& stream() override {
                 return m_ofs;
@@ -4118,13 +4383,12 @@ namespace Detail {
 
         ///////////////////////////////////////////////////////////////////////////
 
-        class CoutStream : public IStream {
+        class CoutStream final : public IStream {
             std::ostream m_os;
         public:
             // Store the streambuf from cout up-front because
             // cout may get redirected when running tests
             CoutStream() : m_os( Catch::cout().rdbuf() ) {}
-            ~CoutStream() override = default;
 
         public: // IStream
             std::ostream& stream() override { return m_os; }
@@ -4138,7 +4402,6 @@ namespace Detail {
             // Store the streambuf from cerr up-front because
             // cout may get redirected when running tests
             CerrStream(): m_os( Catch::cerr().rdbuf() ) {}
-            ~CerrStream() override = default;
 
         public: // IStream
             std::ostream& stream() override { return m_os; }
@@ -4147,7 +4410,7 @@ namespace Detail {
 
         ///////////////////////////////////////////////////////////////////////////
 
-        class DebugOutStream : public IStream {
+        class DebugOutStream final : public IStream {
             Detail::unique_ptr<StreamBufImpl<OutputDebugWriter>> m_streamBuf;
             std::ostream m_os;
         public:
@@ -4156,8 +4419,6 @@ namespace Detail {
                 m_os( m_streamBuf.get() )
             {}
 
-            ~DebugOutStream() override = default;
-
         public: // IStream
             std::ostream& stream() override { return m_os; }
         };
@@ -4189,6 +4450,147 @@ namespace Detail {
 
 
 
+namespace Catch {
+    void JsonUtils::indent( std::ostream& os, std::uint64_t level ) {
+        for ( std::uint64_t i = 0; i < level; ++i ) {
+            os << "  ";
+        }
+    }
+    void JsonUtils::appendCommaNewline( std::ostream& os,
+                                        bool& should_comma,
+                                        std::uint64_t level ) {
+        if ( should_comma ) { os << ','; }
+        should_comma = true;
+        os << '\n';
+        indent( os, level );
+    }
+
+    JsonObjectWriter::JsonObjectWriter( std::ostream& os ):
+        JsonObjectWriter{ os, 0 } {}
+
+    JsonObjectWriter::JsonObjectWriter( std::ostream& os,
+                                        std::uint64_t indent_level ):
+        m_os{ os }, m_indent_level{ indent_level } {
+        m_os << '{';
+    }
+    JsonObjectWriter::JsonObjectWriter( JsonObjectWriter&& source ):
+        m_os{ source.m_os },
+        m_indent_level{ source.m_indent_level },
+        m_should_comma{ source.m_should_comma },
+        m_active{ source.m_active } {
+        source.m_active = false;
+    }
+
+    JsonObjectWriter::~JsonObjectWriter() {
+        if ( !m_active ) { return; }
+
+        m_os << '\n';
+        JsonUtils::indent( m_os, m_indent_level );
+        m_os << '}';
+    }
+
+    JsonValueWriter JsonObjectWriter::write( StringRef key ) {
+        JsonUtils::appendCommaNewline(
+            m_os, m_should_comma, m_indent_level + 1 );
+
+        m_os << '"' << key << "\": ";
+        return JsonValueWriter{ m_os, m_indent_level + 1 };
+    }
+
+    JsonArrayWriter::JsonArrayWriter( std::ostream& os ):
+        JsonArrayWriter{ os, 0 } {}
+    JsonArrayWriter::JsonArrayWriter( std::ostream& os,
+                                      std::uint64_t indent_level ):
+        m_os{ os }, m_indent_level{ indent_level } {
+        m_os << '[';
+    }
+    JsonArrayWriter::JsonArrayWriter( JsonArrayWriter&& source ):
+        m_os{ source.m_os },
+        m_indent_level{ source.m_indent_level },
+        m_should_comma{ source.m_should_comma },
+        m_active{ source.m_active } {
+        source.m_active = false;
+    }
+    JsonArrayWriter::~JsonArrayWriter() {
+        if ( !m_active ) { return; }
+
+        m_os << '\n';
+        JsonUtils::indent( m_os, m_indent_level );
+        m_os << ']';
+    }
+
+    JsonObjectWriter JsonArrayWriter::writeObject() {
+        JsonUtils::appendCommaNewline(
+            m_os, m_should_comma, m_indent_level + 1 );
+        return JsonObjectWriter{ m_os, m_indent_level + 1 };
+    }
+
+    JsonArrayWriter JsonArrayWriter::writeArray() {
+        JsonUtils::appendCommaNewline(
+            m_os, m_should_comma, m_indent_level + 1 );
+        return JsonArrayWriter{ m_os, m_indent_level + 1 };
+    }
+
+    JsonArrayWriter& JsonArrayWriter::write( bool value ) {
+        return writeImpl( value );
+    }
+
+    JsonValueWriter::JsonValueWriter( std::ostream& os ):
+        JsonValueWriter{ os, 0 } {}
+
+    JsonValueWriter::JsonValueWriter( std::ostream& os,
+                                      std::uint64_t indent_level ):
+        m_os{ os }, m_indent_level{ indent_level } {}
+
+    JsonObjectWriter JsonValueWriter::writeObject() && {
+        return JsonObjectWriter{ m_os, m_indent_level };
+    }
+
+    JsonArrayWriter JsonValueWriter::writeArray() && {
+        return JsonArrayWriter{ m_os, m_indent_level };
+    }
+
+    void JsonValueWriter::write( Catch::StringRef value ) && {
+        writeImpl( value, true );
+    }
+
+    void JsonValueWriter::write( bool value ) && {
+        writeImpl( value ? "true"_sr : "false"_sr, false );
+    }
+
+    void JsonValueWriter::writeImpl( Catch::StringRef value, bool quote ) {
+        if ( quote ) { m_os << '"'; }
+        for (char c : value) {
+            // Escape list taken from https://www.json.org/json-en.html,
+            // string definition.
+            // Note that while forward slash _can_ be escaped, it does
+            // not have to be, if JSON is not further embedded somewhere
+            // where forward slash is meaningful.
+            if ( c == '"' ) {
+                m_os << "\\\"";
+            } else if ( c == '\\' ) {
+                m_os << "\\\\";
+            } else if ( c == '\b' ) {
+                m_os << "\\b";
+            } else if ( c == '\f' ) {
+                m_os << "\\f";
+            } else if ( c == '\n' ) {
+                m_os << "\\n";
+            } else if ( c == '\r' ) {
+                m_os << "\\r";
+            } else if ( c == '\t' ) {
+                m_os << "\\t";
+            } else {
+                m_os << c;
+            }
+        }
+        if ( quote ) { m_os << '"'; }
+    }
+
+} // namespace Catch
+
+
+
 
 namespace Catch {
 
@@ -4231,7 +4633,7 @@ namespace Catch {
 
 #else // ^^ Windows crt debug heap enabled // Windows crt debug heap disabled vv
 
-    Catch::LeakDetector::LeakDetector() {}
+    Catch::LeakDetector::LeakDetector() = default;
 
 #endif // CATCH_CONFIG_WINDOWS_CRTDBG
 
@@ -4242,7 +4644,6 @@ Catch::LeakDetector::~LeakDetector() {
 
 
 
-
 namespace Catch {
     namespace {
 
@@ -4277,7 +4678,7 @@ namespace Catch {
         void listReporters(IEventListener& reporter) {
             std::vector<ReporterDescription> descriptions;
 
-            IReporterRegistry::FactoryMap const& factories = getRegistryHub().getReporterRegistry().getFactories();
+            auto const& factories = getRegistryHub().getReporterRegistry().getFactories();
             descriptions.reserve(factories.size());
             for (auto const& fac : factories) {
                 descriptions.push_back({ fac.first, fac.second->getDescription() });
@@ -4599,6 +5000,14 @@ namespace Catch {
     }
 #endif
 
+#if !defined( CATCH_CONFIG_GLOBAL_NEXTAFTER )
+    float nextafter( float x, float y ) { return std::nextafter( x, y ); }
+    double nextafter( double x, double y ) { return std::nextafter( x, y ); }
+#else
+    float nextafter( float x, float y ) { return ::nextafterf( x, y ); }
+    double nextafter( double x, double y ) { return ::nextafter( x, y ); }
+#endif
+
 } // end namespace Catch
 
 
@@ -4680,10 +5089,10 @@ namespace Catch {
             return static_cast<std::uint32_t>( std::time( nullptr ) );
 
         case GenerateFrom::Default:
-        case GenerateFrom::RandomDevice:
-            // In theory, a platform could have random_device that returns just
-            // 16 bits. That is still some randomness, so we don't care too much
-            return static_cast<std::uint32_t>( std::random_device{}() );
+        case GenerateFrom::RandomDevice: {
+            std::random_device rd;
+            return Detail::fillBitsFrom<std::uint32_t>( rd );
+        }
 
         default:
             CATCH_ERROR("Unknown generation method");
@@ -4696,49 +5105,73 @@ namespace Catch {
 
 
 namespace Catch {
+    struct ReporterRegistry::ReporterRegistryImpl {
+        std::vector<Detail::unique_ptr<EventListenerFactory>> listeners;
+        std::map<std::string, IReporterFactoryPtr, Detail::CaseInsensitiveLess>
+            factories;
+    };
 
-    ReporterRegistry::ReporterRegistry() {
+    ReporterRegistry::ReporterRegistry():
+        m_impl( Detail::make_unique<ReporterRegistryImpl>() ) {
         // Because it is impossible to move out of initializer list,
         // we have to add the elements manually
-        m_factories["Automake"] = Detail::make_unique<ReporterFactory<AutomakeReporter>>();
-        m_factories["compact"] = Detail::make_unique<ReporterFactory<CompactReporter>>();
-        m_factories["console"] = Detail::make_unique<ReporterFactory<ConsoleReporter>>();
-        m_factories["JUnit"] = Detail::make_unique<ReporterFactory<JunitReporter>>();
-        m_factories["SonarQube"] = Detail::make_unique<ReporterFactory<SonarQubeReporter>>();
-        m_factories["TAP"] = Detail::make_unique<ReporterFactory<TAPReporter>>();
-        m_factories["TeamCity"] = Detail::make_unique<ReporterFactory<TeamCityReporter>>();
-        m_factories["XML"] = Detail::make_unique<ReporterFactory<XmlReporter>>();
+        m_impl->factories["Automake"] =
+            Detail::make_unique<ReporterFactory<AutomakeReporter>>();
+        m_impl->factories["compact"] =
+            Detail::make_unique<ReporterFactory<CompactReporter>>();
+        m_impl->factories["console"] =
+            Detail::make_unique<ReporterFactory<ConsoleReporter>>();
+        m_impl->factories["JUnit"] =
+            Detail::make_unique<ReporterFactory<JunitReporter>>();
+        m_impl->factories["SonarQube"] =
+            Detail::make_unique<ReporterFactory<SonarQubeReporter>>();
+        m_impl->factories["TAP"] =
+            Detail::make_unique<ReporterFactory<TAPReporter>>();
+        m_impl->factories["TeamCity"] =
+            Detail::make_unique<ReporterFactory<TeamCityReporter>>();
+        m_impl->factories["XML"] =
+            Detail::make_unique<ReporterFactory<XmlReporter>>();
+        m_impl->factories["JSON"] =
+            Detail::make_unique<ReporterFactory<JsonReporter>>();
     }
 
     ReporterRegistry::~ReporterRegistry() = default;
 
-
-    IEventListenerPtr ReporterRegistry::create( std::string const& name, ReporterConfig&& config ) const {
-        auto it =  m_factories.find( name );
-        if( it == m_factories.end() )
-            return nullptr;
-        return it->second->create( CATCH_MOVE(config) );
+    IEventListenerPtr
+    ReporterRegistry::create( std::string const& name,
+                              ReporterConfig&& config ) const {
+        auto it = m_impl->factories.find( name );
+        if ( it == m_impl->factories.end() ) return nullptr;
+        return it->second->create( CATCH_MOVE( config ) );
     }
 
-    void ReporterRegistry::registerReporter( std::string const& name, IReporterFactoryPtr factory ) {
+    void ReporterRegistry::registerReporter( std::string const& name,
+                                             IReporterFactoryPtr factory ) {
         CATCH_ENFORCE( name.find( "::" ) == name.npos,
-                       "'::' is not allowed in reporter name: '" + name + '\'' );
-        auto ret = m_factories.emplace(name, CATCH_MOVE(factory));
-        CATCH_ENFORCE( ret.second, "reporter using '" + name + "' as name was already registered" );
+                       "'::' is not allowed in reporter name: '" + name +
+                           '\'' );
+        auto ret = m_impl->factories.emplace( name, CATCH_MOVE( factory ) );
+        CATCH_ENFORCE( ret.second,
+                       "reporter using '" + name +
+                           "' as name was already registered" );
     }
     void ReporterRegistry::registerListener(
         Detail::unique_ptr<EventListenerFactory> factory ) {
-        m_listeners.push_back( CATCH_MOVE(factory) );
+        m_impl->listeners.push_back( CATCH_MOVE( factory ) );
     }
 
-    IReporterRegistry::FactoryMap const& ReporterRegistry::getFactories() const {
-        return m_factories;
-    }
-    IReporterRegistry::Listeners const& ReporterRegistry::getListeners() const {
-        return m_listeners;
+    std::map<std::string,
+             IReporterFactoryPtr,
+             Detail::CaseInsensitiveLess> const&
+    ReporterRegistry::getFactories() const {
+        return m_impl->factories;
     }
 
-}
+    std::vector<Detail::unique_ptr<EventListenerFactory>> const&
+    ReporterRegistry::getListeners() const {
+        return m_impl->listeners;
+    }
+} // namespace Catch
 
 
 
@@ -4754,9 +5187,9 @@ namespace Catch {
         };
 
         kvPair splitKVPair(StringRef kvString) {
-            auto splitPos = static_cast<size_t>( std::distance(
-                kvString.begin(),
-                std::find( kvString.begin(), kvString.end(), '=' ) ) );
+            auto splitPos = static_cast<size_t>(
+                std::find( kvString.begin(), kvString.end(), '=' ) -
+                kvString.begin() );
 
             return { kvString.substr( 0, splitPos ),
                      kvString.substr( splitPos + 1, kvString.size() ) };
@@ -4988,146 +5421,151 @@ namespace Catch {
 namespace Catch {
 
     namespace Generators {
-        struct GeneratorTracker : TestCaseTracking::TrackerBase, IGeneratorTracker {
-            GeneratorBasePtr m_generator;
+        namespace {
+            struct GeneratorTracker final : TestCaseTracking::TrackerBase,
+                                      IGeneratorTracker {
+                GeneratorBasePtr m_generator;
+
+                GeneratorTracker(
+                    TestCaseTracking::NameAndLocation&& nameAndLocation,
+                    TrackerContext& ctx,
+                    ITracker* parent ):
+                    TrackerBase( CATCH_MOVE( nameAndLocation ), ctx, parent ) {}
+
+                static GeneratorTracker*
+                acquire( TrackerContext& ctx,
+                         TestCaseTracking::NameAndLocationRef const&
+                             nameAndLocation ) {
+                    GeneratorTracker* tracker;
+
+                    ITracker& currentTracker = ctx.currentTracker();
+                    // Under specific circumstances, the generator we want
+                    // to acquire is also the current tracker. If this is
+                    // the case, we have to avoid looking through current
+                    // tracker's children, and instead return the current
+                    // tracker.
+                    // A case where this check is important is e.g.
+                    //     for (int i = 0; i < 5; ++i) {
+                    //         int n = GENERATE(1, 2);
+                    //     }
+                    //
+                    // without it, the code above creates 5 nested generators.
+                    if ( currentTracker.nameAndLocation() == nameAndLocation ) {
+                        auto thisTracker = currentTracker.parent()->findChild(
+                            nameAndLocation );
+                        assert( thisTracker );
+                        assert( thisTracker->isGeneratorTracker() );
+                        tracker = static_cast<GeneratorTracker*>( thisTracker );
+                    } else if ( ITracker* childTracker =
+                                    currentTracker.findChild(
+                                        nameAndLocation ) ) {
+                        assert( childTracker );
+                        assert( childTracker->isGeneratorTracker() );
+                        tracker =
+                            static_cast<GeneratorTracker*>( childTracker );
+                    } else {
+                        return nullptr;
+                    }
 
-            GeneratorTracker( TestCaseTracking::NameAndLocation&& nameAndLocation, TrackerContext& ctx, ITracker* parent )
-            :   TrackerBase( CATCH_MOVE(nameAndLocation), ctx, parent )
-            {}
-            ~GeneratorTracker() override;
-
-            static GeneratorTracker* acquire( TrackerContext& ctx, TestCaseTracking::NameAndLocationRef const& nameAndLocation ) {
-                GeneratorTracker* tracker;
-
-                ITracker& currentTracker = ctx.currentTracker();
-                // Under specific circumstances, the generator we want
-                // to acquire is also the current tracker. If this is
-                // the case, we have to avoid looking through current
-                // tracker's children, and instead return the current
-                // tracker.
-                // A case where this check is important is e.g.
-                //     for (int i = 0; i < 5; ++i) {
-                //         int n = GENERATE(1, 2);
-                //     }
-                //
-                // without it, the code above creates 5 nested generators.
-                if ( currentTracker.nameAndLocation() == nameAndLocation ) {
-                    auto thisTracker =
-                        currentTracker.parent()->findChild( nameAndLocation );
-                    assert( thisTracker );
-                    assert( thisTracker->isGeneratorTracker() );
-                    tracker = static_cast<GeneratorTracker*>( thisTracker );
-                } else if ( ITracker* childTracker =
-                                currentTracker.findChild( nameAndLocation ) ) {
-                    assert( childTracker );
-                    assert( childTracker->isGeneratorTracker() );
-                    tracker = static_cast<GeneratorTracker*>( childTracker );
-                } else {
-                    return nullptr;
-                }
+                    if ( !tracker->isComplete() ) { tracker->open(); }
 
-                if( !tracker->isComplete() ) {
-                    tracker->open();
+                    return tracker;
                 }
 
-                return tracker;
-            }
-
-            // TrackerBase interface
-            bool isGeneratorTracker() const override { return true; }
-            auto hasGenerator() const -> bool override {
-                return !!m_generator;
-            }
-            void close() override {
-                TrackerBase::close();
-                // If a generator has a child (it is followed by a section)
-                // and none of its children have started, then we must wait
-                // until later to start consuming its values.
-                // This catches cases where `GENERATE` is placed between two
-                // `SECTION`s.
-                // **The check for m_children.empty cannot be removed**.
-                // doing so would break `GENERATE` _not_ followed by `SECTION`s.
-                const bool should_wait_for_child = [&]() {
-                    // No children -> nobody to wait for
-                    if ( m_children.empty() ) {
-                        return false;
-                    }
-                    // If at least one child started executing, don't wait
-                    if ( std::find_if(
-                             m_children.begin(),
-                             m_children.end(),
-                             []( TestCaseTracking::ITrackerPtr const& tracker ) {
-                                 return tracker->hasStarted();
-                             } ) != m_children.end() ) {
-                        return false;
-                    }
-
-                    // No children have started. We need to check if they _can_
-                    // start, and thus we should wait for them, or they cannot
-                    // start (due to filters), and we shouldn't wait for them
-                    ITracker* parent = m_parent;
-                    // This is safe: there is always at least one section
-                    // tracker in a test case tracking tree
-                    while ( !parent->isSectionTracker() ) {
-                        parent = parent->parent();
-                    }
-                    assert( parent &&
-                            "Missing root (test case) level section" );
-
-                    auto const& parentSection =
-                        static_cast<SectionTracker const&>( *parent );
-                    auto const& filters = parentSection.getFilters();
-                    // No filters -> no restrictions on running sections
-                    if ( filters.empty() ) {
-                        return true;
-                    }
+                // TrackerBase interface
+                bool isGeneratorTracker() const override { return true; }
+                auto hasGenerator() const -> bool override {
+                    return !!m_generator;
+                }
+                void close() override {
+                    TrackerBase::close();
+                    // If a generator has a child (it is followed by a section)
+                    // and none of its children have started, then we must wait
+                    // until later to start consuming its values.
+                    // This catches cases where `GENERATE` is placed between two
+                    // `SECTION`s.
+                    // **The check for m_children.empty cannot be removed**.
+                    // doing so would break `GENERATE` _not_ followed by
+                    // `SECTION`s.
+                    const bool should_wait_for_child = [&]() {
+                        // No children -> nobody to wait for
+                        if ( m_children.empty() ) { return false; }
+                        // If at least one child started executing, don't wait
+                        if ( std::find_if(
+                                 m_children.begin(),
+                                 m_children.end(),
+                                 []( TestCaseTracking::ITrackerPtr const&
+                                         tracker ) {
+                                     return tracker->hasStarted();
+                                 } ) != m_children.end() ) {
+                            return false;
+                        }
 
-                    for ( auto const& child : m_children ) {
-                        if ( child->isSectionTracker() &&
-                             std::find(
-                                 filters.begin(),
-                                 filters.end(),
-                                 static_cast<SectionTracker const&>( *child )
-                                     .trimmedName() ) != filters.end() ) {
-                            return true;
+                        // No children have started. We need to check if they
+                        // _can_ start, and thus we should wait for them, or
+                        // they cannot start (due to filters), and we shouldn't
+                        // wait for them
+                        ITracker* parent = m_parent;
+                        // This is safe: there is always at least one section
+                        // tracker in a test case tracking tree
+                        while ( !parent->isSectionTracker() ) {
+                            parent = parent->parent();
                         }
+                        assert( parent &&
+                                "Missing root (test case) level section" );
+
+                        auto const& parentSection =
+                            static_cast<SectionTracker const&>( *parent );
+                        auto const& filters = parentSection.getFilters();
+                        // No filters -> no restrictions on running sections
+                        if ( filters.empty() ) { return true; }
+
+                        for ( auto const& child : m_children ) {
+                            if ( child->isSectionTracker() &&
+                                 std::find( filters.begin(),
+                                            filters.end(),
+                                            static_cast<SectionTracker const&>(
+                                                *child )
+                                                .trimmedName() ) !=
+                                     filters.end() ) {
+                                return true;
+                            }
+                        }
+                        return false;
+                    }();
+
+                    // This check is a bit tricky, because m_generator->next()
+                    // has a side-effect, where it consumes generator's current
+                    // value, but we do not want to invoke the side-effect if
+                    // this generator is still waiting for any child to start.
+                    assert( m_generator && "Tracker without generator" );
+                    if ( should_wait_for_child ||
+                         ( m_runState == CompletedSuccessfully &&
+                           m_generator->countedNext() ) ) {
+                        m_children.clear();
+                        m_runState = Executing;
                     }
-                    return false;
-                }();
-
-                // This check is a bit tricky, because m_generator->next()
-                // has a side-effect, where it consumes generator's current
-                // value, but we do not want to invoke the side-effect if
-                // this generator is still waiting for any child to start.
-                assert( m_generator && "Tracker without generator" );
-                if ( should_wait_for_child ||
-                     ( m_runState == CompletedSuccessfully &&
-                       m_generator->countedNext() ) ) {
-                    m_children.clear();
-                    m_runState = Executing;
                 }
-            }
 
-            // IGeneratorTracker interface
-            auto getGenerator() const -> GeneratorBasePtr const& override {
-                return m_generator;
-            }
-            void setGenerator( GeneratorBasePtr&& generator ) override {
-                m_generator = CATCH_MOVE( generator );
-            }
-        };
-        GeneratorTracker::~GeneratorTracker() = default;
+                // IGeneratorTracker interface
+                auto getGenerator() const -> GeneratorBasePtr const& override {
+                    return m_generator;
+                }
+                void setGenerator( GeneratorBasePtr&& generator ) override {
+                    m_generator = CATCH_MOVE( generator );
+                }
+            };
+        } // namespace
     }
 
     RunContext::RunContext(IConfig const* _config, IEventListenerPtr&& reporter)
     :   m_runInfo(_config->name()),
-        m_context(getCurrentMutableContext()),
         m_config(_config),
         m_reporter(CATCH_MOVE(reporter)),
         m_lastAssertionInfo{ StringRef(), SourceLineInfo("",0), StringRef(), ResultDisposition::Normal },
         m_includeSuccessfulResults( m_config->includeSuccessfulResults() || m_reporter->getPreferences().shouldReportAllAssertions )
     {
-        m_context.setResultCapture(this);
+        getCurrentMutableContext().setResultCapture( this );
         m_reporter->testRunStarting(m_runInfo);
     }
 
@@ -5222,7 +5660,7 @@ namespace Catch {
     }
 
 
-    void RunContext::assertionEnded(AssertionResult const & result) {
+    void RunContext::assertionEnded(AssertionResult&& result) {
         if (result.getResultType() == ResultWas::Ok) {
             m_totals.assertions.passed++;
             m_lastAssertionPassed = true;
@@ -5244,19 +5682,27 @@ namespace Catch {
 
         m_reporter->assertionEnded(AssertionStats(result, m_messages, m_totals));
 
-        if (result.getResultType() != ResultWas::Warning)
+        if ( result.getResultType() != ResultWas::Warning ) {
             m_messageScopes.clear();
+        }
 
-        // Reset working state
-        resetAssertionInfo();
-        m_lastResult = result;
+        // Reset working state. assertion info will be reset after
+        // populateReaction is run if it is needed
+        m_lastResult = CATCH_MOVE( result );
     }
     void RunContext::resetAssertionInfo() {
         m_lastAssertionInfo.macroName = StringRef();
         m_lastAssertionInfo.capturedExpression = "{Unknown expression after the reported line}"_sr;
+        m_lastAssertionInfo.resultDisposition = ResultDisposition::Normal;
+    }
+
+    void RunContext::notifyAssertionStarted( AssertionInfo const& info ) {
+        m_reporter->assertionStarting( info );
     }
 
-    bool RunContext::sectionStarted(StringRef sectionName, SourceLineInfo const& sectionLineInfo, Counts & assertions) {
+    bool RunContext::sectionStarted( StringRef sectionName,
+                                     SourceLineInfo const& sectionLineInfo,
+                                     Counts& assertions ) {
         ITracker& sectionTracker =
             SectionTracker::acquire( m_trackerContext,
                                      TestCaseTracking::NameAndLocationRef(
@@ -5394,7 +5840,8 @@ namespace Catch {
         tempResult.message = static_cast<std::string>(message);
         AssertionResult result(m_lastAssertionInfo, CATCH_MOVE(tempResult));
 
-        assertionEnded(result);
+        assertionEnded(CATCH_MOVE(result) );
+        resetAssertionInfo();
 
         handleUnfinishedSections();
 
@@ -5516,8 +5963,6 @@ namespace Catch {
         ITransientExpression const& expr,
         AssertionReaction& reaction
     ) {
-        m_reporter->assertionStarting( info );
-
         bool negated = isFalseTest( info.resultDisposition );
         bool result = expr.getResult() != negated;
 
@@ -5533,6 +5978,7 @@ namespace Catch {
             reportExpr(info, ResultWas::ExpressionFailed, &expr, negated );
             populateReaction( reaction );
         }
+        resetAssertionInfo();
     }
     void RunContext::reportExpr(
             AssertionInfo const &info,
@@ -5546,7 +5992,7 @@ namespace Catch {
         AssertionResult assertionResult{ info, CATCH_MOVE( data ) };
         assertionResult.m_resultData.lazyExpression.m_transientExpression = expr;
 
-        assertionEnded( assertionResult );
+        assertionEnded( CATCH_MOVE(assertionResult) );
     }
 
     void RunContext::handleMessage(
@@ -5555,22 +6001,23 @@ namespace Catch {
             StringRef message,
             AssertionReaction& reaction
     ) {
-        m_reporter->assertionStarting( info );
-
         m_lastAssertionInfo = info;
 
         AssertionResultData data( resultType, LazyExpression( false ) );
         data.message = static_cast<std::string>(message);
         AssertionResult assertionResult{ m_lastAssertionInfo,
                                          CATCH_MOVE( data ) };
-        assertionEnded( assertionResult );
-        if ( !assertionResult.isOk() ) {
+
+        const auto isOk = assertionResult.isOk();
+        assertionEnded( CATCH_MOVE(assertionResult) );
+        if ( !isOk ) {
             populateReaction( reaction );
         } else if ( resultType == ResultWas::ExplicitSkip ) {
             // TODO: Need to handle this explicitly, as ExplicitSkip is
             // considered "OK"
             reaction.shouldSkip = true;
         }
+        resetAssertionInfo();
     }
     void RunContext::handleUnexpectedExceptionNotThrown(
             AssertionInfo const& info,
@@ -5581,16 +6028,17 @@ namespace Catch {
 
     void RunContext::handleUnexpectedInflightException(
             AssertionInfo const& info,
-            std::string const& message,
+            std::string&& message,
             AssertionReaction& reaction
     ) {
         m_lastAssertionInfo = info;
 
         AssertionResultData data( ResultWas::ThrewException, LazyExpression( false ) );
-        data.message = message;
+        data.message = CATCH_MOVE(message);
         AssertionResult assertionResult{ info, CATCH_MOVE(data) };
-        assertionEnded( assertionResult );
+        assertionEnded( CATCH_MOVE(assertionResult) );
         populateReaction( reaction );
+        resetAssertionInfo();
     }
 
     void RunContext::populateReaction( AssertionReaction& reaction ) {
@@ -5607,7 +6055,8 @@ namespace Catch {
         AssertionResultData data( ResultWas::ThrewException, LazyExpression( false ) );
         data.message = "Exception translation was disabled by CATCH_CONFIG_FAST_COMPILE"s;
         AssertionResult assertionResult{ info, CATCH_MOVE( data ) };
-        assertionEnded( assertionResult );
+        assertionEnded( CATCH_MOVE(assertionResult) );
+        resetAssertionInfo();
     }
     void RunContext::handleNonExpr(
             AssertionInfo const &info,
@@ -5618,10 +6067,11 @@ namespace Catch {
 
         AssertionResultData data( resultType, LazyExpression( false ) );
         AssertionResult assertionResult{ info, CATCH_MOVE( data ) };
-        assertionEnded( assertionResult );
 
-        if( !assertionResult.isOk() )
-            populateReaction( reaction );
+        const auto isOk = assertionResult.isOk();
+        assertionEnded( CATCH_MOVE(assertionResult) );
+        if ( !isOk ) { populateReaction( reaction ); }
+        resetAssertionInfo();
     }
 
 
@@ -5790,7 +6240,6 @@ namespace Catch {
 
 
 
-#include <algorithm>
 #include <ostream>
 #include <cstring>
 #include <cctype>
@@ -5814,9 +6263,9 @@ namespace Catch {
         return s.find( infix ) != std::string::npos;
     }
     void toLowerInPlace( std::string& s ) {
-        std::transform( s.begin(), s.end(), s.begin(), []( char c ) {
-            return toLower( c );
-        } );
+        for ( char& c : s ) {
+            c = toLower( c );
+        }
     }
     std::string toLower( std::string const& s ) {
         std::string lc = s;
@@ -5949,7 +6398,7 @@ namespace Catch {
 
 namespace Catch {
 
-    TagAliasRegistry::~TagAliasRegistry() {}
+    TagAliasRegistry::~TagAliasRegistry() = default;
 
     TagAlias const* TagAliasRegistry::find( std::string const& alias ) const {
         auto it = m_registry.find( alias );
@@ -6030,6 +6479,38 @@ namespace Catch {
 
 namespace Catch {
 
+    namespace {
+        static void enforceNoDuplicateTestCases(
+            std::vector<TestCaseHandle> const& tests ) {
+            auto testInfoCmp = []( TestCaseInfo const* lhs,
+                                   TestCaseInfo const* rhs ) {
+                return *lhs < *rhs;
+            };
+            std::set<TestCaseInfo const*, decltype( testInfoCmp )&> seenTests(
+                testInfoCmp );
+            for ( auto const& test : tests ) {
+                const auto infoPtr = &test.getTestCaseInfo();
+                const auto prev = seenTests.insert( infoPtr );
+                CATCH_ENFORCE( prev.second,
+                               "error: test case \""
+                                   << infoPtr->name << "\", with tags \""
+                                   << infoPtr->tagsAsString()
+                                   << "\" already defined.\n"
+                                   << "\tFirst seen at "
+                                   << ( *prev.first )->lineInfo << "\n"
+                                   << "\tRedefined at " << infoPtr->lineInfo );
+            }
+        }
+
+        static bool matchTest( TestCaseHandle const& testCase,
+                               TestSpec const& testSpec,
+                               IConfig const& config ) {
+            return testSpec.matches( testCase.getTestCaseInfo() ) &&
+                   isThrowSafe( testCase, config );
+        }
+
+    } // end unnamed namespace
+
     std::vector<TestCaseHandle> sortTests( IConfig const& config, std::vector<TestCaseHandle> const& unsortedTestCases ) {
         switch (config.runOrder()) {
         case TestRunOrder::Declared:
@@ -6047,7 +6528,6 @@ namespace Catch {
             return sorted;
         }
         case TestRunOrder::Randomized: {
-            seedRng(config);
             using TestWithHash = std::pair<TestCaseInfoHasher::hash_t, TestCaseHandle>;
 
             TestCaseInfoHasher h{ config.rngSeed() };
@@ -6086,29 +6566,6 @@ namespace Catch {
         return !testCase.getTestCaseInfo().throws() || config.allowThrows();
     }
 
-    bool matchTest( TestCaseHandle const& testCase, TestSpec const& testSpec, IConfig const& config ) {
-        return testSpec.matches( testCase.getTestCaseInfo() ) && isThrowSafe( testCase, config );
-    }
-
-    void
-    enforceNoDuplicateTestCases( std::vector<TestCaseHandle> const& tests ) {
-        auto testInfoCmp = []( TestCaseInfo const* lhs,
-                               TestCaseInfo const* rhs ) {
-            return *lhs < *rhs;
-        };
-        std::set<TestCaseInfo const*, decltype(testInfoCmp) &> seenTests(testInfoCmp);
-        for ( auto const& test : tests ) {
-            const auto infoPtr = &test.getTestCaseInfo();
-            const auto prev = seenTests.insert( infoPtr );
-            CATCH_ENFORCE(
-                prev.second,
-                "error: test case \"" << infoPtr->name << "\", with tags \""
-                    << infoPtr->tagsAsString() << "\" already defined.\n"
-                    << "\tFirst seen at " << ( *prev.first )->lineInfo << "\n"
-                    << "\tRedefined at " << infoPtr->lineInfo );
-        }
-    }
-
     std::vector<TestCaseHandle> filterTests( std::vector<TestCaseHandle> const& testCases, TestSpec const& testSpec, IConfig const& config ) {
         std::vector<TestCaseHandle> filtered;
         filtered.reserve( testCases.size() );
@@ -6149,13 +6606,6 @@ namespace Catch {
         return m_sortedFunctions;
     }
 
-
-
-    ///////////////////////////////////////////////////////////////////////////
-    void TestInvokerAsFunction::invoke() const {
-        m_testAsFunction();
-    }
-
 } // end namespace Catch
 
 
@@ -6401,6 +6851,14 @@ namespace Catch {
 #endif
     }
 
+    void throw_test_skip_exception() {
+#if !defined( CATCH_CONFIG_DISABLE_EXCEPTIONS )
+        throw Catch::TestSkipException();
+#else
+        CATCH_ERROR( "Explicitly skipping tests during runtime requires exceptions" );
+#endif
+    }
+
 } // namespace Catch
 
 
@@ -6409,9 +6867,10 @@ namespace Catch {
 #include <iterator>
 
 namespace Catch {
+    ITestInvoker::~ITestInvoker() = default;
 
     namespace {
-        StringRef extractClassName( StringRef classOrMethodName ) {
+        static StringRef extractClassName( StringRef classOrMethodName ) {
             if ( !startsWith( classOrMethodName, '&' ) ) {
                 return classOrMethodName;
             }
@@ -6438,6 +6897,18 @@ namespace Catch {
                 static_cast<std::size_t>( startIdx ),
                 static_cast<std::size_t>( classNameSize ) );
         }
+
+        class TestInvokerAsFunction final : public ITestInvoker {
+            using TestType = void ( * )();
+            TestType m_testAsFunction;
+
+        public:
+            TestInvokerAsFunction( TestType testAsFunction ) noexcept:
+                m_testAsFunction( testAsFunction ) {}
+
+            void invoke() const override { m_testAsFunction(); }
+        };
+
     } // namespace
 
     Detail::unique_ptr<ITestInvoker> makeTestInvoker( void(*testAsFunction)() ) {
@@ -6919,23 +7390,36 @@ namespace Catch {
             return os;
         }
 
-        Columns Column::operator+( Column const& other ) {
+        Columns operator+(Column const& lhs, Column const& rhs) {
             Columns cols;
-            cols += *this;
-            cols += other;
+            cols += lhs;
+            cols += rhs;
             return cols;
         }
-
-        Columns& Columns::operator+=( Column const& col ) {
-            m_columns.push_back( col );
-            return *this;
+        Columns operator+(Column&& lhs, Column&& rhs) {
+            Columns cols;
+            cols += CATCH_MOVE( lhs );
+            cols += CATCH_MOVE( rhs );
+            return cols;
         }
 
-        Columns Columns::operator+( Column const& col ) {
-            Columns combined = *this;
-            combined += col;
+        Columns& operator+=(Columns& lhs, Column const& rhs) {
+            lhs.m_columns.push_back( rhs );
+            return lhs;
+        }
+        Columns& operator+=(Columns& lhs, Column&& rhs) {
+            lhs.m_columns.push_back( CATCH_MOVE(rhs) );
+            return lhs;
+        }
+        Columns operator+( Columns const& lhs, Column const& rhs ) {
+            auto combined( lhs );
+            combined += rhs;
             return combined;
         }
+        Columns operator+( Columns&& lhs, Column&& rhs ) {
+            lhs += CATCH_MOVE( rhs );
+            return CATCH_MOVE( lhs );
+        }
 
     } // namespace TextFlow
 } // namespace Catch
@@ -7431,26 +7915,11 @@ namespace {
         return ulpDist <= maxUlpDiff;
     }
 
-#if defined(CATCH_CONFIG_GLOBAL_NEXTAFTER)
-
-    float nextafter(float x, float y) {
-        return ::nextafterf(x, y);
-    }
-
-    double nextafter(double x, double y) {
-        return ::nextafter(x, y);
-    }
-
-#endif // ^^^ CATCH_CONFIG_GLOBAL_NEXTAFTER ^^^
 
 template <typename FP>
 FP step(FP start, FP direction, uint64_t steps) {
     for (uint64_t i = 0; i < steps; ++i) {
-#if defined(CATCH_CONFIG_GLOBAL_NEXTAFTER)
         start = Catch::nextafter(start, direction);
-#else
-        start = std::nextafter(start, direction);
-#endif
     }
     return start;
 }
@@ -7824,7 +8293,7 @@ namespace Catch {
 
 namespace Catch {
 
-    AutomakeReporter::~AutomakeReporter() {}
+    AutomakeReporter::~AutomakeReporter() = default;
 
     void AutomakeReporter::testCaseEnded(TestCaseStats const& _testCaseStats) {
         // Possible values to emit are PASS, XFAIL, SKIP, FAIL, XPASS and ERROR.
@@ -8046,7 +8515,7 @@ private:
             return;
 
         const auto itEnd = messages.cend();
-        const auto N = static_cast<std::size_t>(std::distance(itMessage, itEnd));
+        const auto N = static_cast<std::size_t>(itEnd - itMessage);
 
         stream << colourImpl->guardColour( colour ) << " with "
                << pluralise( N, "message"_sr ) << ':';
@@ -8124,7 +8593,7 @@ private:
             StreamingReporterBase::testRunEnded( _testRunStats );
         }
 
-        CompactReporter::~CompactReporter() {}
+        CompactReporter::~CompactReporter() = default;
 
 } // end namespace Catch
 
@@ -8319,15 +8788,9 @@ findMax( std::size_t& i, std::size_t& j, std::size_t& k, std::size_t& l ) {
         return l;
 }
 
-enum class Justification { Left, Right };
-
-struct ColumnInfo {
-    std::string name;
-    std::size_t width;
-    Justification justification;
-};
 struct ColumnBreak {};
 struct RowBreak {};
+struct OutputFlush {};
 
 class Duration {
     enum class Unit {
@@ -8402,6 +8865,14 @@ public:
 };
 } // end anon namespace
 
+enum class Justification { Left, Right };
+
+struct ColumnInfo {
+    std::string name;
+    std::size_t width;
+    Justification justification;
+};
+
 class TablePrinter {
     std::ostream& m_os;
     std::vector<ColumnInfo> m_columnInfos;
@@ -8424,11 +8895,10 @@ public:
             *this << RowBreak();
 
 			TextFlow::Columns headerCols;
-			auto spacer = TextFlow::Spacer(2);
 			for (auto const& info : m_columnInfos) {
                 assert(info.width > 2);
 				headerCols += TextFlow::Column(info.name).width(info.width - 2);
-				headerCols += spacer;
+                headerCols += TextFlow::Spacer( 2 );
 			}
 			m_os << headerCols << '\n';
 
@@ -8444,12 +8914,12 @@ public:
     }
 
     template<typename T>
-    friend TablePrinter& operator << (TablePrinter& tp, T const& value) {
+    friend TablePrinter& operator<< (TablePrinter& tp, T const& value) {
         tp.m_oss << value;
         return tp;
     }
 
-    friend TablePrinter& operator << (TablePrinter& tp, ColumnBreak) {
+    friend TablePrinter& operator<< (TablePrinter& tp, ColumnBreak) {
         auto colStr = tp.m_oss.str();
         const auto strSize = colStr.size();
         tp.m_oss.str("");
@@ -8471,13 +8941,18 @@ public:
         return tp;
     }
 
-    friend TablePrinter& operator << (TablePrinter& tp, RowBreak) {
+    friend TablePrinter& operator<< (TablePrinter& tp, RowBreak) {
         if (tp.m_currentColumn > 0) {
             tp.m_os << '\n';
             tp.m_currentColumn = -1;
         }
         return tp;
     }
+
+    friend TablePrinter& operator<<(TablePrinter& tp, OutputFlush) {
+        tp.m_os << std::flush;
+        return tp;
+    }
 };
 
 ConsoleReporter::ConsoleReporter(ReporterConfig&& config):
@@ -8499,7 +8974,7 @@ ConsoleReporter::ConsoleReporter(ReporterConfig&& config):
                 { "benchmark name", CATCH_CONFIG_CONSOLE_WIDTH - 43, Justification::Left },
                 { "samples      mean       std dev", 14, Justification::Right },
                 { "iterations   low mean   low std dev", 14, Justification::Right },
-                { "estimated    high mean  high std dev", 14, Justification::Right }
+                { "est run time high mean  high std dev", 14, Justification::Right }
             };
         }
     }())) {}
@@ -8583,8 +9058,11 @@ void ConsoleReporter::benchmarkPreparing( StringRef name ) {
 void ConsoleReporter::benchmarkStarting(BenchmarkInfo const& info) {
     (*m_tablePrinter) << info.samples << ColumnBreak()
         << info.iterations << ColumnBreak();
-    if (!m_config->benchmarkNoAnalysis())
-        (*m_tablePrinter) << Duration(info.estimatedDuration) << ColumnBreak();
+    if ( !m_config->benchmarkNoAnalysis() ) {
+        ( *m_tablePrinter )
+            << Duration( info.estimatedDuration ) << ColumnBreak();
+    }
+    ( *m_tablePrinter ) << OutputFlush{};
 }
 void ConsoleReporter::benchmarkEnded(BenchmarkStats<> const& stats) {
     if (m_config->benchmarkNoAnalysis())
@@ -9280,6 +9758,366 @@ namespace Catch {
 } // namespace Catch
 
 
+//
+
+namespace Catch {
+    namespace {
+        void writeSourceInfo( JsonObjectWriter& writer,
+                              SourceLineInfo const& sourceInfo ) {
+            auto source_location_writer =
+                writer.write( "source-location"_sr ).writeObject();
+            source_location_writer.write( "filename"_sr )
+                .write( sourceInfo.file );
+            source_location_writer.write( "line"_sr ).write( sourceInfo.line );
+        }
+
+        void writeTags( JsonArrayWriter writer, std::vector<Tag> const& tags ) {
+            for ( auto const& tag : tags ) {
+                writer.write( tag.original );
+            }
+        }
+
+        void writeProperties( JsonArrayWriter writer,
+                              TestCaseInfo const& info ) {
+            if ( info.isHidden() ) { writer.write( "is-hidden"_sr ); }
+            if ( info.okToFail() ) { writer.write( "ok-to-fail"_sr ); }
+            if ( info.expectedToFail() ) {
+                writer.write( "expected-to-fail"_sr );
+            }
+            if ( info.throws() ) { writer.write( "throws"_sr ); }
+        }
+
+    } // namespace
+
+    JsonReporter::JsonReporter( ReporterConfig&& config ):
+        StreamingReporterBase{ CATCH_MOVE( config ) } {
+
+        m_preferences.shouldRedirectStdOut = true;
+        // TBD: Do we want to report all assertions? XML reporter does
+        //      not, but for machine-parseable reporters I think the answer
+        //      should be yes.
+        m_preferences.shouldReportAllAssertions = true;
+
+        m_objectWriters.emplace( m_stream );
+        m_writers.emplace( Writer::Object );
+        auto& writer = m_objectWriters.top();
+
+        writer.write( "version"_sr ).write( 1 );
+
+        {
+            auto metadata_writer = writer.write( "metadata"_sr ).writeObject();
+            metadata_writer.write( "name"_sr ).write( m_config->name() );
+            metadata_writer.write( "rng-seed"_sr ).write( m_config->rngSeed() );
+            metadata_writer.write( "catch2-version"_sr )
+                .write( libraryVersion() );
+            if ( m_config->testSpec().hasFilters() ) {
+                metadata_writer.write( "filters"_sr )
+                    .write( m_config->testSpec() );
+            }
+        }
+    }
+
+    JsonReporter::~JsonReporter() {
+        endListing();
+        // TODO: Ensure this closes the top level object, add asserts
+        assert( m_writers.size() == 1 && "Only the top level object should be open" );
+        assert( m_writers.top() == Writer::Object );
+        endObject();
+        m_stream << '\n' << std::flush;
+        assert( m_writers.empty() );
+    }
+
+    JsonArrayWriter& JsonReporter::startArray() {
+        m_arrayWriters.emplace( m_arrayWriters.top().writeArray() );
+        m_writers.emplace( Writer::Array );
+        return m_arrayWriters.top();
+    }
+    JsonArrayWriter& JsonReporter::startArray( StringRef key ) {
+        m_arrayWriters.emplace(
+            m_objectWriters.top().write( key ).writeArray() );
+        m_writers.emplace( Writer::Array );
+        return m_arrayWriters.top();
+    }
+
+    JsonObjectWriter& JsonReporter::startObject() {
+        m_objectWriters.emplace( m_arrayWriters.top().writeObject() );
+        m_writers.emplace( Writer::Object );
+        return m_objectWriters.top();
+    }
+    JsonObjectWriter& JsonReporter::startObject( StringRef key ) {
+        m_objectWriters.emplace(
+            m_objectWriters.top().write( key ).writeObject() );
+        m_writers.emplace( Writer::Object );
+        return m_objectWriters.top();
+    }
+
+    void JsonReporter::endObject() {
+        assert( isInside( Writer::Object ) );
+        m_objectWriters.pop();
+        m_writers.pop();
+    }
+    void JsonReporter::endArray() {
+        assert( isInside( Writer::Array ) );
+        m_arrayWriters.pop();
+        m_writers.pop();
+    }
+
+    bool JsonReporter::isInside( Writer writer ) {
+        return !m_writers.empty() && m_writers.top() == writer;
+    }
+
+    void JsonReporter::startListing() {
+        if ( !m_startedListing ) { startObject( "listings"_sr ); }
+        m_startedListing = true;
+    }
+    void JsonReporter::endListing() {
+        if ( m_startedListing ) { endObject(); }
+        m_startedListing = false;
+    }
+
+    std::string JsonReporter::getDescription() {
+        return "Outputs listings as JSON. Test listing is Work-in-Progress!";
+    }
+
+    void JsonReporter::testRunStarting( TestRunInfo const& testInfo ) {
+        StreamingReporterBase::testRunStarting( testInfo );
+        endListing();
+
+        assert( isInside( Writer::Object ) );
+        startObject( "test-run"_sr );
+        startArray( "test-cases"_sr );
+    }
+
+     static void writeCounts( JsonObjectWriter&& writer, Counts const& counts ) {
+        writer.write( "passed"_sr ).write( counts.passed );
+        writer.write( "failed"_sr ).write( counts.failed );
+        writer.write( "fail-but-ok"_sr ).write( counts.failedButOk );
+        writer.write( "skipped"_sr ).write( counts.skipped );
+    }
+
+    void JsonReporter::testRunEnded(TestRunStats const& runStats) {
+        assert( isInside( Writer::Array ) );
+        // End "test-cases"
+        endArray();
+
+        {
+            auto totals =
+                m_objectWriters.top().write( "totals"_sr ).writeObject();
+            writeCounts( totals.write( "assertions"_sr ).writeObject(),
+                         runStats.totals.assertions );
+            writeCounts( totals.write( "test-cases"_sr ).writeObject(),
+                         runStats.totals.testCases );
+        }
+
+        // End the "test-run" object
+        endObject();
+    }
+
+    void JsonReporter::testCaseStarting( TestCaseInfo const& tcInfo ) {
+        StreamingReporterBase::testCaseStarting( tcInfo );
+
+        assert( isInside( Writer::Array ) &&
+                "We should be in the 'test-cases' array" );
+        startObject();
+        // "test-info" prelude
+        {
+            auto testInfo =
+                m_objectWriters.top().write( "test-info"_sr ).writeObject();
+            // TODO: handle testName vs className!!
+            testInfo.write( "name"_sr ).write( tcInfo.name );
+            writeSourceInfo(testInfo, tcInfo.lineInfo);
+            writeTags( testInfo.write( "tags"_sr ).writeArray(), tcInfo.tags );
+            writeProperties( testInfo.write( "properties"_sr ).writeArray(),
+                             tcInfo );
+        }
+
+
+        // Start the array for individual test runs (testCasePartial pairs)
+        startArray( "runs"_sr );
+    }
+
+    void JsonReporter::testCaseEnded( TestCaseStats const& tcStats ) {
+        StreamingReporterBase::testCaseEnded( tcStats );
+
+        // We need to close the 'runs' array before finishing the test case
+        assert( isInside( Writer::Array ) );
+        endArray();
+
+        {
+            auto totals =
+                m_objectWriters.top().write( "totals"_sr ).writeObject();
+            writeCounts( totals.write( "assertions"_sr ).writeObject(),
+                         tcStats.totals.assertions );
+            // We do not write the test case totals, because there will always be just one test case here.
+            // TODO: overall "result" -> success, skip, fail here? Or in partial result?
+        }
+        // We do not write out stderr/stdout, because we instead wrote those out in partial runs
+
+        // TODO: aborting?
+
+        // And we also close this test case's object
+        assert( isInside( Writer::Object ) );
+        endObject();
+    }
+
+    void JsonReporter::testCasePartialStarting( TestCaseInfo const& /*tcInfo*/,
+                                                uint64_t index ) {
+        startObject();
+        m_objectWriters.top().write( "run-idx"_sr ).write( index );
+        startArray( "path"_sr );
+        // TODO: we want to delay most of the printing to the 'root' section
+        // TODO: childSection key name?
+    }
+
+    void JsonReporter::testCasePartialEnded( TestCaseStats const& tcStats,
+                                             uint64_t /*index*/ ) {
+        // Fixme: the top level section handles this.
+        //// path object
+        endArray();
+        if ( !tcStats.stdOut.empty() ) {
+            m_objectWriters.top()
+                .write( "captured-stdout"_sr )
+                .write( tcStats.stdOut );
+        }
+        if ( !tcStats.stdErr.empty() ) {
+            m_objectWriters.top()
+                .write( "captured-stderr"_sr )
+                .write( tcStats.stdErr );
+        }
+        {
+            auto totals =
+                m_objectWriters.top().write( "totals"_sr ).writeObject();
+            writeCounts( totals.write( "assertions"_sr ).writeObject(),
+                         tcStats.totals.assertions );
+            // We do not write the test case totals, because there will
+            // always be just one test case here.
+            // TODO: overall "result" -> success, skip, fail here? Or in
+            // partial result?
+        }
+        // TODO: aborting?
+        // run object
+        endObject();
+    }
+
+    void JsonReporter::sectionStarting( SectionInfo const& sectionInfo ) {
+        assert( isInside( Writer::Array ) &&
+                "Section should always start inside an object" );
+        // We want to nest top level sections, even though it shares name
+        // and source loc with the TEST_CASE
+        auto& sectionObject = startObject();
+        sectionObject.write( "kind"_sr ).write( "section"_sr );
+        sectionObject.write( "name"_sr ).write( sectionInfo.name );
+        writeSourceInfo( m_objectWriters.top(), sectionInfo.lineInfo );
+
+
+        // TBD: Do we want to create this event lazily? It would become
+        //      rather complex, but we could do it, and it would look
+        //      better for empty sections. OTOH, empty sections should
+        //      be rare.
+        startArray( "path"_sr );
+    }
+    void JsonReporter::sectionEnded( SectionStats const& /*sectionStats */) {
+        // End the subpath array
+        endArray();
+        // TODO: metadata
+        // TODO: what info do we have here?
+
+        // End the section object
+        endObject();
+    }
+
+    void JsonReporter::assertionStarting( AssertionInfo const& /*assertionInfo*/ ) {}
+    void JsonReporter::assertionEnded( AssertionStats const& assertionStats ) {
+        // TODO: There is lot of different things to handle here, but
+        //       we can fill it in later, after we show that the basic
+        //       outline and streaming reporter impl works well enough.
+        //if ( !m_config->includeSuccessfulResults()
+        //    && assertionStats.assertionResult.isOk() ) {
+        //    return;
+        //}
+        assert( isInside( Writer::Array ) );
+        auto assertionObject = m_arrayWriters.top().writeObject();
+
+        assertionObject.write( "kind"_sr ).write( "assertion"_sr );
+        writeSourceInfo( assertionObject,
+                         assertionStats.assertionResult.getSourceInfo() );
+        assertionObject.write( "status"_sr )
+            .write( assertionStats.assertionResult.isOk() );
+        // TODO: handling of result.
+        // TODO: messages
+        // TODO: totals?
+    }
+
+
+    void JsonReporter::benchmarkPreparing( StringRef name ) { (void)name; }
+    void JsonReporter::benchmarkStarting( BenchmarkInfo const& ) {}
+    void JsonReporter::benchmarkEnded( BenchmarkStats<> const& ) {}
+    void JsonReporter::benchmarkFailed( StringRef error ) { (void)error; }
+
+    void JsonReporter::listReporters(
+        std::vector<ReporterDescription> const& descriptions ) {
+        startListing();
+
+        auto writer =
+            m_objectWriters.top().write( "reporters"_sr ).writeArray();
+        for ( auto const& desc : descriptions ) {
+            auto desc_writer = writer.writeObject();
+            desc_writer.write( "name"_sr ).write( desc.name );
+            desc_writer.write( "description"_sr ).write( desc.description );
+        }
+    }
+    void JsonReporter::listListeners(
+        std::vector<ListenerDescription> const& descriptions ) {
+        startListing();
+
+        auto writer =
+            m_objectWriters.top().write( "listeners"_sr ).writeArray();
+
+        for ( auto const& desc : descriptions ) {
+            auto desc_writer = writer.writeObject();
+            desc_writer.write( "name"_sr ).write( desc.name );
+            desc_writer.write( "description"_sr ).write( desc.description );
+        }
+    }
+    void JsonReporter::listTests( std::vector<TestCaseHandle> const& tests ) {
+        startListing();
+
+        auto writer = m_objectWriters.top().write( "tests"_sr ).writeArray();
+
+        for ( auto const& test : tests ) {
+            auto desc_writer = writer.writeObject();
+            auto const& info = test.getTestCaseInfo();
+
+            desc_writer.write( "name"_sr ).write( info.name );
+            desc_writer.write( "class-name"_sr ).write( info.className );
+            {
+                auto tag_writer = desc_writer.write( "tags"_sr ).writeArray();
+                for ( auto const& tag : info.tags ) {
+                    tag_writer.write( tag.original );
+                }
+            }
+            writeSourceInfo( desc_writer, info.lineInfo );
+        }
+    }
+    void JsonReporter::listTags( std::vector<TagInfo> const& tags ) {
+        startListing();
+
+        auto writer = m_objectWriters.top().write( "tags"_sr ).writeArray();
+        for ( auto const& tag : tags ) {
+            auto tag_writer = writer.writeObject();
+            {
+                auto aliases_writer =
+                    tag_writer.write( "aliases"_sr ).writeArray();
+                for ( auto alias : tag.spellings ) {
+                    aliases_writer.write( alias );
+                }
+            }
+            tag_writer.write( "count"_sr ).write( tag.count );
+        }
+    }
+} // namespace Catch
+
+
 
 
 #include <cassert>
@@ -9299,6 +10137,8 @@ namespace Catch {
             gmtime_s(&timeInfo, &rawtime);
 #elif defined (CATCH_PLATFORM_PLAYSTATION)
             gmtime_s(&rawtime, &timeInfo);
+#elif defined (__IAR_SYSTEMS_ICC__)
+            timeInfo = *std::gmtime(&rawtime);
 #else
             gmtime_r(&rawtime, &timeInfo);
 #endif
@@ -9559,7 +10399,7 @@ namespace Catch {
                 }
             }
 
-            if( !result.getMessage().empty() )
+            if( result.hasMessage() )
                 rss << result.getMessage() << '\n';
             for( auto const& msg : stats.infoMessages )
                 if( msg.type == ResultWas::Info )
@@ -9678,7 +10518,6 @@ namespace Catch {
         }
     }
 
-    // The return value indicates if the messages buffer should be cleared:
     void MultiReporter::assertionEnded( AssertionStats const& assertionStats ) {
         const bool reportByDefault =
             assertionStats.assertionResult.getResultType() != ResultWas::Ok ||
@@ -9781,6 +10620,11 @@ namespace Catch {
             }
         }
 
+        void registerListenerImpl( Detail::unique_ptr<EventListenerFactory> listenerFactory ) {
+            getMutableRegistryHub().registerListener( CATCH_MOVE(listenerFactory) );
+        }
+
+
     } // namespace Detail
 } // namespace Catch
 
@@ -9920,7 +10764,7 @@ namespace Catch {
                 }
             }
 
-            if (!result.getMessage().empty())
+            if (result.hasMessage())
                 textRss << result.getMessage() << '\n';
 
             for (auto const& msg : stats.infoMessages)
@@ -9954,7 +10798,6 @@ namespace Catch {
 
 
 #include <algorithm>
-#include <iterator>
 #include <ostream>
 
 namespace Catch {
@@ -10105,7 +10948,7 @@ namespace Catch {
 
                 // using messages.end() directly (or auto) yields compilation error:
                 std::vector<MessageInfo>::const_iterator itEnd = messages.end();
-                const std::size_t N = static_cast<std::size_t>(std::distance(itMessage, itEnd));
+                const std::size_t N = static_cast<std::size_t>(itEnd - itMessage);
 
                 stream << colourImpl->guardColour( colour ) << " with "
                        << pluralise( N, "message"_sr ) << ':';
@@ -10203,7 +11046,7 @@ namespace Catch {
     } // end anonymous namespace
 
 
-    TeamCityReporter::~TeamCityReporter() {}
+    TeamCityReporter::~TeamCityReporter() = default;
 
     void TeamCityReporter::testRunStarting( TestRunInfo const& runInfo ) {
         m_stream << "##teamcity[testSuiteStarted name='" << escape( runInfo.name )
@@ -10377,7 +11220,7 @@ namespace Catch {
         m_xml.startElement("Catch2TestRun")
              .writeAttribute("name"_sr, m_config->name())
              .writeAttribute("rng-seed"_sr, m_config->rngSeed())
-             .writeAttribute("xml-format-version"_sr, 2)
+             .writeAttribute("xml-format-version"_sr, 3)
              .writeAttribute("catch2-version"_sr, libraryVersion());
         if ( m_config->testSpec().hasFilters() ) {
             m_xml.writeAttribute( "filters"_sr, m_config->testSpec() );
@@ -10419,11 +11262,13 @@ namespace Catch {
             // Print any info messages in <Info> tags.
             for( auto const& msg : assertionStats.infoMessages ) {
                 if( msg.type == ResultWas::Info && includeResults ) {
-                    m_xml.scopedElement( "Info" )
-                            .writeText( msg.message );
+                    auto t = m_xml.scopedElement( "Info" );
+                    writeSourceInfo( msg.lineInfo );
+                    t.writeText( msg.message );
                 } else if ( msg.type == ResultWas::Warning ) {
-                    m_xml.scopedElement( "Warning" )
-                            .writeText( msg.message );
+                    auto t = m_xml.scopedElement( "Warning" );
+                    writeSourceInfo( msg.lineInfo );
+                    t.writeText( msg.message );
                 }
             }
         }
@@ -10553,26 +11398,23 @@ namespace Catch {
     }
 
     void XmlReporter::benchmarkEnded(BenchmarkStats<> const& benchmarkStats) {
-        m_xml.startElement("mean")
+        m_xml.scopedElement("mean")
             .writeAttribute("value"_sr, benchmarkStats.mean.point.count())
             .writeAttribute("lowerBound"_sr, benchmarkStats.mean.lower_bound.count())
             .writeAttribute("upperBound"_sr, benchmarkStats.mean.upper_bound.count())
             .writeAttribute("ci"_sr, benchmarkStats.mean.confidence_interval);
-        m_xml.endElement();
-        m_xml.startElement("standardDeviation")
+        m_xml.scopedElement("standardDeviation")
             .writeAttribute("value"_sr, benchmarkStats.standardDeviation.point.count())
             .writeAttribute("lowerBound"_sr, benchmarkStats.standardDeviation.lower_bound.count())
             .writeAttribute("upperBound"_sr, benchmarkStats.standardDeviation.upper_bound.count())
             .writeAttribute("ci"_sr, benchmarkStats.standardDeviation.confidence_interval);
-        m_xml.endElement();
-        m_xml.startElement("outliers")
+        m_xml.scopedElement("outliers")
             .writeAttribute("variance"_sr, benchmarkStats.outlierVariance)
             .writeAttribute("lowMild"_sr, benchmarkStats.outliers.low_mild)
             .writeAttribute("lowSevere"_sr, benchmarkStats.outliers.low_severe)
             .writeAttribute("highMild"_sr, benchmarkStats.outliers.high_mild)
             .writeAttribute("highSevere"_sr, benchmarkStats.outliers.high_severe);
         m_xml.endElement();
-        m_xml.endElement();
     }
 
     void XmlReporter::benchmarkFailed(StringRef error) {
diff --git a/packages/Catch2/extras/catch_amalgamated.hpp b/packages/Catch2/extras/catch_amalgamated.hpp
index 321cec5dac0647cc2065db5983ddd4ad9898438f..fdba759a785ed36eb320db3b94569e34b1ebc390 100644
--- a/packages/Catch2/extras/catch_amalgamated.hpp
+++ b/packages/Catch2/extras/catch_amalgamated.hpp
@@ -1,3 +1,4 @@
+
 //              Copyright Catch2 Authors
 // Distributed under the Boost Software License, Version 1.0.
 //   (See accompanying file LICENSE.txt or copy at
@@ -5,8 +6,8 @@
 
 // SPDX-License-Identifier: BSL-1.0
 
-//  Catch v3.3.2
-//  Generated: 2023-02-26 10:28:46.785908
+//  Catch v3.5.2
+//  Generated: 2024-01-15 14:06:34.036475
 //  ----------------------------------------------------------
 //  This file is an amalgamation of multiple different files.
 //  You probably shouldn't edit it directly.
@@ -59,238 +60,6 @@
 
 
 
-#ifndef CATCH_INTERFACES_CONFIG_HPP_INCLUDED
-#define CATCH_INTERFACES_CONFIG_HPP_INCLUDED
-
-
-
-#ifndef CATCH_NONCOPYABLE_HPP_INCLUDED
-#define CATCH_NONCOPYABLE_HPP_INCLUDED
-
-namespace Catch {
-    namespace Detail {
-
-        //! Deriving classes become noncopyable and nonmovable
-        class NonCopyable {
-            NonCopyable( NonCopyable const& ) = delete;
-            NonCopyable( NonCopyable&& ) = delete;
-            NonCopyable& operator=( NonCopyable const& ) = delete;
-            NonCopyable& operator=( NonCopyable&& ) = delete;
-
-        protected:
-            NonCopyable() noexcept = default;
-        };
-
-    } // namespace Detail
-} // namespace Catch
-
-#endif // CATCH_NONCOPYABLE_HPP_INCLUDED
-
-
-#ifndef CATCH_STRINGREF_HPP_INCLUDED
-#define CATCH_STRINGREF_HPP_INCLUDED
-
-#include <cstddef>
-#include <string>
-#include <iosfwd>
-#include <cassert>
-
-#include <cstring>
-
-namespace Catch {
-
-    /// A non-owning string class (similar to the forthcoming std::string_view)
-    /// Note that, because a StringRef may be a substring of another string,
-    /// it may not be null terminated.
-    class StringRef {
-    public:
-        using size_type = std::size_t;
-        using const_iterator = const char*;
-
-    private:
-        static constexpr char const* const s_empty = "";
-
-        char const* m_start = s_empty;
-        size_type m_size = 0;
-
-    public: // construction
-        constexpr StringRef() noexcept = default;
-
-        StringRef( char const* rawChars ) noexcept;
-
-        constexpr StringRef( char const* rawChars, size_type size ) noexcept
-        :   m_start( rawChars ),
-            m_size( size )
-        {}
-
-        StringRef( std::string const& stdString ) noexcept
-        :   m_start( stdString.c_str() ),
-            m_size( stdString.size() )
-        {}
-
-        explicit operator std::string() const {
-            return std::string(m_start, m_size);
-        }
-
-    public: // operators
-        auto operator == ( StringRef other ) const noexcept -> bool {
-            return m_size == other.m_size
-                && (std::memcmp( m_start, other.m_start, m_size ) == 0);
-        }
-        auto operator != (StringRef other) const noexcept -> bool {
-            return !(*this == other);
-        }
-
-        constexpr auto operator[] ( size_type index ) const noexcept -> char {
-            assert(index < m_size);
-            return m_start[index];
-        }
-
-        bool operator<(StringRef rhs) const noexcept;
-
-    public: // named queries
-        constexpr auto empty() const noexcept -> bool {
-            return m_size == 0;
-        }
-        constexpr auto size() const noexcept -> size_type {
-            return m_size;
-        }
-
-        // Returns a substring of [start, start + length).
-        // If start + length > size(), then the substring is [start, start + size()).
-        // If start > size(), then the substring is empty.
-        constexpr StringRef substr(size_type start, size_type length) const noexcept {
-            if (start < m_size) {
-                const auto shortened_size = m_size - start;
-                return StringRef(m_start + start, (shortened_size < length) ? shortened_size : length);
-            } else {
-                return StringRef();
-            }
-        }
-
-        // Returns the current start pointer. May not be null-terminated.
-        constexpr char const* data() const noexcept {
-            return m_start;
-        }
-
-        constexpr const_iterator begin() const { return m_start; }
-        constexpr const_iterator end() const { return m_start + m_size; }
-
-
-        friend std::string& operator += (std::string& lhs, StringRef sr);
-        friend std::ostream& operator << (std::ostream& os, StringRef sr);
-        friend std::string operator+(StringRef lhs, StringRef rhs);
-
-        /**
-         * Provides a three-way comparison with rhs
-         *
-         * Returns negative number if lhs < rhs, 0 if lhs == rhs, and a positive
-         * number if lhs > rhs
-         */
-        int compare( StringRef rhs ) const;
-    };
-
-
-    constexpr auto operator ""_sr( char const* rawChars, std::size_t size ) noexcept -> StringRef {
-        return StringRef( rawChars, size );
-    }
-} // namespace Catch
-
-constexpr auto operator ""_catch_sr( char const* rawChars, std::size_t size ) noexcept -> Catch::StringRef {
-    return Catch::StringRef( rawChars, size );
-}
-
-#endif // CATCH_STRINGREF_HPP_INCLUDED
-
-#include <chrono>
-#include <iosfwd>
-#include <string>
-#include <vector>
-
-namespace Catch {
-
-    enum class Verbosity {
-        Quiet = 0,
-        Normal,
-        High
-    };
-
-    struct WarnAbout { enum What {
-        Nothing = 0x00,
-        //! A test case or leaf section did not run any assertions
-        NoAssertions = 0x01,
-        //! A command line test spec matched no test cases
-        UnmatchedTestSpec = 0x02,
-    }; };
-
-    enum class ShowDurations {
-        DefaultForReporter,
-        Always,
-        Never
-    };
-    enum class TestRunOrder {
-        Declared,
-        LexicographicallySorted,
-        Randomized
-    };
-    enum class ColourMode : std::uint8_t {
-        //! Let Catch2 pick implementation based on platform detection
-        PlatformDefault,
-        //! Use ANSI colour code escapes
-        ANSI,
-        //! Use Win32 console colour API
-        Win32,
-        //! Don't use any colour
-        None
-    };
-    struct WaitForKeypress { enum When {
-        Never,
-        BeforeStart = 1,
-        BeforeExit = 2,
-        BeforeStartAndExit = BeforeStart | BeforeExit
-    }; };
-
-    class TestSpec;
-    class IStream;
-
-    class IConfig : public Detail::NonCopyable {
-    public:
-        virtual ~IConfig();
-
-        virtual bool allowThrows() const = 0;
-        virtual StringRef name() const = 0;
-        virtual bool includeSuccessfulResults() const = 0;
-        virtual bool shouldDebugBreak() const = 0;
-        virtual bool warnAboutMissingAssertions() const = 0;
-        virtual bool warnAboutUnmatchedTestSpecs() const = 0;
-        virtual bool zeroTestsCountAsSuccess() const = 0;
-        virtual int abortAfter() const = 0;
-        virtual bool showInvisibles() const = 0;
-        virtual ShowDurations showDurations() const = 0;
-        virtual double minDuration() const = 0;
-        virtual TestSpec const& testSpec() const = 0;
-        virtual bool hasTestFilters() const = 0;
-        virtual std::vector<std::string> const& getTestsOrTags() const = 0;
-        virtual TestRunOrder runOrder() const = 0;
-        virtual uint32_t rngSeed() const = 0;
-        virtual unsigned int shardCount() const = 0;
-        virtual unsigned int shardIndex() const = 0;
-        virtual ColourMode defaultColourMode() const = 0;
-        virtual std::vector<std::string> const& getSectionsToRun() const = 0;
-        virtual Verbosity verbosity() const = 0;
-
-        virtual bool skipBenchmarks() const = 0;
-        virtual bool benchmarkNoAnalysis() const = 0;
-        virtual unsigned int benchmarkSamples() const = 0;
-        virtual double benchmarkConfidenceInterval() const = 0;
-        virtual unsigned int benchmarkResamples() const = 0;
-        virtual std::chrono::milliseconds benchmarkWarmupTime() const = 0;
-    };
-}
-
-#endif // CATCH_INTERFACES_CONFIG_HPP_INCLUDED
-
-
 #ifndef CATCH_COMPILER_CAPABILITIES_HPP_INCLUDED
 #define CATCH_COMPILER_CAPABILITIES_HPP_INCLUDED
 
@@ -366,12 +135,18 @@ namespace Catch {
 #    define CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS \
          _Pragma( "GCC diagnostic ignored \"-Wparentheses\"" )
 
+#    define CATCH_INTERNAL_SUPPRESS_UNUSED_RESULT \
+         _Pragma( "GCC diagnostic ignored \"-Wunused-result\"" )
+
 #    define CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS \
          _Pragma( "GCC diagnostic ignored \"-Wunused-variable\"" )
 
 #    define CATCH_INTERNAL_SUPPRESS_USELESS_CAST_WARNINGS \
          _Pragma( "GCC diagnostic ignored \"-Wuseless-cast\"" )
 
+#    define CATCH_INTERNAL_SUPPRESS_SHADOW_WARNINGS \
+         _Pragma( "GCC diagnostic ignored \"-Wshadow\"" )
+
 #    define CATCH_INTERNAL_IGNORE_BUT_WARN(...) (void)__builtin_constant_p(__VA_ARGS__)
 
 #endif
@@ -444,6 +219,9 @@ namespace Catch {
 #    define CATCH_INTERNAL_SUPPRESS_COMMA_WARNINGS \
         _Pragma( "clang diagnostic ignored \"-Wcomma\"" )
 
+#    define CATCH_INTERNAL_SUPPRESS_SHADOW_WARNINGS \
+        _Pragma( "clang diagnostic ignored \"-Wshadow\"" )
+
 #endif // __clang__
 
 
@@ -463,7 +241,9 @@ namespace Catch {
 
 ////////////////////////////////////////////////////////////////////////////////
 // Assume that some platforms do not support getenv.
-#if defined(CATCH_PLATFORM_WINDOWS_UWP) || defined(CATCH_PLATFORM_PLAYSTATION)
+#if defined( CATCH_PLATFORM_WINDOWS_UWP ) ||                                   \
+    defined( CATCH_PLATFORM_PLAYSTATION ) ||                                   \
+    defined( _GAMING_XBOX )
 #    define CATCH_INTERNAL_CONFIG_NO_GETENV
 #else
 #    define CATCH_INTERNAL_CONFIG_GETENV
@@ -681,6 +461,9 @@ namespace Catch {
 #if !defined(CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS)
 #   define CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS
 #endif
+#if !defined(CATCH_INTERNAL_SUPPRESS_UNUSED_RESULT)
+#   define CATCH_INTERNAL_SUPPRESS_UNUSED_RESULT
+#endif
 #if !defined(CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS)
 #   define CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS
 #endif
@@ -690,6 +473,16 @@ namespace Catch {
 #if !defined(CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS)
 #   define CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS
 #endif
+#if !defined( CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS )
+#    define CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS
+#endif
+#if !defined( CATCH_INTERNAL_SUPPRESS_COMMA_WARNINGS )
+#    define CATCH_INTERNAL_SUPPRESS_COMMA_WARNINGS
+#endif
+#if !defined( CATCH_INTERNAL_SUPPRESS_SHADOW_WARNINGS )
+#    define CATCH_INTERNAL_SUPPRESS_SHADOW_WARNINGS
+#endif
+
 
 // The goal of this macro is to avoid evaluation of the arguments, but
 // still have the compiler warn on problems inside...
@@ -703,13 +496,6 @@ namespace Catch {
 #   undef CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS
 #endif
 
-#if !defined(CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS)
-#   define CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS
-#endif
-
-#if !defined(CATCH_INTERNAL_SUPPRESS_COMMA_WARNINGS)
-#   define CATCH_INTERNAL_SUPPRESS_COMMA_WARNINGS
-#endif
 
 #if defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
 #define CATCH_TRY if ((true))
@@ -755,38 +541,31 @@ namespace Catch {
     class IResultCapture;
     class IConfig;
 
-    class IContext {
-    public:
-        virtual ~IContext(); // = default
+    class Context {
+        IConfig const* m_config = nullptr;
+        IResultCapture* m_resultCapture = nullptr;
 
-        virtual IResultCapture* getResultCapture() = 0;
-        virtual IConfig const* getConfig() const = 0;
-    };
+        CATCH_EXPORT static Context* currentContext;
+        friend Context& getCurrentMutableContext();
+        friend Context const& getCurrentContext();
+        static void createContext();
+        friend void cleanUpContext();
 
-    class IMutableContext : public IContext {
     public:
-        ~IMutableContext() override; // = default
-        virtual void setResultCapture( IResultCapture* resultCapture ) = 0;
-        virtual void setConfig( IConfig const* config ) = 0;
-
-    private:
-        CATCH_EXPORT static IMutableContext* currentContext;
-        friend IMutableContext& getCurrentMutableContext();
-        friend void cleanUpContext();
-        static void createContext();
+        IResultCapture* getResultCapture() const { return m_resultCapture; }
+        IConfig const* getConfig() const { return m_config; }
+        void setResultCapture( IResultCapture* resultCapture );
+        void setConfig( IConfig const* config );
     };
 
-    inline IMutableContext& getCurrentMutableContext()
-    {
-        if( !IMutableContext::currentContext )
-            IMutableContext::createContext();
+    Context& getCurrentMutableContext();
+
+    inline Context const& getCurrentContext() {
+        // We duplicate the logic from `getCurrentMutableContext` here,
+        // to avoid paying the call overhead in debug mode.
+        if ( !Context::currentContext ) { Context::createContext(); }
         // NOLINTNEXTLINE(clang-analyzer-core.uninitialized.UndefReturn)
-        return *IMutableContext::currentContext;
-    }
-
-    inline IContext& getCurrentContext()
-    {
-        return getCurrentMutableContext();
+        return *Context::currentContext;
     }
 
     void cleanUpContext();
@@ -798,16 +577,6 @@ namespace Catch {
 #endif // CATCH_CONTEXT_HPP_INCLUDED
 
 
-#ifndef CATCH_INTERFACES_REPORTER_HPP_INCLUDED
-#define CATCH_INTERFACES_REPORTER_HPP_INCLUDED
-
-
-
-#ifndef CATCH_SECTION_INFO_HPP_INCLUDED
-#define CATCH_SECTION_INFO_HPP_INCLUDED
-
-
-
 #ifndef CATCH_MOVE_AND_FORWARD_HPP_INCLUDED
 #define CATCH_MOVE_AND_FORWARD_HPP_INCLUDED
 
@@ -822,110 +591,201 @@ namespace Catch {
 #endif // CATCH_MOVE_AND_FORWARD_HPP_INCLUDED
 
 
-#ifndef CATCH_SOURCE_LINE_INFO_HPP_INCLUDED
-#define CATCH_SOURCE_LINE_INFO_HPP_INCLUDED
-
-#include <cstddef>
-#include <iosfwd>
+#ifndef CATCH_TEST_FAILURE_EXCEPTION_HPP_INCLUDED
+#define CATCH_TEST_FAILURE_EXCEPTION_HPP_INCLUDED
 
 namespace Catch {
 
-    struct SourceLineInfo {
+    //! Used to signal that an assertion macro failed
+    struct TestFailureException{};
+    //! Used to signal that the remainder of a test should be skipped
+    struct TestSkipException {};
 
-        SourceLineInfo() = delete;
-        constexpr SourceLineInfo( char const* _file, std::size_t _line ) noexcept:
-            file( _file ),
-            line( _line )
-        {}
+    /**
+     * Outlines throwing of `TestFailureException` into a single TU
+     *
+     * Also handles `CATCH_CONFIG_DISABLE_EXCEPTIONS` for callers.
+     */
+    [[noreturn]] void throw_test_failure_exception();
 
-        bool operator == ( SourceLineInfo const& other ) const noexcept;
-        bool operator < ( SourceLineInfo const& other ) const noexcept;
+    /**
+     * Outlines throwing of `TestSkipException` into a single TU
+     *
+     * Also handles `CATCH_CONFIG_DISABLE_EXCEPTIONS` for callers.
+     */
+    [[noreturn]] void throw_test_skip_exception();
 
-        char const* file;
-        std::size_t line;
+} // namespace Catch
 
-        friend std::ostream& operator << (std::ostream& os, SourceLineInfo const& info);
-    };
-}
+#endif // CATCH_TEST_FAILURE_EXCEPTION_HPP_INCLUDED
 
-#define CATCH_INTERNAL_LINEINFO \
-    ::Catch::SourceLineInfo( __FILE__, static_cast<std::size_t>( __LINE__ ) )
 
-#endif // CATCH_SOURCE_LINE_INFO_HPP_INCLUDED
+#ifndef CATCH_UNIQUE_NAME_HPP_INCLUDED
+#define CATCH_UNIQUE_NAME_HPP_INCLUDED
 
 
-#ifndef CATCH_TOTALS_HPP_INCLUDED
-#define CATCH_TOTALS_HPP_INCLUDED
 
-#include <cstdint>
 
-namespace Catch {
+/** \file
+ * Wrapper for the CONFIG configuration option
+ *
+ * When generating internal unique names, there are two options. Either
+ * we mix in the current line number, or mix in an incrementing number.
+ * We prefer the latter, using `__COUNTER__`, but users might want to
+ * use the former.
+ */
 
-    struct Counts {
-        Counts operator - ( Counts const& other ) const;
-        Counts& operator += ( Counts const& other );
+#ifndef CATCH_CONFIG_COUNTER_HPP_INCLUDED
+#define CATCH_CONFIG_COUNTER_HPP_INCLUDED
 
-        std::uint64_t total() const;
-        bool allPassed() const;
-        bool allOk() const;
 
-        std::uint64_t passed = 0;
-        std::uint64_t failed = 0;
-        std::uint64_t failedButOk = 0;
-        std::uint64_t skipped = 0;
-    };
+#if ( !defined(__JETBRAINS_IDE__) || __JETBRAINS_IDE__ >= 20170300L )
+    #define CATCH_INTERNAL_CONFIG_COUNTER
+#endif
 
-    struct Totals {
+#if defined( CATCH_INTERNAL_CONFIG_COUNTER ) && \
+    !defined( CATCH_CONFIG_NO_COUNTER ) && \
+    !defined( CATCH_CONFIG_COUNTER )
+#    define CATCH_CONFIG_COUNTER
+#endif
 
-        Totals operator - ( Totals const& other ) const;
-        Totals& operator += ( Totals const& other );
 
-        Totals delta( Totals const& prevTotals ) const;
+#endif // CATCH_CONFIG_COUNTER_HPP_INCLUDED
+#define INTERNAL_CATCH_UNIQUE_NAME_LINE2( name, line ) name##line
+#define INTERNAL_CATCH_UNIQUE_NAME_LINE( name, line ) INTERNAL_CATCH_UNIQUE_NAME_LINE2( name, line )
+#ifdef CATCH_CONFIG_COUNTER
+#  define INTERNAL_CATCH_UNIQUE_NAME( name ) INTERNAL_CATCH_UNIQUE_NAME_LINE( name, __COUNTER__ )
+#else
+#  define INTERNAL_CATCH_UNIQUE_NAME( name ) INTERNAL_CATCH_UNIQUE_NAME_LINE( name, __LINE__ )
+#endif
 
-        Counts assertions;
-        Counts testCases;
-    };
-}
+#endif // CATCH_UNIQUE_NAME_HPP_INCLUDED
 
-#endif // CATCH_TOTALS_HPP_INCLUDED
 
+#ifndef CATCH_INTERFACES_CAPTURE_HPP_INCLUDED
+#define CATCH_INTERFACES_CAPTURE_HPP_INCLUDED
+
+#include <string>
+#include <chrono>
+
+
+
+#ifndef CATCH_STRINGREF_HPP_INCLUDED
+#define CATCH_STRINGREF_HPP_INCLUDED
+
+#include <cstddef>
 #include <string>
+#include <iosfwd>
+#include <cassert>
+
+#include <cstring>
 
 namespace Catch {
 
-    struct SectionInfo {
-        // The last argument is ignored, so that people can write
-        // SECTION("ShortName", "Proper description that is long") and
-        // still use the `-c` flag comfortably.
-        SectionInfo( SourceLineInfo const& _lineInfo, std::string _name,
-                    const char* const = nullptr ):
-            name(CATCH_MOVE(_name)),
-            lineInfo(_lineInfo)
-            {}
+    /// A non-owning string class (similar to the forthcoming std::string_view)
+    /// Note that, because a StringRef may be a substring of another string,
+    /// it may not be null terminated.
+    class StringRef {
+    public:
+        using size_type = std::size_t;
+        using const_iterator = const char*;
 
-        std::string name;
-        SourceLineInfo lineInfo;
-    };
+        static constexpr size_type npos{ static_cast<size_type>( -1 ) };
 
-    struct SectionEndInfo {
-        SectionInfo sectionInfo;
-        Counts prevAssertions;
-        double durationInSeconds;
-    };
+    private:
+        static constexpr char const* const s_empty = "";
 
-} // end namespace Catch
+        char const* m_start = s_empty;
+        size_type m_size = 0;
 
-#endif // CATCH_SECTION_INFO_HPP_INCLUDED
+    public: // construction
+        constexpr StringRef() noexcept = default;
 
+        StringRef( char const* rawChars ) noexcept;
 
-#ifndef CATCH_ASSERTION_RESULT_HPP_INCLUDED
-#define CATCH_ASSERTION_RESULT_HPP_INCLUDED
+        constexpr StringRef( char const* rawChars, size_type size ) noexcept
+        :   m_start( rawChars ),
+            m_size( size )
+        {}
+
+        StringRef( std::string const& stdString ) noexcept
+        :   m_start( stdString.c_str() ),
+            m_size( stdString.size() )
+        {}
 
+        explicit operator std::string() const {
+            return std::string(m_start, m_size);
+        }
 
+    public: // operators
+        auto operator == ( StringRef other ) const noexcept -> bool {
+            return m_size == other.m_size
+                && (std::memcmp( m_start, other.m_start, m_size ) == 0);
+        }
+        auto operator != (StringRef other) const noexcept -> bool {
+            return !(*this == other);
+        }
 
-#ifndef CATCH_ASSERTION_INFO_HPP_INCLUDED
-#define CATCH_ASSERTION_INFO_HPP_INCLUDED
+        constexpr auto operator[] ( size_type index ) const noexcept -> char {
+            assert(index < m_size);
+            return m_start[index];
+        }
+
+        bool operator<(StringRef rhs) const noexcept;
+
+    public: // named queries
+        constexpr auto empty() const noexcept -> bool {
+            return m_size == 0;
+        }
+        constexpr auto size() const noexcept -> size_type {
+            return m_size;
+        }
+
+        // Returns a substring of [start, start + length).
+        // If start + length > size(), then the substring is [start, size()).
+        // If start > size(), then the substring is empty.
+        constexpr StringRef substr(size_type start, size_type length) const noexcept {
+            if (start < m_size) {
+                const auto shortened_size = m_size - start;
+                return StringRef(m_start + start, (shortened_size < length) ? shortened_size : length);
+            } else {
+                return StringRef();
+            }
+        }
+
+        // Returns the current start pointer. May not be null-terminated.
+        constexpr char const* data() const noexcept {
+            return m_start;
+        }
+
+        constexpr const_iterator begin() const { return m_start; }
+        constexpr const_iterator end() const { return m_start + m_size; }
+
+
+        friend std::string& operator += (std::string& lhs, StringRef sr);
+        friend std::ostream& operator << (std::ostream& os, StringRef sr);
+        friend std::string operator+(StringRef lhs, StringRef rhs);
+
+        /**
+         * Provides a three-way comparison with rhs
+         *
+         * Returns negative number if lhs < rhs, 0 if lhs == rhs, and a positive
+         * number if lhs > rhs
+         */
+        int compare( StringRef rhs ) const;
+    };
+
+
+    constexpr auto operator ""_sr( char const* rawChars, std::size_t size ) noexcept -> StringRef {
+        return StringRef( rawChars, size );
+    }
+} // namespace Catch
+
+constexpr auto operator ""_catch_sr( char const* rawChars, std::size_t size ) noexcept -> Catch::StringRef {
+    return Catch::StringRef( rawChars, size );
+}
 
+#endif // CATCH_STRINGREF_HPP_INCLUDED
 
 
 #ifndef CATCH_RESULT_TYPE_HPP_INCLUDED
@@ -979,120 +839,12 @@ namespace Catch {
 
 #endif // CATCH_RESULT_TYPE_HPP_INCLUDED
 
-namespace Catch {
-
-    struct AssertionInfo {
-        // AssertionInfo() = delete;
-
-        StringRef macroName;
-        SourceLineInfo lineInfo;
-        StringRef capturedExpression;
-        ResultDisposition::Flags resultDisposition;
-    };
-
-} // end namespace Catch
-
-#endif // CATCH_ASSERTION_INFO_HPP_INCLUDED
 
+#ifndef CATCH_UNIQUE_PTR_HPP_INCLUDED
+#define CATCH_UNIQUE_PTR_HPP_INCLUDED
 
-#ifndef CATCH_LAZY_EXPR_HPP_INCLUDED
-#define CATCH_LAZY_EXPR_HPP_INCLUDED
-
-#include <iosfwd>
-
-namespace Catch {
-
-    class ITransientExpression;
-
-    class LazyExpression {
-        friend class AssertionHandler;
-        friend struct AssertionStats;
-        friend class RunContext;
-
-        ITransientExpression const* m_transientExpression = nullptr;
-        bool m_isNegated;
-    public:
-        LazyExpression( bool isNegated ):
-            m_isNegated(isNegated)
-        {}
-        LazyExpression(LazyExpression const& other) = default;
-        LazyExpression& operator = ( LazyExpression const& ) = delete;
-
-        explicit operator bool() const {
-            return m_transientExpression != nullptr;
-        }
-
-        friend auto operator << ( std::ostream& os, LazyExpression const& lazyExpr ) -> std::ostream&;
-    };
-
-} // namespace Catch
-
-#endif // CATCH_LAZY_EXPR_HPP_INCLUDED
-
-#include <string>
-
-namespace Catch {
-
-    struct AssertionResultData
-    {
-        AssertionResultData() = delete;
-
-        AssertionResultData( ResultWas::OfType _resultType, LazyExpression const& _lazyExpression );
-
-        std::string message;
-        mutable std::string reconstructedExpression;
-        LazyExpression lazyExpression;
-        ResultWas::OfType resultType;
-
-        std::string reconstructExpression() const;
-    };
-
-    class AssertionResult {
-    public:
-        AssertionResult() = delete;
-        AssertionResult( AssertionInfo const& info, AssertionResultData&& data );
-
-        bool isOk() const;
-        bool succeeded() const;
-        ResultWas::OfType getResultType() const;
-        bool hasExpression() const;
-        bool hasMessage() const;
-        std::string getExpression() const;
-        std::string getExpressionInMacro() const;
-        bool hasExpandedExpression() const;
-        std::string getExpandedExpression() const;
-        StringRef getMessage() const;
-        SourceLineInfo getSourceInfo() const;
-        StringRef getTestMacroName() const;
-
-    //protected:
-        AssertionInfo m_info;
-        AssertionResultData m_resultData;
-    };
-
-} // end namespace Catch
-
-#endif // CATCH_ASSERTION_RESULT_HPP_INCLUDED
-
-
-#ifndef CATCH_MESSAGE_INFO_HPP_INCLUDED
-#define CATCH_MESSAGE_INFO_HPP_INCLUDED
-
-
-
-#ifndef CATCH_INTERFACES_CAPTURE_HPP_INCLUDED
-#define CATCH_INTERFACES_CAPTURE_HPP_INCLUDED
-
-#include <string>
-#include <chrono>
-
-
-
-#ifndef CATCH_UNIQUE_PTR_HPP_INCLUDED
-#define CATCH_UNIQUE_PTR_HPP_INCLUDED
-
-#include <cassert>
-#include <type_traits>
+#include <cassert>
+#include <type_traits>
 
 
 namespace Catch {
@@ -1199,6 +951,45 @@ namespace Detail {
 
 #endif // CATCH_UNIQUE_PTR_HPP_INCLUDED
 
+
+#ifndef CATCH_BENCHMARK_STATS_FWD_HPP_INCLUDED
+#define CATCH_BENCHMARK_STATS_FWD_HPP_INCLUDED
+
+
+
+// Adapted from donated nonius code.
+
+#ifndef CATCH_CLOCK_HPP_INCLUDED
+#define CATCH_CLOCK_HPP_INCLUDED
+
+#include <chrono>
+
+namespace Catch {
+    namespace Benchmark {
+        using IDuration = std::chrono::nanoseconds;
+        using FDuration = std::chrono::duration<double, std::nano>;
+
+        template <typename Clock>
+        using TimePoint = typename Clock::time_point;
+
+        using default_clock = std::chrono::steady_clock;
+    } // namespace Benchmark
+} // namespace Catch
+
+#endif // CATCH_CLOCK_HPP_INCLUDED
+
+namespace Catch {
+
+    // We cannot forward declare the type with default template argument
+    // multiple times, so it is split out into a separate header so that
+    // we can prevent multiple declarations in dependees
+    template <typename Duration = Benchmark::FDuration>
+    struct BenchmarkStats;
+
+} // end namespace Catch
+
+#endif // CATCH_BENCHMARK_STATS_FWD_HPP_INCLUDED
+
 namespace Catch {
 
     class AssertionResult;
@@ -1215,8 +1006,6 @@ namespace Catch {
     class IGeneratorTracker;
 
     struct BenchmarkInfo;
-    template <typename Duration = std::chrono::duration<double, std::nano>>
-    struct BenchmarkStats;
 
     namespace Generators {
         class GeneratorUntypedBase;
@@ -1228,6 +1017,7 @@ namespace Catch {
     public:
         virtual ~IResultCapture();
 
+        virtual void notifyAssertionStarted( AssertionInfo const& info ) = 0;
         virtual bool sectionStarted( StringRef sectionName,
                                      SourceLineInfo const& sectionLineInfo,
                                      Counts& assertions ) = 0;
@@ -1268,7 +1058,7 @@ namespace Catch {
                     AssertionReaction& reaction ) = 0;
         virtual void handleUnexpectedInflightException
                 (   AssertionInfo const& info,
-                    std::string const& message,
+                    std::string&& message,
                     AssertionReaction& reaction ) = 0;
         virtual void handleIncomplete
                 (   AssertionInfo const& info ) = 0;
@@ -1293,415 +1083,308 @@ namespace Catch {
 
 #endif // CATCH_INTERFACES_CAPTURE_HPP_INCLUDED
 
-#include <string>
-
-namespace Catch {
-
-    struct MessageInfo {
-        MessageInfo(    StringRef _macroName,
-                        SourceLineInfo const& _lineInfo,
-                        ResultWas::OfType _type );
-
-        StringRef macroName;
-        std::string message;
-        SourceLineInfo lineInfo;
-        ResultWas::OfType type;
-        unsigned int sequence;
-
-        bool operator == (MessageInfo const& other) const {
-            return sequence == other.sequence;
-        }
-        bool operator < (MessageInfo const& other) const {
-            return sequence < other.sequence;
-        }
-    private:
-        static unsigned int globalCount;
-    };
 
-} // end namespace Catch
-
-#endif // CATCH_MESSAGE_INFO_HPP_INCLUDED
+#ifndef CATCH_INTERFACES_CONFIG_HPP_INCLUDED
+#define CATCH_INTERFACES_CONFIG_HPP_INCLUDED
 
 
-// Adapted from donated nonius code.
 
-#ifndef CATCH_ESTIMATE_HPP_INCLUDED
-#define CATCH_ESTIMATE_HPP_INCLUDED
+#ifndef CATCH_NONCOPYABLE_HPP_INCLUDED
+#define CATCH_NONCOPYABLE_HPP_INCLUDED
 
 namespace Catch {
-    namespace Benchmark {
-        template <typename Duration>
-        struct Estimate {
-            Duration point;
-            Duration lower_bound;
-            Duration upper_bound;
-            double confidence_interval;
-
-            template <typename Duration2>
-            operator Estimate<Duration2>() const {
-                return { point, lower_bound, upper_bound, confidence_interval };
-            }
-        };
-    } // namespace Benchmark
-} // namespace Catch
-
-#endif // CATCH_ESTIMATE_HPP_INCLUDED
-
-
-// Adapted from donated nonius code.
-
-#ifndef CATCH_OUTLIER_CLASSIFICATION_HPP_INCLUDED
-#define CATCH_OUTLIER_CLASSIFICATION_HPP_INCLUDED
+    namespace Detail {
 
-namespace Catch {
-    namespace Benchmark {
-        struct OutlierClassification {
-            int samples_seen = 0;
-            int low_severe = 0;     // more than 3 times IQR below Q1
-            int low_mild = 0;       // 1.5 to 3 times IQR below Q1
-            int high_mild = 0;      // 1.5 to 3 times IQR above Q3
-            int high_severe = 0;    // more than 3 times IQR above Q3
+        //! Deriving classes become noncopyable and nonmovable
+        class NonCopyable {
+            NonCopyable( NonCopyable const& ) = delete;
+            NonCopyable( NonCopyable&& ) = delete;
+            NonCopyable& operator=( NonCopyable const& ) = delete;
+            NonCopyable& operator=( NonCopyable&& ) = delete;
 
-            int total() const {
-                return low_severe + low_mild + high_mild + high_severe;
-            }
+        protected:
+            NonCopyable() noexcept = default;
         };
-    } // namespace Benchmark
-} // namespace Catch
 
-#endif // CATCH_OUTLIERS_CLASSIFICATION_HPP_INCLUDED
+    } // namespace Detail
+} // namespace Catch
 
+#endif // CATCH_NONCOPYABLE_HPP_INCLUDED
 
-#include <map>
+#include <chrono>
+#include <iosfwd>
 #include <string>
 #include <vector>
-#include <iosfwd>
 
 namespace Catch {
 
-    struct ReporterDescription;
-    struct ListenerDescription;
-    struct TagInfo;
-    struct TestCaseInfo;
-    class TestCaseHandle;
-    class IConfig;
-    class IStream;
-    enum class ColourMode : std::uint8_t;
-
-    struct ReporterConfig {
-        ReporterConfig( IConfig const* _fullConfig,
-                        Detail::unique_ptr<IStream> _stream,
-                        ColourMode colourMode,
-                        std::map<std::string, std::string> customOptions );
-
-        ReporterConfig( ReporterConfig&& ) = default;
-        ReporterConfig& operator=( ReporterConfig&& ) = default;
-        ~ReporterConfig(); // = default
+    enum class Verbosity {
+        Quiet = 0,
+        Normal,
+        High
+    };
 
-        Detail::unique_ptr<IStream> takeStream() &&;
-        IConfig const* fullConfig() const;
-        ColourMode colourMode() const;
-        std::map<std::string, std::string> const& customOptions() const;
+    struct WarnAbout { enum What {
+        Nothing = 0x00,
+        //! A test case or leaf section did not run any assertions
+        NoAssertions = 0x01,
+        //! A command line test spec matched no test cases
+        UnmatchedTestSpec = 0x02,
+    }; };
 
-    private:
-        Detail::unique_ptr<IStream> m_stream;
-        IConfig const* m_fullConfig;
-        ColourMode m_colourMode;
-        std::map<std::string, std::string> m_customOptions;
+    enum class ShowDurations {
+        DefaultForReporter,
+        Always,
+        Never
     };
-
-    struct TestRunInfo {
-        constexpr TestRunInfo(StringRef _name) : name(_name) {}
-        StringRef name;
+    enum class TestRunOrder {
+        Declared,
+        LexicographicallySorted,
+        Randomized
     };
-
-    struct AssertionStats {
-        AssertionStats( AssertionResult const& _assertionResult,
-                        std::vector<MessageInfo> const& _infoMessages,
-                        Totals const& _totals );
-
-        AssertionStats( AssertionStats const& )              = default;
-        AssertionStats( AssertionStats && )                  = default;
-        AssertionStats& operator = ( AssertionStats const& ) = delete;
-        AssertionStats& operator = ( AssertionStats && )     = delete;
-
-        AssertionResult assertionResult;
-        std::vector<MessageInfo> infoMessages;
-        Totals totals;
+    enum class ColourMode : std::uint8_t {
+        //! Let Catch2 pick implementation based on platform detection
+        PlatformDefault,
+        //! Use ANSI colour code escapes
+        ANSI,
+        //! Use Win32 console colour API
+        Win32,
+        //! Don't use any colour
+        None
     };
+    struct WaitForKeypress { enum When {
+        Never,
+        BeforeStart = 1,
+        BeforeExit = 2,
+        BeforeStartAndExit = BeforeStart | BeforeExit
+    }; };
 
-    struct SectionStats {
-        SectionStats(   SectionInfo&& _sectionInfo,
-                        Counts const& _assertions,
-                        double _durationInSeconds,
-                        bool _missingAssertions );
+    class TestSpec;
+    class IStream;
 
-        SectionInfo sectionInfo;
-        Counts assertions;
-        double durationInSeconds;
-        bool missingAssertions;
-    };
+    class IConfig : public Detail::NonCopyable {
+    public:
+        virtual ~IConfig();
 
-    struct TestCaseStats {
-        TestCaseStats(  TestCaseInfo const& _testInfo,
-                        Totals const& _totals,
-                        std::string&& _stdOut,
-                        std::string&& _stdErr,
-                        bool _aborting );
+        virtual bool allowThrows() const = 0;
+        virtual StringRef name() const = 0;
+        virtual bool includeSuccessfulResults() const = 0;
+        virtual bool shouldDebugBreak() const = 0;
+        virtual bool warnAboutMissingAssertions() const = 0;
+        virtual bool warnAboutUnmatchedTestSpecs() const = 0;
+        virtual bool zeroTestsCountAsSuccess() const = 0;
+        virtual int abortAfter() const = 0;
+        virtual bool showInvisibles() const = 0;
+        virtual ShowDurations showDurations() const = 0;
+        virtual double minDuration() const = 0;
+        virtual TestSpec const& testSpec() const = 0;
+        virtual bool hasTestFilters() const = 0;
+        virtual std::vector<std::string> const& getTestsOrTags() const = 0;
+        virtual TestRunOrder runOrder() const = 0;
+        virtual uint32_t rngSeed() const = 0;
+        virtual unsigned int shardCount() const = 0;
+        virtual unsigned int shardIndex() const = 0;
+        virtual ColourMode defaultColourMode() const = 0;
+        virtual std::vector<std::string> const& getSectionsToRun() const = 0;
+        virtual Verbosity verbosity() const = 0;
 
-        TestCaseInfo const * testInfo;
-        Totals totals;
-        std::string stdOut;
-        std::string stdErr;
-        bool aborting;
+        virtual bool skipBenchmarks() const = 0;
+        virtual bool benchmarkNoAnalysis() const = 0;
+        virtual unsigned int benchmarkSamples() const = 0;
+        virtual double benchmarkConfidenceInterval() const = 0;
+        virtual unsigned int benchmarkResamples() const = 0;
+        virtual std::chrono::milliseconds benchmarkWarmupTime() const = 0;
     };
+}
 
-    struct TestRunStats {
-        TestRunStats(   TestRunInfo const& _runInfo,
-                        Totals const& _totals,
-                        bool _aborting );
+#endif // CATCH_INTERFACES_CONFIG_HPP_INCLUDED
 
-        TestRunInfo runInfo;
-        Totals totals;
-        bool aborting;
-    };
 
+#ifndef CATCH_INTERFACES_REGISTRY_HUB_HPP_INCLUDED
+#define CATCH_INTERFACES_REGISTRY_HUB_HPP_INCLUDED
 
-    struct BenchmarkInfo {
-        std::string name;
-        double estimatedDuration;
-        int iterations;
-        unsigned int samples;
-        unsigned int resamples;
-        double clockResolution;
-        double clockCost;
-    };
 
-    template <class Duration>
-    struct BenchmarkStats {
-        BenchmarkInfo info;
+#include <string>
 
-        std::vector<Duration> samples;
-        Benchmark::Estimate<Duration> mean;
-        Benchmark::Estimate<Duration> standardDeviation;
-        Benchmark::OutlierClassification outliers;
-        double outlierVariance;
+namespace Catch {
 
-        template <typename Duration2>
-        operator BenchmarkStats<Duration2>() const {
-            std::vector<Duration2> samples2;
-            samples2.reserve(samples.size());
-            for (auto const& sample : samples) {
-                samples2.push_back(Duration2(sample));
-            }
-            return {
-                info,
-                CATCH_MOVE(samples2),
-                mean,
-                standardDeviation,
-                outliers,
-                outlierVariance,
-            };
-        }
-    };
+    class TestCaseHandle;
+    struct TestCaseInfo;
+    class ITestCaseRegistry;
+    class IExceptionTranslatorRegistry;
+    class IExceptionTranslator;
+    class ReporterRegistry;
+    class IReporterFactory;
+    class ITagAliasRegistry;
+    class ITestInvoker;
+    class IMutableEnumValuesRegistry;
+    struct SourceLineInfo;
 
-    //! By setting up its preferences, a reporter can modify Catch2's behaviour
-    //! in some regards, e.g. it can request Catch2 to capture writes to
-    //! stdout/stderr during test execution, and pass them to the reporter.
-    struct ReporterPreferences {
-        //! Catch2 should redirect writes to stdout and pass them to the
-        //! reporter
-        bool shouldRedirectStdOut = false;
-        //! Catch2 should call `Reporter::assertionEnded` even for passing
-        //! assertions
-        bool shouldReportAllAssertions = false;
-    };
+    class StartupExceptionRegistry;
+    class EventListenerFactory;
 
-    /**
-     * The common base for all reporters and event listeners
-     *
-     * Implementing classes must also implement:
-     *
-     *     //! User-friendly description of the reporter/listener type
-     *     static std::string getDescription()
-     *
-     * Generally shouldn't be derived from by users of Catch2 directly,
-     * instead they should derive from one of the utility bases that
-     * derive from this class.
-     */
-    class IEventListener {
-    protected:
-        //! Derived classes can set up their preferences here
-        ReporterPreferences m_preferences;
-        //! The test run's config as filled in from CLI and defaults
-        IConfig const* m_config;
+    using IReporterFactoryPtr = Detail::unique_ptr<IReporterFactory>;
 
+    class IRegistryHub {
     public:
-        IEventListener( IConfig const* config ): m_config( config ) {}
+        virtual ~IRegistryHub(); // = default
 
-        virtual ~IEventListener(); // = default;
+        virtual ReporterRegistry const& getReporterRegistry() const = 0;
+        virtual ITestCaseRegistry const& getTestCaseRegistry() const = 0;
+        virtual ITagAliasRegistry const& getTagAliasRegistry() const = 0;
+        virtual IExceptionTranslatorRegistry const& getExceptionTranslatorRegistry() const = 0;
 
-        // Implementing class must also provide the following static methods:
-        // static std::string getDescription();
 
-        ReporterPreferences const& getPreferences() const {
-            return m_preferences;
-        }
+        virtual StartupExceptionRegistry const& getStartupExceptionRegistry() const = 0;
+    };
 
-        //! Called when no test cases match provided test spec
-        virtual void noMatchingTestCases( StringRef unmatchedSpec ) = 0;
-        //! Called for all invalid test specs from the cli
-        virtual void reportInvalidTestSpec( StringRef invalidArgument ) = 0;
+    class IMutableRegistryHub {
+    public:
+        virtual ~IMutableRegistryHub(); // = default
+        virtual void registerReporter( std::string const& name, IReporterFactoryPtr factory ) = 0;
+        virtual void registerListener( Detail::unique_ptr<EventListenerFactory> factory ) = 0;
+        virtual void registerTest(Detail::unique_ptr<TestCaseInfo>&& testInfo, Detail::unique_ptr<ITestInvoker>&& invoker) = 0;
+        virtual void registerTranslator( Detail::unique_ptr<IExceptionTranslator>&& translator ) = 0;
+        virtual void registerTagAlias( std::string const& alias, std::string const& tag, SourceLineInfo const& lineInfo ) = 0;
+        virtual void registerStartupException() noexcept = 0;
+        virtual IMutableEnumValuesRegistry& getMutableEnumValuesRegistry() = 0;
+    };
 
-        /**
-         * Called once in a testing run before tests are started
-         *
-         * Not called if tests won't be run (e.g. only listing will happen)
-         */
-        virtual void testRunStarting( TestRunInfo const& testRunInfo ) = 0;
+    IRegistryHub const& getRegistryHub();
+    IMutableRegistryHub& getMutableRegistryHub();
+    void cleanUp();
+    std::string translateActiveException();
 
-        //! Called _once_ for each TEST_CASE, no matter how many times it is entered
-        virtual void testCaseStarting( TestCaseInfo const& testInfo ) = 0;
-        //! Called _every time_ a TEST_CASE is entered, including repeats (due to sections)
-        virtual void testCasePartialStarting( TestCaseInfo const& testInfo, uint64_t partNumber ) = 0;
-        //! Called when a `SECTION` is being entered. Not called for skipped sections
-        virtual void sectionStarting( SectionInfo const& sectionInfo ) = 0;
+}
 
-        //! Called when user-code is being probed before the actual benchmark runs
-        virtual void benchmarkPreparing( StringRef benchmarkName ) = 0;
-        //! Called after probe but before the user-code is being benchmarked
-        virtual void benchmarkStarting( BenchmarkInfo const& benchmarkInfo ) = 0;
-        //! Called with the benchmark results if benchmark successfully finishes
-        virtual void benchmarkEnded( BenchmarkStats<> const& benchmarkStats ) = 0;
-        //! Called if running the benchmarks fails for any reason
-        virtual void benchmarkFailed( StringRef benchmarkName ) = 0;
+#endif // CATCH_INTERFACES_REGISTRY_HUB_HPP_INCLUDED
 
-        //! Called before assertion success/failure is evaluated
-        virtual void assertionStarting( AssertionInfo const& assertionInfo ) = 0;
 
-        //! Called after assertion was fully evaluated
-        virtual void assertionEnded( AssertionStats const& assertionStats ) = 0;
+#ifndef CATCH_BENCHMARK_STATS_HPP_INCLUDED
+#define CATCH_BENCHMARK_STATS_HPP_INCLUDED
 
-        //! Called after a `SECTION` has finished running
-        virtual void sectionEnded( SectionStats const& sectionStats ) = 0;
-        //! Called _every time_ a TEST_CASE is entered, including repeats (due to sections)
-        virtual void testCasePartialEnded(TestCaseStats const& testCaseStats, uint64_t partNumber ) = 0;
-        //! Called _once_ for each TEST_CASE, no matter how many times it is entered
-        virtual void testCaseEnded( TestCaseStats const& testCaseStats ) = 0;
-        /**
-         * Called once after all tests in a testing run are finished
-         *
-         * Not called if tests weren't run (e.g. only listings happened)
-         */
-        virtual void testRunEnded( TestRunStats const& testRunStats ) = 0;
 
-        /**
-         * Called with test cases that are skipped due to the test run aborting.
-         * NOT called for test cases that are explicitly skipped using the `SKIP` macro.
-         *
-         * Deprecated - will be removed in the next major release.
-         */
-        virtual void skipTest( TestCaseInfo const& testInfo ) = 0;
 
-        //! Called if a fatal error (signal/structured exception) occured
-        virtual void fatalErrorEncountered( StringRef error ) = 0;
+// Adapted from donated nonius code.
 
-        //! Writes out information about provided reporters using reporter-specific format
-        virtual void listReporters(std::vector<ReporterDescription> const& descriptions) = 0;
-        //! Writes out the provided listeners descriptions using reporter-specific format
-        virtual void listListeners(std::vector<ListenerDescription> const& descriptions) = 0;
-        //! Writes out information about provided tests using reporter-specific format
-        virtual void listTests(std::vector<TestCaseHandle> const& tests) = 0;
-        //! Writes out information about the provided tags using reporter-specific format
-        virtual void listTags(std::vector<TagInfo> const& tags) = 0;
-    };
-    using IEventListenerPtr = Detail::unique_ptr<IEventListener>;
+#ifndef CATCH_ESTIMATE_HPP_INCLUDED
+#define CATCH_ESTIMATE_HPP_INCLUDED
 
-} // end namespace Catch
+namespace Catch {
+    namespace Benchmark {
+        template <typename Type>
+        struct Estimate {
+            Type point;
+            Type lower_bound;
+            Type upper_bound;
+            double confidence_interval;
+        };
+    } // namespace Benchmark
+} // namespace Catch
 
-#endif // CATCH_INTERFACES_REPORTER_HPP_INCLUDED
+#endif // CATCH_ESTIMATE_HPP_INCLUDED
 
 
-#ifndef CATCH_UNIQUE_NAME_HPP_INCLUDED
-#define CATCH_UNIQUE_NAME_HPP_INCLUDED
+// Adapted from donated nonius code.
 
+#ifndef CATCH_OUTLIER_CLASSIFICATION_HPP_INCLUDED
+#define CATCH_OUTLIER_CLASSIFICATION_HPP_INCLUDED
 
+namespace Catch {
+    namespace Benchmark {
+        struct OutlierClassification {
+            int samples_seen = 0;
+            int low_severe = 0;     // more than 3 times IQR below Q1
+            int low_mild = 0;       // 1.5 to 3 times IQR below Q1
+            int high_mild = 0;      // 1.5 to 3 times IQR above Q3
+            int high_severe = 0;    // more than 3 times IQR above Q3
 
+            int total() const {
+                return low_severe + low_mild + high_mild + high_severe;
+            }
+        };
+    } // namespace Benchmark
+} // namespace Catch
 
-/** \file
- * Wrapper for the CONFIG configuration option
- *
- * When generating internal unique names, there are two options. Either
- * we mix in the current line number, or mix in an incrementing number.
- * We prefer the latter, using `__COUNTER__`, but users might want to
- * use the former.
- */
+#endif // CATCH_OUTLIERS_CLASSIFICATION_HPP_INCLUDED
+// The fwd decl & default specialization needs to be seen by VS2017 before
+// BenchmarkStats itself, or VS2017 will report compilation error.
 
-#ifndef CATCH_CONFIG_COUNTER_HPP_INCLUDED
-#define CATCH_CONFIG_COUNTER_HPP_INCLUDED
+#include <string>
+#include <vector>
 
-#if ( !defined(__JETBRAINS_IDE__) || __JETBRAINS_IDE__ >= 20170300L )
-    #define CATCH_INTERNAL_CONFIG_COUNTER
-#endif
+namespace Catch {
 
-#if defined( CATCH_INTERNAL_CONFIG_COUNTER ) && \
-    !defined( CATCH_CONFIG_NO_COUNTER ) && \
-    !defined( CATCH_CONFIG_COUNTER )
-#    define CATCH_CONFIG_COUNTER
-#endif
+    struct BenchmarkInfo {
+        std::string name;
+        double estimatedDuration;
+        int iterations;
+        unsigned int samples;
+        unsigned int resamples;
+        double clockResolution;
+        double clockCost;
+    };
 
+    // We need to keep template parameter for backwards compatibility,
+    // but we also do not want to use the template paraneter.
+    template <class Dummy>
+    struct BenchmarkStats {
+        BenchmarkInfo info;
 
-#endif // CATCH_CONFIG_COUNTER_HPP_INCLUDED
-#define INTERNAL_CATCH_UNIQUE_NAME_LINE2( name, line ) name##line
-#define INTERNAL_CATCH_UNIQUE_NAME_LINE( name, line ) INTERNAL_CATCH_UNIQUE_NAME_LINE2( name, line )
-#ifdef CATCH_CONFIG_COUNTER
-#  define INTERNAL_CATCH_UNIQUE_NAME( name ) INTERNAL_CATCH_UNIQUE_NAME_LINE( name, __COUNTER__ )
-#else
-#  define INTERNAL_CATCH_UNIQUE_NAME( name ) INTERNAL_CATCH_UNIQUE_NAME_LINE( name, __LINE__ )
-#endif
+        std::vector<Benchmark::FDuration> samples;
+        Benchmark::Estimate<Benchmark::FDuration> mean;
+        Benchmark::Estimate<Benchmark::FDuration> standardDeviation;
+        Benchmark::OutlierClassification outliers;
+        double outlierVariance;
+    };
 
-#endif // CATCH_UNIQUE_NAME_HPP_INCLUDED
+
+} // end namespace Catch
+
+#endif // CATCH_BENCHMARK_STATS_HPP_INCLUDED
 
 
 // Adapted from donated nonius code.
 
-#ifndef CATCH_CHRONOMETER_HPP_INCLUDED
-#define CATCH_CHRONOMETER_HPP_INCLUDED
+#ifndef CATCH_ENVIRONMENT_HPP_INCLUDED
+#define CATCH_ENVIRONMENT_HPP_INCLUDED
 
 
+namespace Catch {
+    namespace Benchmark {
+        struct EnvironmentEstimate {
+            FDuration mean;
+            OutlierClassification outliers;
+        };
+        struct Environment {
+            EnvironmentEstimate clock_resolution;
+            EnvironmentEstimate clock_cost;
+        };
+    } // namespace Benchmark
+} // namespace Catch
+
+#endif // CATCH_ENVIRONMENT_HPP_INCLUDED
+
 
 // Adapted from donated nonius code.
 
-#ifndef CATCH_CLOCK_HPP_INCLUDED
-#define CATCH_CLOCK_HPP_INCLUDED
+#ifndef CATCH_EXECUTION_PLAN_HPP_INCLUDED
+#define CATCH_EXECUTION_PLAN_HPP_INCLUDED
 
-#include <chrono>
-#include <ratio>
 
-namespace Catch {
-    namespace Benchmark {
-        template <typename Clock>
-        using ClockDuration = typename Clock::duration;
-        template <typename Clock>
-        using FloatDuration = std::chrono::duration<double, typename Clock::period>;
 
-        template <typename Clock>
-        using TimePoint = typename Clock::time_point;
+// Adapted from donated nonius code.
 
-        using default_clock = std::chrono::steady_clock;
+#ifndef CATCH_BENCHMARK_FUNCTION_HPP_INCLUDED
+#define CATCH_BENCHMARK_FUNCTION_HPP_INCLUDED
 
-        template <typename Clock>
-        struct now {
-            TimePoint<Clock> operator()() const {
-                return Clock::now();
-            }
-        };
 
-        using fp_seconds = std::chrono::duration<double, std::ratio<1>>;
-    } // namespace Benchmark
-} // namespace Catch
 
-#endif // CATCH_CLOCK_HPP_INCLUDED
+// Adapted from donated nonius code.
+
+#ifndef CATCH_CHRONOMETER_HPP_INCLUDED
+#define CATCH_CHRONOMETER_HPP_INCLUDED
+
 
 
 // Adapted from donated nonius code.
@@ -1709,7 +1392,7 @@ namespace Catch {
 #ifndef CATCH_OPTIMIZER_HPP_INCLUDED
 #define CATCH_OPTIMIZER_HPP_INCLUDED
 
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) || defined(__IAR_SYSTEMS_ICC__)
 #   include <atomic> // atomic_thread_fence
 #endif
 
@@ -1730,16 +1413,23 @@ namespace Catch {
         namespace Detail {
             inline void optimizer_barrier() { keep_memory(); }
         } // namespace Detail
-#elif defined(_MSC_VER)
+#elif defined(_MSC_VER) || defined(__IAR_SYSTEMS_ICC__)
 
+#if defined(_MSVC_VER)
 #pragma optimize("", off)
+#elif defined(__IAR_SYSTEMS_ICC__)
+// For IAR the pragma only affects the following function
+#pragma optimize=disable
+#endif
         template <typename T>
         inline void keep_memory(T* p) {
             // thanks @milleniumbug
             *reinterpret_cast<char volatile*>(p) = *reinterpret_cast<char const volatile*>(p);
         }
         // TODO equivalent keep_memory()
+#if defined(_MSVC_VER)
 #pragma optimize("", on)
+#endif
 
         namespace Detail {
             inline void optimizer_barrier() {
@@ -1751,52 +1441,22 @@ namespace Catch {
 
         template <typename T>
         inline void deoptimize_value(T&& x) {
-            keep_memory(&x);
-        }
-
-        template <typename Fn, typename... Args>
-        inline auto invoke_deoptimized(Fn&& fn, Args&&... args) -> std::enable_if_t<!std::is_same<void, decltype(fn(args...))>::value> {
-            deoptimize_value(CATCH_FORWARD(fn) (CATCH_FORWARD(args)...));
-        }
-
-        template <typename Fn, typename... Args>
-        inline auto invoke_deoptimized(Fn&& fn, Args&&... args) -> std::enable_if_t<std::is_same<void, decltype(fn(args...))>::value> {
-            CATCH_FORWARD(fn) (CATCH_FORWARD(args)...);
-        }
-    } // namespace Benchmark
-} // namespace Catch
-
-#endif // CATCH_OPTIMIZER_HPP_INCLUDED
-
-
-// Adapted from donated nonius code.
-
-#ifndef CATCH_COMPLETE_INVOKE_HPP_INCLUDED
-#define CATCH_COMPLETE_INVOKE_HPP_INCLUDED
-
-
-
-#ifndef CATCH_TEST_FAILURE_EXCEPTION_HPP_INCLUDED
-#define CATCH_TEST_FAILURE_EXCEPTION_HPP_INCLUDED
-
-namespace Catch {
-
-    //! Used to signal that an assertion macro failed
-    struct TestFailureException{};
-
-    /**
-     * Outlines throwing of `TestFailureException` into a single TU
-     *
-     * Also handles `CATCH_CONFIG_DISABLE_EXCEPTIONS` for callers.
-     */
-    [[noreturn]] void throw_test_failure_exception();
+            keep_memory(&x);
+        }
 
-    //! Used to signal that the remainder of a test should be skipped
-    struct TestSkipException{};
+        template <typename Fn, typename... Args>
+        inline auto invoke_deoptimized(Fn&& fn, Args&&... args) -> std::enable_if_t<!std::is_same<void, decltype(fn(args...))>::value> {
+            deoptimize_value(CATCH_FORWARD(fn) (CATCH_FORWARD(args)...));
+        }
 
+        template <typename Fn, typename... Args>
+        inline auto invoke_deoptimized(Fn&& fn, Args&&... args) -> std::enable_if_t<std::is_same<void, decltype(fn(args...))>::value> {
+            CATCH_FORWARD((fn)) (CATCH_FORWARD(args)...);
+        }
+    } // namespace Benchmark
 } // namespace Catch
 
-#endif // CATCH_TEST_FAILURE_EXCEPTION_HPP_INCLUDED
+#endif // CATCH_OPTIMIZER_HPP_INCLUDED
 
 
 #ifndef CATCH_META_HPP_INCLUDED
@@ -1840,112 +1500,6 @@ namespace mpl_{
 
 #endif // CATCH_META_HPP_INCLUDED
 
-
-#ifndef CATCH_INTERFACES_REGISTRY_HUB_HPP_INCLUDED
-#define CATCH_INTERFACES_REGISTRY_HUB_HPP_INCLUDED
-
-
-#include <string>
-
-namespace Catch {
-
-    class TestCaseHandle;
-    struct TestCaseInfo;
-    class ITestCaseRegistry;
-    class IExceptionTranslatorRegistry;
-    class IExceptionTranslator;
-    class IReporterRegistry;
-    class IReporterFactory;
-    class ITagAliasRegistry;
-    class ITestInvoker;
-    class IMutableEnumValuesRegistry;
-    struct SourceLineInfo;
-
-    class StartupExceptionRegistry;
-    class EventListenerFactory;
-
-    using IReporterFactoryPtr = Detail::unique_ptr<IReporterFactory>;
-
-    class IRegistryHub {
-    public:
-        virtual ~IRegistryHub(); // = default
-
-        virtual IReporterRegistry const& getReporterRegistry() const = 0;
-        virtual ITestCaseRegistry const& getTestCaseRegistry() const = 0;
-        virtual ITagAliasRegistry const& getTagAliasRegistry() const = 0;
-        virtual IExceptionTranslatorRegistry const& getExceptionTranslatorRegistry() const = 0;
-
-
-        virtual StartupExceptionRegistry const& getStartupExceptionRegistry() const = 0;
-    };
-
-    class IMutableRegistryHub {
-    public:
-        virtual ~IMutableRegistryHub(); // = default
-        virtual void registerReporter( std::string const& name, IReporterFactoryPtr factory ) = 0;
-        virtual void registerListener( Detail::unique_ptr<EventListenerFactory> factory ) = 0;
-        virtual void registerTest(Detail::unique_ptr<TestCaseInfo>&& testInfo, Detail::unique_ptr<ITestInvoker>&& invoker) = 0;
-        virtual void registerTranslator( Detail::unique_ptr<IExceptionTranslator>&& translator ) = 0;
-        virtual void registerTagAlias( std::string const& alias, std::string const& tag, SourceLineInfo const& lineInfo ) = 0;
-        virtual void registerStartupException() noexcept = 0;
-        virtual IMutableEnumValuesRegistry& getMutableEnumValuesRegistry() = 0;
-    };
-
-    IRegistryHub const& getRegistryHub();
-    IMutableRegistryHub& getMutableRegistryHub();
-    void cleanUp();
-    std::string translateActiveException();
-
-}
-
-#endif // CATCH_INTERFACES_REGISTRY_HUB_HPP_INCLUDED
-
-#include <type_traits>
-
-namespace Catch {
-    namespace Benchmark {
-        namespace Detail {
-            template <typename T>
-            struct CompleteType { using type = T; };
-            template <>
-            struct CompleteType<void> { struct type {}; };
-
-            template <typename T>
-            using CompleteType_t = typename CompleteType<T>::type;
-
-            template <typename Result>
-            struct CompleteInvoker {
-                template <typename Fun, typename... Args>
-                static Result invoke(Fun&& fun, Args&&... args) {
-                    return CATCH_FORWARD(fun)(CATCH_FORWARD(args)...);
-                }
-            };
-            template <>
-            struct CompleteInvoker<void> {
-                template <typename Fun, typename... Args>
-                static CompleteType_t<void> invoke(Fun&& fun, Args&&... args) {
-                    CATCH_FORWARD(fun)(CATCH_FORWARD(args)...);
-                    return {};
-                }
-            };
-
-            // invoke and not return void :(
-            template <typename Fun, typename... Args>
-            CompleteType_t<FunctionReturnType<Fun, Args...>> complete_invoke(Fun&& fun, Args&&... args) {
-                return CompleteInvoker<FunctionReturnType<Fun, Args...>>::invoke(CATCH_FORWARD(fun), CATCH_FORWARD(args)...);
-            }
-
-        } // namespace Detail
-
-        template <typename Fun>
-        Detail::CompleteType_t<FunctionReturnType<Fun>> user_code(Fun&& fun) {
-            return Detail::complete_invoke(CATCH_FORWARD(fun));
-        }
-    } // namespace Benchmark
-} // namespace Catch
-
-#endif // CATCH_COMPLETE_INVOKE_HPP_INCLUDED
-
 namespace Catch {
     namespace Benchmark {
         namespace Detail {
@@ -1963,7 +1517,10 @@ namespace Catch {
                 void start() override { started = Clock::now(); }
                 void finish() override { finished = Clock::now(); }
 
-                ClockDuration<Clock> elapsed() const { return finished - started; }
+                IDuration elapsed() const {
+                    return std::chrono::duration_cast<std::chrono::nanoseconds>(
+                        finished - started );
+                }
 
                 TimePoint<Clock> started;
                 TimePoint<Clock> finished;
@@ -2004,50 +1561,6 @@ namespace Catch {
 
 #endif // CATCH_CHRONOMETER_HPP_INCLUDED
 
-
-// Adapted from donated nonius code.
-
-#ifndef CATCH_ENVIRONMENT_HPP_INCLUDED
-#define CATCH_ENVIRONMENT_HPP_INCLUDED
-
-
-namespace Catch {
-    namespace Benchmark {
-        template <typename Duration>
-        struct EnvironmentEstimate {
-            Duration mean;
-            OutlierClassification outliers;
-
-            template <typename Duration2>
-            operator EnvironmentEstimate<Duration2>() const {
-                return { mean, outliers };
-            }
-        };
-        template <typename Clock>
-        struct Environment {
-            using clock_type = Clock;
-            EnvironmentEstimate<FloatDuration<Clock>> clock_resolution;
-            EnvironmentEstimate<FloatDuration<Clock>> clock_cost;
-        };
-    } // namespace Benchmark
-} // namespace Catch
-
-#endif // CATCH_ENVIRONMENT_HPP_INCLUDED
-
-
-// Adapted from donated nonius code.
-
-#ifndef CATCH_EXECUTION_PLAN_HPP_INCLUDED
-#define CATCH_EXECUTION_PLAN_HPP_INCLUDED
-
-
-
-// Adapted from donated nonius code.
-
-#ifndef CATCH_BENCHMARK_FUNCTION_HPP_INCLUDED
-#define CATCH_BENCHMARK_FUNCTION_HPP_INCLUDED
-
-
 #include <type_traits>
 
 namespace Catch {
@@ -2184,6 +1697,57 @@ namespace Catch {
 
 
 
+// Adapted from donated nonius code.
+
+#ifndef CATCH_COMPLETE_INVOKE_HPP_INCLUDED
+#define CATCH_COMPLETE_INVOKE_HPP_INCLUDED
+
+
+namespace Catch {
+    namespace Benchmark {
+        namespace Detail {
+            template <typename T>
+            struct CompleteType { using type = T; };
+            template <>
+            struct CompleteType<void> { struct type {}; };
+
+            template <typename T>
+            using CompleteType_t = typename CompleteType<T>::type;
+
+            template <typename Result>
+            struct CompleteInvoker {
+                template <typename Fun, typename... Args>
+                static Result invoke(Fun&& fun, Args&&... args) {
+                    return CATCH_FORWARD(fun)(CATCH_FORWARD(args)...);
+                }
+            };
+            template <>
+            struct CompleteInvoker<void> {
+                template <typename Fun, typename... Args>
+                static CompleteType_t<void> invoke(Fun&& fun, Args&&... args) {
+                    CATCH_FORWARD(fun)(CATCH_FORWARD(args)...);
+                    return {};
+                }
+            };
+
+            // invoke and not return void :(
+            template <typename Fun, typename... Args>
+            CompleteType_t<FunctionReturnType<Fun, Args...>> complete_invoke(Fun&& fun, Args&&... args) {
+                return CompleteInvoker<FunctionReturnType<Fun, Args...>>::invoke(CATCH_FORWARD(fun), CATCH_FORWARD(args)...);
+            }
+
+        } // namespace Detail
+
+        template <typename Fun>
+        Detail::CompleteType_t<FunctionReturnType<Fun>> user_code(Fun&& fun) {
+            return Detail::complete_invoke(CATCH_FORWARD(fun));
+        }
+    } // namespace Benchmark
+} // namespace Catch
+
+#endif // CATCH_COMPLETE_INVOKE_HPP_INCLUDED
+
+
 // Adapted from donated nonius code.
 
 #ifndef CATCH_TIMING_HPP_INCLUDED
@@ -2194,14 +1758,14 @@ namespace Catch {
 
 namespace Catch {
     namespace Benchmark {
-        template <typename Duration, typename Result>
+        template <typename Result>
         struct Timing {
-            Duration elapsed;
+            IDuration elapsed;
             Result result;
             int iterations;
         };
-        template <typename Clock, typename Func, typename... Args>
-        using TimingOf = Timing<ClockDuration<Clock>, Detail::CompleteType_t<FunctionReturnType<Func, Args...>>>;
+        template <typename Func, typename... Args>
+        using TimingOf = Timing<Detail::CompleteType_t<FunctionReturnType<Func, Args...>>>;
     } // namespace Benchmark
 } // namespace Catch
 
@@ -2211,7 +1775,7 @@ namespace Catch {
     namespace Benchmark {
         namespace Detail {
             template <typename Clock, typename Fun, typename... Args>
-            TimingOf<Clock, Fun, Args...> measure(Fun&& fun, Args&&... args) {
+            TimingOf<Fun, Args...> measure(Fun&& fun, Args&&... args) {
                 auto start = Clock::now();
                 auto&& r = Detail::complete_invoke(fun, CATCH_FORWARD(args)...);
                 auto end = Clock::now();
@@ -2230,11 +1794,11 @@ namespace Catch {
     namespace Benchmark {
         namespace Detail {
             template <typename Clock, typename Fun>
-            TimingOf<Clock, Fun, int> measure_one(Fun&& fun, int iters, std::false_type) {
+            TimingOf<Fun, int> measure_one(Fun&& fun, int iters, std::false_type) {
                 return Detail::measure<Clock>(fun, iters);
             }
             template <typename Clock, typename Fun>
-            TimingOf<Clock, Fun, Chronometer> measure_one(Fun&& fun, int iters, std::true_type) {
+            TimingOf<Fun, Chronometer> measure_one(Fun&& fun, int iters, std::true_type) {
                 Detail::ChronometerModel<Clock> meter;
                 auto&& result = Detail::complete_invoke(fun, Chronometer(meter, iters));
 
@@ -2249,8 +1813,8 @@ namespace Catch {
             void throw_optimized_away_error();
 
             template <typename Clock, typename Fun>
-            TimingOf<Clock, Fun, run_for_at_least_argument_t<Clock, Fun>>
-                run_for_at_least(ClockDuration<Clock> how_long,
+            TimingOf<Fun, run_for_at_least_argument_t<Clock, Fun>>
+                run_for_at_least(IDuration how_long,
                                  const int initial_iterations,
                                  Fun&& fun) {
                 auto iters = initial_iterations;
@@ -2270,38 +1834,38 @@ namespace Catch {
 
 #endif // CATCH_RUN_FOR_AT_LEAST_HPP_INCLUDED
 
-#include <algorithm>
-#include <iterator>
+#include <vector>
 
 namespace Catch {
     namespace Benchmark {
-        template <typename Duration>
         struct ExecutionPlan {
             int iterations_per_sample;
-            Duration estimated_duration;
+            FDuration estimated_duration;
             Detail::BenchmarkFunction benchmark;
-            Duration warmup_time;
+            FDuration warmup_time;
             int warmup_iterations;
 
-            template <typename Duration2>
-            operator ExecutionPlan<Duration2>() const {
-                return { iterations_per_sample, estimated_duration, benchmark, warmup_time, warmup_iterations };
-            }
-
             template <typename Clock>
-            std::vector<FloatDuration<Clock>> run(const IConfig &cfg, Environment<FloatDuration<Clock>> env) const {
+            std::vector<FDuration> run(const IConfig &cfg, Environment env) const {
                 // warmup a bit
-                Detail::run_for_at_least<Clock>(std::chrono::duration_cast<ClockDuration<Clock>>(warmup_time), warmup_iterations, Detail::repeat(now<Clock>{}));
+                Detail::run_for_at_least<Clock>(
+                    std::chrono::duration_cast<IDuration>( warmup_time ),
+                    warmup_iterations,
+                    Detail::repeat( []() { return Clock::now(); } )
+                );
 
-                std::vector<FloatDuration<Clock>> times;
-                times.reserve(cfg.benchmarkSamples());
-                std::generate_n(std::back_inserter(times), cfg.benchmarkSamples(), [this, env] {
+                std::vector<FDuration> times;
+                const auto num_samples = cfg.benchmarkSamples();
+                times.reserve( num_samples );
+                for ( size_t i = 0; i < num_samples; ++i ) {
                     Detail::ChronometerModel<Clock> model;
-                    this->benchmark(Chronometer(model, iterations_per_sample));
+                    this->benchmark( Chronometer( model, iterations_per_sample ) );
                     auto sample_time = model.elapsed() - env.clock_cost.mean;
-                    if (sample_time < FloatDuration<Clock>::zero()) sample_time = FloatDuration<Clock>::zero();
-                    return sample_time / iterations_per_sample;
-                });
+                    if ( sample_time < FDuration::zero() ) {
+                        sample_time = FDuration::zero();
+                    }
+                    times.push_back(sample_time / iterations_per_sample);
+                }
                 return times;
             }
         };
@@ -2324,122 +1888,35 @@ namespace Catch {
 #define CATCH_STATS_HPP_INCLUDED
 
 
-#include <algorithm>
 #include <vector>
-#include <numeric>
-#include <tuple>
-#include <cmath>
 
 namespace Catch {
     namespace Benchmark {
         namespace Detail {
             using sample = std::vector<double>;
 
-            // Used when we know we want == comparison of two doubles
-            // to centralize warning suppression
-            bool directCompare( double lhs, double rhs );
-
-            double weighted_average_quantile(int k, int q, std::vector<double>::iterator first, std::vector<double>::iterator last);
-
-            template <typename Iterator>
-            OutlierClassification classify_outliers(Iterator first, Iterator last) {
-                std::vector<double> copy(first, last);
-
-                auto q1 = weighted_average_quantile(1, 4, copy.begin(), copy.end());
-                auto q3 = weighted_average_quantile(3, 4, copy.begin(), copy.end());
-                auto iqr = q3 - q1;
-                auto los = q1 - (iqr * 3.);
-                auto lom = q1 - (iqr * 1.5);
-                auto him = q3 + (iqr * 1.5);
-                auto his = q3 + (iqr * 3.);
-
-                OutlierClassification o;
-                for (; first != last; ++first) {
-                    auto&& t = *first;
-                    if (t < los) ++o.low_severe;
-                    else if (t < lom) ++o.low_mild;
-                    else if (t > his) ++o.high_severe;
-                    else if (t > him) ++o.high_mild;
-                    ++o.samples_seen;
-                }
-                return o;
-            }
-
-            template <typename Iterator>
-            double mean(Iterator first, Iterator last) {
-                auto count = last - first;
-                double sum = std::accumulate(first, last, 0.);
-                return sum / static_cast<double>(count);
-            }
+            double weighted_average_quantile( int k,
+                                              int q,
+                                              double* first,
+                                              double* last );
 
-            template <typename Estimator, typename Iterator>
-            sample jackknife(Estimator&& estimator, Iterator first, Iterator last) {
-                auto n = static_cast<size_t>(last - first);
-                auto second = first;
-                ++second;
-                sample results;
-                results.reserve(n);
+            OutlierClassification
+            classify_outliers( double const* first, double const* last );
 
-                for (auto it = first; it != last; ++it) {
-                    std::iter_swap(it, first);
-                    results.push_back(estimator(second, last));
-                }
-
-                return results;
-            }
+            double mean( double const* first, double const* last );
 
-            inline double normal_cdf(double x) {
-                return std::erfc(-x / std::sqrt(2.0)) / 2.0;
-            }
+            double normal_cdf( double x );
 
             double erfc_inv(double x);
 
             double normal_quantile(double p);
 
-            template <typename Iterator, typename Estimator>
-            Estimate<double> bootstrap(double confidence_level, Iterator first, Iterator last, sample const& resample, Estimator&& estimator) {
-                auto n_samples = last - first;
-
-                double point = estimator(first, last);
-                // Degenerate case with a single sample
-                if (n_samples == 1) return { point, point, point, confidence_level };
-
-                sample jack = jackknife(estimator, first, last);
-                double jack_mean = mean(jack.begin(), jack.end());
-                double sum_squares, sum_cubes;
-                std::tie(sum_squares, sum_cubes) = std::accumulate(jack.begin(), jack.end(), std::make_pair(0., 0.), [jack_mean](std::pair<double, double> sqcb, double x) -> std::pair<double, double> {
-                    auto d = jack_mean - x;
-                    auto d2 = d * d;
-                    auto d3 = d2 * d;
-                    return { sqcb.first + d2, sqcb.second + d3 };
-                });
-
-                double accel = sum_cubes / (6 * std::pow(sum_squares, 1.5));
-                long n = static_cast<long>(resample.size());
-                double prob_n = std::count_if(resample.begin(), resample.end(), [point](double x) { return x < point; }) / static_cast<double>(n);
-                // degenerate case with uniform samples
-                if ( directCompare( prob_n, 0. ) ) {
-                    return { point, point, point, confidence_level };
-                }
-
-                double bias = normal_quantile(prob_n);
-                double z1 = normal_quantile((1. - confidence_level) / 2.);
-
-                auto cumn = [n]( double x ) -> long {
-                    return std::lround( normal_cdf( x ) * static_cast<double>(n) );
-                };
-                auto a = [bias, accel](double b) { return bias + b / (1. - accel * b); };
-                double b1 = bias + z1;
-                double b2 = bias - z1;
-                double a1 = a(b1);
-                double a2 = a(b2);
-                auto lo = static_cast<size_t>((std::max)(cumn(a1), 0l));
-                auto hi = static_cast<size_t>((std::min)(cumn(a2), n - 1));
-
-                return { point, resample[lo], resample[hi], confidence_level };
-            }
-
-            double outlier_variance(Estimate<double> mean, Estimate<double> stddev, int n);
+            Estimate<double>
+            bootstrap( double confidence_level,
+                       double* first,
+                       double* last,
+                       sample const& resample,
+                       double ( *estimator )( double const*, double const* ) );
 
             struct bootstrap_analysis {
                 Estimate<double> mean;
@@ -2447,7 +1924,10 @@ namespace Catch {
                 double outlier_variance;
             };
 
-            bootstrap_analysis analyse_samples(double confidence_level, unsigned int n_resamples, std::vector<double>::iterator first, std::vector<double>::iterator last);
+            bootstrap_analysis analyse_samples(double confidence_level,
+                                               unsigned int n_resamples,
+                                               double* first,
+                                               double* last);
         } // namespace Detail
     } // namespace Benchmark
 } // namespace Catch
@@ -2455,7 +1935,6 @@ namespace Catch {
 #endif // CATCH_STATS_HPP_INCLUDED
 
 #include <algorithm>
-#include <iterator>
 #include <vector>
 #include <cmath>
 
@@ -2466,46 +1945,49 @@ namespace Catch {
             std::vector<double> resolution(int k) {
                 std::vector<TimePoint<Clock>> times;
                 times.reserve(static_cast<size_t>(k + 1));
-                std::generate_n(std::back_inserter(times), k + 1, now<Clock>{});
+                for ( int i = 0; i < k + 1; ++i ) {
+                    times.push_back( Clock::now() );
+                }
 
                 std::vector<double> deltas;
                 deltas.reserve(static_cast<size_t>(k));
-                std::transform(std::next(times.begin()), times.end(), times.begin(),
-                    std::back_inserter(deltas),
-                    [](TimePoint<Clock> a, TimePoint<Clock> b) { return static_cast<double>((a - b).count()); });
+                for ( size_t idx = 1; idx < times.size(); ++idx ) {
+                    deltas.push_back( static_cast<double>(
+                        ( times[idx] - times[idx - 1] ).count() ) );
+                }
 
                 return deltas;
             }
 
-            const auto warmup_iterations = 10000;
-            const auto warmup_time = std::chrono::milliseconds(100);
-            const auto minimum_ticks = 1000;
-            const auto warmup_seed = 10000;
-            const auto clock_resolution_estimation_time = std::chrono::milliseconds(500);
-            const auto clock_cost_estimation_time_limit = std::chrono::seconds(1);
-            const auto clock_cost_estimation_tick_limit = 100000;
-            const auto clock_cost_estimation_time = std::chrono::milliseconds(10);
-            const auto clock_cost_estimation_iterations = 10000;
+            constexpr auto warmup_iterations = 10000;
+            constexpr auto warmup_time = std::chrono::milliseconds(100);
+            constexpr auto minimum_ticks = 1000;
+            constexpr auto warmup_seed = 10000;
+            constexpr auto clock_resolution_estimation_time = std::chrono::milliseconds(500);
+            constexpr auto clock_cost_estimation_time_limit = std::chrono::seconds(1);
+            constexpr auto clock_cost_estimation_tick_limit = 100000;
+            constexpr auto clock_cost_estimation_time = std::chrono::milliseconds(10);
+            constexpr auto clock_cost_estimation_iterations = 10000;
 
             template <typename Clock>
             int warmup() {
-                return run_for_at_least<Clock>(std::chrono::duration_cast<ClockDuration<Clock>>(warmup_time), warmup_seed, &resolution<Clock>)
+                return run_for_at_least<Clock>(warmup_time, warmup_seed, &resolution<Clock>)
                     .iterations;
             }
             template <typename Clock>
-            EnvironmentEstimate<FloatDuration<Clock>> estimate_clock_resolution(int iterations) {
-                auto r = run_for_at_least<Clock>(std::chrono::duration_cast<ClockDuration<Clock>>(clock_resolution_estimation_time), iterations, &resolution<Clock>)
+            EnvironmentEstimate estimate_clock_resolution(int iterations) {
+                auto r = run_for_at_least<Clock>(clock_resolution_estimation_time, iterations, &resolution<Clock>)
                     .result;
                 return {
-                    FloatDuration<Clock>(mean(r.begin(), r.end())),
-                    classify_outliers(r.begin(), r.end()),
+                    FDuration(mean(r.data(), r.data() + r.size())),
+                    classify_outliers(r.data(), r.data() + r.size()),
                 };
             }
             template <typename Clock>
-            EnvironmentEstimate<FloatDuration<Clock>> estimate_clock_cost(FloatDuration<Clock> resolution) {
+            EnvironmentEstimate estimate_clock_cost(FDuration resolution) {
                 auto time_limit = (std::min)(
                     resolution * clock_cost_estimation_tick_limit,
-                    FloatDuration<Clock>(clock_cost_estimation_time_limit));
+                    FDuration(clock_cost_estimation_time_limit));
                 auto time_clock = [](int k) {
                     return Detail::measure<Clock>([k] {
                         for (int i = 0; i < k; ++i) {
@@ -2516,26 +1998,28 @@ namespace Catch {
                 };
                 time_clock(1);
                 int iters = clock_cost_estimation_iterations;
-                auto&& r = run_for_at_least<Clock>(std::chrono::duration_cast<ClockDuration<Clock>>(clock_cost_estimation_time), iters, time_clock);
+                auto&& r = run_for_at_least<Clock>(clock_cost_estimation_time, iters, time_clock);
                 std::vector<double> times;
                 int nsamples = static_cast<int>(std::ceil(time_limit / r.elapsed));
                 times.reserve(static_cast<size_t>(nsamples));
-                std::generate_n(std::back_inserter(times), nsamples, [time_clock, &r] {
-                    return static_cast<double>((time_clock(r.iterations) / r.iterations).count());
-                });
+                for ( int s = 0; s < nsamples; ++s ) {
+                    times.push_back( static_cast<double>(
+                        ( time_clock( r.iterations ) / r.iterations )
+                            .count() ) );
+                }
                 return {
-                    FloatDuration<Clock>(mean(times.begin(), times.end())),
-                    classify_outliers(times.begin(), times.end()),
+                    FDuration(mean(times.data(), times.data() + times.size())),
+                    classify_outliers(times.data(), times.data() + times.size()),
                 };
             }
 
             template <typename Clock>
-            Environment<FloatDuration<Clock>> measure_environment() {
+            Environment measure_environment() {
 #if defined(__clang__)
 #    pragma clang diagnostic push
 #    pragma clang diagnostic ignored "-Wexit-time-destructors"
 #endif
-                static Catch::Detail::unique_ptr<Environment<FloatDuration<Clock>>> env;
+                static Catch::Detail::unique_ptr<Environment> env;
 #if defined(__clang__)
 #    pragma clang diagnostic pop
 #endif
@@ -2547,7 +2031,7 @@ namespace Catch {
                 auto resolution = Detail::estimate_clock_resolution<Clock>(iters);
                 auto cost = Detail::estimate_clock_cost<Clock>(resolution.mean);
 
-                env = Catch::Detail::make_unique<Environment<FloatDuration<Clock>>>( Environment<FloatDuration<Clock>>{resolution, cost} );
+                env = Catch::Detail::make_unique<Environment>( Environment{resolution, cost} );
                 return *env;
             }
         } // namespace Detail
@@ -2570,95 +2054,29 @@ namespace Catch {
 #define CATCH_SAMPLE_ANALYSIS_HPP_INCLUDED
 
 
-#include <algorithm>
 #include <vector>
-#include <iterator>
 
 namespace Catch {
     namespace Benchmark {
-        template <typename Duration>
         struct SampleAnalysis {
-            std::vector<Duration> samples;
-            Estimate<Duration> mean;
-            Estimate<Duration> standard_deviation;
+            std::vector<FDuration> samples;
+            Estimate<FDuration> mean;
+            Estimate<FDuration> standard_deviation;
             OutlierClassification outliers;
             double outlier_variance;
-
-            template <typename Duration2>
-            operator SampleAnalysis<Duration2>() const {
-                std::vector<Duration2> samples2;
-                samples2.reserve(samples.size());
-                std::transform(samples.begin(), samples.end(), std::back_inserter(samples2), [](Duration d) { return Duration2(d); });
-                return {
-                    CATCH_MOVE(samples2),
-                    mean,
-                    standard_deviation,
-                    outliers,
-                    outlier_variance,
-                };
-            }
         };
     } // namespace Benchmark
 } // namespace Catch
 
 #endif // CATCH_SAMPLE_ANALYSIS_HPP_INCLUDED
 
-#include <algorithm>
-#include <iterator>
-#include <vector>
 
 namespace Catch {
+    class IConfig;
+
     namespace Benchmark {
         namespace Detail {
-            template <typename Duration, typename Iterator>
-            SampleAnalysis<Duration> analyse(const IConfig &cfg, Environment<Duration>, Iterator first, Iterator last) {
-                if (!cfg.benchmarkNoAnalysis()) {
-                    std::vector<double> samples;
-                    samples.reserve(static_cast<size_t>(last - first));
-                    std::transform(first, last, std::back_inserter(samples), [](Duration d) { return d.count(); });
-
-                    auto analysis = Catch::Benchmark::Detail::analyse_samples(cfg.benchmarkConfidenceInterval(), cfg.benchmarkResamples(), samples.begin(), samples.end());
-                    auto outliers = Catch::Benchmark::Detail::classify_outliers(samples.begin(), samples.end());
-
-                    auto wrap_estimate = [](Estimate<double> e) {
-                        return Estimate<Duration> {
-                            Duration(e.point),
-                                Duration(e.lower_bound),
-                                Duration(e.upper_bound),
-                                e.confidence_interval,
-                        };
-                    };
-                    std::vector<Duration> samples2;
-                    samples2.reserve(samples.size());
-                    std::transform(samples.begin(), samples.end(), std::back_inserter(samples2), [](double d) { return Duration(d); });
-                    return {
-                        CATCH_MOVE(samples2),
-                        wrap_estimate(analysis.mean),
-                        wrap_estimate(analysis.standard_deviation),
-                        outliers,
-                        analysis.outlier_variance,
-                    };
-                } else {
-                    std::vector<Duration> samples;
-                    samples.reserve(static_cast<size_t>(last - first));
-
-                    Duration mean = Duration(0);
-                    int i = 0;
-                    for (auto it = first; it < last; ++it, ++i) {
-                        samples.push_back(Duration(*it));
-                        mean += Duration(*it);
-                    }
-                    mean /= i;
-
-                    return {
-                        CATCH_MOVE(samples),
-                        Estimate<Duration>{mean, mean, mean, 0.0},
-                        Estimate<Duration>{Duration(0), Duration(0), Duration(0), 0.0},
-                        OutlierClassification{},
-                        0.0
-                    };
-                }
-            }
+            SampleAnalysis analyse(const IConfig &cfg, FDuration* first, FDuration* last);
         } // namespace Detail
     } // namespace Benchmark
 } // namespace Catch
@@ -2666,9 +2084,9 @@ namespace Catch {
 #endif // CATCH_ANALYSE_HPP_INCLUDED
 
 #include <algorithm>
-#include <functional>
+#include <chrono>
+#include <exception>
 #include <string>
-#include <vector>
 #include <cmath>
 
 namespace Catch {
@@ -2682,16 +2100,18 @@ namespace Catch {
                 : fun(CATCH_MOVE(func)), name(CATCH_MOVE(benchmarkName)) {}
 
             template <typename Clock>
-            ExecutionPlan<FloatDuration<Clock>> prepare(const IConfig &cfg, Environment<FloatDuration<Clock>> env) const {
+            ExecutionPlan prepare(const IConfig &cfg, Environment env) const {
                 auto min_time = env.clock_resolution.mean * Detail::minimum_ticks;
                 auto run_time = std::max(min_time, std::chrono::duration_cast<decltype(min_time)>(cfg.benchmarkWarmupTime()));
-                auto&& test = Detail::run_for_at_least<Clock>(std::chrono::duration_cast<ClockDuration<Clock>>(run_time), 1, fun);
+                auto&& test = Detail::run_for_at_least<Clock>(std::chrono::duration_cast<IDuration>(run_time), 1, fun);
                 int new_iters = static_cast<int>(std::ceil(min_time * test.iterations / test.elapsed));
-                return { new_iters, test.elapsed / test.iterations * new_iters * cfg.benchmarkSamples(), fun, std::chrono::duration_cast<FloatDuration<Clock>>(cfg.benchmarkWarmupTime()), Detail::warmup_iterations };
+                return { new_iters, test.elapsed / test.iterations * new_iters * cfg.benchmarkSamples(), fun, std::chrono::duration_cast<FDuration>(cfg.benchmarkWarmupTime()), Detail::warmup_iterations };
             }
 
             template <typename Clock = default_clock>
             void run() {
+                static_assert( Clock::is_steady,
+                               "Benchmarking clock should be steady" );
                 auto const* cfg = getCurrentContext().getConfig();
 
                 auto env = Detail::measure_environment<Clock>();
@@ -2718,10 +2138,10 @@ namespace Catch {
                         return plan.template run<Clock>(*cfg, env);
                     });
 
-                    auto analysis = Detail::analyse(*cfg, env, samples.begin(), samples.end());
-                    BenchmarkStats<FloatDuration<Clock>> stats{ CATCH_MOVE(info), CATCH_MOVE(analysis.samples), analysis.mean, analysis.standard_deviation, analysis.outliers, analysis.outlier_variance };
+                    auto analysis = Detail::analyse(*cfg, samples.data(), samples.data() + samples.size());
+                    BenchmarkStats<> stats{ CATCH_MOVE(info), CATCH_MOVE(analysis.samples), analysis.mean, analysis.standard_deviation, analysis.outliers, analysis.outlier_variance };
                     getResultCapture().benchmarkEnded(stats);
-                } CATCH_CATCH_ANON (TestFailureException) {
+                } CATCH_CATCH_ANON (TestFailureException const&) {
                     getResultCapture().benchmarkFailed("Benchmark failed due to failed assertion"_sr);
                 } CATCH_CATCH_ALL{
                     getResultCapture().benchmarkFailed(translateActiveException());
@@ -2889,6 +2309,7 @@ namespace Catch {
 #ifndef CATCH_CONFIG_WCHAR_HPP_INCLUDED
 #define CATCH_CONFIG_WCHAR_HPP_INCLUDED
 
+
 // We assume that WCHAR should be enabled by default, and only disabled
 // for a shortlist (so far only DJGPP) of compilers.
 
@@ -3112,7 +2533,6 @@ namespace Catch {
     } // namespace Detail
 
 
-    // If we decide for C++14, change these to enable_if_ts
     template <typename T, typename = void>
     struct StringMaker {
         template <typename Fake = T>
@@ -3395,6 +2815,12 @@ namespace Catch {
             }
         }
     };
+    template <>
+    struct StringMaker<std::nullopt_t> {
+        static std::string convert(const std::nullopt_t&) {
+            return "{ }";
+        }
+    };
 }
 #endif // CATCH_CONFIG_ENABLE_OPTIONAL_STRINGMAKER
 
@@ -3781,6 +3207,143 @@ struct StringMaker<Catch::Approx> {
 #endif // CATCH_APPROX_HPP_INCLUDED
 
 
+#ifndef CATCH_ASSERTION_INFO_HPP_INCLUDED
+#define CATCH_ASSERTION_INFO_HPP_INCLUDED
+
+
+
+#ifndef CATCH_SOURCE_LINE_INFO_HPP_INCLUDED
+#define CATCH_SOURCE_LINE_INFO_HPP_INCLUDED
+
+#include <cstddef>
+#include <iosfwd>
+
+namespace Catch {
+
+    struct SourceLineInfo {
+
+        SourceLineInfo() = delete;
+        constexpr SourceLineInfo( char const* _file, std::size_t _line ) noexcept:
+            file( _file ),
+            line( _line )
+        {}
+
+        bool operator == ( SourceLineInfo const& other ) const noexcept;
+        bool operator < ( SourceLineInfo const& other ) const noexcept;
+
+        char const* file;
+        std::size_t line;
+
+        friend std::ostream& operator << (std::ostream& os, SourceLineInfo const& info);
+    };
+}
+
+#define CATCH_INTERNAL_LINEINFO \
+    ::Catch::SourceLineInfo( __FILE__, static_cast<std::size_t>( __LINE__ ) )
+
+#endif // CATCH_SOURCE_LINE_INFO_HPP_INCLUDED
+
+namespace Catch {
+
+    struct AssertionInfo {
+        // AssertionInfo() = delete;
+
+        StringRef macroName;
+        SourceLineInfo lineInfo;
+        StringRef capturedExpression;
+        ResultDisposition::Flags resultDisposition;
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_ASSERTION_INFO_HPP_INCLUDED
+
+
+#ifndef CATCH_ASSERTION_RESULT_HPP_INCLUDED
+#define CATCH_ASSERTION_RESULT_HPP_INCLUDED
+
+
+
+#ifndef CATCH_LAZY_EXPR_HPP_INCLUDED
+#define CATCH_LAZY_EXPR_HPP_INCLUDED
+
+#include <iosfwd>
+
+namespace Catch {
+
+    class ITransientExpression;
+
+    class LazyExpression {
+        friend class AssertionHandler;
+        friend struct AssertionStats;
+        friend class RunContext;
+
+        ITransientExpression const* m_transientExpression = nullptr;
+        bool m_isNegated;
+    public:
+        LazyExpression( bool isNegated ):
+            m_isNegated(isNegated)
+        {}
+        LazyExpression(LazyExpression const& other) = default;
+        LazyExpression& operator = ( LazyExpression const& ) = delete;
+
+        explicit operator bool() const {
+            return m_transientExpression != nullptr;
+        }
+
+        friend auto operator << ( std::ostream& os, LazyExpression const& lazyExpr ) -> std::ostream&;
+    };
+
+} // namespace Catch
+
+#endif // CATCH_LAZY_EXPR_HPP_INCLUDED
+
+#include <string>
+
+namespace Catch {
+
+    struct AssertionResultData
+    {
+        AssertionResultData() = delete;
+
+        AssertionResultData( ResultWas::OfType _resultType, LazyExpression const& _lazyExpression );
+
+        std::string message;
+        mutable std::string reconstructedExpression;
+        LazyExpression lazyExpression;
+        ResultWas::OfType resultType;
+
+        std::string reconstructExpression() const;
+    };
+
+    class AssertionResult {
+    public:
+        AssertionResult() = delete;
+        AssertionResult( AssertionInfo const& info, AssertionResultData&& data );
+
+        bool isOk() const;
+        bool succeeded() const;
+        ResultWas::OfType getResultType() const;
+        bool hasExpression() const;
+        bool hasMessage() const;
+        std::string getExpression() const;
+        std::string getExpressionInMacro() const;
+        bool hasExpandedExpression() const;
+        std::string getExpandedExpression() const;
+        StringRef getMessage() const;
+        SourceLineInfo getSourceInfo() const;
+        StringRef getTestMacroName() const;
+
+    //protected:
+        AssertionInfo m_info;
+        AssertionResultData m_resultData;
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_ASSERTION_RESULT_HPP_INCLUDED
+
+
 #ifndef CATCH_CONFIG_HPP_INCLUDED
 #define CATCH_CONFIG_HPP_INCLUDED
 
@@ -3945,6 +3508,7 @@ namespace Catch {
 #ifndef CATCH_OPTIONAL_HPP_INCLUDED
 #define CATCH_OPTIONAL_HPP_INCLUDED
 
+
 #include <cassert>
 
 namespace Catch {
@@ -3953,35 +3517,50 @@ namespace Catch {
     template<typename T>
     class Optional {
     public:
-        Optional() : nullableValue( nullptr ) {}
-        Optional( T const& _value )
-        : nullableValue( new( storage ) T( _value ) )
-        {}
-        Optional( Optional const& _other )
-        : nullableValue( _other ? new( storage ) T( *_other ) : nullptr )
-        {}
+        Optional(): nullableValue( nullptr ) {}
+        ~Optional() { reset(); }
+
+        Optional( T const& _value ):
+            nullableValue( new ( storage ) T( _value ) ) {}
+        Optional( T&& _value ):
+            nullableValue( new ( storage ) T( CATCH_MOVE( _value ) ) ) {}
 
-        ~Optional() {
+        Optional& operator=( T const& _value ) {
+            reset();
+            nullableValue = new ( storage ) T( _value );
+            return *this;
+        }
+        Optional& operator=( T&& _value ) {
             reset();
+            nullableValue = new ( storage ) T( CATCH_MOVE( _value ) );
+            return *this;
         }
 
-        Optional& operator= ( Optional const& _other ) {
-            if( &_other != this ) {
+        Optional( Optional const& _other ):
+            nullableValue( _other ? new ( storage ) T( *_other ) : nullptr ) {}
+        Optional( Optional&& _other ):
+            nullableValue( _other ? new ( storage ) T( CATCH_MOVE( *_other ) )
+                                  : nullptr ) {}
+
+        Optional& operator=( Optional const& _other ) {
+            if ( &_other != this ) {
                 reset();
-                if( _other )
-                    nullableValue = new( storage ) T( *_other );
+                if ( _other ) { nullableValue = new ( storage ) T( *_other ); }
             }
             return *this;
         }
-        Optional& operator = ( T const& _value ) {
-            reset();
-            nullableValue = new( storage ) T( _value );
+        Optional& operator=( Optional&& _other ) {
+            if ( &_other != this ) {
+                reset();
+                if ( _other ) {
+                    nullableValue = new ( storage ) T( CATCH_MOVE( *_other ) );
+                }
+            }
             return *this;
         }
 
         void reset() {
-            if( nullableValue )
-                nullableValue->~T();
+            if ( nullableValue ) { nullableValue->~T(); }
             nullableValue = nullptr;
         }
 
@@ -4025,177 +3604,42 @@ namespace Catch {
         }
         friend bool operator!=(Optional const& a, Optional const& b) {
             return !( a == b );
-        }
-
-    private:
-        T *nullableValue;
-        alignas(alignof(T)) char storage[sizeof(T)];
-    };
-
-} // end namespace Catch
-
-#endif // CATCH_OPTIONAL_HPP_INCLUDED
-
-
-#ifndef CATCH_RANDOM_SEED_GENERATION_HPP_INCLUDED
-#define CATCH_RANDOM_SEED_GENERATION_HPP_INCLUDED
-
-#include <cstdint>
-
-namespace Catch {
-
-    enum class GenerateFrom {
-        Time,
-        RandomDevice,
-        //! Currently equivalent to RandomDevice, but can change at any point
-        Default
-    };
-
-    std::uint32_t generateRandomSeed(GenerateFrom from);
-
-} // end namespace Catch
-
-#endif // CATCH_RANDOM_SEED_GENERATION_HPP_INCLUDED
-
-
-#ifndef CATCH_REPORTER_SPEC_PARSER_HPP_INCLUDED
-#define CATCH_REPORTER_SPEC_PARSER_HPP_INCLUDED
-
-
-
-#ifndef CATCH_CONSOLE_COLOUR_HPP_INCLUDED
-#define CATCH_CONSOLE_COLOUR_HPP_INCLUDED
-
-
-#include <iosfwd>
-#include <cstdint>
-
-namespace Catch {
-
-    enum class ColourMode : std::uint8_t;
-    class IStream;
-
-    struct Colour {
-        enum Code {
-            None = 0,
-
-            White,
-            Red,
-            Green,
-            Blue,
-            Cyan,
-            Yellow,
-            Grey,
-
-            Bright = 0x10,
-
-            BrightRed = Bright | Red,
-            BrightGreen = Bright | Green,
-            LightGrey = Bright | Grey,
-            BrightWhite = Bright | White,
-            BrightYellow = Bright | Yellow,
-
-            // By intention
-            FileName = LightGrey,
-            Warning = BrightYellow,
-            ResultError = BrightRed,
-            ResultSuccess = BrightGreen,
-            ResultExpectedFailure = Warning,
-
-            Error = BrightRed,
-            Success = Green,
-            Skip = LightGrey,
-
-            OriginalExpression = Cyan,
-            ReconstructedExpression = BrightYellow,
-
-            SecondaryText = LightGrey,
-            Headers = White
-        };
-    };
-
-    class ColourImpl {
-    protected:
-        //! The associated stream of this ColourImpl instance
-        IStream* m_stream;
-    public:
-        ColourImpl( IStream* stream ): m_stream( stream ) {}
-
-        //! RAII wrapper around writing specific colour of text using specific
-        //! colour impl into a stream.
-        class ColourGuard {
-            ColourImpl const* m_colourImpl;
-            Colour::Code m_code;
-            bool m_engaged = false;
+        }
 
-        public:
-            //! Does **not** engage the guard/start the colour
-            ColourGuard( Colour::Code code,
-                         ColourImpl const* colour );
+    private:
+        T* nullableValue;
+        alignas(alignof(T)) char storage[sizeof(T)];
+    };
 
-            ColourGuard( ColourGuard const& rhs ) = delete;
-            ColourGuard& operator=( ColourGuard const& rhs ) = delete;
+} // end namespace Catch
 
-            ColourGuard( ColourGuard&& rhs ) noexcept;
-            ColourGuard& operator=( ColourGuard&& rhs ) noexcept;
+#endif // CATCH_OPTIONAL_HPP_INCLUDED
 
-            //! Removes colour _if_ the guard was engaged
-            ~ColourGuard();
 
-            /**
-             * Explicitly engages colour for given stream.
-             *
-             * The API based on operator<< should be preferred.
-             */
-            ColourGuard& engage( std::ostream& stream ) &;
-            /**
-             * Explicitly engages colour for given stream.
-             *
-             * The API based on operator<< should be preferred.
-             */
-            ColourGuard&& engage( std::ostream& stream ) &&;
+#ifndef CATCH_RANDOM_SEED_GENERATION_HPP_INCLUDED
+#define CATCH_RANDOM_SEED_GENERATION_HPP_INCLUDED
 
-        private:
-            //! Engages the guard and starts using colour
-            friend std::ostream& operator<<( std::ostream& lhs,
-                                             ColourGuard& guard ) {
-                guard.engageImpl( lhs );
-                return lhs;
-            }
-            //! Engages the guard and starts using colour
-            friend std::ostream& operator<<( std::ostream& lhs,
-                                            ColourGuard&& guard) {
-                guard.engageImpl( lhs );
-                return lhs;
-            }
+#include <cstdint>
 
-            void engageImpl( std::ostream& stream );
+namespace Catch {
 
-        };
+    enum class GenerateFrom {
+        Time,
+        RandomDevice,
+        //! Currently equivalent to RandomDevice, but can change at any point
+        Default
+    };
 
-        virtual ~ColourImpl(); // = default
-        /**
-         * Creates a guard object for given colour and this colour impl
-         *
-         * **Important:**
-         * the guard starts disengaged, and has to be engaged explicitly.
-         */
-        ColourGuard guardColour( Colour::Code colourCode );
+    std::uint32_t generateRandomSeed(GenerateFrom from);
 
-    private:
-        virtual void use( Colour::Code colourCode ) const = 0;
-    };
+} // end namespace Catch
 
-    //! Provides ColourImpl based on global config and target compilation platform
-    Detail::unique_ptr<ColourImpl> makeColourImpl( ColourMode colourSelection,
-                                                   IStream* stream );
+#endif // CATCH_RANDOM_SEED_GENERATION_HPP_INCLUDED
 
-    //! Checks if specific colour impl has been compiled into the binary
-    bool isColourImplAvailable( ColourMode colourSelection );
 
-} // end namespace Catch
+#ifndef CATCH_REPORTER_SPEC_PARSER_HPP_INCLUDED
+#define CATCH_REPORTER_SPEC_PARSER_HPP_INCLUDED
 
-#endif // CATCH_CONSOLE_COLOUR_HPP_INCLUDED
 
 #include <map>
 #include <string>
@@ -4322,7 +3766,7 @@ namespace Catch {
         bool benchmarkNoAnalysis = false;
         unsigned int benchmarkSamples = 100;
         double benchmarkConfidenceInterval = 0.95;
-        unsigned int benchmarkResamples = 100000;
+        unsigned int benchmarkResamples = 100'000;
         std::chrono::milliseconds::rep benchmarkWarmupTime = 100;
 
         Verbosity verbosity = Verbosity::Normal;
@@ -4424,6 +3868,29 @@ namespace Catch {
 
 
 
+
+/** \file
+ * Wrapper for the CATCH_CONFIG_PREFIX_MESSAGES configuration option
+ *
+ * CATCH_CONFIG_PREFIX_ALL can be used to avoid clashes with other macros
+ * by prepending CATCH_. This may not be desirable if the only clashes are with
+ * logger macros such as INFO and WARN. In this cases
+ * CATCH_CONFIG_PREFIX_MESSAGES can be used to only prefix a small subset
+ * of relevant macros.
+ *
+ */
+
+#ifndef CATCH_CONFIG_PREFIX_MESSAGES_HPP_INCLUDED
+#define CATCH_CONFIG_PREFIX_MESSAGES_HPP_INCLUDED
+
+
+#if defined(CATCH_CONFIG_PREFIX_ALL) && !defined(CATCH_CONFIG_PREFIX_MESSAGES)
+    #define CATCH_CONFIG_PREFIX_MESSAGES
+#endif
+
+#endif // CATCH_CONFIG_PREFIX_MESSAGES_HPP_INCLUDED
+
+
 #ifndef CATCH_STREAM_END_STOP_HPP_INCLUDED
 #define CATCH_STREAM_END_STOP_HPP_INCLUDED
 
@@ -4435,10 +3902,10 @@ namespace Catch {
     // as well as
     //    << stuff +StreamEndStop
     struct StreamEndStop {
-        StringRef operator+() const { return StringRef(); }
+        constexpr StringRef operator+() const { return StringRef(); }
 
         template <typename T>
-        friend T const& operator+( T const& value, StreamEndStop ) {
+        constexpr friend T const& operator+( T const& value, StreamEndStop ) {
             return value;
         }
     };
@@ -4447,12 +3914,47 @@ namespace Catch {
 
 #endif // CATCH_STREAM_END_STOP_HPP_INCLUDED
 
+
+#ifndef CATCH_MESSAGE_INFO_HPP_INCLUDED
+#define CATCH_MESSAGE_INFO_HPP_INCLUDED
+
+
+#include <string>
+
+namespace Catch {
+
+    struct MessageInfo {
+        MessageInfo(    StringRef _macroName,
+                        SourceLineInfo const& _lineInfo,
+                        ResultWas::OfType _type );
+
+        StringRef macroName;
+        std::string message;
+        SourceLineInfo lineInfo;
+        ResultWas::OfType type;
+        unsigned int sequence;
+
+        bool operator == (MessageInfo const& other) const {
+            return sequence == other.sequence;
+        }
+        bool operator < (MessageInfo const& other) const {
+            return sequence < other.sequence;
+        }
+    private:
+        static unsigned int globalCount;
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_MESSAGE_INFO_HPP_INCLUDED
+
 #include <string>
 #include <vector>
 
 namespace Catch {
 
     struct SourceLineInfo;
+    class IResultCapture;
 
     struct MessageStream {
 
@@ -4493,7 +3995,7 @@ namespace Catch {
 
     class Capturer {
         std::vector<MessageInfo> m_messages;
-        IResultCapture& m_resultCapture = getResultCapture();
+        IResultCapture& m_resultCapture;
         size_t m_captured = 0;
     public:
         Capturer( StringRef macroName, SourceLineInfo const& lineInfo, ResultWas::OfType resultType, StringRef names );
@@ -4544,28 +4046,28 @@ namespace Catch {
     Catch::getResultCapture().emplaceUnscopedMessage( Catch::MessageBuilder( macroName##_catch_sr, CATCH_INTERNAL_LINEINFO, Catch::ResultWas::Info ) << log )
 
 
-#if defined(CATCH_CONFIG_PREFIX_ALL) && !defined(CATCH_CONFIG_DISABLE)
+#if defined(CATCH_CONFIG_PREFIX_MESSAGES) && !defined(CATCH_CONFIG_DISABLE)
 
   #define CATCH_INFO( msg ) INTERNAL_CATCH_INFO( "CATCH_INFO", msg )
   #define CATCH_UNSCOPED_INFO( msg ) INTERNAL_CATCH_UNSCOPED_INFO( "CATCH_UNSCOPED_INFO", msg )
   #define CATCH_WARN( msg ) INTERNAL_CATCH_MSG( "CATCH_WARN", Catch::ResultWas::Warning, Catch::ResultDisposition::ContinueOnFailure, msg )
   #define CATCH_CAPTURE( ... ) INTERNAL_CATCH_CAPTURE( INTERNAL_CATCH_UNIQUE_NAME(capturer), "CATCH_CAPTURE", __VA_ARGS__ )
 
-#elif defined(CATCH_CONFIG_PREFIX_ALL) && defined(CATCH_CONFIG_DISABLE)
+#elif defined(CATCH_CONFIG_PREFIX_MESSAGES) && defined(CATCH_CONFIG_DISABLE)
 
   #define CATCH_INFO( msg )          (void)(0)
   #define CATCH_UNSCOPED_INFO( msg ) (void)(0)
   #define CATCH_WARN( msg )          (void)(0)
   #define CATCH_CAPTURE( ... )       (void)(0)
 
-#elif !defined(CATCH_CONFIG_PREFIX_ALL) && !defined(CATCH_CONFIG_DISABLE)
+#elif !defined(CATCH_CONFIG_PREFIX_MESSAGES) && !defined(CATCH_CONFIG_DISABLE)
 
   #define INFO( msg ) INTERNAL_CATCH_INFO( "INFO", msg )
   #define UNSCOPED_INFO( msg ) INTERNAL_CATCH_UNSCOPED_INFO( "UNSCOPED_INFO", msg )
   #define WARN( msg ) INTERNAL_CATCH_MSG( "WARN", Catch::ResultWas::Warning, Catch::ResultDisposition::ContinueOnFailure, msg )
   #define CAPTURE( ... ) INTERNAL_CATCH_CAPTURE( INTERNAL_CATCH_UNIQUE_NAME(capturer), "CAPTURE", __VA_ARGS__ )
 
-#elif !defined(CATCH_CONFIG_PREFIX_ALL) && defined(CATCH_CONFIG_DISABLE)
+#elif !defined(CATCH_CONFIG_PREFIX_MESSAGES) && defined(CATCH_CONFIG_DISABLE)
 
   #define INFO( msg )          (void)(0)
   #define UNSCOPED_INFO( msg ) (void)(0)
@@ -4580,6 +4082,75 @@ namespace Catch {
 #endif // CATCH_MESSAGE_HPP_INCLUDED
 
 
+#ifndef CATCH_SECTION_INFO_HPP_INCLUDED
+#define CATCH_SECTION_INFO_HPP_INCLUDED
+
+
+
+#ifndef CATCH_TOTALS_HPP_INCLUDED
+#define CATCH_TOTALS_HPP_INCLUDED
+
+#include <cstdint>
+
+namespace Catch {
+
+    struct Counts {
+        Counts operator - ( Counts const& other ) const;
+        Counts& operator += ( Counts const& other );
+
+        std::uint64_t total() const;
+        bool allPassed() const;
+        bool allOk() const;
+
+        std::uint64_t passed = 0;
+        std::uint64_t failed = 0;
+        std::uint64_t failedButOk = 0;
+        std::uint64_t skipped = 0;
+    };
+
+    struct Totals {
+
+        Totals operator - ( Totals const& other ) const;
+        Totals& operator += ( Totals const& other );
+
+        Totals delta( Totals const& prevTotals ) const;
+
+        Counts assertions;
+        Counts testCases;
+    };
+}
+
+#endif // CATCH_TOTALS_HPP_INCLUDED
+
+#include <string>
+
+namespace Catch {
+
+    struct SectionInfo {
+        // The last argument is ignored, so that people can write
+        // SECTION("ShortName", "Proper description that is long") and
+        // still use the `-c` flag comfortably.
+        SectionInfo( SourceLineInfo const& _lineInfo, std::string _name,
+                    const char* const = nullptr ):
+            name(CATCH_MOVE(_name)),
+            lineInfo(_lineInfo)
+            {}
+
+        std::string name;
+        SourceLineInfo lineInfo;
+    };
+
+    struct SectionEndInfo {
+        SectionInfo sectionInfo;
+        Counts prevAssertions;
+        double durationInSeconds;
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_SECTION_INFO_HPP_INCLUDED
+
+
 #ifndef CATCH_SESSION_HPP_INCLUDED
 #define CATCH_SESSION_HPP_INCLUDED
 
@@ -4683,17 +4254,16 @@ namespace Catch {
             enum class TokenType { Option, Argument };
             struct Token {
                 TokenType type;
-                std::string token;
+                StringRef token;
             };
 
             // Abstracts iterators into args as a stream of tokens, with option
             // arguments uniformly handled
             class TokenStream {
-                using Iterator = std::vector<std::string>::const_iterator;
+                using Iterator = std::vector<StringRef>::const_iterator;
                 Iterator it;
                 Iterator itEnd;
                 std::vector<Token> m_tokenBuffer;
-
                 void loadBuffer();
 
             public:
@@ -4745,12 +4315,17 @@ namespace Catch {
                 ResultType m_type;
             };
 
-            template <typename T> class ResultValueBase : public ResultBase {
+            template <typename T>
+            class ResultValueBase : public ResultBase {
             public:
-                auto value() const -> T const& {
+                T const& value() const& {
                     enforceOk();
                     return m_value;
                 }
+                T&& value() && {
+                    enforceOk();
+                    return CATCH_MOVE( m_value );
+                }
 
             protected:
                 ResultValueBase( ResultType type ): ResultBase( type ) {}
@@ -4760,13 +4335,23 @@ namespace Catch {
                     if ( m_type == ResultType::Ok )
                         new ( &m_value ) T( other.m_value );
                 }
+                ResultValueBase( ResultValueBase&& other ):
+                    ResultBase( other ) {
+                    if ( m_type == ResultType::Ok )
+                        new ( &m_value ) T( CATCH_MOVE(other.m_value) );
+                }
 
-                ResultValueBase( ResultType, T const& value ): ResultBase( ResultType::Ok ) {
+
+                ResultValueBase( ResultType, T const& value ):
+                    ResultBase( ResultType::Ok ) {
                     new ( &m_value ) T( value );
                 }
+                ResultValueBase( ResultType, T&& value ):
+                    ResultBase( ResultType::Ok ) {
+                    new ( &m_value ) T( CATCH_MOVE(value) );
+                }
 
-                auto operator=( ResultValueBase const& other )
-                    -> ResultValueBase& {
+                ResultValueBase& operator=( ResultValueBase const& other ) {
                     if ( m_type == ResultType::Ok )
                         m_value.~T();
                     ResultBase::operator=( other );
@@ -4774,6 +4359,14 @@ namespace Catch {
                         new ( &m_value ) T( other.m_value );
                     return *this;
                 }
+                ResultValueBase& operator=( ResultValueBase&& other ) {
+                    if ( m_type == ResultType::Ok ) m_value.~T();
+                    ResultBase::operator=( other );
+                    if ( m_type == ResultType::Ok )
+                        new ( &m_value ) T( CATCH_MOVE(other.m_value) );
+                    return *this;
+                }
+
 
                 ~ResultValueBase() override {
                     if ( m_type == ResultType::Ok )
@@ -4801,8 +4394,8 @@ namespace Catch {
                 }
 
                 template <typename U>
-                static auto ok( U const& value ) -> BasicResult {
-                    return { ResultType::Ok, value };
+                static auto ok( U&& value ) -> BasicResult {
+                    return { ResultType::Ok, CATCH_FORWARD(value) };
                 }
                 static auto ok() -> BasicResult { return { ResultType::Ok }; }
                 static auto logicError( std::string&& message )
@@ -4849,12 +4442,15 @@ namespace Catch {
             class ParseState {
             public:
                 ParseState( ParseResultType type,
-                            TokenStream const& remainingTokens );
+                            TokenStream remainingTokens );
 
                 ParseResultType type() const { return m_type; }
-                TokenStream const& remainingTokens() const {
+                TokenStream const& remainingTokens() const& {
                     return m_remainingTokens;
                 }
+                TokenStream&& remainingTokens() && {
+                    return CATCH_MOVE( m_remainingTokens );
+                }
 
             private:
                 ParseResultType m_type;
@@ -4867,7 +4463,7 @@ namespace Catch {
 
             struct HelpColumns {
                 std::string left;
-                std::string right;
+                StringRef descriptions;
             };
 
             template <typename T>
@@ -5027,7 +4623,7 @@ namespace Catch {
                 virtual ~ParserBase() = default;
                 virtual auto validate() const -> Result { return Result::ok(); }
                 virtual auto parse( std::string const& exeName,
-                                    TokenStream const& tokens ) const
+                                    TokenStream tokens ) const
                     -> InternalParseResult = 0;
                 virtual size_t cardinality() const;
 
@@ -5047,8 +4643,8 @@ namespace Catch {
             protected:
                 Optionality m_optionality = Optionality::Optional;
                 std::shared_ptr<BoundRef> m_ref;
-                std::string m_hint;
-                std::string m_description;
+                StringRef m_hint;
+                StringRef m_description;
 
                 explicit ParserRefImpl( std::shared_ptr<BoundRef> const& ref ):
                     m_ref( ref ) {}
@@ -5057,28 +4653,32 @@ namespace Catch {
                 template <typename LambdaT>
                 ParserRefImpl( accept_many_t,
                                LambdaT const& ref,
-                               std::string const& hint ):
+                               StringRef hint ):
                     m_ref( std::make_shared<BoundManyLambda<LambdaT>>( ref ) ),
                     m_hint( hint ) {}
 
                 template <typename T,
                           typename = typename std::enable_if_t<
                               !Detail::is_unary_function<T>::value>>
-                ParserRefImpl( T& ref, std::string const& hint ):
+                ParserRefImpl( T& ref, StringRef hint ):
                     m_ref( std::make_shared<BoundValueRef<T>>( ref ) ),
                     m_hint( hint ) {}
 
                 template <typename LambdaT,
                           typename = typename std::enable_if_t<
                               Detail::is_unary_function<LambdaT>::value>>
-                ParserRefImpl( LambdaT const& ref, std::string const& hint ):
+                ParserRefImpl( LambdaT const& ref, StringRef hint ):
                     m_ref( std::make_shared<BoundLambda<LambdaT>>( ref ) ),
                     m_hint( hint ) {}
 
-                auto operator()( std::string const& description ) -> DerivedT& {
+                DerivedT& operator()( StringRef description ) & {
                     m_description = description;
                     return static_cast<DerivedT&>( *this );
                 }
+                DerivedT&& operator()( StringRef description ) && {
+                    m_description = description;
+                    return static_cast<DerivedT&&>( *this );
+                }
 
                 auto optional() -> DerivedT& {
                     m_optionality = Optionality::Optional;
@@ -5101,7 +4701,7 @@ namespace Catch {
                         return 1;
                 }
 
-                std::string const& hint() const { return m_hint; }
+                StringRef hint() const { return m_hint; }
             };
 
         } // namespace detail
@@ -5115,13 +4715,13 @@ namespace Catch {
 
             Detail::InternalParseResult
                 parse(std::string const&,
-                      Detail::TokenStream const& tokens) const override;
+                      Detail::TokenStream tokens) const override;
         };
 
         // A parser for options
         class Opt : public Detail::ParserRefImpl<Opt> {
         protected:
-            std::vector<std::string> m_optNames;
+            std::vector<StringRef> m_optNames;
 
         public:
             template <typename LambdaT>
@@ -5134,33 +4734,37 @@ namespace Catch {
             template <typename LambdaT,
                       typename = typename std::enable_if_t<
                           Detail::is_unary_function<LambdaT>::value>>
-            Opt( LambdaT const& ref, std::string const& hint ):
+            Opt( LambdaT const& ref, StringRef hint ):
                 ParserRefImpl( ref, hint ) {}
 
             template <typename LambdaT>
-            Opt( accept_many_t, LambdaT const& ref, std::string const& hint ):
+            Opt( accept_many_t, LambdaT const& ref, StringRef hint ):
                 ParserRefImpl( accept_many, ref, hint ) {}
 
             template <typename T,
                       typename = typename std::enable_if_t<
                           !Detail::is_unary_function<T>::value>>
-            Opt( T& ref, std::string const& hint ):
+            Opt( T& ref, StringRef hint ):
                 ParserRefImpl( ref, hint ) {}
 
-            auto operator[](std::string const& optName) -> Opt& {
+            Opt& operator[]( StringRef optName ) & {
                 m_optNames.push_back(optName);
                 return *this;
             }
+            Opt&& operator[]( StringRef optName ) && {
+                m_optNames.push_back( optName );
+                return CATCH_MOVE(*this);
+            }
 
-            std::vector<Detail::HelpColumns> getHelpColumns() const;
+            Detail::HelpColumns getHelpColumns() const;
 
-            bool isMatch(std::string const& optToken) const;
+            bool isMatch(StringRef optToken) const;
 
             using ParserBase::parse;
 
             Detail::InternalParseResult
                 parse(std::string const&,
-                      Detail::TokenStream const& tokens) const override;
+                      Detail::TokenStream tokens) const override;
 
             Detail::Result validate() const override;
         };
@@ -5183,7 +4787,7 @@ namespace Catch {
             // handled specially
             Detail::InternalParseResult
                 parse(std::string const&,
-                      Detail::TokenStream const& tokens) const override;
+                      Detail::TokenStream tokens) const override;
 
             std::string const& name() const { return *m_name; }
             Detail::ParserResult set(std::string const& newName);
@@ -5208,16 +4812,28 @@ namespace Catch {
                 return *this;
             }
 
-            auto operator|=(Opt const& opt) -> Parser& {
-                m_options.push_back(opt);
-                return *this;
+            friend Parser& operator|=( Parser& p, Opt const& opt ) {
+                p.m_options.push_back( opt );
+                return p;
+            }
+            friend Parser& operator|=( Parser& p, Opt&& opt ) {
+                p.m_options.push_back( CATCH_MOVE(opt) );
+                return p;
             }
 
             Parser& operator|=(Parser const& other);
 
             template <typename T>
-            auto operator|(T const& other) const -> Parser {
-                return Parser(*this) |= other;
+            friend Parser operator|( Parser const& p, T&& rhs ) {
+                Parser temp( p );
+                temp |= rhs;
+                return temp;
+            }
+
+            template <typename T>
+            friend Parser operator|( Parser&& p, T&& rhs ) {
+                p |= CATCH_FORWARD(rhs);
+                return CATCH_MOVE(p);
             }
 
             std::vector<Detail::HelpColumns> getHelpColumns() const;
@@ -5235,21 +4851,23 @@ namespace Catch {
             using ParserBase::parse;
             Detail::InternalParseResult
                 parse(std::string const& exeName,
-                      Detail::TokenStream const& tokens) const override;
+                      Detail::TokenStream tokens) const override;
         };
 
-        // Transport for raw args (copied from main args, or supplied via
-        // init list for testing)
+        /**
+         * Wrapper over argc + argv, assumes that the inputs outlive it
+         */
         class Args {
             friend Detail::TokenStream;
-            std::string m_exeName;
-            std::vector<std::string> m_args;
+            StringRef m_exeName;
+            std::vector<StringRef> m_args;
 
         public:
             Args(int argc, char const* const* argv);
-            Args(std::initializer_list<std::string> args);
+            // Helper constructor for testing
+            Args(std::initializer_list<StringRef> args);
 
-            std::string const& exeName() const { return m_exeName; }
+            StringRef exeName() const { return m_exeName; }
         };
 
 
@@ -5855,8 +5473,6 @@ namespace Catch {
 
 namespace Catch {
 
-    class IResultCapture;
-
     struct AssertionReaction {
         bool shouldDebugBreak = false;
         bool shouldThrow = false;
@@ -5897,7 +5513,6 @@ namespace Catch {
         void handleUnexpectedInflightException();
 
         void complete();
-        void setCompleted();
 
         // query
         auto allowThrows() const -> bool;
@@ -5909,13 +5524,10 @@ namespace Catch {
 
 #endif // CATCH_ASSERTION_HANDLER_HPP_INCLUDED
 
-// We need this suppression to leak, because it took until GCC 10
-// for the front end to handle local suppression via _Pragma properly
-#if defined(__GNUC__) && !defined(__clang__) && !defined(__ICC) && __GNUC__ <= 9
-  #pragma GCC diagnostic ignored "-Wparentheses"
-#endif
 
-#if !defined(CATCH_CONFIG_DISABLE)
+#ifndef CATCH_PREPROCESSOR_INTERNAL_STRINGIFY_HPP_INCLUDED
+#define CATCH_PREPROCESSOR_INTERNAL_STRINGIFY_HPP_INCLUDED
+
 
 #if !defined(CATCH_CONFIG_DISABLE_STRINGIFICATION)
   #define CATCH_INTERNAL_STRINGIFY(...) #__VA_ARGS__##_catch_sr
@@ -5923,6 +5535,16 @@ namespace Catch {
   #define CATCH_INTERNAL_STRINGIFY(...) "Disabled by CATCH_CONFIG_DISABLE_STRINGIFICATION"_catch_sr
 #endif
 
+#endif // CATCH_PREPROCESSOR_INTERNAL_STRINGIFY_HPP_INCLUDED
+
+// We need this suppression to leak, because it took until GCC 10
+// for the front end to handle local suppression via _Pragma properly
+#if defined(__GNUC__) && !defined(__clang__) && !defined(__ICC) && __GNUC__ <= 9
+  #pragma GCC diagnostic ignored "-Wparentheses"
+#endif
+
+#if !defined(CATCH_CONFIG_DISABLE)
+
 #if defined(CATCH_CONFIG_FAST_COMPILE) || defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -5934,7 +5556,7 @@ namespace Catch {
 #else // CATCH_CONFIG_FAST_COMPILE
 
 #define INTERNAL_CATCH_TRY try
-#define INTERNAL_CATCH_CATCH( handler ) catch(...) { handler.handleUnexpectedInflightException(); }
+#define INTERNAL_CATCH_CATCH( handler ) catch(...) { (handler).handleUnexpectedInflightException(); }
 
 #endif
 
@@ -5990,6 +5612,7 @@ namespace Catch {
         if( catchAssertionHandler.allowThrows() ) \
             try { \
                 CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+                CATCH_INTERNAL_SUPPRESS_UNUSED_RESULT \
                 CATCH_INTERNAL_SUPPRESS_USELESS_CAST_WARNINGS \
                 static_cast<void>(__VA_ARGS__); \
                 CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \
@@ -6010,6 +5633,7 @@ namespace Catch {
         if( catchAssertionHandler.allowThrows() ) \
             try { \
                 CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+                CATCH_INTERNAL_SUPPRESS_UNUSED_RESULT \
                 CATCH_INTERNAL_SUPPRESS_USELESS_CAST_WARNINGS \
                 static_cast<void>(expr); \
                 CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \
@@ -6036,6 +5660,7 @@ namespace Catch {
         if( catchAssertionHandler.allowThrows() ) \
             try { \
                 CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+                CATCH_INTERNAL_SUPPRESS_UNUSED_RESULT \
                 CATCH_INTERNAL_SUPPRESS_USELESS_CAST_WARNINGS \
                 static_cast<void>(__VA_ARGS__); \
                 CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \
@@ -6051,12 +5676,40 @@ namespace Catch {
 
 #endif // CATCH_CONFIG_DISABLE
 
-#endif // CATCH_TEST_MACRO_IMPL_HPP_INCLUDED
+#endif // CATCH_TEST_MACRO_IMPL_HPP_INCLUDED
+
+
+#ifndef CATCH_SECTION_HPP_INCLUDED
+#define CATCH_SECTION_HPP_INCLUDED
+
+
+
+
+/** \file
+ * Wrapper for the STATIC_ANALYSIS_SUPPORT configuration option
+ *
+ * Some of Catch2's macros can be defined differently to work better with
+ * static analysis tools, like clang-tidy or coverity.
+ * Currently the main use case is to show that `SECTION`s are executed
+ * exclusively, and not all in one run of a `TEST_CASE`.
+ */
+
+#ifndef CATCH_CONFIG_STATIC_ANALYSIS_SUPPORT_HPP_INCLUDED
+#define CATCH_CONFIG_STATIC_ANALYSIS_SUPPORT_HPP_INCLUDED
+
+
+#if defined(__clang_analyzer__) || defined(__COVERITY__)
+    #define CATCH_INTERNAL_CONFIG_STATIC_ANALYSIS_SUPPORT
+#endif
 
+#if defined( CATCH_INTERNAL_CONFIG_STATIC_ANALYSIS_SUPPORT ) && \
+    !defined( CATCH_CONFIG_NO_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT ) && \
+    !defined( CATCH_CONFIG_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT )
+#    define CATCH_CONFIG_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT
+#endif
 
-#ifndef CATCH_SECTION_HPP_INCLUDED
-#define CATCH_SECTION_HPP_INCLUDED
 
+#endif // CATCH_CONFIG_STATIC_ANALYSIS_SUPPORT_HPP_INCLUDED
 
 
 #ifndef CATCH_TIMER_HPP_INCLUDED
@@ -6103,17 +5756,63 @@ namespace Catch {
 
 } // end namespace Catch
 
-#define INTERNAL_CATCH_SECTION( ... ) \
-    CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
-    CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS \
-    if( Catch::Section const& INTERNAL_CATCH_UNIQUE_NAME( catch_internal_Section ) = Catch::Section( CATCH_INTERNAL_LINEINFO, __VA_ARGS__ ) ) \
-    CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+#if !defined(CATCH_CONFIG_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT)
+#    define INTERNAL_CATCH_SECTION( ... )                                 \
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                         \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS                  \
+        if ( Catch::Section const& INTERNAL_CATCH_UNIQUE_NAME(            \
+                 catch_internal_Section ) =                               \
+                 Catch::Section( CATCH_INTERNAL_LINEINFO, __VA_ARGS__ ) ) \
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+
+#    define INTERNAL_CATCH_DYNAMIC_SECTION( ... )                     \
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                     \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS              \
+        if ( Catch::Section const& INTERNAL_CATCH_UNIQUE_NAME(        \
+                 catch_internal_Section ) =                           \
+                 Catch::SectionInfo(                                  \
+                     CATCH_INTERNAL_LINEINFO,                         \
+                     ( Catch::ReusableStringStream() << __VA_ARGS__ ) \
+                         .str() ) )                                   \
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+
+#else
+
+// These section definitions imply that at most one section at one level
+// will be intered (because only one section's __LINE__ can be equal to
+// the dummy `catchInternalSectionHint` variable from `TEST_CASE`).
+
+namespace Catch {
+    namespace Detail {
+        // Intentionally without linkage, as it should only be used as a dummy
+        // symbol for static analysis.
+        int GetNewSectionHint();
+    } // namespace Detail
+} // namespace Catch
+
+
+#    define INTERNAL_CATCH_SECTION( ... )                                   \
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                           \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS                    \
+        CATCH_INTERNAL_SUPPRESS_SHADOW_WARNINGS                             \
+        if ( [[maybe_unused]] const int catchInternalPreviousSectionHint =  \
+                 catchInternalSectionHint,                                  \
+             catchInternalSectionHint = Catch::Detail::GetNewSectionHint(); \
+             catchInternalPreviousSectionHint == __LINE__ )                 \
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+
+#    define INTERNAL_CATCH_DYNAMIC_SECTION( ... )                           \
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                           \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS                    \
+        CATCH_INTERNAL_SUPPRESS_SHADOW_WARNINGS                             \
+        if ( [[maybe_unused]] const int catchInternalPreviousSectionHint =  \
+                 catchInternalSectionHint,                                  \
+             catchInternalSectionHint = Catch::Detail::GetNewSectionHint(); \
+             catchInternalPreviousSectionHint == __LINE__ )                 \
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+
+#endif
 
-#define INTERNAL_CATCH_DYNAMIC_SECTION( ... ) \
-    CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
-    CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS \
-    if( Catch::Section const& INTERNAL_CATCH_UNIQUE_NAME( catch_internal_Section ) = Catch::SectionInfo( CATCH_INTERNAL_LINEINFO, (Catch::ReusableStringStream() << __VA_ARGS__).str() ) ) \
-    CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
 
 #endif // CATCH_SECTION_HPP_INCLUDED
 
@@ -6123,42 +5822,20 @@ namespace Catch {
 
 
 
-#ifndef CATCH_INTERFACES_TESTCASE_HPP_INCLUDED
-#define CATCH_INTERFACES_TESTCASE_HPP_INCLUDED
-
-#include <vector>
+#ifndef CATCH_INTERFACES_TEST_INVOKER_HPP_INCLUDED
+#define CATCH_INTERFACES_TEST_INVOKER_HPP_INCLUDED
 
 namespace Catch {
 
-    class TestSpec;
-    struct TestCaseInfo;
-
     class ITestInvoker {
     public:
-        virtual void invoke () const = 0;
+        virtual void invoke() const = 0;
         virtual ~ITestInvoker(); // = default
     };
 
-    class TestCaseHandle;
-    class IConfig;
-
-    class ITestCaseRegistry {
-    public:
-        virtual ~ITestCaseRegistry(); // = default
-        // TODO: this exists only for adding filenames to test cases -- let's expose this in a saner way later
-        virtual std::vector<TestCaseInfo* > const& getAllInfos() const = 0;
-        virtual std::vector<TestCaseHandle> const& getAllTests() const = 0;
-        virtual std::vector<TestCaseHandle> const& getAllTestsSorted( IConfig const& config ) const = 0;
-    };
-
-    bool isThrowSafe( TestCaseHandle const& testCase, IConfig const& config );
-    bool matchTest( TestCaseHandle const& testCase, TestSpec const& testSpec, IConfig const& config );
-    std::vector<TestCaseHandle> filterTests( std::vector<TestCaseHandle> const& testCases, TestSpec const& testSpec, IConfig const& config );
-    std::vector<TestCaseHandle> const& getAllTestCasesSorted( IConfig const& config );
-
-}
+} // namespace Catch
 
-#endif // CATCH_INTERFACES_TESTCASE_HPP_INCLUDED
+#endif // CATCH_INTERFACES_TEST_INVOKER_HPP_INCLUDED
 
 
 #ifndef CATCH_PREPROCESSOR_REMOVE_PARENS_HPP_INCLUDED
@@ -6230,6 +5907,9 @@ struct AutoReg : Detail::NonCopyable {
         void TestName::test()
 #endif
 
+
+#if !defined(CATCH_CONFIG_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT)
+
     ///////////////////////////////////////////////////////////////////////////////
     #define INTERNAL_CATCH_TESTCASE2( TestName, ... ) \
         static void TestName(); \
@@ -6242,19 +5922,40 @@ struct AutoReg : Detail::NonCopyable {
     #define INTERNAL_CATCH_TESTCASE( ... ) \
         INTERNAL_CATCH_TESTCASE2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEST_ ), __VA_ARGS__ )
 
-    ///////////////////////////////////////////////////////////////////////////////
-    #define INTERNAL_CATCH_METHOD_AS_TEST_CASE( QualifiedMethod, ... ) \
-        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
-        CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \
-        CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS \
-        namespace {                                                           \
-        const Catch::AutoReg INTERNAL_CATCH_UNIQUE_NAME( autoRegistrar )( \
-            Catch::makeTestInvoker( &QualifiedMethod ),                   \
-            CATCH_INTERNAL_LINEINFO,                                      \
-            "&" #QualifiedMethod##_catch_sr,                              \
-            Catch::NameAndTags{ __VA_ARGS__ } );                          \
-    } /* NOLINT */ \
-        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+#else  // ^^ !CATCH_CONFIG_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT | vv CATCH_CONFIG_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT
+
+
+// Dummy registrator for the dumy test case macros
+namespace Catch {
+    namespace Detail {
+        struct DummyUse {
+            DummyUse( void ( * )( int ) );
+        };
+    } // namespace Detail
+} // namespace Catch
+
+// Note that both the presence of the argument and its exact name are
+// necessary for the section support.
+
+// We provide a shadowed variable so that a `SECTION` inside non-`TEST_CASE`
+// tests can compile. The redefined `TEST_CASE` shadows this with param.
+static int catchInternalSectionHint = 0;
+
+#    define INTERNAL_CATCH_TESTCASE2( fname )                              \
+        static void fname( int );                                          \
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                          \
+        CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS                           \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS                   \
+        static const Catch::Detail::DummyUse INTERNAL_CATCH_UNIQUE_NAME(   \
+            dummyUser )( &(fname) );                                       \
+        CATCH_INTERNAL_SUPPRESS_SHADOW_WARNINGS                            \
+        static void fname( [[maybe_unused]] int catchInternalSectionHint ) \
+            CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+#    define INTERNAL_CATCH_TESTCASE( ... ) \
+        INTERNAL_CATCH_TESTCASE2( INTERNAL_CATCH_UNIQUE_NAME( dummyFunction ) )
+
+
+#endif // CATCH_CONFIG_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT
 
     ///////////////////////////////////////////////////////////////////////////////
     #define INTERNAL_CATCH_TEST_CASE_METHOD2( TestName, ClassName, ... )\
@@ -6276,6 +5977,22 @@ struct AutoReg : Detail::NonCopyable {
     #define INTERNAL_CATCH_TEST_CASE_METHOD( ClassName, ... ) \
         INTERNAL_CATCH_TEST_CASE_METHOD2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEST_ ), ClassName, __VA_ARGS__ )
 
+
+    ///////////////////////////////////////////////////////////////////////////////
+    #define INTERNAL_CATCH_METHOD_AS_TEST_CASE( QualifiedMethod, ... ) \
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+        CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS \
+        namespace {                                                           \
+        const Catch::AutoReg INTERNAL_CATCH_UNIQUE_NAME( autoRegistrar )( \
+            Catch::makeTestInvoker( &QualifiedMethod ),                   \
+            CATCH_INTERNAL_LINEINFO,                                      \
+            "&" #QualifiedMethod##_catch_sr,                              \
+            Catch::NameAndTags{ __VA_ARGS__ } );                          \
+    } /* NOLINT */ \
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+
+
     ///////////////////////////////////////////////////////////////////////////////
     #define INTERNAL_CATCH_REGISTER_TESTCASE( Function, ... ) \
         do { \
@@ -7194,6 +6911,7 @@ namespace Catch {
     };
 
     class ITestInvoker;
+    struct NameAndTags;
 
     enum class TestCaseProperties : uint8_t {
         None = 0,
@@ -7318,6 +7036,10 @@ namespace Catch {
 #include <exception>
 
 namespace Catch {
+    namespace Detail {
+        void registerTranslatorImpl(
+            Detail::unique_ptr<IExceptionTranslator>&& translator );
+    }
 
     class ExceptionTranslatorRegistrar {
         template<typename T>
@@ -7351,9 +7073,9 @@ namespace Catch {
     public:
         template<typename T>
         ExceptionTranslatorRegistrar( std::string(*translateFunction)( T const& ) ) {
-            getMutableRegistryHub().registerTranslator(
-                Detail::make_unique<ExceptionTranslator<T>>(translateFunction)
-            );
+            Detail::registerTranslatorImpl(
+                Detail::make_unique<ExceptionTranslator<T>>(
+                    translateFunction ) );
         }
     };
 
@@ -7425,7 +7147,7 @@ namespace Catch {
 #define CATCH_VERSION_MACROS_HPP_INCLUDED
 
 #define CATCH_VERSION_MAJOR 3
-#define CATCH_VERSION_MINOR 3
+#define CATCH_VERSION_MINOR 5
 #define CATCH_VERSION_PATCH 2
 
 #endif // CATCH_VERSION_MACROS_HPP_INCLUDED
@@ -7584,12 +7306,6 @@ namespace Detail {
         }
 
     public:
-        ~IGenerator() override = default;
-        IGenerator() = default;
-        IGenerator(IGenerator const&) = default;
-        IGenerator& operator=(IGenerator const&) = default;
-
-
         // Returns the current element of the generator
         //
         // \Precondition The generator is either freshly constructed,
@@ -8058,37 +7774,578 @@ namespace Catch {
             return static_cast<result_type>(-1);
         }
 
-        // Provide some default initial state for the default constructor
-        SimplePcg32():SimplePcg32(0xed743cc4U) {}
+        // Provide some default initial state for the default constructor
+        SimplePcg32():SimplePcg32(0xed743cc4U) {}
+
+        explicit SimplePcg32(result_type seed_);
+
+        void seed(result_type seed_);
+        void discard(uint64_t skip);
+
+        result_type operator()();
+
+    private:
+        friend bool operator==(SimplePcg32 const& lhs, SimplePcg32 const& rhs);
+        friend bool operator!=(SimplePcg32 const& lhs, SimplePcg32 const& rhs);
+
+        // In theory we also need operator<< and operator>>
+        // In practice we do not use them, so we will skip them for now
+
+
+        std::uint64_t m_state;
+        // This part of the state determines which "stream" of the numbers
+        // is chosen -- we take it as a constant for Catch2, so we only
+        // need to deal with seeding the main state.
+        // Picked by reading 8 bytes from `/dev/random` :-)
+        static const std::uint64_t s_inc = (0x13ed0cc53f939476ULL << 1ULL) | 1ULL;
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_RANDOM_NUMBER_GENERATOR_HPP_INCLUDED
+
+
+
+#ifndef CATCH_UNIFORM_INTEGER_DISTRIBUTION_HPP_INCLUDED
+#define CATCH_UNIFORM_INTEGER_DISTRIBUTION_HPP_INCLUDED
+
+
+
+
+#ifndef CATCH_RANDOM_INTEGER_HELPERS_HPP_INCLUDED
+#define CATCH_RANDOM_INTEGER_HELPERS_HPP_INCLUDED
+
+#include <climits>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+namespace Catch {
+    namespace Detail {
+
+        template <std::size_t>
+        struct SizedUnsignedType;
+#define SizedUnsignedTypeHelper( TYPE )        \
+    template <>                                \
+    struct SizedUnsignedType<sizeof( TYPE )> { \
+        using type = TYPE;                     \
+    }
+
+        SizedUnsignedTypeHelper( std::uint8_t );
+        SizedUnsignedTypeHelper( std::uint16_t );
+        SizedUnsignedTypeHelper( std::uint32_t );
+        SizedUnsignedTypeHelper( std::uint64_t );
+#undef SizedUnsignedTypeHelper
+
+        template <std::size_t sz>
+        using SizedUnsignedType_t = typename SizedUnsignedType<sz>::type;
+
+        template <typename T>
+        using DoubleWidthUnsignedType_t = SizedUnsignedType_t<2 * sizeof( T )>;
+
+        template <typename T>
+        struct ExtendedMultResult {
+            T upper;
+            T lower;
+            friend bool operator==( ExtendedMultResult const& lhs,
+                                    ExtendedMultResult const& rhs ) {
+                return lhs.upper == rhs.upper && lhs.lower == rhs.lower;
+            }
+        };
+
+        // Returns 128 bit result of multiplying lhs and rhs
+        constexpr ExtendedMultResult<std::uint64_t>
+        extendedMult( std::uint64_t lhs, std::uint64_t rhs ) {
+            // We use the simple long multiplication approach for
+            // correctness, we can use platform specific builtins
+            // for performance later.
+
+            // Split the lhs and rhs into two 32bit "digits", so that we can
+            // do 64 bit arithmetic to handle carry bits.
+            //            32b    32b    32b    32b
+            //     lhs                  L1     L2
+            //   * rhs                  R1     R2
+            //            ------------------------
+            //                       |  R2 * L2  |
+            //                 |  R2 * L1  |
+            //                 |  R1 * L2  |
+            //           |  R1 * L1  |
+            //           -------------------------
+            //           |  a  |  b  |  c  |  d  |
+
+#define CarryBits( x ) ( x >> 32 )
+#define Digits( x ) ( x & 0xFF'FF'FF'FF )
+
+            auto r2l2 = Digits( rhs ) * Digits( lhs );
+            auto r2l1 = Digits( rhs ) * CarryBits( lhs );
+            auto r1l2 = CarryBits( rhs ) * Digits( lhs );
+            auto r1l1 = CarryBits( rhs ) * CarryBits( lhs );
+
+            // Sum to columns first
+            auto d = Digits( r2l2 );
+            auto c = CarryBits( r2l2 ) + Digits( r2l1 ) + Digits( r1l2 );
+            auto b = CarryBits( r2l1 ) + CarryBits( r1l2 ) + Digits( r1l1 );
+            auto a = CarryBits( r1l1 );
+
+            // Propagate carries between columns
+            c += CarryBits( d );
+            b += CarryBits( c );
+            a += CarryBits( b );
+
+            // Remove the used carries
+            c = Digits( c );
+            b = Digits( b );
+            a = Digits( a );
+
+#undef CarryBits
+#undef Digits
+
+            return {
+                a << 32 | b, // upper 64 bits
+                c << 32 | d  // lower 64 bits
+            };
+        }
+
+        template <typename UInt>
+        constexpr ExtendedMultResult<UInt> extendedMult( UInt lhs, UInt rhs ) {
+            static_assert( std::is_unsigned<UInt>::value,
+                           "extendedMult can only handle unsigned integers" );
+            static_assert( sizeof( UInt ) < sizeof( std::uint64_t ),
+                           "Generic extendedMult can only handle types smaller "
+                           "than uint64_t" );
+            using WideType = DoubleWidthUnsignedType_t<UInt>;
+
+            auto result = WideType( lhs ) * WideType( rhs );
+            return {
+                static_cast<UInt>( result >> ( CHAR_BIT * sizeof( UInt ) ) ),
+                static_cast<UInt>( result & UInt( -1 ) ) };
+        }
+
+
+        template <typename TargetType,
+                  typename Generator>
+            std::enable_if_t<sizeof(typename Generator::result_type) >= sizeof(TargetType),
+            TargetType> fillBitsFrom(Generator& gen) {
+            using gresult_type = typename Generator::result_type;
+            static_assert( std::is_unsigned<TargetType>::value, "Only unsigned integers are supported" );
+            static_assert( Generator::min() == 0 &&
+                           Generator::max() == static_cast<gresult_type>( -1 ),
+                           "Generator must be able to output all numbers in its result type (effectively it must be a random bit generator)" );
+
+            // We want to return the top bits from a generator, as they are
+            // usually considered higher quality.
+            constexpr auto generated_bits = sizeof( gresult_type ) * CHAR_BIT;
+            constexpr auto return_bits = sizeof( TargetType ) * CHAR_BIT;
+
+            return static_cast<TargetType>( gen() >>
+                                            ( generated_bits - return_bits) );
+        }
+
+        template <typename TargetType,
+                  typename Generator>
+            std::enable_if_t<sizeof(typename Generator::result_type) < sizeof(TargetType),
+            TargetType> fillBitsFrom(Generator& gen) {
+            using gresult_type = typename Generator::result_type;
+            static_assert( std::is_unsigned<TargetType>::value,
+                           "Only unsigned integers are supported" );
+            static_assert( Generator::min() == 0 &&
+                           Generator::max() == static_cast<gresult_type>( -1 ),
+                           "Generator must be able to output all numbers in its result type (effectively it must be a random bit generator)" );
+
+            constexpr auto generated_bits = sizeof( gresult_type ) * CHAR_BIT;
+            constexpr auto return_bits = sizeof( TargetType ) * CHAR_BIT;
+            std::size_t filled_bits = 0;
+            TargetType ret = 0;
+            do {
+                ret <<= generated_bits;
+                ret |= gen();
+                filled_bits += generated_bits;
+            } while ( filled_bits < return_bits );
+
+            return ret;
+        }
+
+        /*
+         * Transposes numbers into unsigned type while keeping their ordering
+         *
+         * This means that signed types are changed so that the ordering is
+         * [INT_MIN, ..., -1, 0, ..., INT_MAX], rather than order we would
+         * get by simple casting ([0, ..., INT_MAX, INT_MIN, ..., -1])
+         */
+        template <typename OriginalType, typename UnsignedType>
+        std::enable_if_t<std::is_signed<OriginalType>::value, UnsignedType>
+        transposeToNaturalOrder( UnsignedType in ) {
+            static_assert(
+                sizeof( OriginalType ) == sizeof( UnsignedType ),
+                "reordering requires the same sized types on both sides" );
+            static_assert( std::is_unsigned<UnsignedType>::value,
+                           "Input type must be unsigned" );
+            // Assuming 2s complement (standardized in current C++), the
+            // positive and negative numbers are already internally ordered,
+            // and their difference is in the top bit. Swapping it orders
+            // them the desired way.
+            constexpr auto highest_bit =
+                UnsignedType( 1 ) << ( sizeof( UnsignedType ) * CHAR_BIT - 1 );
+            return static_cast<UnsignedType>( in ^ highest_bit );
+        }
+
+
+
+        template <typename OriginalType,
+                  typename UnsignedType>
+        std::enable_if_t<std::is_unsigned<OriginalType>::value, UnsignedType>
+            transposeToNaturalOrder(UnsignedType in) {
+            static_assert(
+                sizeof( OriginalType ) == sizeof( UnsignedType ),
+                "reordering requires the same sized types on both sides" );
+            static_assert( std::is_unsigned<UnsignedType>::value, "Input type must be unsigned" );
+            // No reordering is needed for unsigned -> unsigned
+            return in;
+        }
+    } // namespace Detail
+} // namespace Catch
+
+#endif // CATCH_RANDOM_INTEGER_HELPERS_HPP_INCLUDED
+
+namespace Catch {
+
+    namespace Detail {
+        // Indirection to enable make_unsigned<bool> behaviour.
+        template <typename T>
+        struct make_unsigned {
+            using type = std::make_unsigned_t<T>;
+        };
+
+        template <>
+        struct make_unsigned<bool> {
+            using type = uint8_t;
+        };
+
+        template <typename T>
+        using make_unsigned_t = typename make_unsigned<T>::type;
+    }
+
+/**
+ * Implementation of uniform distribution on integers.
+ *
+ * Unlike `std::uniform_int_distribution`, this implementation supports
+ * various 1 byte integral types, including bool (but you should not
+ * actually use it for bools).
+ *
+ * The underlying algorithm is based on the one described in "Fast Random
+ * Integer Generation in an Interval" by Daniel Lemire, but has been
+ * optimized under the assumption of reuse of the same distribution object.
+ */
+template <typename IntegerType>
+class uniform_integer_distribution {
+    static_assert(std::is_integral<IntegerType>::value, "...");
+
+    using UnsignedIntegerType = Detail::make_unsigned_t<IntegerType>;
+
+    // Only the left bound is stored, and we store it converted to its
+    // unsigned image. This avoids having to do the conversions inside
+    // the operator(), at the cost of having to do the conversion in
+    // the a() getter. The right bound is only needed in the b() getter,
+    // so we recompute it there from other stored data.
+    UnsignedIntegerType m_a;
+
+    // How many different values are there in [a, b]. a == b => 1, can be 0 for distribution over all values in the type.
+    UnsignedIntegerType m_ab_distance;
+
+    // We hoisted this out of the main generation function. Technically,
+    // this means that using this distribution will be slower than Lemire's
+    // algorithm if this distribution instance will be used only few times,
+    // but it will be faster if it is used many times. Since Catch2 uses
+    // distributions only to implement random generators, we assume that each
+    // distribution will be reused many times and this is an optimization.
+    UnsignedIntegerType m_rejection_threshold = 0;
+
+    UnsignedIntegerType computeDistance(IntegerType a, IntegerType b) const {
+        // This overflows and returns 0 if a == 0 and b == TYPE_MAX.
+        // We handle that later when generating the number.
+        return transposeTo(b) - transposeTo(a) + 1;
+    }
+
+    static UnsignedIntegerType computeRejectionThreshold(UnsignedIntegerType ab_distance) {
+        // distance == 0 means that we will return all possible values from
+        // the type's range, and that we shouldn't reject anything.
+        if ( ab_distance == 0 ) { return 0; }
+        return ( ~ab_distance + 1 ) % ab_distance;
+    }
+
+    static UnsignedIntegerType transposeTo(IntegerType in) {
+        return Detail::transposeToNaturalOrder<IntegerType>(
+            static_cast<UnsignedIntegerType>( in ) );
+    }
+    static IntegerType transposeBack(UnsignedIntegerType in) {
+        return static_cast<IntegerType>(
+            Detail::transposeToNaturalOrder<IntegerType>(in) );
+    }
+
+public:
+    using result_type = IntegerType;
+
+    uniform_integer_distribution( IntegerType a, IntegerType b ):
+        m_a( transposeTo(a) ),
+        m_ab_distance( computeDistance(a, b) ),
+        m_rejection_threshold( computeRejectionThreshold(m_ab_distance) ) {
+        assert( a <= b );
+    }
+
+    template <typename Generator>
+    result_type operator()( Generator& g ) {
+        // All possible values of result_type are valid.
+        if ( m_ab_distance == 0 ) {
+            return transposeBack( Detail::fillBitsFrom<UnsignedIntegerType>( g ) );
+        }
+
+        auto random_number = Detail::fillBitsFrom<UnsignedIntegerType>( g );
+        auto emul = Detail::extendedMult( random_number, m_ab_distance );
+        // Unlike Lemire's algorithm we skip the ab_distance check, since
+        // we precomputed the rejection threshold, which is always tighter.
+        while (emul.lower < m_rejection_threshold) {
+            random_number = Detail::fillBitsFrom<UnsignedIntegerType>( g );
+            emul = Detail::extendedMult( random_number, m_ab_distance );
+        }
+
+        return transposeBack(m_a + emul.upper);
+    }
+
+    result_type a() const { return transposeBack(m_a); }
+    result_type b() const { return transposeBack(m_ab_distance + m_a - 1); }
+};
+
+} // end namespace Catch
+
+#endif // CATCH_UNIFORM_INTEGER_DISTRIBUTION_HPP_INCLUDED
+
+
+
+#ifndef CATCH_UNIFORM_FLOATING_POINT_DISTRIBUTION_HPP_INCLUDED
+#define CATCH_UNIFORM_FLOATING_POINT_DISTRIBUTION_HPP_INCLUDED
+
+
+
+
+#ifndef CATCH_RANDOM_FLOATING_POINT_HELPERS_HPP_INCLUDED
+#define CATCH_RANDOM_FLOATING_POINT_HELPERS_HPP_INCLUDED
+
+
+
+#ifndef CATCH_POLYFILLS_HPP_INCLUDED
+#define CATCH_POLYFILLS_HPP_INCLUDED
+
+namespace Catch {
+
+    bool isnan(float f);
+    bool isnan(double d);
+
+    float nextafter(float x, float y);
+    double nextafter(double x, double y);
+
+}
+
+#endif // CATCH_POLYFILLS_HPP_INCLUDED
+
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+
+namespace Catch {
+
+    namespace Detail {
+        /**
+         * Returns the largest magnitude of 1-ULP distance inside the [a, b] range.
+         *
+         * Assumes `a < b`.
+         */
+        template <typename FloatType>
+        FloatType gamma(FloatType a, FloatType b) {
+            static_assert( std::is_floating_point<FloatType>::value,
+                           "gamma returns the largest ULP magnitude within "
+                           "floating point range [a, b]. This only makes sense "
+                           "for floating point types" );
+            assert( a <= b );
+
+            const auto gamma_up = Catch::nextafter( a, std::numeric_limits<FloatType>::infinity() ) - a;
+            const auto gamma_down = b - Catch::nextafter( b, -std::numeric_limits<FloatType>::infinity() );
+
+            return gamma_up < gamma_down ? gamma_down : gamma_up;
+        }
+
+        template <typename FloatingPoint>
+        struct DistanceTypePicker;
+        template <>
+        struct DistanceTypePicker<float> {
+            using type = std::uint32_t;
+        };
+        template <>
+        struct DistanceTypePicker<double> {
+            using type = std::uint64_t;
+        };
+
+        template <typename T>
+        using DistanceType = typename DistanceTypePicker<T>::type;
+
+#if defined( __GNUC__ ) || defined( __clang__ )
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wfloat-equal"
+#endif
+        /**
+         * Computes the number of equi-distant floats in [a, b]
+         *
+         * Since not every range can be split into equidistant floats
+         * exactly, we actually compute ceil(b/distance - a/distance),
+         * because in those cases we want to overcount.
+         *
+         * Uses modified Dekker's FastTwoSum algorithm to handle rounding.
+         */
+        template <typename FloatType>
+        DistanceType<FloatType>
+        count_equidistant_floats( FloatType a, FloatType b, FloatType distance ) {
+            assert( a <= b );
+            // We get distance as gamma for our uniform float distribution,
+            // so this will round perfectly.
+            const auto ag = a / distance;
+            const auto bg = b / distance;
+
+            const auto s = bg - ag;
+            const auto err = ( std::fabs( a ) <= std::fabs( b ) )
+                                 ? -ag - ( s - bg )
+                                 : bg - ( s + ag );
+            const auto ceil_s = static_cast<DistanceType<FloatType>>( std::ceil( s ) );
+
+            return ( ceil_s != s ) ? ceil_s : ceil_s + ( err > 0 );
+        }
+#if defined( __GNUC__ ) || defined( __clang__ )
+#    pragma GCC diagnostic pop
+#endif
+
+    }
 
-        explicit SimplePcg32(result_type seed_);
+} // end namespace Catch
 
-        void seed(result_type seed_);
-        void discard(uint64_t skip);
+#endif // CATCH_RANDOM_FLOATING_POINT_HELPERS_HPP_INCLUDED
 
-        result_type operator()();
+#include <cmath>
+#include <type_traits>
 
-    private:
-        friend bool operator==(SimplePcg32 const& lhs, SimplePcg32 const& rhs);
-        friend bool operator!=(SimplePcg32 const& lhs, SimplePcg32 const& rhs);
+namespace Catch {
 
-        // In theory we also need operator<< and operator>>
-        // In practice we do not use them, so we will skip them for now
+    namespace Detail {
+#if defined( __GNUC__ ) || defined( __clang__ )
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wfloat-equal"
+#endif
+        // The issue with overflow only happens with maximal ULP and HUGE
+        // distance, e.g. when generating numbers in [-inf, inf] for given
+        // type. So we only check for the largest possible ULP in the
+        // type, and return something that does not overflow to inf in 1 mult.
+        constexpr std::uint64_t calculate_max_steps_in_one_go(double gamma) {
+            if ( gamma == 1.99584030953472e+292 ) { return 9007199254740991; }
+            return static_cast<std::uint64_t>( -1 );
+        }
+        constexpr std::uint32_t calculate_max_steps_in_one_go(float gamma) {
+            if ( gamma == 2.028241e+31f ) { return 16777215; }
+            return static_cast<std::uint32_t>( -1 );
+        }
+#if defined( __GNUC__ ) || defined( __clang__ )
+#    pragma GCC diagnostic pop
+#endif
+    }
 
+/**
+ * Implementation of uniform distribution on floating point numbers.
+ *
+ * Note that we support only `float` and `double` types, because these
+ * usually mean the same thing across different platform. `long double`
+ * varies wildly by platform and thus we cannot provide reproducible
+ * implementation. Also note that we don't implement all parts of
+ * distribution per standard: this distribution is not serializable, nor
+ * can the range be arbitrarily reset.
+ *
+ * The implementation also uses different approach than the one taken by
+ * `std::uniform_real_distribution`, where instead of generating a number
+ * between [0, 1) and then multiplying the range bounds with it, we first
+ * split the [a, b] range into a set of equidistributed floating point
+ * numbers, and then use uniform int distribution to pick which one to
+ * return.
+ *
+ * This has the advantage of guaranteeing uniformity (the multiplication
+ * method loses uniformity due to rounding when multiplying floats), except
+ * for small non-uniformity at one side of the interval, where we have
+ * to deal with the fact that not every interval is splittable into
+ * equidistributed floats.
+ *
+ * Based on "Drawing random floating-point numbers from an interval" by
+ * Frederic Goualard.
+ */
+template <typename FloatType>
+class uniform_floating_point_distribution {
+    static_assert(std::is_floating_point<FloatType>::value, "...");
+    static_assert(!std::is_same<FloatType, long double>::value,
+                  "We do not support long double due to inconsistent behaviour between platforms");
+
+    using WidthType = Detail::DistanceType<FloatType>;
+
+    FloatType m_a, m_b;
+    FloatType m_ulp_magnitude;
+    WidthType m_floats_in_range;
+    uniform_integer_distribution<WidthType> m_int_dist;
+
+    // In specific cases, we can overflow into `inf` when computing the
+    // `steps * g` offset. To avoid this, we don't offset by more than this
+    // in one multiply + addition.
+    WidthType m_max_steps_in_one_go;
+    // We don't want to do the magnitude check every call to `operator()`
+    bool m_a_has_leq_magnitude;
 
-        std::uint64_t m_state;
-        // This part of the state determines which "stream" of the numbers
-        // is chosen -- we take it as a constant for Catch2, so we only
-        // need to deal with seeding the main state.
-        // Picked by reading 8 bytes from `/dev/random` :-)
-        static const std::uint64_t s_inc = (0x13ed0cc53f939476ULL << 1ULL) | 1ULL;
-    };
+public:
+    using result_type = FloatType;
+
+    uniform_floating_point_distribution( FloatType a, FloatType b ):
+        m_a( a ),
+        m_b( b ),
+        m_ulp_magnitude( Detail::gamma( m_a, m_b ) ),
+        m_floats_in_range( Detail::count_equidistant_floats( m_a, m_b, m_ulp_magnitude ) ),
+        m_int_dist(0, m_floats_in_range),
+        m_max_steps_in_one_go( Detail::calculate_max_steps_in_one_go(m_ulp_magnitude)),
+        m_a_has_leq_magnitude(std::fabs(m_a) <= std::fabs(m_b))
+    {
+        assert( a <= b );
+    }
 
-} // end namespace Catch
+    template <typename Generator>
+    result_type operator()( Generator& g ) {
+        WidthType steps = m_int_dist( g );
+        if ( m_a_has_leq_magnitude ) {
+            if ( steps == m_floats_in_range ) { return m_a; }
+            auto b = m_b;
+            while (steps > m_max_steps_in_one_go) {
+                b -= m_max_steps_in_one_go * m_ulp_magnitude;
+                steps -= m_max_steps_in_one_go;
+            }
+            return b - steps * m_ulp_magnitude;
+        } else {
+            if ( steps == m_floats_in_range ) { return m_b; }
+            auto a = m_a;
+            while (steps > m_max_steps_in_one_go) {
+                a += m_max_steps_in_one_go * m_ulp_magnitude;
+                steps -= m_max_steps_in_one_go;
+            }
+            return a + steps * m_ulp_magnitude;
+        }
+    }
 
-#endif // CATCH_RANDOM_NUMBER_GENERATOR_HPP_INCLUDED
+    result_type a() const { return m_a; }
+    result_type b() const { return m_b; }
+};
+
+} // end namespace Catch
 
-#include <random>
+#endif // CATCH_UNIFORM_FLOATING_POINT_DISTRIBUTION_HPP_INCLUDED
 
 namespace Catch {
 namespace Generators {
@@ -8102,7 +8359,7 @@ namespace Detail {
 template <typename Float>
 class RandomFloatingGenerator final : public IGenerator<Float> {
     Catch::SimplePcg32 m_rng;
-    std::uniform_real_distribution<Float> m_dist;
+    Catch::uniform_floating_point_distribution<Float> m_dist;
     Float m_current_number;
 public:
     RandomFloatingGenerator( Float a, Float b, std::uint32_t seed ):
@@ -8120,10 +8377,27 @@ public:
     }
 };
 
+template <>
+class RandomFloatingGenerator<long double> final : public IGenerator<long double> {
+    // We still rely on <random> for this specialization, but we don't
+    // want to drag it into the header.
+    struct PImpl;
+    Catch::Detail::unique_ptr<PImpl> m_pimpl;
+    long double m_current_number;
+
+public:
+    RandomFloatingGenerator( long double a, long double b, std::uint32_t seed );
+
+    long double const& get() const override { return m_current_number; }
+    bool next() override;
+
+    ~RandomFloatingGenerator() override; // = default
+};
+
 template <typename Integer>
 class RandomIntegerGenerator final : public IGenerator<Integer> {
     Catch::SimplePcg32 m_rng;
-    std::uniform_int_distribution<Integer> m_dist;
+    Catch::uniform_integer_distribution<Integer> m_dist;
     Integer m_current_number;
 public:
     RandomIntegerGenerator( Integer a, Integer b, std::uint32_t seed ):
@@ -8144,14 +8418,6 @@ public:
 template <typename T>
 std::enable_if_t<std::is_integral<T>::value, GeneratorWrapper<T>>
 random(T a, T b) {
-    static_assert(
-        !std::is_same<T, char>::value &&
-        !std::is_same<T, int8_t>::value &&
-        !std::is_same<T, uint8_t>::value &&
-        !std::is_same<T, signed char>::value &&
-        !std::is_same<T, unsigned char>::value &&
-        !std::is_same<T, bool>::value,
-        "The requested type is not supported by the underlying random distributions from std" );
     return GeneratorWrapper<T>(
         Catch::Detail::make_unique<RandomIntegerGenerator<T>>(a, b, Detail::getSeed())
     );
@@ -8264,39 +8530,266 @@ GeneratorWrapper<ResultType> from_range(InputIterator from, InputSentinel to) {
     return GeneratorWrapper<ResultType>(Catch::Detail::make_unique<IteratorGenerator<ResultType>>(from, to));
 }
 
-template <typename Container,
-          typename ResultType = typename Container::value_type>
-GeneratorWrapper<ResultType> from_range(Container const& cnt) {
-    return GeneratorWrapper<ResultType>(Catch::Detail::make_unique<IteratorGenerator<ResultType>>(cnt.begin(), cnt.end()));
+template <typename Container>
+auto from_range(Container const& cnt) {
+    using std::begin;
+    using std::end;
+    return from_range( begin( cnt ), end( cnt ) );
 }
 
 
-} // namespace Generators
-} // namespace Catch
+} // namespace Generators
+} // namespace Catch
+
+
+#endif // CATCH_GENERATORS_RANGE_HPP_INCLUDED
+
+#endif // CATCH_GENERATORS_ALL_HPP_INCLUDED
+
+
+/** \file
+ * This is a convenience header for Catch2's interfaces. It includes
+ * **all** of Catch2 headers related to interfaces.
+ *
+ * Generally the Catch2 users should use specific includes they need,
+ * but this header can be used instead for ease-of-experimentation, or
+ * just plain convenience, at the cost of somewhat increased compilation
+ * times.
+ *
+ * When a new header is added to either the `interfaces` folder, or to
+ * the corresponding internal subfolder, it should be added here.
+ */
+
+
+#ifndef CATCH_INTERFACES_ALL_HPP_INCLUDED
+#define CATCH_INTERFACES_ALL_HPP_INCLUDED
+
+
+
+#ifndef CATCH_INTERFACES_REPORTER_HPP_INCLUDED
+#define CATCH_INTERFACES_REPORTER_HPP_INCLUDED
+
+
+
+#ifndef CATCH_TEST_RUN_INFO_HPP_INCLUDED
+#define CATCH_TEST_RUN_INFO_HPP_INCLUDED
+
+
+namespace Catch {
+
+    struct TestRunInfo {
+        constexpr TestRunInfo(StringRef _name) : name(_name) {}
+        StringRef name;
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_TEST_RUN_INFO_HPP_INCLUDED
+
+#include <map>
+#include <string>
+#include <vector>
+#include <iosfwd>
+
+namespace Catch {
+
+    struct ReporterDescription;
+    struct ListenerDescription;
+    struct TagInfo;
+    struct TestCaseInfo;
+    class TestCaseHandle;
+    class IConfig;
+    class IStream;
+    enum class ColourMode : std::uint8_t;
+
+    struct ReporterConfig {
+        ReporterConfig( IConfig const* _fullConfig,
+                        Detail::unique_ptr<IStream> _stream,
+                        ColourMode colourMode,
+                        std::map<std::string, std::string> customOptions );
+
+        ReporterConfig( ReporterConfig&& ) = default;
+        ReporterConfig& operator=( ReporterConfig&& ) = default;
+        ~ReporterConfig(); // = default
+
+        Detail::unique_ptr<IStream> takeStream() &&;
+        IConfig const* fullConfig() const;
+        ColourMode colourMode() const;
+        std::map<std::string, std::string> const& customOptions() const;
+
+    private:
+        Detail::unique_ptr<IStream> m_stream;
+        IConfig const* m_fullConfig;
+        ColourMode m_colourMode;
+        std::map<std::string, std::string> m_customOptions;
+    };
+
+    struct AssertionStats {
+        AssertionStats( AssertionResult const& _assertionResult,
+                        std::vector<MessageInfo> const& _infoMessages,
+                        Totals const& _totals );
+
+        AssertionStats( AssertionStats const& )              = default;
+        AssertionStats( AssertionStats && )                  = default;
+        AssertionStats& operator = ( AssertionStats const& ) = delete;
+        AssertionStats& operator = ( AssertionStats && )     = delete;
+
+        AssertionResult assertionResult;
+        std::vector<MessageInfo> infoMessages;
+        Totals totals;
+    };
+
+    struct SectionStats {
+        SectionStats(   SectionInfo&& _sectionInfo,
+                        Counts const& _assertions,
+                        double _durationInSeconds,
+                        bool _missingAssertions );
+
+        SectionInfo sectionInfo;
+        Counts assertions;
+        double durationInSeconds;
+        bool missingAssertions;
+    };
+
+    struct TestCaseStats {
+        TestCaseStats(  TestCaseInfo const& _testInfo,
+                        Totals const& _totals,
+                        std::string&& _stdOut,
+                        std::string&& _stdErr,
+                        bool _aborting );
+
+        TestCaseInfo const * testInfo;
+        Totals totals;
+        std::string stdOut;
+        std::string stdErr;
+        bool aborting;
+    };
+
+    struct TestRunStats {
+        TestRunStats(   TestRunInfo const& _runInfo,
+                        Totals const& _totals,
+                        bool _aborting );
+
+        TestRunInfo runInfo;
+        Totals totals;
+        bool aborting;
+    };
+
+    //! By setting up its preferences, a reporter can modify Catch2's behaviour
+    //! in some regards, e.g. it can request Catch2 to capture writes to
+    //! stdout/stderr during test execution, and pass them to the reporter.
+    struct ReporterPreferences {
+        //! Catch2 should redirect writes to stdout and pass them to the
+        //! reporter
+        bool shouldRedirectStdOut = false;
+        //! Catch2 should call `Reporter::assertionEnded` even for passing
+        //! assertions
+        bool shouldReportAllAssertions = false;
+    };
+
+    /**
+     * The common base for all reporters and event listeners
+     *
+     * Implementing classes must also implement:
+     *
+     *     //! User-friendly description of the reporter/listener type
+     *     static std::string getDescription()
+     *
+     * Generally shouldn't be derived from by users of Catch2 directly,
+     * instead they should derive from one of the utility bases that
+     * derive from this class.
+     */
+    class IEventListener {
+    protected:
+        //! Derived classes can set up their preferences here
+        ReporterPreferences m_preferences;
+        //! The test run's config as filled in from CLI and defaults
+        IConfig const* m_config;
+
+    public:
+        IEventListener( IConfig const* config ): m_config( config ) {}
+
+        virtual ~IEventListener(); // = default;
+
+        // Implementing class must also provide the following static methods:
+        // static std::string getDescription();
+
+        ReporterPreferences const& getPreferences() const {
+            return m_preferences;
+        }
+
+        //! Called when no test cases match provided test spec
+        virtual void noMatchingTestCases( StringRef unmatchedSpec ) = 0;
+        //! Called for all invalid test specs from the cli
+        virtual void reportInvalidTestSpec( StringRef invalidArgument ) = 0;
+
+        /**
+         * Called once in a testing run before tests are started
+         *
+         * Not called if tests won't be run (e.g. only listing will happen)
+         */
+        virtual void testRunStarting( TestRunInfo const& testRunInfo ) = 0;
+
+        //! Called _once_ for each TEST_CASE, no matter how many times it is entered
+        virtual void testCaseStarting( TestCaseInfo const& testInfo ) = 0;
+        //! Called _every time_ a TEST_CASE is entered, including repeats (due to sections)
+        virtual void testCasePartialStarting( TestCaseInfo const& testInfo, uint64_t partNumber ) = 0;
+        //! Called when a `SECTION` is being entered. Not called for skipped sections
+        virtual void sectionStarting( SectionInfo const& sectionInfo ) = 0;
+
+        //! Called when user-code is being probed before the actual benchmark runs
+        virtual void benchmarkPreparing( StringRef benchmarkName ) = 0;
+        //! Called after probe but before the user-code is being benchmarked
+        virtual void benchmarkStarting( BenchmarkInfo const& benchmarkInfo ) = 0;
+        //! Called with the benchmark results if benchmark successfully finishes
+        virtual void benchmarkEnded( BenchmarkStats<> const& benchmarkStats ) = 0;
+        //! Called if running the benchmarks fails for any reason
+        virtual void benchmarkFailed( StringRef benchmarkName ) = 0;
 
+        //! Called before assertion success/failure is evaluated
+        virtual void assertionStarting( AssertionInfo const& assertionInfo ) = 0;
 
-#endif // CATCH_GENERATORS_RANGE_HPP_INCLUDED
+        //! Called after assertion was fully evaluated
+        virtual void assertionEnded( AssertionStats const& assertionStats ) = 0;
 
-#endif // CATCH_GENERATORS_ALL_HPP_INCLUDED
+        //! Called after a `SECTION` has finished running
+        virtual void sectionEnded( SectionStats const& sectionStats ) = 0;
+        //! Called _every time_ a TEST_CASE is entered, including repeats (due to sections)
+        virtual void testCasePartialEnded(TestCaseStats const& testCaseStats, uint64_t partNumber ) = 0;
+        //! Called _once_ for each TEST_CASE, no matter how many times it is entered
+        virtual void testCaseEnded( TestCaseStats const& testCaseStats ) = 0;
+        /**
+         * Called once after all tests in a testing run are finished
+         *
+         * Not called if tests weren't run (e.g. only listings happened)
+         */
+        virtual void testRunEnded( TestRunStats const& testRunStats ) = 0;
 
+        /**
+         * Called with test cases that are skipped due to the test run aborting.
+         * NOT called for test cases that are explicitly skipped using the `SKIP` macro.
+         *
+         * Deprecated - will be removed in the next major release.
+         */
+        virtual void skipTest( TestCaseInfo const& testInfo ) = 0;
 
-/** \file
- * This is a convenience header for Catch2's interfaces. It includes
- * **all** of Catch2 headers related to interfaces.
- *
- * Generally the Catch2 users should use specific includes they need,
- * but this header can be used instead for ease-of-experimentation, or
- * just plain convenience, at the cost of somewhat increased compilation
- * times.
- *
- * When a new header is added to either the `interfaces` folder, or to
- * the corresponding internal subfolder, it should be added here.
- */
+        //! Called if a fatal error (signal/structured exception) occurred
+        virtual void fatalErrorEncountered( StringRef error ) = 0;
 
+        //! Writes out information about provided reporters using reporter-specific format
+        virtual void listReporters(std::vector<ReporterDescription> const& descriptions) = 0;
+        //! Writes out the provided listeners descriptions using reporter-specific format
+        virtual void listListeners(std::vector<ListenerDescription> const& descriptions) = 0;
+        //! Writes out information about provided tests using reporter-specific format
+        virtual void listTests(std::vector<TestCaseHandle> const& tests) = 0;
+        //! Writes out information about the provided tags using reporter-specific format
+        virtual void listTags(std::vector<TagInfo> const& tags) = 0;
+    };
+    using IEventListenerPtr = Detail::unique_ptr<IEventListener>;
 
-#ifndef CATCH_INTERFACES_ALL_HPP_INCLUDED
-#define CATCH_INTERFACES_ALL_HPP_INCLUDED
+} // end namespace Catch
 
+#endif // CATCH_INTERFACES_REPORTER_HPP_INCLUDED
 
 
 #ifndef CATCH_INTERFACES_REPORTER_FACTORY_HPP_INCLUDED
@@ -8337,89 +8830,79 @@ namespace Catch {
 #endif // CATCH_INTERFACES_REPORTER_FACTORY_HPP_INCLUDED
 
 
-#ifndef CATCH_INTERFACES_REPORTER_REGISTRY_HPP_INCLUDED
-#define CATCH_INTERFACES_REPORTER_REGISTRY_HPP_INCLUDED
+#ifndef CATCH_INTERFACES_TAG_ALIAS_REGISTRY_HPP_INCLUDED
+#define CATCH_INTERFACES_TAG_ALIAS_REGISTRY_HPP_INCLUDED
 
+#include <string>
 
+namespace Catch {
 
-#ifndef CATCH_CASE_INSENSITIVE_COMPARISONS_HPP_INCLUDED
-#define CATCH_CASE_INSENSITIVE_COMPARISONS_HPP_INCLUDED
+    struct TagAlias;
 
+    class ITagAliasRegistry {
+    public:
+        virtual ~ITagAliasRegistry(); // = default
+        // Nullptr if not present
+        virtual TagAlias const* find( std::string const& alias ) const = 0;
+        virtual std::string expandAliases( std::string const& unexpandedTestSpec ) const = 0;
 
-namespace Catch {
-    namespace Detail {
-        //! Provides case-insensitive `op<` semantics when called
-        struct CaseInsensitiveLess {
-            bool operator()( StringRef lhs,
-                             StringRef rhs ) const;
-        };
+        static ITagAliasRegistry const& get();
+    };
 
-        //! Provides case-insensitive `op==` semantics when called
-        struct CaseInsensitiveEqualTo {
-            bool operator()( StringRef lhs,
-                             StringRef rhs ) const;
-        };
+} // end namespace Catch
 
-    } // namespace Detail
-} // namespace Catch
+#endif // CATCH_INTERFACES_TAG_ALIAS_REGISTRY_HPP_INCLUDED
 
-#endif // CATCH_CASE_INSENSITIVE_COMPARISONS_HPP_INCLUDED
 
-#include <string>
+#ifndef CATCH_INTERFACES_TESTCASE_HPP_INCLUDED
+#define CATCH_INTERFACES_TESTCASE_HPP_INCLUDED
+
 #include <vector>
-#include <map>
 
 namespace Catch {
 
+    struct TestCaseInfo;
+    class TestCaseHandle;
     class IConfig;
 
-    class IEventListener;
-    using IEventListenerPtr = Detail::unique_ptr<IEventListener>;
-    class IReporterFactory;
-    using IReporterFactoryPtr = Detail::unique_ptr<IReporterFactory>;
-    struct ReporterConfig;
-    class EventListenerFactory;
-
-    class IReporterRegistry {
+    class ITestCaseRegistry {
     public:
-        using FactoryMap = std::map<std::string, IReporterFactoryPtr, Detail::CaseInsensitiveLess>;
-        using Listeners = std::vector<Detail::unique_ptr<EventListenerFactory>>;
-
-        virtual ~IReporterRegistry(); // = default
-        virtual IEventListenerPtr create( std::string const& name, ReporterConfig&& config ) const = 0;
-        virtual FactoryMap const& getFactories() const = 0;
-        virtual Listeners const& getListeners() const = 0;
+        virtual ~ITestCaseRegistry(); // = default
+        // TODO: this exists only for adding filenames to test cases -- let's expose this in a saner way later
+        virtual std::vector<TestCaseInfo* > const& getAllInfos() const = 0;
+        virtual std::vector<TestCaseHandle> const& getAllTests() const = 0;
+        virtual std::vector<TestCaseHandle> const& getAllTestsSorted( IConfig const& config ) const = 0;
     };
 
-} // end namespace Catch
-
-#endif // CATCH_INTERFACES_REPORTER_REGISTRY_HPP_INCLUDED
-
+}
 
-#ifndef CATCH_INTERFACES_TAG_ALIAS_REGISTRY_HPP_INCLUDED
-#define CATCH_INTERFACES_TAG_ALIAS_REGISTRY_HPP_INCLUDED
+#endif // CATCH_INTERFACES_TESTCASE_HPP_INCLUDED
 
-#include <string>
+#endif // CATCH_INTERFACES_ALL_HPP_INCLUDED
 
-namespace Catch {
 
-    struct TagAlias;
+#ifndef CATCH_CASE_INSENSITIVE_COMPARISONS_HPP_INCLUDED
+#define CATCH_CASE_INSENSITIVE_COMPARISONS_HPP_INCLUDED
 
-    class ITagAliasRegistry {
-    public:
-        virtual ~ITagAliasRegistry(); // = default
-        // Nullptr if not present
-        virtual TagAlias const* find( std::string const& alias ) const = 0;
-        virtual std::string expandAliases( std::string const& unexpandedTestSpec ) const = 0;
 
-        static ITagAliasRegistry const& get();
-    };
+namespace Catch {
+    namespace Detail {
+        //! Provides case-insensitive `op<` semantics when called
+        struct CaseInsensitiveLess {
+            bool operator()( StringRef lhs,
+                             StringRef rhs ) const;
+        };
 
-} // end namespace Catch
+        //! Provides case-insensitive `op==` semantics when called
+        struct CaseInsensitiveEqualTo {
+            bool operator()( StringRef lhs,
+                             StringRef rhs ) const;
+        };
 
-#endif // CATCH_INTERFACES_TAG_ALIAS_REGISTRY_HPP_INCLUDED
+    } // namespace Detail
+} // namespace Catch
 
-#endif // CATCH_INTERFACES_ALL_HPP_INCLUDED
+#endif // CATCH_CASE_INSENSITIVE_COMPARISONS_HPP_INCLUDED
 
 
 
@@ -8446,46 +8929,182 @@ namespace Catch {
 #    define CATCH_CONFIG_ANDROID_LOGWRITE
 #endif
 
-#endif // CATCH_CONFIG_ANDROID_LOGWRITE_HPP_INCLUDED
+#endif // CATCH_CONFIG_ANDROID_LOGWRITE_HPP_INCLUDED
+
+
+
+/** \file
+ * Wrapper for UNCAUGHT_EXCEPTIONS configuration option
+ *
+ * For some functionality, Catch2 requires to know whether there is
+ * an active exception. Because `std::uncaught_exception` is deprecated
+ * in C++17, we want to use `std::uncaught_exceptions` if possible.
+ */
+
+#ifndef CATCH_CONFIG_UNCAUGHT_EXCEPTIONS_HPP_INCLUDED
+#define CATCH_CONFIG_UNCAUGHT_EXCEPTIONS_HPP_INCLUDED
+
+
+#if defined(_MSC_VER)
+#  if _MSC_VER >= 1900 // Visual Studio 2015 or newer
+#    define CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS
+#  endif
+#endif
+
+
+#include <exception>
+
+#if defined(__cpp_lib_uncaught_exceptions) \
+    && !defined(CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS)
+
+#  define CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS
+#endif // __cpp_lib_uncaught_exceptions
+
+
+#if defined(CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS) \
+    && !defined(CATCH_CONFIG_NO_CPP17_UNCAUGHT_EXCEPTIONS) \
+    && !defined(CATCH_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS)
+
+#  define CATCH_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS
+#endif
+
+
+#endif // CATCH_CONFIG_UNCAUGHT_EXCEPTIONS_HPP_INCLUDED
+
+
+#ifndef CATCH_CONSOLE_COLOUR_HPP_INCLUDED
+#define CATCH_CONSOLE_COLOUR_HPP_INCLUDED
+
+
+#include <iosfwd>
+#include <cstdint>
+
+namespace Catch {
+
+    enum class ColourMode : std::uint8_t;
+    class IStream;
+
+    struct Colour {
+        enum Code {
+            None = 0,
+
+            White,
+            Red,
+            Green,
+            Blue,
+            Cyan,
+            Yellow,
+            Grey,
+
+            Bright = 0x10,
+
+            BrightRed = Bright | Red,
+            BrightGreen = Bright | Green,
+            LightGrey = Bright | Grey,
+            BrightWhite = Bright | White,
+            BrightYellow = Bright | Yellow,
+
+            // By intention
+            FileName = LightGrey,
+            Warning = BrightYellow,
+            ResultError = BrightRed,
+            ResultSuccess = BrightGreen,
+            ResultExpectedFailure = Warning,
+
+            Error = BrightRed,
+            Success = Green,
+            Skip = LightGrey,
+
+            OriginalExpression = Cyan,
+            ReconstructedExpression = BrightYellow,
+
+            SecondaryText = LightGrey,
+            Headers = White
+        };
+    };
+
+    class ColourImpl {
+    protected:
+        //! The associated stream of this ColourImpl instance
+        IStream* m_stream;
+    public:
+        ColourImpl( IStream* stream ): m_stream( stream ) {}
+
+        //! RAII wrapper around writing specific colour of text using specific
+        //! colour impl into a stream.
+        class ColourGuard {
+            ColourImpl const* m_colourImpl;
+            Colour::Code m_code;
+            bool m_engaged = false;
 
+        public:
+            //! Does **not** engage the guard/start the colour
+            ColourGuard( Colour::Code code,
+                         ColourImpl const* colour );
 
+            ColourGuard( ColourGuard const& rhs ) = delete;
+            ColourGuard& operator=( ColourGuard const& rhs ) = delete;
 
-/** \file
- * Wrapper for UNCAUGHT_EXCEPTIONS configuration option
- *
- * For some functionality, Catch2 requires to know whether there is
- * an active exception. Because `std::uncaught_exception` is deprecated
- * in C++17, we want to use `std::uncaught_exceptions` if possible.
- */
+            ColourGuard( ColourGuard&& rhs ) noexcept;
+            ColourGuard& operator=( ColourGuard&& rhs ) noexcept;
 
-#ifndef CATCH_CONFIG_UNCAUGHT_EXCEPTIONS_HPP_INCLUDED
-#define CATCH_CONFIG_UNCAUGHT_EXCEPTIONS_HPP_INCLUDED
+            //! Removes colour _if_ the guard was engaged
+            ~ColourGuard();
 
-#if defined(_MSC_VER)
-#  if _MSC_VER >= 1900 // Visual Studio 2015 or newer
-#    define CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS
-#  endif
-#endif
+            /**
+             * Explicitly engages colour for given stream.
+             *
+             * The API based on operator<< should be preferred.
+             */
+            ColourGuard& engage( std::ostream& stream ) &;
+            /**
+             * Explicitly engages colour for given stream.
+             *
+             * The API based on operator<< should be preferred.
+             */
+            ColourGuard&& engage( std::ostream& stream ) &&;
 
+        private:
+            //! Engages the guard and starts using colour
+            friend std::ostream& operator<<( std::ostream& lhs,
+                                             ColourGuard& guard ) {
+                guard.engageImpl( lhs );
+                return lhs;
+            }
+            //! Engages the guard and starts using colour
+            friend std::ostream& operator<<( std::ostream& lhs,
+                                            ColourGuard&& guard) {
+                guard.engageImpl( lhs );
+                return lhs;
+            }
 
-#include <exception>
+            void engageImpl( std::ostream& stream );
 
-#if defined(__cpp_lib_uncaught_exceptions) \
-    && !defined(CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS)
+        };
 
-#  define CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS
-#endif // __cpp_lib_uncaught_exceptions
+        virtual ~ColourImpl(); // = default
+        /**
+         * Creates a guard object for given colour and this colour impl
+         *
+         * **Important:**
+         * the guard starts disengaged, and has to be engaged explicitly.
+         */
+        ColourGuard guardColour( Colour::Code colourCode );
 
+    private:
+        virtual void use( Colour::Code colourCode ) const = 0;
+    };
 
-#if defined(CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS) \
-    && !defined(CATCH_CONFIG_NO_CPP17_UNCAUGHT_EXCEPTIONS) \
-    && !defined(CATCH_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS)
+    //! Provides ColourImpl based on global config and target compilation platform
+    Detail::unique_ptr<ColourImpl> makeColourImpl( ColourMode colourSelection,
+                                                   IStream* stream );
 
-#  define CATCH_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS
-#endif
+    //! Checks if specific colour impl has been compiled into the binary
+    bool isColourImplAvailable( ColourMode colourSelection );
 
+} // end namespace Catch
 
-#endif // CATCH_CONFIG_UNCAUGHT_EXCEPTIONS_HPP_INCLUDED
+#endif // CATCH_CONSOLE_COLOUR_HPP_INCLUDED
 
 
 #ifndef CATCH_CONSOLE_WIDTH_HPP_INCLUDED
@@ -8751,7 +9370,6 @@ namespace Catch {
         ~ExceptionTranslatorRegistry() override;
         void registerTranslator( Detail::unique_ptr<IExceptionTranslator>&& translator );
         std::string translateActiveException() const override;
-        std::string tryTranslators() const;
 
     private:
         ExceptionTranslators m_translators;
@@ -8764,7 +9382,6 @@ namespace Catch {
 #ifndef CATCH_FATAL_CONDITION_HANDLER_HPP_INCLUDED
 #define CATCH_FATAL_CONDITION_HANDLER_HPP_INCLUDED
 
-
 #include <cassert>
 
 namespace Catch {
@@ -8827,17 +9444,6 @@ namespace Catch {
 #define CATCH_FLOATING_POINT_HELPERS_HPP_INCLUDED
 
 
-
-#ifndef CATCH_POLYFILLS_HPP_INCLUDED
-#define CATCH_POLYFILLS_HPP_INCLUDED
-
-namespace Catch {
-    bool isnan(float f);
-    bool isnan(double d);
-}
-
-#endif // CATCH_POLYFILLS_HPP_INCLUDED
-
 #include <cassert>
 #include <cmath>
 #include <cstdint>
@@ -8850,6 +9456,11 @@ namespace Catch {
         uint32_t convertToBits(float f);
         uint64_t convertToBits(double d);
 
+        // Used when we know we want == comparison of two doubles
+        // to centralize warning suppression
+        bool directCompare( float lhs, float rhs );
+        bool directCompare( double lhs, double rhs );
+
     } // end namespace Detail
 
 
@@ -9126,6 +9737,119 @@ namespace Catch {
 #endif // CATCH_STREAM_HPP_INCLUDED
 
 
+#ifndef CATCH_JSONWRITER_HPP_INCLUDED
+#define CATCH_JSONWRITER_HPP_INCLUDED
+
+
+#include <cstdint>
+#include <sstream>
+
+namespace Catch {
+    class JsonObjectWriter;
+    class JsonArrayWriter;
+
+    struct JsonUtils {
+        static void indent( std::ostream& os, std::uint64_t level );
+        static void appendCommaNewline( std::ostream& os,
+                                        bool& should_comma,
+                                        std::uint64_t level );
+    };
+
+    class JsonValueWriter {
+    public:
+        JsonValueWriter( std::ostream& os );
+        JsonValueWriter( std::ostream& os, std::uint64_t indent_level );
+
+        JsonObjectWriter writeObject() &&;
+        JsonArrayWriter writeArray() &&;
+
+        template <typename T>
+        void write( T const& value ) && {
+            writeImpl( value, !std::is_arithmetic<T>::value );
+        }
+        void write( StringRef value ) &&;
+        void write( bool value ) &&;
+
+    private:
+        void writeImpl( StringRef value, bool quote );
+
+        // Without this SFINAE, this overload is a better match
+        // for `std::string`, `char const*`, `char const[N]` args.
+        // While it would still work, it would cause code bloat
+        // and multiple iteration over the strings
+        template <typename T,
+                  typename = typename std::enable_if_t<
+                      !std::is_convertible<T, StringRef>::value>>
+        void writeImpl( T const& value, bool quote_value ) {
+            m_sstream << value;
+            writeImpl( m_sstream.str(), quote_value );
+        }
+
+        std::ostream& m_os;
+        std::stringstream m_sstream;
+        std::uint64_t m_indent_level;
+    };
+
+    class JsonObjectWriter {
+    public:
+        JsonObjectWriter( std::ostream& os );
+        JsonObjectWriter( std::ostream& os, std::uint64_t indent_level );
+
+        JsonObjectWriter( JsonObjectWriter&& source );
+        JsonObjectWriter& operator=( JsonObjectWriter&& source ) = delete;
+
+        ~JsonObjectWriter();
+
+        JsonValueWriter write( StringRef key );
+
+    private:
+        std::ostream& m_os;
+        std::uint64_t m_indent_level;
+        bool m_should_comma = false;
+        bool m_active = true;
+    };
+
+    class JsonArrayWriter {
+    public:
+        JsonArrayWriter( std::ostream& os );
+        JsonArrayWriter( std::ostream& os, std::uint64_t indent_level );
+
+        JsonArrayWriter( JsonArrayWriter&& source );
+        JsonArrayWriter& operator=( JsonArrayWriter&& source ) = delete;
+
+        ~JsonArrayWriter();
+
+        JsonObjectWriter writeObject();
+        JsonArrayWriter writeArray();
+
+        template <typename T>
+        JsonArrayWriter& write( T const& value ) {
+            return writeImpl( value );
+        }
+
+        JsonArrayWriter& write( bool value );
+
+    private:
+        template <typename T>
+        JsonArrayWriter& writeImpl( T const& value ) {
+            JsonUtils::appendCommaNewline(
+                m_os, m_should_comma, m_indent_level + 1 );
+            JsonValueWriter{ m_os }.write( value );
+
+            return *this;
+        }
+
+        std::ostream& m_os;
+        std::uint64_t m_indent_level;
+        bool m_should_comma = false;
+        bool m_active = true;
+    };
+
+} // namespace Catch
+
+#endif // CATCH_JSONWRITER_HPP_INCLUDED
+
+
 #ifndef CATCH_LEAK_DETECTOR_HPP_INCLUDED
 #define CATCH_LEAK_DETECTOR_HPP_INCLUDED
 
@@ -9312,28 +10036,45 @@ namespace Catch {
 
 
 #include <map>
+#include <string>
+#include <vector>
 
 namespace Catch {
 
-    class ReporterRegistry : public IReporterRegistry {
-    public:
+    class IEventListener;
+    using IEventListenerPtr = Detail::unique_ptr<IEventListener>;
+    class IReporterFactory;
+    using IReporterFactoryPtr = Detail::unique_ptr<IReporterFactory>;
+    struct ReporterConfig;
+    class EventListenerFactory;
+
+    class ReporterRegistry {
+        struct ReporterRegistryImpl;
+        Detail::unique_ptr<ReporterRegistryImpl> m_impl;
 
+    public:
         ReporterRegistry();
-        ~ReporterRegistry() override; // = default, out of line to allow fwd decl
+        ~ReporterRegistry(); // = default;
 
-        IEventListenerPtr create( std::string const& name, ReporterConfig&& config ) const override;
+        IEventListenerPtr create( std::string const& name,
+                                  ReporterConfig&& config ) const;
 
-        void registerReporter( std::string const& name, IReporterFactoryPtr factory );
-        void registerListener( Detail::unique_ptr<EventListenerFactory> factory );
+        void registerReporter( std::string const& name,
+                               IReporterFactoryPtr factory );
 
-        FactoryMap const& getFactories() const override;
-        Listeners const& getListeners() const override;
+        void
+        registerListener( Detail::unique_ptr<EventListenerFactory> factory );
 
-    private:
-        FactoryMap m_factories;
-        Listeners m_listeners;
+        std::map<std::string,
+                 IReporterFactoryPtr,
+                 Detail::CaseInsensitiveLess> const&
+        getFactories() const;
+
+        std::vector<Detail::unique_ptr<EventListenerFactory>> const&
+        getListeners() const;
     };
-}
+
+} // end namespace Catch
 
 #endif // CATCH_REPORTER_REGISTRY_HPP_INCLUDED
 
@@ -9448,7 +10189,7 @@ namespace TestCaseTracking {
 
         //! Returns true if tracker run to completion (successfully or not)
         virtual bool isComplete() const = 0;
-        //! Returns true if tracker run to completion succesfully
+        //! Returns true if tracker run to completion successfully
         bool isSuccessfullyCompleted() const {
             return m_runState == CompletedSuccessfully;
         }
@@ -9582,13 +10323,14 @@ using TestCaseTracking::SectionTracker;
 
 namespace Catch {
 
-    class IMutableContext;
     class IGeneratorTracker;
     class IConfig;
+    class IEventListener;
+    using IEventListenerPtr = Detail::unique_ptr<IEventListener>;
 
     ///////////////////////////////////////////////////////////////////////////
 
-    class RunContext : public IResultCapture {
+    class RunContext final : public IResultCapture {
 
     public:
         RunContext( RunContext const& ) = delete;
@@ -9617,7 +10359,7 @@ namespace Catch {
                     AssertionReaction& reaction ) override;
         void handleUnexpectedInflightException
                 (   AssertionInfo const& info,
-                    std::string const& message,
+                    std::string&& message,
                     AssertionReaction& reaction ) override;
         void handleIncomplete
                 (   AssertionInfo const& info ) override;
@@ -9626,6 +10368,7 @@ namespace Catch {
                     ResultWas::OfType resultType,
                     AssertionReaction &reaction ) override;
 
+        void notifyAssertionStarted( AssertionInfo const& info ) override;
         bool sectionStarted( StringRef sectionName,
                              SourceLineInfo const& sectionLineInfo,
                              Counts& assertions ) override;
@@ -9676,7 +10419,7 @@ namespace Catch {
         void resetAssertionInfo();
         bool testForMissingAssertions( Counts& assertions );
 
-        void assertionEnded( AssertionResult const& result );
+        void assertionEnded( AssertionResult&& result );
         void reportExpr
                 (   AssertionInfo const &info,
                     ResultWas::OfType resultType,
@@ -9690,7 +10433,6 @@ namespace Catch {
         void handleUnfinishedSections();
 
         TestRunInfo m_runInfo;
-        IMutableContext& m_context;
         TestCaseHandle const* m_activeTestCase = nullptr;
         ITracker* m_testCaseTracker = nullptr;
         Optional<AssertionResult> m_lastResult;
@@ -9720,7 +10462,7 @@ namespace Catch {
 #ifndef CATCH_SHARDING_HPP_INCLUDED
 #define CATCH_SHARDING_HPP_INCLUDED
 
-
+#include <cassert>
 #include <cmath>
 #include <algorithm>
 
@@ -9947,24 +10689,20 @@ namespace Catch {
 
 namespace Catch {
 
-    class TestCaseHandle;
     class IConfig;
+    class ITestInvoker;
+    class TestCaseHandle;
     class TestSpec;
 
     std::vector<TestCaseHandle> sortTests( IConfig const& config, std::vector<TestCaseHandle> const& unsortedTestCases );
 
     bool isThrowSafe( TestCaseHandle const& testCase, IConfig const& config );
-    bool matchTest( TestCaseHandle const& testCase, TestSpec const& testSpec, IConfig const& config );
-
-    void enforceNoDuplicateTestCases( std::vector<TestCaseHandle> const& functions );
 
     std::vector<TestCaseHandle> filterTests( std::vector<TestCaseHandle> const& testCases, TestSpec const& testSpec, IConfig const& config );
     std::vector<TestCaseHandle> const& getAllTestCasesSorted( IConfig const& config );
 
     class TestRegistry : public ITestCaseRegistry {
     public:
-        ~TestRegistry() override = default;
-
         void registerTest( Detail::unique_ptr<TestCaseInfo> testInfo, Detail::unique_ptr<ITestInvoker> testInvoker );
 
         std::vector<TestCaseInfo*> const& getAllInfos() const override;
@@ -9985,18 +10723,6 @@ namespace Catch {
 
     ///////////////////////////////////////////////////////////////////////////
 
-    class TestInvokerAsFunction final : public ITestInvoker {
-        using TestType = void(*)();
-        TestType m_testAsFunction;
-    public:
-        TestInvokerAsFunction(TestType testAsFunction) noexcept:
-            m_testAsFunction(testAsFunction) {}
-
-        void invoke() const override;
-    };
-
-    ///////////////////////////////////////////////////////////////////////////
-
 
 } // end namespace Catch
 
@@ -10082,6 +10808,7 @@ namespace Catch {
 #ifndef CATCH_TEXTFLOW_HPP_INCLUDED
 #define CATCH_TEXTFLOW_HPP_INCLUDED
 
+
 #include <cassert>
 #include <string>
 #include <vector>
@@ -10110,7 +10837,7 @@ namespace Catch {
 
         public:
             /**
-             * Iterates "lines" in `Column` and return sthem
+             * Iterates "lines" in `Column` and returns them
              */
             class const_iterator {
                 friend Column;
@@ -10132,7 +10859,7 @@ namespace Catch {
                 // Calculates the length of the current line
                 void calcLength();
 
-                // Returns current indention width
+                // Returns current indentation width
                 size_t indentSize() const;
 
                 // Creates an indented and (optionally) suffixed string from
@@ -10164,20 +10891,35 @@ namespace Catch {
             using iterator = const_iterator;
 
             explicit Column( std::string const& text ): m_string( text ) {}
+            explicit Column( std::string&& text ):
+                m_string( CATCH_MOVE(text)) {}
 
-            Column& width( size_t newWidth ) {
+            Column& width( size_t newWidth ) & {
                 assert( newWidth > 0 );
                 m_width = newWidth;
                 return *this;
             }
-            Column& indent( size_t newIndent ) {
+            Column&& width( size_t newWidth ) && {
+                assert( newWidth > 0 );
+                m_width = newWidth;
+                return CATCH_MOVE( *this );
+            }
+            Column& indent( size_t newIndent ) & {
                 m_indent = newIndent;
                 return *this;
             }
-            Column& initialIndent( size_t newIndent ) {
+            Column&& indent( size_t newIndent ) && {
+                m_indent = newIndent;
+                return CATCH_MOVE( *this );
+            }
+            Column& initialIndent( size_t newIndent ) & {
                 m_initialIndent = newIndent;
                 return *this;
             }
+            Column&& initialIndent( size_t newIndent ) && {
+                m_initialIndent = newIndent;
+                return CATCH_MOVE( *this );
+            }
 
             size_t width() const { return m_width; }
             const_iterator begin() const { return const_iterator( *this ); }
@@ -10186,7 +10928,8 @@ namespace Catch {
             friend std::ostream& operator<<( std::ostream& os,
                                              Column const& col );
 
-            Columns operator+( Column const& other );
+            friend Columns operator+( Column const& lhs, Column const& rhs );
+            friend Columns operator+( Column&& lhs, Column&& rhs );
         };
 
         //! Creates a column that serves as an empty space of specific width
@@ -10230,8 +10973,10 @@ namespace Catch {
             iterator begin() const { return iterator( *this ); }
             iterator end() const { return { *this, iterator::EndTag() }; }
 
-            Columns& operator+=( Column const& col );
-            Columns operator+( Column const& col );
+            friend Columns& operator+=( Columns& lhs, Column const& rhs );
+            friend Columns& operator+=( Columns& lhs, Column&& rhs );
+            friend Columns operator+( Columns const& lhs, Column const& rhs );
+            friend Columns operator+( Columns&& lhs, Column&& rhs );
 
             friend std::ostream& operator<<( std::ostream& os,
                                              Columns const& cols );
@@ -10445,6 +11190,8 @@ namespace Catch {
 #define CATCH_MATCHERS_IMPL_HPP_INCLUDED
 
 
+#include <string>
+
 namespace Catch {
 
     template<typename ArgT, typename MatcherT>
@@ -11680,7 +12427,7 @@ namespace Catch {
 
         /**
          * Creates a matcher that checks if all elements in a range are equal
-         * to all elements in another range, in some permuation.
+         * to all elements in another range, in some permutation.
          *
          * Uses to provided predicate `predicate` to do the comparisons
          */
@@ -11850,11 +12597,10 @@ namespace Matchers {
             // - a more general approach would be via a compare template that defaults
             // to using !=. but could be specialised for, e.g. std::vector<T> etc
             // - then just call that directly
-            if (m_comparator.size() != v.size())
-                return false;
-            for (std::size_t i = 0; i < v.size(); ++i)
-                if (m_comparator[i] != v[i])
-                    return false;
+            if ( m_comparator.size() != v.size() ) { return false; }
+            for ( std::size_t i = 0; i < v.size(); ++i ) {
+                if ( !( m_comparator[i] == v[i] ) ) { return false; }
+            }
             return true;
         }
         std::string describe() const override {
@@ -12358,7 +13104,7 @@ namespace Catch {
         void skipTest(TestCaseInfo const&) override {}
 
     protected:
-        //! Should the cumulative base store the assertion expansion for succesful assertions?
+        //! Should the cumulative base store the assertion expansion for successful assertions?
         bool m_shouldStoreSuccesfulAssertions = true;
         //! Should the cumulative base store the assertion expansion for failed assertions?
         bool m_shouldStoreFailedAssertions = true;
@@ -12526,6 +13272,93 @@ namespace Catch {
 #endif // CATCH_REPORTER_HELPERS_HPP_INCLUDED
 
 
+
+#ifndef CATCH_REPORTER_JSON_HPP_INCLUDED
+#define CATCH_REPORTER_JSON_HPP_INCLUDED
+
+
+#include <stack>
+
+namespace Catch {
+    class JsonReporter : public StreamingReporterBase {
+    public:
+        JsonReporter( ReporterConfig&& config );
+
+        ~JsonReporter() override;
+
+        static std::string getDescription();
+
+    public: // StreamingReporterBase
+        void testRunStarting( TestRunInfo const& runInfo ) override;
+        void testRunEnded( TestRunStats const& runStats ) override;
+
+        void testCaseStarting( TestCaseInfo const& tcInfo ) override;
+        void testCaseEnded( TestCaseStats const& tcStats ) override;
+
+        void testCasePartialStarting( TestCaseInfo const& tcInfo,
+                                      uint64_t index ) override;
+        void testCasePartialEnded( TestCaseStats const& tcStats,
+                                   uint64_t index ) override;
+
+        void sectionStarting( SectionInfo const& sectionInfo ) override;
+        void sectionEnded( SectionStats const& sectionStats ) override;
+
+        void assertionStarting( AssertionInfo const& assertionInfo ) override;
+        void assertionEnded( AssertionStats const& assertionStats ) override;
+
+        //void testRunEndedCumulative() override;
+
+        void benchmarkPreparing( StringRef name ) override;
+        void benchmarkStarting( BenchmarkInfo const& ) override;
+        void benchmarkEnded( BenchmarkStats<> const& ) override;
+        void benchmarkFailed( StringRef error ) override;
+
+        void listReporters(
+            std::vector<ReporterDescription> const& descriptions ) override;
+        void listListeners(
+            std::vector<ListenerDescription> const& descriptions ) override;
+        void listTests( std::vector<TestCaseHandle> const& tests ) override;
+        void listTags( std::vector<TagInfo> const& tags ) override;
+
+    private:
+        Timer m_testCaseTimer;
+        enum class Writer {
+            Object,
+            Array
+        };
+
+        JsonArrayWriter& startArray();
+        JsonArrayWriter& startArray( StringRef key );
+
+        JsonObjectWriter& startObject();
+        JsonObjectWriter& startObject( StringRef key );
+
+        void endObject();
+        void endArray();
+
+        bool isInside( Writer writer );
+
+        void startListing();
+        void endListing();
+
+        // Invariant:
+        // When m_writers is not empty and its top element is
+        // - Writer::Object, then m_objectWriters is not be empty
+        // - Writer::Array,  then m_arrayWriters shall not be empty
+        std::stack<JsonObjectWriter> m_objectWriters{};
+        std::stack<JsonArrayWriter> m_arrayWriters{};
+        std::stack<Writer> m_writers{};
+
+        bool m_startedListing = false;
+
+        // std::size_t m_sectionDepth = 0;
+        // std::size_t m_sectionStarted = 0;
+    };
+} // namespace Catch
+
+#endif // CATCH_REPORTER_JSON_HPP_INCLUDED
+
+
 #ifndef CATCH_REPORTER_JUNIT_HPP_INCLUDED
 #define CATCH_REPORTER_JUNIT_HPP_INCLUDED
 
@@ -12537,8 +13370,6 @@ namespace Catch {
     public:
         JunitReporter(ReporterConfig&& _config);
 
-        ~JunitReporter() override = default;
-
         static std::string getDescription();
 
         void testRunStarting(TestRunInfo const& runInfo) override;
@@ -12665,7 +13496,8 @@ namespace Catch {
         //! independent on the reporter's concrete type
         void registerReporterImpl( std::string const& name,
                                    IReporterFactoryPtr reporterPtr );
-
+        //! Actually registers the factory, independent on listener's concrete type
+        void registerListenerImpl( Detail::unique_ptr<EventListenerFactory> listenerFactory );
     } // namespace Detail
 
     class IEventListener;
@@ -12726,7 +13558,7 @@ namespace Catch {
 
     public:
         ListenerRegistrar(StringRef listenerName) {
-            getMutableRegistryHub().registerListener( Detail::make_unique<TypedListenerFactory>(listenerName) );
+            registerListenerImpl( Detail::make_unique<TypedListenerFactory>(listenerName) );
         }
     };
 }
@@ -12778,8 +13610,6 @@ namespace Catch {
             m_shouldStoreSuccesfulAssertions = false;
         }
 
-        ~SonarQubeReporter() override = default;
-
         static std::string getDescription() {
             using namespace std::string_literals;
             return "Reports test results in the Generic Test Data SonarQube XML format"s;
@@ -12826,7 +13656,6 @@ namespace Catch {
             StreamingReporterBase( CATCH_MOVE(config) ) {
             m_preferences.shouldReportAllAssertions = true;
         }
-        ~TAPReporter() override = default;
 
         static std::string getDescription() {
             using namespace std::string_literals;
diff --git a/packages/Catch2/fuzzing/NullOStream.cpp b/packages/Catch2/fuzzing/NullOStream.cpp
index 53e0893dcc915922614f1665c89a64a449fc841e..e3a181e80bf718212da61b40c37a50f986f81c09 100644
--- a/packages/Catch2/fuzzing/NullOStream.cpp
+++ b/packages/Catch2/fuzzing/NullOStream.cpp
@@ -1,3 +1,11 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
 #include "NullOStream.h"
 
 void NullOStream::avoidOutOfLineVirtualCompilerWarning()
diff --git a/packages/Catch2/fuzzing/NullOStream.h b/packages/Catch2/fuzzing/NullOStream.h
index e1fe15b088733ba1dd455a64bada004b22bf6321..abbec09c882ef0dc5ee81f9630672a962d3b2d98 100644
--- a/packages/Catch2/fuzzing/NullOStream.h
+++ b/packages/Catch2/fuzzing/NullOStream.h
@@ -1,3 +1,11 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
 #pragma once
 
 #include <ostream>
diff --git a/packages/Catch2/fuzzing/fuzz_TestSpecParser.cpp b/packages/Catch2/fuzzing/fuzz_TestSpecParser.cpp
index af4de4062387da3e61a73cbb4a620c4dd4af9d8a..3aba8c84067c71a0730ee6926a20a75e2dfcc903 100644
--- a/packages/Catch2/fuzzing/fuzz_TestSpecParser.cpp
+++ b/packages/Catch2/fuzzing/fuzz_TestSpecParser.cpp
@@ -1,4 +1,10 @@
-//License: Boost 1.0
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
 //By Paul Dreik 2020
 
 #include <catch2/internal/catch_test_spec_parser.hpp>
diff --git a/packages/Catch2/fuzzing/fuzz_XmlWriter.cpp b/packages/Catch2/fuzzing/fuzz_XmlWriter.cpp
index f8e5a0d9a30b442bf02c3490c17eb7e0b7267f28..70c4ed80309d162f57750f4ad36acee7fd80201b 100644
--- a/packages/Catch2/fuzzing/fuzz_XmlWriter.cpp
+++ b/packages/Catch2/fuzzing/fuzz_XmlWriter.cpp
@@ -1,4 +1,10 @@
-//License: Boost 1.0
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
 //By Paul Dreik 2020
 
 #include <catch2/internal/catch_xmlwriter.hpp>
diff --git a/packages/Catch2/fuzzing/fuzz_textflow.cpp b/packages/Catch2/fuzzing/fuzz_textflow.cpp
index eafe79febf6163005fdf7312a3bffb9d50476b1e..7000f420f53998ecf8c02ab47f8cd947399c5aa2 100644
--- a/packages/Catch2/fuzzing/fuzz_textflow.cpp
+++ b/packages/Catch2/fuzzing/fuzz_textflow.cpp
@@ -1,4 +1,10 @@
-//License: Boost 1.0
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
 //By Paul Dreik 2020
 
 #include <catch2/internal/catch_textflow.hpp>
diff --git a/packages/Catch2/meson.build b/packages/Catch2/meson.build
index ed5033acd8820752dd3dde09374a13056704c68f..0a897520de1144724dbd4090af9121f4806a208e 100644
--- a/packages/Catch2/meson.build
+++ b/packages/Catch2/meson.build
@@ -8,7 +8,7 @@
 project(
   'catch2',
   'cpp',
-  version: '3.3.2', # CML version placeholder, don't delete
+  version: '3.5.2', # CML version placeholder, don't delete
   license: 'BSL-1.0',
   meson_version: '>=0.54.1',
 )
diff --git a/packages/Catch2/src/CMakeLists.txt b/packages/Catch2/src/CMakeLists.txt
index 0fdf931e6ee86de3412c1cf7e5aad6c2deaa1a83..eb805ddd0bfa6f7e81a232e3289968713894b559 100644
--- a/packages/Catch2/src/CMakeLists.txt
+++ b/packages/Catch2/src/CMakeLists.txt
@@ -33,6 +33,7 @@ set(BENCHMARK_HEADERS
 )
 set(BENCHMARK_SOURCES
   ${SOURCES_DIR}/benchmark/catch_chronometer.cpp
+  ${SOURCES_DIR}/benchmark/detail/catch_analyse.cpp
   ${SOURCES_DIR}/benchmark/detail/catch_benchmark_function.cpp
   ${SOURCES_DIR}/benchmark/detail/catch_run_for_at_least.cpp
   ${SOURCES_DIR}/benchmark/detail/catch_stats.cpp
@@ -92,6 +93,7 @@ set(IMPL_HEADERS
   ${SOURCES_DIR}/internal/catch_getenv.hpp
   ${SOURCES_DIR}/internal/catch_istream.hpp
   ${SOURCES_DIR}/internal/catch_is_permutation.hpp
+  ${SOURCES_DIR}/internal/catch_jsonwriter.hpp
   ${SOURCES_DIR}/internal/catch_lazy_expr.hpp
   ${SOURCES_DIR}/internal/catch_leak_detector.hpp
   ${SOURCES_DIR}/internal/catch_list.hpp
@@ -107,6 +109,8 @@ set(IMPL_HEADERS
   ${SOURCES_DIR}/internal/catch_polyfills.hpp
   ${SOURCES_DIR}/internal/catch_preprocessor.hpp
   ${SOURCES_DIR}/internal/catch_preprocessor_remove_parens.hpp
+  ${SOURCES_DIR}/internal/catch_random_floating_point_helpers.hpp
+  ${SOURCES_DIR}/internal/catch_random_integer_helpers.hpp
   ${SOURCES_DIR}/internal/catch_random_number_generator.hpp
   ${SOURCES_DIR}/internal/catch_random_seed_generation.hpp
   ${SOURCES_DIR}/internal/catch_reporter_registry.hpp
@@ -136,6 +140,8 @@ set(IMPL_HEADERS
   ${SOURCES_DIR}/internal/catch_textflow.hpp
   ${SOURCES_DIR}/internal/catch_to_string.hpp
   ${SOURCES_DIR}/internal/catch_uncaught_exceptions.hpp
+  ${SOURCES_DIR}/internal/catch_uniform_floating_point_distribution.hpp
+  ${SOURCES_DIR}/internal/catch_uniform_integer_distribution.hpp
   ${SOURCES_DIR}/internal/catch_unique_name.hpp
   ${SOURCES_DIR}/internal/catch_unique_ptr.hpp
   ${SOURCES_DIR}/internal/catch_void_type.hpp
@@ -176,6 +182,7 @@ set(IMPL_SOURCES
   ${SOURCES_DIR}/internal/catch_floating_point_helpers.cpp
   ${SOURCES_DIR}/internal/catch_getenv.cpp
   ${SOURCES_DIR}/internal/catch_istream.cpp
+  ${SOURCES_DIR}/internal/catch_jsonwriter.cpp
   ${SOURCES_DIR}/internal/catch_lazy_expr.cpp
   ${SOURCES_DIR}/internal/catch_leak_detector.cpp
   ${SOURCES_DIR}/internal/catch_list.cpp
@@ -288,6 +295,7 @@ set(REPORTER_HEADERS
   ${SOURCES_DIR}/reporters/catch_reporter_cumulative_base.hpp
   ${SOURCES_DIR}/reporters/catch_reporter_event_listener.hpp
   ${SOURCES_DIR}/reporters/catch_reporter_helpers.hpp
+  ${SOURCES_DIR}/reporters/catch_reporter_json.hpp
   ${SOURCES_DIR}/reporters/catch_reporter_junit.hpp
   ${SOURCES_DIR}/reporters/catch_reporter_multi.hpp
   ${SOURCES_DIR}/reporters/catch_reporter_registrars.hpp
@@ -306,6 +314,7 @@ set(REPORTER_SOURCES
   ${SOURCES_DIR}/reporters/catch_reporter_cumulative_base.cpp
   ${SOURCES_DIR}/reporters/catch_reporter_event_listener.cpp
   ${SOURCES_DIR}/reporters/catch_reporter_helpers.cpp
+  ${SOURCES_DIR}/reporters/catch_reporter_json.cpp
   ${SOURCES_DIR}/reporters/catch_reporter_junit.cpp
   ${SOURCES_DIR}/reporters/catch_reporter_multi.cpp
   ${SOURCES_DIR}/reporters/catch_reporter_registrars.cpp
@@ -339,7 +348,9 @@ source_group("generated headers"
 )
 
 add_library(Catch2 ${ALL_FILES})
-add_build_reproducibility_settings(Catch2)
+if (CATCH_ENABLE_REPRODUCIBLE_BUILD)
+    add_build_reproducibility_settings(Catch2)
+endif()
 add_library(Catch2::Catch2 ALIAS Catch2)
 
 if (ANDROID)
@@ -392,7 +403,9 @@ target_include_directories(Catch2
 add_library(Catch2WithMain
     ${SOURCES_DIR}/internal/catch_main.cpp
 )
-add_build_reproducibility_settings(Catch2WithMain)
+if (CATCH_ENABLE_REPRODUCIBLE_BUILD)
+    add_build_reproducibility_settings(Catch2WithMain)
+endif()
 add_library(Catch2::Catch2WithMain ALIAS Catch2WithMain)
 target_link_libraries(Catch2WithMain PUBLIC Catch2)
 set_target_properties(Catch2WithMain
diff --git a/packages/Catch2/src/catch2/benchmark/catch_benchmark.hpp b/packages/Catch2/src/catch2/benchmark/catch_benchmark.hpp
index 99d1c9df895afa1c2563a32c28876290c85cd64a..3db40bb048131125ea39c66a794e644241b2ce79 100644
--- a/packages/Catch2/src/catch2/benchmark/catch_benchmark.hpp
+++ b/packages/Catch2/src/catch2/benchmark/catch_benchmark.hpp
@@ -31,9 +31,7 @@
 #include <algorithm>
 #include <chrono>
 #include <exception>
-#include <functional>
 #include <string>
-#include <vector>
 #include <cmath>
 
 namespace Catch {
@@ -47,16 +45,18 @@ namespace Catch {
                 : fun(CATCH_MOVE(func)), name(CATCH_MOVE(benchmarkName)) {}
 
             template <typename Clock>
-            ExecutionPlan<FloatDuration<Clock>> prepare(const IConfig &cfg, Environment<FloatDuration<Clock>> env) const {
+            ExecutionPlan prepare(const IConfig &cfg, Environment env) const {
                 auto min_time = env.clock_resolution.mean * Detail::minimum_ticks;
                 auto run_time = std::max(min_time, std::chrono::duration_cast<decltype(min_time)>(cfg.benchmarkWarmupTime()));
-                auto&& test = Detail::run_for_at_least<Clock>(std::chrono::duration_cast<ClockDuration<Clock>>(run_time), 1, fun);
+                auto&& test = Detail::run_for_at_least<Clock>(std::chrono::duration_cast<IDuration>(run_time), 1, fun);
                 int new_iters = static_cast<int>(std::ceil(min_time * test.iterations / test.elapsed));
-                return { new_iters, test.elapsed / test.iterations * new_iters * cfg.benchmarkSamples(), fun, std::chrono::duration_cast<FloatDuration<Clock>>(cfg.benchmarkWarmupTime()), Detail::warmup_iterations };
+                return { new_iters, test.elapsed / test.iterations * new_iters * cfg.benchmarkSamples(), fun, std::chrono::duration_cast<FDuration>(cfg.benchmarkWarmupTime()), Detail::warmup_iterations };
             }
 
             template <typename Clock = default_clock>
             void run() {
+                static_assert( Clock::is_steady,
+                               "Benchmarking clock should be steady" );
                 auto const* cfg = getCurrentContext().getConfig();
 
                 auto env = Detail::measure_environment<Clock>();
@@ -83,8 +83,8 @@ namespace Catch {
                         return plan.template run<Clock>(*cfg, env);
                     });
 
-                    auto analysis = Detail::analyse(*cfg, env, samples.begin(), samples.end());
-                    BenchmarkStats<FloatDuration<Clock>> stats{ CATCH_MOVE(info), CATCH_MOVE(analysis.samples), analysis.mean, analysis.standard_deviation, analysis.outliers, analysis.outlier_variance };
+                    auto analysis = Detail::analyse(*cfg, samples.data(), samples.data() + samples.size());
+                    BenchmarkStats<> stats{ CATCH_MOVE(info), CATCH_MOVE(analysis.samples), analysis.mean, analysis.standard_deviation, analysis.outliers, analysis.outlier_variance };
                     getResultCapture().benchmarkEnded(stats);
                 } CATCH_CATCH_ANON (TestFailureException const&) {
                     getResultCapture().benchmarkFailed("Benchmark failed due to failed assertion"_sr);
diff --git a/packages/Catch2/src/catch2/benchmark/catch_chronometer.hpp b/packages/Catch2/src/catch2/benchmark/catch_chronometer.hpp
index c3f813060f989a91e63ee786e5213682e5abee18..95498e6be083affbe623e290bbdcd094d29cabcc 100644
--- a/packages/Catch2/src/catch2/benchmark/catch_chronometer.hpp
+++ b/packages/Catch2/src/catch2/benchmark/catch_chronometer.hpp
@@ -32,7 +32,10 @@ namespace Catch {
                 void start() override { started = Clock::now(); }
                 void finish() override { finished = Clock::now(); }
 
-                ClockDuration<Clock> elapsed() const { return finished - started; }
+                IDuration elapsed() const {
+                    return std::chrono::duration_cast<std::chrono::nanoseconds>(
+                        finished - started );
+                }
 
                 TimePoint<Clock> started;
                 TimePoint<Clock> finished;
diff --git a/packages/Catch2/src/catch2/benchmark/catch_clock.hpp b/packages/Catch2/src/catch2/benchmark/catch_clock.hpp
index cee46097d0b664954a38caeff026a7d81aa4853c..4068c4d29681ffaaa3d171f7cea1bd260e7e094f 100644
--- a/packages/Catch2/src/catch2/benchmark/catch_clock.hpp
+++ b/packages/Catch2/src/catch2/benchmark/catch_clock.hpp
@@ -11,28 +11,16 @@
 #define CATCH_CLOCK_HPP_INCLUDED
 
 #include <chrono>
-#include <ratio>
 
 namespace Catch {
     namespace Benchmark {
-        template <typename Clock>
-        using ClockDuration = typename Clock::duration;
-        template <typename Clock>
-        using FloatDuration = std::chrono::duration<double, typename Clock::period>;
+        using IDuration = std::chrono::nanoseconds;
+        using FDuration = std::chrono::duration<double, std::nano>;
 
         template <typename Clock>
         using TimePoint = typename Clock::time_point;
 
         using default_clock = std::chrono::steady_clock;
-
-        template <typename Clock>
-        struct now {
-            TimePoint<Clock> operator()() const {
-                return Clock::now();
-            }
-        };
-
-        using fp_seconds = std::chrono::duration<double, std::ratio<1>>;
     } // namespace Benchmark
 } // namespace Catch
 
diff --git a/packages/Catch2/src/catch2/benchmark/catch_environment.hpp b/packages/Catch2/src/catch2/benchmark/catch_environment.hpp
index de4d77df4f0781da089cd824171e0792848ba302..da3f2fa9532feed7a221c2c2007d700e220701ba 100644
--- a/packages/Catch2/src/catch2/benchmark/catch_environment.hpp
+++ b/packages/Catch2/src/catch2/benchmark/catch_environment.hpp
@@ -15,21 +15,13 @@
 
 namespace Catch {
     namespace Benchmark {
-        template <typename Duration>
         struct EnvironmentEstimate {
-            Duration mean;
+            FDuration mean;
             OutlierClassification outliers;
-
-            template <typename Duration2>
-            operator EnvironmentEstimate<Duration2>() const {
-                return { mean, outliers };
-            }
         };
-        template <typename Clock>
         struct Environment {
-            using clock_type = Clock;
-            EnvironmentEstimate<FloatDuration<Clock>> clock_resolution;
-            EnvironmentEstimate<FloatDuration<Clock>> clock_cost;
+            EnvironmentEstimate clock_resolution;
+            EnvironmentEstimate clock_cost;
         };
     } // namespace Benchmark
 } // namespace Catch
diff --git a/packages/Catch2/src/catch2/benchmark/catch_estimate.hpp b/packages/Catch2/src/catch2/benchmark/catch_estimate.hpp
index be594a189b482ad7f8c9ed00726647e85292b3ed..64383a2e5fb0c31365d5df2327a27a86ff8c288c 100644
--- a/packages/Catch2/src/catch2/benchmark/catch_estimate.hpp
+++ b/packages/Catch2/src/catch2/benchmark/catch_estimate.hpp
@@ -12,17 +12,12 @@
 
 namespace Catch {
     namespace Benchmark {
-        template <typename Duration>
+        template <typename Type>
         struct Estimate {
-            Duration point;
-            Duration lower_bound;
-            Duration upper_bound;
+            Type point;
+            Type lower_bound;
+            Type upper_bound;
             double confidence_interval;
-
-            template <typename Duration2>
-            operator Estimate<Duration2>() const {
-                return { point, lower_bound, upper_bound, confidence_interval };
-            }
         };
     } // namespace Benchmark
 } // namespace Catch
diff --git a/packages/Catch2/src/catch2/benchmark/catch_execution_plan.hpp b/packages/Catch2/src/catch2/benchmark/catch_execution_plan.hpp
index 4f60a64677850d1b5d4891bb70db7d26ec469c01..17ca589f5e0349ab25cb371bc851e9481063c757 100644
--- a/packages/Catch2/src/catch2/benchmark/catch_execution_plan.hpp
+++ b/packages/Catch2/src/catch2/benchmark/catch_execution_plan.hpp
@@ -21,33 +21,31 @@
 
 namespace Catch {
     namespace Benchmark {
-        template <typename Duration>
         struct ExecutionPlan {
             int iterations_per_sample;
-            Duration estimated_duration;
+            FDuration estimated_duration;
             Detail::BenchmarkFunction benchmark;
-            Duration warmup_time;
+            FDuration warmup_time;
             int warmup_iterations;
 
-            template <typename Duration2>
-            operator ExecutionPlan<Duration2>() const {
-                return { iterations_per_sample, estimated_duration, benchmark, warmup_time, warmup_iterations };
-            }
-
             template <typename Clock>
-            std::vector<FloatDuration<Clock>> run(const IConfig &cfg, Environment<FloatDuration<Clock>> env) const {
+            std::vector<FDuration> run(const IConfig &cfg, Environment env) const {
                 // warmup a bit
-                Detail::run_for_at_least<Clock>(std::chrono::duration_cast<ClockDuration<Clock>>(warmup_time), warmup_iterations, Detail::repeat(now<Clock>{}));
+                Detail::run_for_at_least<Clock>(
+                    std::chrono::duration_cast<IDuration>( warmup_time ),
+                    warmup_iterations,
+                    Detail::repeat( []() { return Clock::now(); } )
+                );
 
-                std::vector<FloatDuration<Clock>> times;
+                std::vector<FDuration> times;
                 const auto num_samples = cfg.benchmarkSamples();
                 times.reserve( num_samples );
                 for ( size_t i = 0; i < num_samples; ++i ) {
                     Detail::ChronometerModel<Clock> model;
                     this->benchmark( Chronometer( model, iterations_per_sample ) );
                     auto sample_time = model.elapsed() - env.clock_cost.mean;
-                    if ( sample_time < FloatDuration<Clock>::zero() ) {
-                        sample_time = FloatDuration<Clock>::zero();
+                    if ( sample_time < FDuration::zero() ) {
+                        sample_time = FDuration::zero();
                     }
                     times.push_back(sample_time / iterations_per_sample);
                 }
diff --git a/packages/Catch2/src/catch2/benchmark/catch_optimizer.hpp b/packages/Catch2/src/catch2/benchmark/catch_optimizer.hpp
index 02cf2073e40856db0e36c2051f0372209dabe83a..61e6571f6e366e9d4d031d62baf090ec71a834ab 100644
--- a/packages/Catch2/src/catch2/benchmark/catch_optimizer.hpp
+++ b/packages/Catch2/src/catch2/benchmark/catch_optimizer.hpp
@@ -70,7 +70,7 @@ namespace Catch {
 
         template <typename Fn, typename... Args>
         inline auto invoke_deoptimized(Fn&& fn, Args&&... args) -> std::enable_if_t<std::is_same<void, decltype(fn(args...))>::value> {
-            CATCH_FORWARD(fn) (CATCH_FORWARD(args)...);
+            CATCH_FORWARD((fn)) (CATCH_FORWARD(args)...);
         }
     } // namespace Benchmark
 } // namespace Catch
diff --git a/packages/Catch2/src/catch2/benchmark/catch_sample_analysis.hpp b/packages/Catch2/src/catch2/benchmark/catch_sample_analysis.hpp
index 97b8fe5083eeceb9c9e7ac76a74cb8a287c281da..aeb87d05a37823754d00aa1605d4062753bd640c 100644
--- a/packages/Catch2/src/catch2/benchmark/catch_sample_analysis.hpp
+++ b/packages/Catch2/src/catch2/benchmark/catch_sample_analysis.hpp
@@ -12,35 +12,18 @@
 
 #include <catch2/benchmark/catch_estimate.hpp>
 #include <catch2/benchmark/catch_outlier_classification.hpp>
-#include <catch2/internal/catch_move_and_forward.hpp>
+#include <catch2/benchmark/catch_clock.hpp>
 
 #include <vector>
 
 namespace Catch {
     namespace Benchmark {
-        template <typename Duration>
         struct SampleAnalysis {
-            std::vector<Duration> samples;
-            Estimate<Duration> mean;
-            Estimate<Duration> standard_deviation;
+            std::vector<FDuration> samples;
+            Estimate<FDuration> mean;
+            Estimate<FDuration> standard_deviation;
             OutlierClassification outliers;
             double outlier_variance;
-
-            template <typename Duration2>
-            operator SampleAnalysis<Duration2>() const {
-                std::vector<Duration2> samples2;
-                samples2.reserve(samples.size());
-                for (auto const& d : samples) {
-                    samples2.push_back(Duration2(d));
-                }
-                return {
-                    CATCH_MOVE(samples2),
-                    mean,
-                    standard_deviation,
-                    outliers,
-                    outlier_variance,
-                };
-            }
         };
     } // namespace Benchmark
 } // namespace Catch
diff --git a/packages/Catch2/src/catch2/benchmark/detail/catch_analyse.cpp b/packages/Catch2/src/catch2/benchmark/detail/catch_analyse.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7d27daf195d4e7cf1b584bea09db599f3f6e5adf
--- /dev/null
+++ b/packages/Catch2/src/catch2/benchmark/detail/catch_analyse.cpp
@@ -0,0 +1,85 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+// Adapted from donated nonius code.
+
+#include <catch2/benchmark/detail/catch_analyse.hpp>
+#include <catch2/benchmark/catch_clock.hpp>
+#include <catch2/benchmark/catch_sample_analysis.hpp>
+#include <catch2/benchmark/detail/catch_stats.hpp>
+#include <catch2/interfaces/catch_interfaces_config.hpp>
+#include <catch2/internal/catch_move_and_forward.hpp>
+
+#include <vector>
+
+namespace Catch {
+    namespace Benchmark {
+        namespace Detail {
+            SampleAnalysis analyse(const IConfig &cfg, FDuration* first, FDuration* last) {
+                if (!cfg.benchmarkNoAnalysis()) {
+                    std::vector<double> samples;
+                    samples.reserve(static_cast<size_t>(last - first));
+                    for (auto current = first; current != last; ++current) {
+                        samples.push_back( current->count() );
+                    }
+
+                    auto analysis = Catch::Benchmark::Detail::analyse_samples(
+                        cfg.benchmarkConfidenceInterval(),
+                        cfg.benchmarkResamples(),
+                        samples.data(),
+                        samples.data() + samples.size() );
+                    auto outliers = Catch::Benchmark::Detail::classify_outliers(
+                        samples.data(), samples.data() + samples.size() );
+
+                    auto wrap_estimate = [](Estimate<double> e) {
+                        return Estimate<FDuration> {
+                            FDuration(e.point),
+                                FDuration(e.lower_bound),
+                                FDuration(e.upper_bound),
+                                e.confidence_interval,
+                        };
+                    };
+                    std::vector<FDuration> samples2;
+                    samples2.reserve(samples.size());
+                    for (auto s : samples) {
+                        samples2.push_back( FDuration( s ) );
+                    }
+
+                    return {
+                        CATCH_MOVE(samples2),
+                        wrap_estimate(analysis.mean),
+                        wrap_estimate(analysis.standard_deviation),
+                        outliers,
+                        analysis.outlier_variance,
+                    };
+                } else {
+                    std::vector<FDuration> samples;
+                    samples.reserve(static_cast<size_t>(last - first));
+
+                    FDuration mean = FDuration(0);
+                    int i = 0;
+                    for (auto it = first; it < last; ++it, ++i) {
+                        samples.push_back(FDuration(*it));
+                        mean += FDuration(*it);
+                    }
+                    mean /= i;
+
+                    return SampleAnalysis{
+                        CATCH_MOVE(samples),
+                        Estimate<FDuration>{ mean, mean, mean, 0.0 },
+                        Estimate<FDuration>{ FDuration( 0 ),
+                                             FDuration( 0 ),
+                                             FDuration( 0 ),
+                                             0.0 },
+                        OutlierClassification{},
+                        0.0
+                    };
+                }
+            }
+        } // namespace Detail
+    } // namespace Benchmark
+} // namespace Catch
diff --git a/packages/Catch2/src/catch2/benchmark/detail/catch_analyse.hpp b/packages/Catch2/src/catch2/benchmark/detail/catch_analyse.hpp
index c932ff26a4d1bf0ff1c9364015bb426695916bc1..5e3f7b0f591c154fd4707cb8bb25205b68c7f01e 100644
--- a/packages/Catch2/src/catch2/benchmark/detail/catch_analyse.hpp
+++ b/packages/Catch2/src/catch2/benchmark/detail/catch_analyse.hpp
@@ -10,71 +10,16 @@
 #ifndef CATCH_ANALYSE_HPP_INCLUDED
 #define CATCH_ANALYSE_HPP_INCLUDED
 
-#include <catch2/benchmark/catch_environment.hpp>
+#include <catch2/benchmark/catch_clock.hpp>
 #include <catch2/benchmark/catch_sample_analysis.hpp>
-#include <catch2/benchmark/detail/catch_stats.hpp>
-#include <catch2/interfaces/catch_interfaces_config.hpp>
-#include <catch2/internal/catch_move_and_forward.hpp>
 
-#include <vector>
 
 namespace Catch {
+    class IConfig;
+
     namespace Benchmark {
         namespace Detail {
-            template <typename Duration, typename Iterator>
-            SampleAnalysis<Duration> analyse(const IConfig &cfg, Environment<Duration>, Iterator first, Iterator last) {
-                if (!cfg.benchmarkNoAnalysis()) {
-                    std::vector<double> samples;
-                    samples.reserve(static_cast<size_t>(last - first));
-                    for (auto current = first; current != last; ++current) {
-                        samples.push_back( current->count() );
-                    }
-
-                    auto analysis = Catch::Benchmark::Detail::analyse_samples(cfg.benchmarkConfidenceInterval(), cfg.benchmarkResamples(), samples.begin(), samples.end());
-                    auto outliers = Catch::Benchmark::Detail::classify_outliers(samples.begin(), samples.end());
-
-                    auto wrap_estimate = [](Estimate<double> e) {
-                        return Estimate<Duration> {
-                            Duration(e.point),
-                                Duration(e.lower_bound),
-                                Duration(e.upper_bound),
-                                e.confidence_interval,
-                        };
-                    };
-                    std::vector<Duration> samples2;
-                    samples2.reserve(samples.size());
-                    for (auto s : samples) {
-                        samples2.push_back( Duration( s ) );
-                    }
-
-                    return {
-                        CATCH_MOVE(samples2),
-                        wrap_estimate(analysis.mean),
-                        wrap_estimate(analysis.standard_deviation),
-                        outliers,
-                        analysis.outlier_variance,
-                    };
-                } else {
-                    std::vector<Duration> samples;
-                    samples.reserve(static_cast<size_t>(last - first));
-
-                    Duration mean = Duration(0);
-                    int i = 0;
-                    for (auto it = first; it < last; ++it, ++i) {
-                        samples.push_back(Duration(*it));
-                        mean += Duration(*it);
-                    }
-                    mean /= i;
-
-                    return {
-                        CATCH_MOVE(samples),
-                        Estimate<Duration>{mean, mean, mean, 0.0},
-                        Estimate<Duration>{Duration(0), Duration(0), Duration(0), 0.0},
-                        OutlierClassification{},
-                        0.0
-                    };
-                }
-            }
+            SampleAnalysis analyse(const IConfig &cfg, FDuration* first, FDuration* last);
         } // namespace Detail
     } // namespace Benchmark
 } // namespace Catch
diff --git a/packages/Catch2/src/catch2/benchmark/detail/catch_benchmark_stats.hpp b/packages/Catch2/src/catch2/benchmark/detail/catch_benchmark_stats.hpp
index a8b3494e97b35a5a36d866911a6fd80a1109200b..3633bc9f93a4dde52de32b7250b0d17c0096622b 100644
--- a/packages/Catch2/src/catch2/benchmark/detail/catch_benchmark_stats.hpp
+++ b/packages/Catch2/src/catch2/benchmark/detail/catch_benchmark_stats.hpp
@@ -8,7 +8,6 @@
 #ifndef CATCH_BENCHMARK_STATS_HPP_INCLUDED
 #define CATCH_BENCHMARK_STATS_HPP_INCLUDED
 
-#include <catch2/internal/catch_move_and_forward.hpp>
 #include <catch2/benchmark/catch_estimate.hpp>
 #include <catch2/benchmark/catch_outlier_classification.hpp>
 // The fwd decl & default specialization needs to be seen by VS2017 before
@@ -30,32 +29,17 @@ namespace Catch {
         double clockCost;
     };
 
-    template <class Duration>
+    // We need to keep template parameter for backwards compatibility,
+    // but we also do not want to use the template paraneter.
+    template <class Dummy>
     struct BenchmarkStats {
         BenchmarkInfo info;
 
-        std::vector<Duration> samples;
-        Benchmark::Estimate<Duration> mean;
-        Benchmark::Estimate<Duration> standardDeviation;
+        std::vector<Benchmark::FDuration> samples;
+        Benchmark::Estimate<Benchmark::FDuration> mean;
+        Benchmark::Estimate<Benchmark::FDuration> standardDeviation;
         Benchmark::OutlierClassification outliers;
         double outlierVariance;
-
-        template <typename Duration2>
-        operator BenchmarkStats<Duration2>() const {
-            std::vector<Duration2> samples2;
-            samples2.reserve(samples.size());
-            for (auto const& sample : samples) {
-                samples2.push_back(Duration2(sample));
-            }
-            return {
-                info,
-                CATCH_MOVE(samples2),
-                mean,
-                standardDeviation,
-                outliers,
-                outlierVariance,
-            };
-        }
     };
 
 
diff --git a/packages/Catch2/src/catch2/benchmark/detail/catch_benchmark_stats_fwd.hpp b/packages/Catch2/src/catch2/benchmark/detail/catch_benchmark_stats_fwd.hpp
index 607725613eae5bebacb2115873f63bce97e860f6..2ccc25d582b7524644503cfc84230015442fce1e 100644
--- a/packages/Catch2/src/catch2/benchmark/detail/catch_benchmark_stats_fwd.hpp
+++ b/packages/Catch2/src/catch2/benchmark/detail/catch_benchmark_stats_fwd.hpp
@@ -8,14 +8,14 @@
 #ifndef CATCH_BENCHMARK_STATS_FWD_HPP_INCLUDED
 #define CATCH_BENCHMARK_STATS_FWD_HPP_INCLUDED
 
-#include <chrono>
+#include <catch2/benchmark/catch_clock.hpp>
 
 namespace Catch {
 
     // We cannot forward declare the type with default template argument
     // multiple times, so it is split out into a separate header so that
     // we can prevent multiple declarations in dependees
-    template <typename Duration = std::chrono::duration<double, std::nano>>
+    template <typename Duration = Benchmark::FDuration>
     struct BenchmarkStats;
 
 } // end namespace Catch
diff --git a/packages/Catch2/src/catch2/benchmark/detail/catch_estimate_clock.hpp b/packages/Catch2/src/catch2/benchmark/detail/catch_estimate_clock.hpp
index 1e916ae4aca0cb7631f2ff21805a6c3905b0410d..8e3552796b51f742cdd4e08d5fa765f3fae9482d 100644
--- a/packages/Catch2/src/catch2/benchmark/detail/catch_estimate_clock.hpp
+++ b/packages/Catch2/src/catch2/benchmark/detail/catch_estimate_clock.hpp
@@ -55,23 +55,23 @@ namespace Catch {
 
             template <typename Clock>
             int warmup() {
-                return run_for_at_least<Clock>(std::chrono::duration_cast<ClockDuration<Clock>>(warmup_time), warmup_seed, &resolution<Clock>)
+                return run_for_at_least<Clock>(warmup_time, warmup_seed, &resolution<Clock>)
                     .iterations;
             }
             template <typename Clock>
-            EnvironmentEstimate<FloatDuration<Clock>> estimate_clock_resolution(int iterations) {
-                auto r = run_for_at_least<Clock>(std::chrono::duration_cast<ClockDuration<Clock>>(clock_resolution_estimation_time), iterations, &resolution<Clock>)
+            EnvironmentEstimate estimate_clock_resolution(int iterations) {
+                auto r = run_for_at_least<Clock>(clock_resolution_estimation_time, iterations, &resolution<Clock>)
                     .result;
                 return {
-                    FloatDuration<Clock>(mean(r.begin(), r.end())),
-                    classify_outliers(r.begin(), r.end()),
+                    FDuration(mean(r.data(), r.data() + r.size())),
+                    classify_outliers(r.data(), r.data() + r.size()),
                 };
             }
             template <typename Clock>
-            EnvironmentEstimate<FloatDuration<Clock>> estimate_clock_cost(FloatDuration<Clock> resolution) {
+            EnvironmentEstimate estimate_clock_cost(FDuration resolution) {
                 auto time_limit = (std::min)(
                     resolution * clock_cost_estimation_tick_limit,
-                    FloatDuration<Clock>(clock_cost_estimation_time_limit));
+                    FDuration(clock_cost_estimation_time_limit));
                 auto time_clock = [](int k) {
                     return Detail::measure<Clock>([k] {
                         for (int i = 0; i < k; ++i) {
@@ -82,7 +82,7 @@ namespace Catch {
                 };
                 time_clock(1);
                 int iters = clock_cost_estimation_iterations;
-                auto&& r = run_for_at_least<Clock>(std::chrono::duration_cast<ClockDuration<Clock>>(clock_cost_estimation_time), iters, time_clock);
+                auto&& r = run_for_at_least<Clock>(clock_cost_estimation_time, iters, time_clock);
                 std::vector<double> times;
                 int nsamples = static_cast<int>(std::ceil(time_limit / r.elapsed));
                 times.reserve(static_cast<size_t>(nsamples));
@@ -92,18 +92,18 @@ namespace Catch {
                             .count() ) );
                 }
                 return {
-                    FloatDuration<Clock>(mean(times.begin(), times.end())),
-                    classify_outliers(times.begin(), times.end()),
+                    FDuration(mean(times.data(), times.data() + times.size())),
+                    classify_outliers(times.data(), times.data() + times.size()),
                 };
             }
 
             template <typename Clock>
-            Environment<FloatDuration<Clock>> measure_environment() {
+            Environment measure_environment() {
 #if defined(__clang__)
 #    pragma clang diagnostic push
 #    pragma clang diagnostic ignored "-Wexit-time-destructors"
 #endif
-                static Catch::Detail::unique_ptr<Environment<FloatDuration<Clock>>> env;
+                static Catch::Detail::unique_ptr<Environment> env;
 #if defined(__clang__)
 #    pragma clang diagnostic pop
 #endif
@@ -115,7 +115,7 @@ namespace Catch {
                 auto resolution = Detail::estimate_clock_resolution<Clock>(iters);
                 auto cost = Detail::estimate_clock_cost<Clock>(resolution.mean);
 
-                env = Catch::Detail::make_unique<Environment<FloatDuration<Clock>>>( Environment<FloatDuration<Clock>>{resolution, cost} );
+                env = Catch::Detail::make_unique<Environment>( Environment{resolution, cost} );
                 return *env;
             }
         } // namespace Detail
diff --git a/packages/Catch2/src/catch2/benchmark/detail/catch_measure.hpp b/packages/Catch2/src/catch2/benchmark/detail/catch_measure.hpp
index 1a30efabe04b211d4f8e369d0c9675f6775754ac..37494a68f63b30db4ab3c3a293e125e23a0842bb 100644
--- a/packages/Catch2/src/catch2/benchmark/detail/catch_measure.hpp
+++ b/packages/Catch2/src/catch2/benchmark/detail/catch_measure.hpp
@@ -18,7 +18,7 @@ namespace Catch {
     namespace Benchmark {
         namespace Detail {
             template <typename Clock, typename Fun, typename... Args>
-            TimingOf<Clock, Fun, Args...> measure(Fun&& fun, Args&&... args) {
+            TimingOf<Fun, Args...> measure(Fun&& fun, Args&&... args) {
                 auto start = Clock::now();
                 auto&& r = Detail::complete_invoke(fun, CATCH_FORWARD(args)...);
                 auto end = Clock::now();
diff --git a/packages/Catch2/src/catch2/benchmark/detail/catch_run_for_at_least.hpp b/packages/Catch2/src/catch2/benchmark/detail/catch_run_for_at_least.hpp
index 976a4b2430518735a8f37849968844fb85bb77fb..4dfa8bbbb6300e1680ee398ebecc27c44fad92ec 100644
--- a/packages/Catch2/src/catch2/benchmark/detail/catch_run_for_at_least.hpp
+++ b/packages/Catch2/src/catch2/benchmark/detail/catch_run_for_at_least.hpp
@@ -24,11 +24,11 @@ namespace Catch {
     namespace Benchmark {
         namespace Detail {
             template <typename Clock, typename Fun>
-            TimingOf<Clock, Fun, int> measure_one(Fun&& fun, int iters, std::false_type) {
+            TimingOf<Fun, int> measure_one(Fun&& fun, int iters, std::false_type) {
                 return Detail::measure<Clock>(fun, iters);
             }
             template <typename Clock, typename Fun>
-            TimingOf<Clock, Fun, Chronometer> measure_one(Fun&& fun, int iters, std::true_type) {
+            TimingOf<Fun, Chronometer> measure_one(Fun&& fun, int iters, std::true_type) {
                 Detail::ChronometerModel<Clock> meter;
                 auto&& result = Detail::complete_invoke(fun, Chronometer(meter, iters));
 
@@ -43,8 +43,8 @@ namespace Catch {
             void throw_optimized_away_error();
 
             template <typename Clock, typename Fun>
-            TimingOf<Clock, Fun, run_for_at_least_argument_t<Clock, Fun>>
-                run_for_at_least(ClockDuration<Clock> how_long,
+            TimingOf<Fun, run_for_at_least_argument_t<Clock, Fun>>
+                run_for_at_least(IDuration how_long,
                                  const int initial_iterations,
                                  Fun&& fun) {
                 auto iters = initial_iterations;
diff --git a/packages/Catch2/src/catch2/benchmark/detail/catch_stats.cpp b/packages/Catch2/src/catch2/benchmark/detail/catch_stats.cpp
index ea483a3093c6dea4880ca636676228b23f2ae18d..52cee4eeaebd6a78f187db69fa932a152f201388 100644
--- a/packages/Catch2/src/catch2/benchmark/detail/catch_stats.cpp
+++ b/packages/Catch2/src/catch2/benchmark/detail/catch_stats.cpp
@@ -10,8 +10,12 @@
 #include <catch2/benchmark/detail/catch_stats.hpp>
 
 #include <catch2/internal/catch_compiler_capabilities.hpp>
+#include <catch2/internal/catch_floating_point_helpers.hpp>
+#include <catch2/internal/catch_random_number_generator.hpp>
 
+#include <algorithm>
 #include <cassert>
+#include <cmath>
 #include <cstddef>
 #include <numeric>
 #include <random>
@@ -30,28 +34,23 @@ namespace Catch {
                 static sample
                 resample( URng& rng,
                           unsigned int resamples,
-                          std::vector<double>::const_iterator first,
-                          std::vector<double>::const_iterator last,
+                          double const* first,
+                          double const* last,
                           Estimator& estimator ) {
                     auto n = static_cast<size_t>( last - first );
-                    std::uniform_int_distribution<decltype( n )> dist( 0,
-                                                                       n - 1 );
+                    std::uniform_int_distribution<size_t> dist( 0, n - 1 );
 
                     sample out;
                     out.reserve( resamples );
-                    // We allocate the vector outside the loop to avoid realloc
-                    // per resample
                     std::vector<double> resampled;
                     resampled.reserve( n );
                     for ( size_t i = 0; i < resamples; ++i ) {
                         resampled.clear();
                         for ( size_t s = 0; s < n; ++s ) {
-                            resampled.push_back(
-                                first[static_cast<std::ptrdiff_t>(
-                                    dist( rng ) )] );
+                            resampled.push_back( first[dist( rng )] );
                         }
                         const auto estimate =
-                            estimator( resampled.begin(), resampled.end() );
+                            estimator( resampled.data(), resampled.data() + resampled.size() );
                         out.push_back( estimate );
                     }
                     std::sort( out.begin(), out.end() );
@@ -168,8 +167,7 @@ namespace Catch {
                 }
 
                 static double
-                standard_deviation( std::vector<double>::const_iterator first,
-                                    std::vector<double>::const_iterator last ) {
+                standard_deviation( double const* first, double const* last ) {
                     auto m = Catch::Benchmark::Detail::mean( first, last );
                     double variance =
                         std::accumulate( first,
@@ -183,6 +181,23 @@ namespace Catch {
                     return std::sqrt( variance );
                 }
 
+                static sample jackknife( double ( *estimator )( double const*,
+                                                                double const* ),
+                                         double* first,
+                                         double* last ) {
+                    const auto second = first + 1;
+                    sample results;
+                    results.reserve( static_cast<size_t>( last - first ) );
+
+                    for ( auto it = first; it != last; ++it ) {
+                        std::iter_swap( it, first );
+                        results.push_back( estimator( second, last ) );
+                    }
+
+                    return results;
+                }
+
+
             } // namespace
         }     // namespace Detail
     }         // namespace Benchmark
@@ -192,23 +207,17 @@ namespace Catch {
     namespace Benchmark {
         namespace Detail {
 
-#if defined( __GNUC__ ) || defined( __clang__ )
-#    pragma GCC diagnostic push
-#    pragma GCC diagnostic ignored "-Wfloat-equal"
-#endif
-            bool directCompare( double lhs, double rhs ) { return lhs == rhs; }
-#if defined( __GNUC__ ) || defined( __clang__ )
-#    pragma GCC diagnostic pop
-#endif
-
-            double weighted_average_quantile(int k, int q, std::vector<double>::iterator first, std::vector<double>::iterator last) {
+            double weighted_average_quantile( int k,
+                                              int q,
+                                              double* first,
+                                              double* last ) {
                 auto count = last - first;
                 double idx = (count - 1) * k / static_cast<double>(q);
                 int j = static_cast<int>(idx);
                 double g = idx - j;
                 std::nth_element(first, first + j, last);
                 auto xj = first[j];
-                if ( directCompare( g, 0 ) ) {
+                if ( Catch::Detail::directCompare( g, 0 ) ) {
                     return xj;
                 }
 
@@ -217,12 +226,11 @@ namespace Catch {
             }
 
             OutlierClassification
-            classify_outliers( std::vector<double>::const_iterator first,
-                               std::vector<double>::const_iterator last ) {
+            classify_outliers( double const* first, double const* last ) {
                 std::vector<double> copy( first, last );
 
-                auto q1 = weighted_average_quantile( 1, 4, copy.begin(), copy.end() );
-                auto q3 = weighted_average_quantile( 3, 4, copy.begin(), copy.end() );
+                auto q1 = weighted_average_quantile( 1, 4, copy.data(), copy.data() + copy.size() );
+                auto q3 = weighted_average_quantile( 3, 4, copy.data(), copy.data() + copy.size() );
                 auto iqr = q3 - q1;
                 auto los = q1 - ( iqr * 3. );
                 auto lom = q1 - ( iqr * 1.5 );
@@ -246,8 +254,7 @@ namespace Catch {
                 return o;
             }
 
-            double mean( std::vector<double>::const_iterator first,
-                         std::vector<double>::const_iterator last ) {
+            double mean( double const* first, double const* last ) {
                 auto count = last - first;
                 double sum = 0.;
                 while (first != last) {
@@ -257,6 +264,9 @@ namespace Catch {
                 return sum / static_cast<double>(count);
             }
 
+            double normal_cdf( double x ) {
+                return std::erfc( -x / std::sqrt( 2.0 ) ) / 2.0;
+            }
 
             double erfc_inv(double x) {
                 return erf_inv(1.0 - x);
@@ -278,26 +288,77 @@ namespace Catch {
                 return result;
             }
 
-            bootstrap_analysis analyse_samples(double confidence_level,
-                                               unsigned int n_resamples,
-                                               std::vector<double>::iterator first,
-                                               std::vector<double>::iterator last) {
-                CATCH_INTERNAL_START_WARNINGS_SUPPRESSION
-                CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS
-                static std::random_device entropy;
-                CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+            Estimate<double>
+            bootstrap( double confidence_level,
+                       double* first,
+                       double* last,
+                       sample const& resample,
+                       double ( *estimator )( double const*, double const* ) ) {
+                auto n_samples = last - first;
+
+                double point = estimator( first, last );
+                // Degenerate case with a single sample
+                if ( n_samples == 1 )
+                    return { point, point, point, confidence_level };
+
+                sample jack = jackknife( estimator, first, last );
+                double jack_mean =
+                    mean( jack.data(), jack.data() + jack.size() );
+                double sum_squares = 0, sum_cubes = 0;
+                for ( double x : jack ) {
+                    auto difference = jack_mean - x;
+                    auto square = difference * difference;
+                    auto cube = square * difference;
+                    sum_squares += square;
+                    sum_cubes += cube;
+                }
 
-                auto n = static_cast<int>(last - first); // seriously, one can't use integral types without hell in C++
+                double accel = sum_cubes / ( 6 * std::pow( sum_squares, 1.5 ) );
+                long n = static_cast<long>( resample.size() );
+                double prob_n =
+                    std::count_if( resample.begin(),
+                                   resample.end(),
+                                   [point]( double x ) { return x < point; } ) /
+                    static_cast<double>( n );
+                // degenerate case with uniform samples
+                if ( Catch::Detail::directCompare( prob_n, 0. ) ) {
+                    return { point, point, point, confidence_level };
+                }
 
+                double bias = normal_quantile( prob_n );
+                double z1 = normal_quantile( ( 1. - confidence_level ) / 2. );
+
+                auto cumn = [n]( double x ) -> long {
+                    return std::lround( normal_cdf( x ) *
+                                        static_cast<double>( n ) );
+                };
+                auto a = [bias, accel]( double b ) {
+                    return bias + b / ( 1. - accel * b );
+                };
+                double b1 = bias + z1;
+                double b2 = bias - z1;
+                double a1 = a( b1 );
+                double a2 = a( b2 );
+                auto lo = static_cast<size_t>( (std::max)( cumn( a1 ), 0l ) );
+                auto hi =
+                    static_cast<size_t>( (std::min)( cumn( a2 ), n - 1 ) );
+
+                return { point, resample[lo], resample[hi], confidence_level };
+            }
+
+            bootstrap_analysis analyse_samples(double confidence_level,
+                                               unsigned int n_resamples,
+                                               double* first,
+                                               double* last) {
                 auto mean = &Detail::mean;
                 auto stddev = &standard_deviation;
 
 #if defined(CATCH_CONFIG_USE_ASYNC)
-                auto Estimate = [=](double(*f)(std::vector<double>::const_iterator,
-                                               std::vector<double>::const_iterator)) {
-                    auto seed = entropy();
+                auto Estimate = [=](double(*f)(double const*, double const*)) {
+                    std::random_device rd;
+                    auto seed = rd();
                     return std::async(std::launch::async, [=] {
-                        std::mt19937 rng(seed);
+                        SimplePcg32 rng( seed );
                         auto resampled = resample(rng, n_resamples, first, last, f);
                         return bootstrap(confidence_level, first, last, resampled, f);
                     });
@@ -309,10 +370,10 @@ namespace Catch {
                 auto mean_estimate = mean_future.get();
                 auto stddev_estimate = stddev_future.get();
 #else
-                auto Estimate = [=](double(*f)(std::vector<double>::const_iterator,
-                                               std::vector<double>::const_iterator)) {
-                    auto seed = entropy();
-                    std::mt19937 rng(seed);
+                auto Estimate = [=](double(*f)(double const* , double const*)) {
+                    std::random_device rd;
+                    auto seed = rd();
+                    SimplePcg32 rng( seed );
                     auto resampled = resample(rng, n_resamples, first, last, f);
                     return bootstrap(confidence_level, first, last, resampled, f);
                 };
@@ -321,6 +382,7 @@ namespace Catch {
                 auto stddev_estimate = Estimate(stddev);
 #endif // CATCH_USE_ASYNC
 
+                auto n = static_cast<int>(last - first); // seriously, one can't use integral types without hell in C++
                 double outlier_variance = Detail::outlier_variance(mean_estimate, stddev_estimate, n);
 
                 return { mean_estimate, stddev_estimate, outlier_variance };
diff --git a/packages/Catch2/src/catch2/benchmark/detail/catch_stats.hpp b/packages/Catch2/src/catch2/benchmark/detail/catch_stats.hpp
index c1ce56644271cd79c52d96ebf42a0afe0de787a2..3bea612f9051e30347ba398c214ba18149b49f5c 100644
--- a/packages/Catch2/src/catch2/benchmark/detail/catch_stats.hpp
+++ b/packages/Catch2/src/catch2/benchmark/detail/catch_stats.hpp
@@ -13,100 +13,35 @@
 #include <catch2/benchmark/catch_estimate.hpp>
 #include <catch2/benchmark/catch_outlier_classification.hpp>
 
-#include <algorithm>
 #include <vector>
-#include <cmath>
 
 namespace Catch {
     namespace Benchmark {
         namespace Detail {
             using sample = std::vector<double>;
 
-            // Used when we know we want == comparison of two doubles
-            // to centralize warning suppression
-            bool directCompare( double lhs, double rhs );
-
-            double weighted_average_quantile(int k, int q, std::vector<double>::iterator first, std::vector<double>::iterator last);
+            double weighted_average_quantile( int k,
+                                              int q,
+                                              double* first,
+                                              double* last );
 
             OutlierClassification
-            classify_outliers( std::vector<double>::const_iterator first,
-                               std::vector<double>::const_iterator last );
-
-            double mean( std::vector<double>::const_iterator first,
-                         std::vector<double>::const_iterator last );
-
-            template <typename Estimator>
-            sample jackknife(Estimator&& estimator,
-                             std::vector<double>::iterator first,
-                             std::vector<double>::iterator last) {
-                auto n = static_cast<size_t>(last - first);
-                auto second = first;
-                ++second;
-                sample results;
-                results.reserve(n);
-
-                for (auto it = first; it != last; ++it) {
-                    std::iter_swap(it, first);
-                    results.push_back(estimator(second, last));
-                }
-
-                return results;
-            }
-
-            inline double normal_cdf(double x) {
-                return std::erfc(-x / std::sqrt(2.0)) / 2.0;
-            }
+            classify_outliers( double const* first, double const* last );
+
+            double mean( double const* first, double const* last );
+
+            double normal_cdf( double x );
 
             double erfc_inv(double x);
 
             double normal_quantile(double p);
 
-            template <typename Estimator>
-            Estimate<double> bootstrap( double confidence_level,
-                                        std::vector<double>::iterator first,
-                                        std::vector<double>::iterator last,
-                                        sample const& resample,
-                                        Estimator&& estimator ) {
-                auto n_samples = last - first;
-
-                double point = estimator(first, last);
-                // Degenerate case with a single sample
-                if (n_samples == 1) return { point, point, point, confidence_level };
-
-                sample jack = jackknife(estimator, first, last);
-                double jack_mean = mean(jack.begin(), jack.end());
-                double sum_squares = 0, sum_cubes = 0;
-                for (double x : jack) {
-                    auto difference = jack_mean - x;
-                    auto square = difference * difference;
-                    auto cube = square * difference;
-                    sum_squares += square; sum_cubes += cube;
-                }
-
-                double accel = sum_cubes / (6 * std::pow(sum_squares, 1.5));
-                long n = static_cast<long>(resample.size());
-                double prob_n = std::count_if(resample.begin(), resample.end(), [point](double x) { return x < point; }) / static_cast<double>(n);
-                // degenerate case with uniform samples
-                if ( directCompare( prob_n, 0. ) ) {
-                    return { point, point, point, confidence_level };
-                }
-
-                double bias = normal_quantile(prob_n);
-                double z1 = normal_quantile((1. - confidence_level) / 2.);
-
-                auto cumn = [n]( double x ) -> long {
-                    return std::lround( normal_cdf( x ) * static_cast<double>(n) );
-                };
-                auto a = [bias, accel](double b) { return bias + b / (1. - accel * b); };
-                double b1 = bias + z1;
-                double b2 = bias - z1;
-                double a1 = a(b1);
-                double a2 = a(b2);
-                auto lo = static_cast<size_t>((std::max)(cumn(a1), 0l));
-                auto hi = static_cast<size_t>((std::min)(cumn(a2), n - 1));
-
-                return { point, resample[lo], resample[hi], confidence_level };
-            }
+            Estimate<double>
+            bootstrap( double confidence_level,
+                       double* first,
+                       double* last,
+                       sample const& resample,
+                       double ( *estimator )( double const*, double const* ) );
 
             struct bootstrap_analysis {
                 Estimate<double> mean;
@@ -116,8 +51,8 @@ namespace Catch {
 
             bootstrap_analysis analyse_samples(double confidence_level,
                                                unsigned int n_resamples,
-                                               std::vector<double>::iterator first,
-                                               std::vector<double>::iterator last);
+                                               double* first,
+                                               double* last);
         } // namespace Detail
     } // namespace Benchmark
 } // namespace Catch
diff --git a/packages/Catch2/src/catch2/benchmark/detail/catch_timing.hpp b/packages/Catch2/src/catch2/benchmark/detail/catch_timing.hpp
index f5c25571c4779a0cf7ac586c427e3d54ef277a62..da5671908135de9af3407f9b63f805209c15c364 100644
--- a/packages/Catch2/src/catch2/benchmark/detail/catch_timing.hpp
+++ b/packages/Catch2/src/catch2/benchmark/detail/catch_timing.hpp
@@ -17,14 +17,14 @@
 
 namespace Catch {
     namespace Benchmark {
-        template <typename Duration, typename Result>
+        template <typename Result>
         struct Timing {
-            Duration elapsed;
+            IDuration elapsed;
             Result result;
             int iterations;
         };
-        template <typename Clock, typename Func, typename... Args>
-        using TimingOf = Timing<ClockDuration<Clock>, Detail::CompleteType_t<FunctionReturnType<Func, Args...>>>;
+        template <typename Func, typename... Args>
+        using TimingOf = Timing<Detail::CompleteType_t<FunctionReturnType<Func, Args...>>>;
     } // namespace Benchmark
 } // namespace Catch
 
diff --git a/packages/Catch2/src/catch2/catch_all.hpp b/packages/Catch2/src/catch2/catch_all.hpp
index 70ec402d6c223eb99dd8a005f17b1c04760a6743..f2cc8536590902593334ed805aba76ffbcd5fd9c 100644
--- a/packages/Catch2/src/catch2/catch_all.hpp
+++ b/packages/Catch2/src/catch2/catch_all.hpp
@@ -54,6 +54,7 @@
 #include <catch2/internal/catch_compiler_capabilities.hpp>
 #include <catch2/internal/catch_config_android_logwrite.hpp>
 #include <catch2/internal/catch_config_counter.hpp>
+#include <catch2/internal/catch_config_prefix_messages.hpp>
 #include <catch2/internal/catch_config_static_analysis_support.hpp>
 #include <catch2/internal/catch_config_uncaught_exceptions.hpp>
 #include <catch2/internal/catch_config_wchar.hpp>
@@ -73,6 +74,7 @@
 #include <catch2/internal/catch_getenv.hpp>
 #include <catch2/internal/catch_is_permutation.hpp>
 #include <catch2/internal/catch_istream.hpp>
+#include <catch2/internal/catch_jsonwriter.hpp>
 #include <catch2/internal/catch_lazy_expr.hpp>
 #include <catch2/internal/catch_leak_detector.hpp>
 #include <catch2/internal/catch_list.hpp>
@@ -89,6 +91,8 @@
 #include <catch2/internal/catch_preprocessor.hpp>
 #include <catch2/internal/catch_preprocessor_internal_stringify.hpp>
 #include <catch2/internal/catch_preprocessor_remove_parens.hpp>
+#include <catch2/internal/catch_random_floating_point_helpers.hpp>
+#include <catch2/internal/catch_random_integer_helpers.hpp>
 #include <catch2/internal/catch_random_number_generator.hpp>
 #include <catch2/internal/catch_random_seed_generation.hpp>
 #include <catch2/internal/catch_reporter_registry.hpp>
@@ -118,6 +122,8 @@
 #include <catch2/internal/catch_textflow.hpp>
 #include <catch2/internal/catch_to_string.hpp>
 #include <catch2/internal/catch_uncaught_exceptions.hpp>
+#include <catch2/internal/catch_uniform_floating_point_distribution.hpp>
+#include <catch2/internal/catch_uniform_integer_distribution.hpp>
 #include <catch2/internal/catch_unique_name.hpp>
 #include <catch2/internal/catch_unique_ptr.hpp>
 #include <catch2/internal/catch_void_type.hpp>
diff --git a/packages/Catch2/src/catch2/catch_approx.cpp b/packages/Catch2/src/catch2/catch_approx.cpp
index 407586d1d5db4462d5c6bfa5f9b7e49812dcda56..9ad4ce3ee77be8a651fb345b3eee6376822f6531 100644
--- a/packages/Catch2/src/catch2/catch_approx.cpp
+++ b/packages/Catch2/src/catch2/catch_approx.cpp
@@ -70,10 +70,10 @@ namespace Catch {
     }
 
 namespace literals {
-    Approx operator "" _a(long double val) {
+    Approx operator ""_a(long double val) {
         return Approx(val);
     }
-    Approx operator "" _a(unsigned long long val) {
+    Approx operator ""_a(unsigned long long val) {
         return Approx(val);
     }
 } // end namespace literals
diff --git a/packages/Catch2/src/catch2/catch_config.hpp b/packages/Catch2/src/catch2/catch_config.hpp
index 784de4aa5bbc6361503e2637574463055fd44810..17e983e5cc8576c8fa051fbca99d0db8815472c6 100644
--- a/packages/Catch2/src/catch2/catch_config.hpp
+++ b/packages/Catch2/src/catch2/catch_config.hpp
@@ -69,7 +69,7 @@ namespace Catch {
         bool benchmarkNoAnalysis = false;
         unsigned int benchmarkSamples = 100;
         double benchmarkConfidenceInterval = 0.95;
-        unsigned int benchmarkResamples = 100000;
+        unsigned int benchmarkResamples = 100'000;
         std::chrono::milliseconds::rep benchmarkWarmupTime = 100;
 
         Verbosity verbosity = Verbosity::Normal;
diff --git a/packages/Catch2/src/catch2/catch_message.hpp b/packages/Catch2/src/catch2/catch_message.hpp
index b348ac87026143447a4b9cd5b02b991949356e0d..05325ee8f571eeb762fc0728a39b4b489cfd7d68 100644
--- a/packages/Catch2/src/catch2/catch_message.hpp
+++ b/packages/Catch2/src/catch2/catch_message.hpp
@@ -8,11 +8,13 @@
 #ifndef CATCH_MESSAGE_HPP_INCLUDED
 #define CATCH_MESSAGE_HPP_INCLUDED
 
+#include <catch2/internal/catch_config_prefix_messages.hpp>
 #include <catch2/internal/catch_result_type.hpp>
 #include <catch2/internal/catch_reusable_string_stream.hpp>
 #include <catch2/internal/catch_stream_end_stop.hpp>
 #include <catch2/internal/catch_message_info.hpp>
 #include <catch2/catch_tostring.hpp>
+#include <catch2/interfaces/catch_interfaces_capture.hpp>
 
 #include <string>
 #include <vector>
@@ -112,28 +114,28 @@ namespace Catch {
     Catch::getResultCapture().emplaceUnscopedMessage( Catch::MessageBuilder( macroName##_catch_sr, CATCH_INTERNAL_LINEINFO, Catch::ResultWas::Info ) << log )
 
 
-#if defined(CATCH_CONFIG_PREFIX_ALL) && !defined(CATCH_CONFIG_DISABLE)
+#if defined(CATCH_CONFIG_PREFIX_MESSAGES) && !defined(CATCH_CONFIG_DISABLE)
 
   #define CATCH_INFO( msg ) INTERNAL_CATCH_INFO( "CATCH_INFO", msg )
   #define CATCH_UNSCOPED_INFO( msg ) INTERNAL_CATCH_UNSCOPED_INFO( "CATCH_UNSCOPED_INFO", msg )
   #define CATCH_WARN( msg ) INTERNAL_CATCH_MSG( "CATCH_WARN", Catch::ResultWas::Warning, Catch::ResultDisposition::ContinueOnFailure, msg )
   #define CATCH_CAPTURE( ... ) INTERNAL_CATCH_CAPTURE( INTERNAL_CATCH_UNIQUE_NAME(capturer), "CATCH_CAPTURE", __VA_ARGS__ )
 
-#elif defined(CATCH_CONFIG_PREFIX_ALL) && defined(CATCH_CONFIG_DISABLE)
+#elif defined(CATCH_CONFIG_PREFIX_MESSAGES) && defined(CATCH_CONFIG_DISABLE)
 
   #define CATCH_INFO( msg )          (void)(0)
   #define CATCH_UNSCOPED_INFO( msg ) (void)(0)
   #define CATCH_WARN( msg )          (void)(0)
   #define CATCH_CAPTURE( ... )       (void)(0)
 
-#elif !defined(CATCH_CONFIG_PREFIX_ALL) && !defined(CATCH_CONFIG_DISABLE)
+#elif !defined(CATCH_CONFIG_PREFIX_MESSAGES) && !defined(CATCH_CONFIG_DISABLE)
 
   #define INFO( msg ) INTERNAL_CATCH_INFO( "INFO", msg )
   #define UNSCOPED_INFO( msg ) INTERNAL_CATCH_UNSCOPED_INFO( "UNSCOPED_INFO", msg )
   #define WARN( msg ) INTERNAL_CATCH_MSG( "WARN", Catch::ResultWas::Warning, Catch::ResultDisposition::ContinueOnFailure, msg )
   #define CAPTURE( ... ) INTERNAL_CATCH_CAPTURE( INTERNAL_CATCH_UNIQUE_NAME(capturer), "CAPTURE", __VA_ARGS__ )
 
-#elif !defined(CATCH_CONFIG_PREFIX_ALL) && defined(CATCH_CONFIG_DISABLE)
+#elif !defined(CATCH_CONFIG_PREFIX_MESSAGES) && defined(CATCH_CONFIG_DISABLE)
 
   #define INFO( msg )          (void)(0)
   #define UNSCOPED_INFO( msg ) (void)(0)
diff --git a/packages/Catch2/src/catch2/catch_test_case_info.cpp b/packages/Catch2/src/catch2/catch_test_case_info.cpp
index e9dad57787036172a30deb088c8c6a2bea1c6883..c38ee55acc44492f596d1058e941782214ce3dd9 100644
--- a/packages/Catch2/src/catch2/catch_test_case_info.cpp
+++ b/packages/Catch2/src/catch2/catch_test_case_info.cpp
@@ -9,6 +9,7 @@
 #include <catch2/internal/catch_enforce.hpp>
 #include <catch2/internal/catch_string_manip.hpp>
 #include <catch2/internal/catch_case_insensitive_comparisons.hpp>
+#include <catch2/internal/catch_test_registry.hpp>
 
 #include <cassert>
 #include <cctype>
diff --git a/packages/Catch2/src/catch2/catch_test_case_info.hpp b/packages/Catch2/src/catch2/catch_test_case_info.hpp
index 5ff3e3e7207e3002a69516fddb34f77f889afa99..a2f4b43ecb47ed2c4c8332d07d355ea2aed2b11c 100644
--- a/packages/Catch2/src/catch2/catch_test_case_info.hpp
+++ b/packages/Catch2/src/catch2/catch_test_case_info.hpp
@@ -8,10 +8,10 @@
 #ifndef CATCH_TEST_CASE_INFO_HPP_INCLUDED
 #define CATCH_TEST_CASE_INFO_HPP_INCLUDED
 
+#include <catch2/interfaces/catch_interfaces_test_invoker.hpp>
 #include <catch2/internal/catch_source_line_info.hpp>
 #include <catch2/internal/catch_noncopyable.hpp>
 #include <catch2/internal/catch_stringref.hpp>
-#include <catch2/internal/catch_test_registry.hpp>
 #include <catch2/internal/catch_unique_ptr.hpp>
 
 
@@ -44,6 +44,7 @@ namespace Catch {
     };
 
     class ITestInvoker;
+    struct NameAndTags;
 
     enum class TestCaseProperties : uint8_t {
         None = 0,
diff --git a/packages/Catch2/src/catch2/catch_tostring.hpp b/packages/Catch2/src/catch2/catch_tostring.hpp
index 788d824c899d4f5a7d66081d08a302f8f3b02908..f3fb0beb7938611e56d1cb765f2dab9370b1d350 100644
--- a/packages/Catch2/src/catch2/catch_tostring.hpp
+++ b/packages/Catch2/src/catch2/catch_tostring.hpp
@@ -398,6 +398,12 @@ namespace Catch {
             }
         }
     };
+    template <>
+    struct StringMaker<std::nullopt_t> {
+        static std::string convert(const std::nullopt_t&) {
+            return "{ }";
+        }
+    };
 }
 #endif // CATCH_CONFIG_ENABLE_OPTIONAL_STRINGMAKER
 
diff --git a/packages/Catch2/src/catch2/catch_user_config.hpp.in b/packages/Catch2/src/catch2/catch_user_config.hpp.in
index 11ab5a6d1aae25ca516859f84c6b46242d28fb39..10d61937f1abfef4509accfc43b587ffaa27dd3d 100644
--- a/packages/Catch2/src/catch2/catch_user_config.hpp.in
+++ b/packages/Catch2/src/catch2/catch_user_config.hpp.in
@@ -198,6 +198,7 @@
 #cmakedefine CATCH_CONFIG_FAST_COMPILE
 #cmakedefine CATCH_CONFIG_NOSTDOUT
 #cmakedefine CATCH_CONFIG_PREFIX_ALL
+#cmakedefine CATCH_CONFIG_PREFIX_MESSAGES
 #cmakedefine CATCH_CONFIG_WINDOWS_CRTDBG
 
 #cmakedefine CATCH_CONFIG_SHARED_LIBRARY
diff --git a/packages/Catch2/src/catch2/catch_version.cpp b/packages/Catch2/src/catch2/catch_version.cpp
index 19cab91b3db63f719726c752c85eab12f1380e38..4e67d968ccadeedb1d99ac7ed3f4386323e72b43 100644
--- a/packages/Catch2/src/catch2/catch_version.cpp
+++ b/packages/Catch2/src/catch2/catch_version.cpp
@@ -36,7 +36,7 @@ namespace Catch {
     }
 
     Version const& libraryVersion() {
-        static Version version( 3, 3, 2, "", 0 );
+        static Version version( 3, 5, 2, "", 0 );
         return version;
     }
 
diff --git a/packages/Catch2/src/catch2/catch_version_macros.hpp b/packages/Catch2/src/catch2/catch_version_macros.hpp
index 9ece85051120b099f0f70b1e217243f3cb15c320..be2a04d2f0b63d270f5be5b9fcde5f85235e0d03 100644
--- a/packages/Catch2/src/catch2/catch_version_macros.hpp
+++ b/packages/Catch2/src/catch2/catch_version_macros.hpp
@@ -9,7 +9,7 @@
 #define CATCH_VERSION_MACROS_HPP_INCLUDED
 
 #define CATCH_VERSION_MAJOR 3
-#define CATCH_VERSION_MINOR 3
+#define CATCH_VERSION_MINOR 5
 #define CATCH_VERSION_PATCH 2
 
 #endif // CATCH_VERSION_MACROS_HPP_INCLUDED
diff --git a/packages/Catch2/src/catch2/generators/catch_generators.hpp b/packages/Catch2/src/catch2/generators/catch_generators.hpp
index 117f190193ce241e11cf17a3c1ebfd78b3ededb9..0f35a9968a84d316dfe0e797023ce504b126887e 100644
--- a/packages/Catch2/src/catch2/generators/catch_generators.hpp
+++ b/packages/Catch2/src/catch2/generators/catch_generators.hpp
@@ -37,12 +37,6 @@ namespace Detail {
         }
 
     public:
-        ~IGenerator() override = default;
-        IGenerator() = default;
-        IGenerator(IGenerator const&) = default;
-        IGenerator& operator=(IGenerator const&) = default;
-
-
         // Returns the current element of the generator
         //
         // \Precondition The generator is either freshly constructed,
diff --git a/packages/Catch2/src/catch2/generators/catch_generators_random.cpp b/packages/Catch2/src/catch2/generators/catch_generators_random.cpp
index 2e3390fdfca69fa312abef527ab2b9662a25f5b1..00a8e634f7be360a557059ed32c72a3b1ed11c6b 100644
--- a/packages/Catch2/src/catch2/generators/catch_generators_random.cpp
+++ b/packages/Catch2/src/catch2/generators/catch_generators_random.cpp
@@ -7,7 +7,35 @@
 // SPDX-License-Identifier: BSL-1.0
 
 #include <catch2/generators/catch_generators_random.hpp>
-
 #include <catch2/internal/catch_context.hpp>
 
-std::uint32_t Catch::Generators::Detail::getSeed() { return sharedRng()(); }
+#include <random>
+
+namespace Catch {
+    namespace Generators {
+        namespace Detail {
+            std::uint32_t getSeed() { return sharedRng()(); }
+        } // namespace Detail
+
+        struct RandomFloatingGenerator<long double>::PImpl {
+            PImpl( long double a, long double b, uint32_t seed ):
+                rng( seed ), dist( a, b ) {}
+
+            Catch::SimplePcg32 rng;
+            std::uniform_real_distribution<long double> dist;
+        };
+
+        RandomFloatingGenerator<long double>::RandomFloatingGenerator(
+            long double a, long double b, std::uint32_t seed) :
+            m_pimpl(Catch::Detail::make_unique<PImpl>(a, b, seed)) {
+            static_cast<void>( next() );
+        }
+
+        RandomFloatingGenerator<long double>::~RandomFloatingGenerator() =
+            default;
+        bool RandomFloatingGenerator<long double>::next() {
+            m_current_number = m_pimpl->dist( m_pimpl->rng );
+            return true;
+        }
+    } // namespace Generators
+} // namespace Catch
diff --git a/packages/Catch2/src/catch2/generators/catch_generators_random.hpp b/packages/Catch2/src/catch2/generators/catch_generators_random.hpp
index bcd4888dc3d3e6bdb94113309429290f897e8626..712835619e943d38bebc97c4a0f643cb90c0906b 100644
--- a/packages/Catch2/src/catch2/generators/catch_generators_random.hpp
+++ b/packages/Catch2/src/catch2/generators/catch_generators_random.hpp
@@ -8,11 +8,11 @@
 #ifndef CATCH_GENERATORS_RANDOM_HPP_INCLUDED
 #define CATCH_GENERATORS_RANDOM_HPP_INCLUDED
 
-#include <catch2/internal/catch_context.hpp>
 #include <catch2/generators/catch_generators.hpp>
 #include <catch2/internal/catch_random_number_generator.hpp>
-
-#include <random>
+#include <catch2/internal/catch_uniform_integer_distribution.hpp>
+#include <catch2/internal/catch_uniform_floating_point_distribution.hpp>
+#include <catch2/internal/catch_unique_ptr.hpp>
 
 namespace Catch {
 namespace Generators {
@@ -26,7 +26,7 @@ namespace Detail {
 template <typename Float>
 class RandomFloatingGenerator final : public IGenerator<Float> {
     Catch::SimplePcg32 m_rng;
-    std::uniform_real_distribution<Float> m_dist;
+    Catch::uniform_floating_point_distribution<Float> m_dist;
     Float m_current_number;
 public:
     RandomFloatingGenerator( Float a, Float b, std::uint32_t seed ):
@@ -44,10 +44,27 @@ public:
     }
 };
 
+template <>
+class RandomFloatingGenerator<long double> final : public IGenerator<long double> {
+    // We still rely on <random> for this specialization, but we don't
+    // want to drag it into the header.
+    struct PImpl;
+    Catch::Detail::unique_ptr<PImpl> m_pimpl;
+    long double m_current_number;
+
+public:
+    RandomFloatingGenerator( long double a, long double b, std::uint32_t seed );
+
+    long double const& get() const override { return m_current_number; }
+    bool next() override;
+
+    ~RandomFloatingGenerator() override; // = default
+};
+
 template <typename Integer>
 class RandomIntegerGenerator final : public IGenerator<Integer> {
     Catch::SimplePcg32 m_rng;
-    std::uniform_int_distribution<Integer> m_dist;
+    Catch::uniform_integer_distribution<Integer> m_dist;
     Integer m_current_number;
 public:
     RandomIntegerGenerator( Integer a, Integer b, std::uint32_t seed ):
@@ -68,14 +85,6 @@ public:
 template <typename T>
 std::enable_if_t<std::is_integral<T>::value, GeneratorWrapper<T>>
 random(T a, T b) {
-    static_assert(
-        !std::is_same<T, char>::value &&
-        !std::is_same<T, int8_t>::value &&
-        !std::is_same<T, uint8_t>::value &&
-        !std::is_same<T, signed char>::value &&
-        !std::is_same<T, unsigned char>::value &&
-        !std::is_same<T, bool>::value,
-        "The requested type is not supported by the underlying random distributions from std" );
     return GeneratorWrapper<T>(
         Catch::Detail::make_unique<RandomIntegerGenerator<T>>(a, b, Detail::getSeed())
     );
diff --git a/packages/Catch2/src/catch2/generators/catch_generators_range.hpp b/packages/Catch2/src/catch2/generators/catch_generators_range.hpp
index 495acb9509b1e616014184da18f447f6073fcfc2..b67c1590ee4b6b3e41c1b3cd72d90bebec6749f5 100644
--- a/packages/Catch2/src/catch2/generators/catch_generators_range.hpp
+++ b/packages/Catch2/src/catch2/generators/catch_generators_range.hpp
@@ -96,10 +96,11 @@ GeneratorWrapper<ResultType> from_range(InputIterator from, InputSentinel to) {
     return GeneratorWrapper<ResultType>(Catch::Detail::make_unique<IteratorGenerator<ResultType>>(from, to));
 }
 
-template <typename Container,
-          typename ResultType = typename Container::value_type>
-GeneratorWrapper<ResultType> from_range(Container const& cnt) {
-    return GeneratorWrapper<ResultType>(Catch::Detail::make_unique<IteratorGenerator<ResultType>>(cnt.begin(), cnt.end()));
+template <typename Container>
+auto from_range(Container const& cnt) {
+    using std::begin;
+    using std::end;
+    return from_range( begin( cnt ), end( cnt ) );
 }
 
 
diff --git a/packages/Catch2/src/catch2/interfaces/catch_interfaces_reporter.cpp b/packages/Catch2/src/catch2/interfaces/catch_interfaces_reporter.cpp
index 3274bcf50ebd03afd651b2778f90b2a296fe3f91..90536bb366232c8f205646b472bcaa8fd207157d 100644
--- a/packages/Catch2/src/catch2/interfaces/catch_interfaces_reporter.cpp
+++ b/packages/Catch2/src/catch2/interfaces/catch_interfaces_reporter.cpp
@@ -7,19 +7,11 @@
 // SPDX-License-Identifier: BSL-1.0
 #include <catch2/interfaces/catch_interfaces_reporter.hpp>
 #include <catch2/interfaces/catch_interfaces_config.hpp>
-#include <catch2/internal/catch_console_colour.hpp>
-#include <catch2/internal/catch_console_width.hpp>
 #include <catch2/catch_message.hpp>
-#include <catch2/internal/catch_list.hpp>
-#include <catch2/internal/catch_string_manip.hpp>
-#include <catch2/catch_test_case_info.hpp>
-#include <catch2/reporters/catch_reporter_helpers.hpp>
 #include <catch2/internal/catch_move_and_forward.hpp>
 #include <catch2/internal/catch_istream.hpp>
 
-#include <algorithm>
 #include <cassert>
-#include <iomanip>
 
 namespace Catch {
 
diff --git a/packages/Catch2/src/catch2/interfaces/catch_interfaces_reporter.hpp b/packages/Catch2/src/catch2/interfaces/catch_interfaces_reporter.hpp
index b40fce3128960bdfaed980d31d7cf1dedeff81d6..a052c5db1d97b7d29da61a19b39effbb9a444cc9 100644
--- a/packages/Catch2/src/catch2/interfaces/catch_interfaces_reporter.hpp
+++ b/packages/Catch2/src/catch2/interfaces/catch_interfaces_reporter.hpp
@@ -15,7 +15,6 @@
 #include <catch2/internal/catch_stringref.hpp>
 #include <catch2/internal/catch_test_run_info.hpp>
 #include <catch2/internal/catch_unique_ptr.hpp>
-#include <catch2/internal/catch_move_and_forward.hpp>
 #include <catch2/benchmark/detail/catch_benchmark_stats.hpp>
 
 #include <map>
diff --git a/packages/Catch2/src/catch2/internal/catch_assertion_handler.cpp b/packages/Catch2/src/catch2/internal/catch_assertion_handler.cpp
index e5232f70c9a66c8e6f9b9a9e23633aa7433b3101..f650a7073a25c12fc39796a069329f67d1f53225 100644
--- a/packages/Catch2/src/catch2/internal/catch_assertion_handler.cpp
+++ b/packages/Catch2/src/catch2/internal/catch_assertion_handler.cpp
@@ -8,10 +8,8 @@
 #include <catch2/internal/catch_assertion_handler.hpp>
 #include <catch2/interfaces/catch_interfaces_config.hpp>
 #include <catch2/internal/catch_context.hpp>
-#include <catch2/internal/catch_enforce.hpp>
 #include <catch2/internal/catch_debugger.hpp>
 #include <catch2/internal/catch_test_failure_exception.hpp>
-#include <catch2/interfaces/catch_interfaces_registry_hub.hpp>
 #include <catch2/matchers/catch_matchers_string.hpp>
 
 namespace Catch {
diff --git a/packages/Catch2/src/catch2/internal/catch_clara.cpp b/packages/Catch2/src/catch2/internal/catch_clara.cpp
index c9bc76959d0b1fb87186020093251055d95fe65d..c76089eea10c5064a5e2b275397b776da134fdf4 100644
--- a/packages/Catch2/src/catch2/internal/catch_clara.cpp
+++ b/packages/Catch2/src/catch2/internal/catch_clara.cpp
@@ -11,6 +11,7 @@
 #include <catch2/internal/catch_platform.hpp>
 #include <catch2/internal/catch_string_manip.hpp>
 #include <catch2/internal/catch_textflow.hpp>
+#include <catch2/internal/catch_reusable_string_stream.hpp>
 
 #include <algorithm>
 #include <ostream>
@@ -24,13 +25,29 @@ namespace {
             ;
     }
 
-    std::string normaliseOpt( std::string const& optName ) {
-#ifdef CATCH_PLATFORM_WINDOWS
-        if ( optName[0] == '/' )
-            return "-" + optName.substr( 1 );
-        else
+    Catch::StringRef normaliseOpt( Catch::StringRef optName ) {
+        if ( optName[0] == '-'
+#if defined(CATCH_PLATFORM_WINDOWS)
+             || optName[0] == '/'
 #endif
-            return optName;
+        ) {
+            return optName.substr( 1, optName.size() );
+        }
+
+        return optName;
+    }
+
+    static size_t find_first_separator(Catch::StringRef sr) {
+        auto is_separator = []( char c ) {
+            return c == ' ' || c == ':' || c == '=';
+        };
+        size_t pos = 0;
+        while (pos < sr.size()) {
+            if (is_separator(sr[pos])) { return pos; }
+            ++pos;
+        }
+
+        return Catch::StringRef::npos;
     }
 
 } // namespace
@@ -48,23 +65,23 @@ namespace Catch {
                 }
 
                 if ( it != itEnd ) {
-                    auto const& next = *it;
+                    StringRef next = *it;
                     if ( isOptPrefix( next[0] ) ) {
-                        auto delimiterPos = next.find_first_of( " :=" );
-                        if ( delimiterPos != std::string::npos ) {
+                        auto delimiterPos = find_first_separator(next);
+                        if ( delimiterPos != StringRef::npos ) {
                             m_tokenBuffer.push_back(
                                 { TokenType::Option,
                                   next.substr( 0, delimiterPos ) } );
                             m_tokenBuffer.push_back(
                                 { TokenType::Argument,
-                                  next.substr( delimiterPos + 1 ) } );
+                                  next.substr( delimiterPos + 1, next.size() ) } );
                         } else {
                             if ( next[1] != '-' && next.size() > 2 ) {
-                                std::string opt = "- ";
+                                // Combined short args, e.g. "-ab" for "-a -b"
                                 for ( size_t i = 1; i < next.size(); ++i ) {
-                                    opt[1] = next[i];
                                     m_tokenBuffer.push_back(
-                                        { TokenType::Option, opt } );
+                                        { TokenType::Option,
+                                          next.substr( i, 1 ) } );
                                 }
                             } else {
                                 m_tokenBuffer.push_back(
@@ -124,12 +141,12 @@ namespace Catch {
             size_t ParserBase::cardinality() const { return 1; }
 
             InternalParseResult ParserBase::parse( Args const& args ) const {
-                return parse( args.exeName(), TokenStream( args ) );
+                return parse( static_cast<std::string>(args.exeName()), TokenStream( args ) );
             }
 
             ParseState::ParseState( ParseResultType type,
-                                    TokenStream const& remainingTokens ):
-                m_type( type ), m_remainingTokens( remainingTokens ) {}
+                                    TokenStream remainingTokens ):
+                m_type( type ), m_remainingTokens( CATCH_MOVE(remainingTokens) ) {}
 
             ParserResult BoundFlagRef::setFlag( bool flag ) {
                 m_ref = flag;
@@ -147,34 +164,34 @@ namespace Catch {
 } // namespace Detail
 
         Detail::InternalParseResult Arg::parse(std::string const&,
-                                               Detail::TokenStream const& tokens) const {
+                                               Detail::TokenStream tokens) const {
             auto validationResult = validate();
             if (!validationResult)
                 return Detail::InternalParseResult(validationResult);
 
-            auto remainingTokens = tokens;
-            auto const& token = *remainingTokens;
+            auto token = *tokens;
             if (token.type != Detail::TokenType::Argument)
                 return Detail::InternalParseResult::ok(Detail::ParseState(
-                    ParseResultType::NoMatch, remainingTokens));
+                    ParseResultType::NoMatch, CATCH_MOVE(tokens)));
 
             assert(!m_ref->isFlag());
             auto valueRef =
                 static_cast<Detail::BoundValueRefBase*>(m_ref.get());
 
-            auto result = valueRef->setValue(remainingTokens->token);
-            if (!result)
-                return Detail::InternalParseResult(result);
+            auto result = valueRef->setValue(static_cast<std::string>(token.token));
+            if ( !result )
+                return Detail::InternalParseResult( result );
             else
-                return Detail::InternalParseResult::ok(Detail::ParseState(
-                    ParseResultType::Matched, ++remainingTokens));
+                return Detail::InternalParseResult::ok(
+                    Detail::ParseState( ParseResultType::Matched,
+                                        CATCH_MOVE( ++tokens ) ) );
         }
 
         Opt::Opt(bool& ref) :
             ParserRefImpl(std::make_shared<Detail::BoundFlagRef>(ref)) {}
 
-        std::vector<Detail::HelpColumns> Opt::getHelpColumns() const {
-            std::ostringstream oss;
+        Detail::HelpColumns Opt::getHelpColumns() const {
+            ReusableStringStream oss;
             bool first = true;
             for (auto const& opt : m_optNames) {
                 if (first)
@@ -185,10 +202,10 @@ namespace Catch {
             }
             if (!m_hint.empty())
                 oss << " <" << m_hint << '>';
-            return { { oss.str(), m_description } };
+            return { oss.str(), m_description };
         }
 
-        bool Opt::isMatch(std::string const& optToken) const {
+        bool Opt::isMatch(StringRef optToken) const {
             auto normalisedToken = normaliseOpt(optToken);
             for (auto const& name : m_optNames) {
                 if (normaliseOpt(name) == normalisedToken)
@@ -198,15 +215,14 @@ namespace Catch {
         }
 
         Detail::InternalParseResult Opt::parse(std::string const&,
-                                       Detail::TokenStream const& tokens) const {
+                                       Detail::TokenStream tokens) const {
             auto validationResult = validate();
             if (!validationResult)
                 return Detail::InternalParseResult(validationResult);
 
-            auto remainingTokens = tokens;
-            if (remainingTokens &&
-                remainingTokens->type == Detail::TokenType::Option) {
-                auto const& token = *remainingTokens;
+            if (tokens &&
+                tokens->type == Detail::TokenType::Option) {
+                auto const& token = *tokens;
                 if (isMatch(token.token)) {
                     if (m_ref->isFlag()) {
                         auto flagRef =
@@ -218,35 +234,35 @@ namespace Catch {
                         if (result.value() ==
                             ParseResultType::ShortCircuitAll)
                             return Detail::InternalParseResult::ok(Detail::ParseState(
-                                result.value(), remainingTokens));
+                                result.value(), CATCH_MOVE(tokens)));
                     } else {
                         auto valueRef =
                             static_cast<Detail::BoundValueRefBase*>(
                                 m_ref.get());
-                        ++remainingTokens;
-                        if (!remainingTokens)
+                        ++tokens;
+                        if (!tokens)
                             return Detail::InternalParseResult::runtimeError(
                                 "Expected argument following " +
                                 token.token);
-                        auto const& argToken = *remainingTokens;
+                        auto const& argToken = *tokens;
                         if (argToken.type != Detail::TokenType::Argument)
                             return Detail::InternalParseResult::runtimeError(
                                 "Expected argument following " +
                                 token.token);
-                        const auto result = valueRef->setValue(argToken.token);
+                        const auto result = valueRef->setValue(static_cast<std::string>(argToken.token));
                         if (!result)
                             return Detail::InternalParseResult(result);
                         if (result.value() ==
                             ParseResultType::ShortCircuitAll)
                             return Detail::InternalParseResult::ok(Detail::ParseState(
-                                result.value(), remainingTokens));
+                                result.value(), CATCH_MOVE(tokens)));
                     }
                     return Detail::InternalParseResult::ok(Detail::ParseState(
-                        ParseResultType::Matched, ++remainingTokens));
+                        ParseResultType::Matched, CATCH_MOVE(++tokens)));
                 }
             }
             return Detail::InternalParseResult::ok(
-                Detail::ParseState(ParseResultType::NoMatch, remainingTokens));
+                Detail::ParseState(ParseResultType::NoMatch, CATCH_MOVE(tokens)));
         }
 
         Detail::Result Opt::validate() const {
@@ -278,9 +294,9 @@ namespace Catch {
 
         Detail::InternalParseResult
             ExeName::parse(std::string const&,
-                           Detail::TokenStream const& tokens) const {
+                           Detail::TokenStream tokens) const {
             return Detail::InternalParseResult::ok(
-                Detail::ParseState(ParseResultType::NoMatch, tokens));
+                Detail::ParseState(ParseResultType::NoMatch, CATCH_MOVE(tokens)));
         }
 
         ParserResult ExeName::set(std::string const& newName) {
@@ -310,9 +326,9 @@ namespace Catch {
 
         std::vector<Detail::HelpColumns> Parser::getHelpColumns() const {
             std::vector<Detail::HelpColumns> cols;
+            cols.reserve( m_options.size() );
             for ( auto const& o : m_options ) {
-                auto childCols = o.getHelpColumns();
-                cols.insert( cols.end(), childCols.begin(), childCols.end() );
+                cols.push_back(o.getHelpColumns());
             }
             return cols;
         }
@@ -350,12 +366,12 @@ namespace Catch {
 
             optWidth = ( std::min )( optWidth, consoleWidth / 2 );
 
-            for ( auto const& cols : rows ) {
-                auto row = TextFlow::Column( cols.left )
+            for ( auto& cols : rows ) {
+                auto row = TextFlow::Column( CATCH_MOVE(cols.left) )
                                .width( optWidth )
                                .indent( 2 ) +
                            TextFlow::Spacer( 4 ) +
-                           TextFlow::Column( cols.right )
+                           TextFlow::Column( static_cast<std::string>(cols.descriptions) )
                                .width( consoleWidth - 7 - optWidth );
                 os << row << '\n';
             }
@@ -377,7 +393,7 @@ namespace Catch {
 
         Detail::InternalParseResult
         Parser::parse( std::string const& exeName,
-                       Detail::TokenStream const& tokens ) const {
+                       Detail::TokenStream tokens ) const {
 
             struct ParserInfo {
                 ParserBase const* parser = nullptr;
@@ -395,7 +411,7 @@ namespace Catch {
             m_exeName.set( exeName );
 
             auto result = Detail::InternalParseResult::ok(
-                Detail::ParseState( ParseResultType::NoMatch, tokens ) );
+                Detail::ParseState( ParseResultType::NoMatch, CATCH_MOVE(tokens) ) );
             while ( result.value().remainingTokens() ) {
                 bool tokenParsed = false;
 
@@ -403,7 +419,7 @@ namespace Catch {
                     if ( parseInfo.parser->cardinality() == 0 ||
                          parseInfo.count < parseInfo.parser->cardinality() ) {
                         result = parseInfo.parser->parse(
-                            exeName, result.value().remainingTokens() );
+                            exeName, CATCH_MOVE(result).value().remainingTokens() );
                         if ( !result )
                             return result;
                         if ( result.value().type() !=
@@ -429,7 +445,7 @@ namespace Catch {
         Args::Args(int argc, char const* const* argv) :
             m_exeName(argv[0]), m_args(argv + 1, argv + argc) {}
 
-        Args::Args(std::initializer_list<std::string> args) :
+        Args::Args(std::initializer_list<StringRef> args) :
             m_exeName(*args.begin()),
             m_args(args.begin() + 1, args.end()) {}
 
diff --git a/packages/Catch2/src/catch2/internal/catch_clara.hpp b/packages/Catch2/src/catch2/internal/catch_clara.hpp
index 9117b65e84d51ff0c82d05904107a45afa61fcc9..d869593bff873c2e9e0b5f0f24fbf9c35e6bf199 100644
--- a/packages/Catch2/src/catch2/internal/catch_clara.hpp
+++ b/packages/Catch2/src/catch2/internal/catch_clara.hpp
@@ -29,6 +29,7 @@
 #    endif
 #endif
 
+#include <catch2/internal/catch_stringref.hpp>
 #include <catch2/internal/catch_move_and_forward.hpp>
 #include <catch2/internal/catch_noncopyable.hpp>
 #include <catch2/internal/catch_void_type.hpp>
@@ -101,17 +102,16 @@ namespace Catch {
             enum class TokenType { Option, Argument };
             struct Token {
                 TokenType type;
-                std::string token;
+                StringRef token;
             };
 
             // Abstracts iterators into args as a stream of tokens, with option
             // arguments uniformly handled
             class TokenStream {
-                using Iterator = std::vector<std::string>::const_iterator;
+                using Iterator = std::vector<StringRef>::const_iterator;
                 Iterator it;
                 Iterator itEnd;
                 std::vector<Token> m_tokenBuffer;
-
                 void loadBuffer();
 
             public:
@@ -163,12 +163,17 @@ namespace Catch {
                 ResultType m_type;
             };
 
-            template <typename T> class ResultValueBase : public ResultBase {
+            template <typename T>
+            class ResultValueBase : public ResultBase {
             public:
-                auto value() const -> T const& {
+                T const& value() const& {
                     enforceOk();
                     return m_value;
                 }
+                T&& value() && {
+                    enforceOk();
+                    return CATCH_MOVE( m_value );
+                }
 
             protected:
                 ResultValueBase( ResultType type ): ResultBase( type ) {}
@@ -178,13 +183,23 @@ namespace Catch {
                     if ( m_type == ResultType::Ok )
                         new ( &m_value ) T( other.m_value );
                 }
+                ResultValueBase( ResultValueBase&& other ):
+                    ResultBase( other ) {
+                    if ( m_type == ResultType::Ok )
+                        new ( &m_value ) T( CATCH_MOVE(other.m_value) );
+                }
+
 
-                ResultValueBase( ResultType, T const& value ): ResultBase( ResultType::Ok ) {
+                ResultValueBase( ResultType, T const& value ):
+                    ResultBase( ResultType::Ok ) {
                     new ( &m_value ) T( value );
                 }
+                ResultValueBase( ResultType, T&& value ):
+                    ResultBase( ResultType::Ok ) {
+                    new ( &m_value ) T( CATCH_MOVE(value) );
+                }
 
-                auto operator=( ResultValueBase const& other )
-                    -> ResultValueBase& {
+                ResultValueBase& operator=( ResultValueBase const& other ) {
                     if ( m_type == ResultType::Ok )
                         m_value.~T();
                     ResultBase::operator=( other );
@@ -192,6 +207,14 @@ namespace Catch {
                         new ( &m_value ) T( other.m_value );
                     return *this;
                 }
+                ResultValueBase& operator=( ResultValueBase&& other ) {
+                    if ( m_type == ResultType::Ok ) m_value.~T();
+                    ResultBase::operator=( other );
+                    if ( m_type == ResultType::Ok )
+                        new ( &m_value ) T( CATCH_MOVE(other.m_value) );
+                    return *this;
+                }
+
 
                 ~ResultValueBase() override {
                     if ( m_type == ResultType::Ok )
@@ -219,8 +242,8 @@ namespace Catch {
                 }
 
                 template <typename U>
-                static auto ok( U const& value ) -> BasicResult {
-                    return { ResultType::Ok, value };
+                static auto ok( U&& value ) -> BasicResult {
+                    return { ResultType::Ok, CATCH_FORWARD(value) };
                 }
                 static auto ok() -> BasicResult { return { ResultType::Ok }; }
                 static auto logicError( std::string&& message )
@@ -267,12 +290,15 @@ namespace Catch {
             class ParseState {
             public:
                 ParseState( ParseResultType type,
-                            TokenStream const& remainingTokens );
+                            TokenStream remainingTokens );
 
                 ParseResultType type() const { return m_type; }
-                TokenStream const& remainingTokens() const {
+                TokenStream const& remainingTokens() const& {
                     return m_remainingTokens;
                 }
+                TokenStream&& remainingTokens() && {
+                    return CATCH_MOVE( m_remainingTokens );
+                }
 
             private:
                 ParseResultType m_type;
@@ -285,7 +311,7 @@ namespace Catch {
 
             struct HelpColumns {
                 std::string left;
-                std::string right;
+                StringRef descriptions;
             };
 
             template <typename T>
@@ -445,7 +471,7 @@ namespace Catch {
                 virtual ~ParserBase() = default;
                 virtual auto validate() const -> Result { return Result::ok(); }
                 virtual auto parse( std::string const& exeName,
-                                    TokenStream const& tokens ) const
+                                    TokenStream tokens ) const
                     -> InternalParseResult = 0;
                 virtual size_t cardinality() const;
 
@@ -465,8 +491,8 @@ namespace Catch {
             protected:
                 Optionality m_optionality = Optionality::Optional;
                 std::shared_ptr<BoundRef> m_ref;
-                std::string m_hint;
-                std::string m_description;
+                StringRef m_hint;
+                StringRef m_description;
 
                 explicit ParserRefImpl( std::shared_ptr<BoundRef> const& ref ):
                     m_ref( ref ) {}
@@ -475,28 +501,32 @@ namespace Catch {
                 template <typename LambdaT>
                 ParserRefImpl( accept_many_t,
                                LambdaT const& ref,
-                               std::string const& hint ):
+                               StringRef hint ):
                     m_ref( std::make_shared<BoundManyLambda<LambdaT>>( ref ) ),
                     m_hint( hint ) {}
 
                 template <typename T,
                           typename = typename std::enable_if_t<
                               !Detail::is_unary_function<T>::value>>
-                ParserRefImpl( T& ref, std::string const& hint ):
+                ParserRefImpl( T& ref, StringRef hint ):
                     m_ref( std::make_shared<BoundValueRef<T>>( ref ) ),
                     m_hint( hint ) {}
 
                 template <typename LambdaT,
                           typename = typename std::enable_if_t<
                               Detail::is_unary_function<LambdaT>::value>>
-                ParserRefImpl( LambdaT const& ref, std::string const& hint ):
+                ParserRefImpl( LambdaT const& ref, StringRef hint ):
                     m_ref( std::make_shared<BoundLambda<LambdaT>>( ref ) ),
                     m_hint( hint ) {}
 
-                auto operator()( std::string const& description ) -> DerivedT& {
+                DerivedT& operator()( StringRef description ) & {
                     m_description = description;
                     return static_cast<DerivedT&>( *this );
                 }
+                DerivedT&& operator()( StringRef description ) && {
+                    m_description = description;
+                    return static_cast<DerivedT&&>( *this );
+                }
 
                 auto optional() -> DerivedT& {
                     m_optionality = Optionality::Optional;
@@ -519,7 +549,7 @@ namespace Catch {
                         return 1;
                 }
 
-                std::string const& hint() const { return m_hint; }
+                StringRef hint() const { return m_hint; }
             };
 
         } // namespace detail
@@ -533,13 +563,13 @@ namespace Catch {
 
             Detail::InternalParseResult
                 parse(std::string const&,
-                      Detail::TokenStream const& tokens) const override;
+                      Detail::TokenStream tokens) const override;
         };
 
         // A parser for options
         class Opt : public Detail::ParserRefImpl<Opt> {
         protected:
-            std::vector<std::string> m_optNames;
+            std::vector<StringRef> m_optNames;
 
         public:
             template <typename LambdaT>
@@ -552,33 +582,37 @@ namespace Catch {
             template <typename LambdaT,
                       typename = typename std::enable_if_t<
                           Detail::is_unary_function<LambdaT>::value>>
-            Opt( LambdaT const& ref, std::string const& hint ):
+            Opt( LambdaT const& ref, StringRef hint ):
                 ParserRefImpl( ref, hint ) {}
 
             template <typename LambdaT>
-            Opt( accept_many_t, LambdaT const& ref, std::string const& hint ):
+            Opt( accept_many_t, LambdaT const& ref, StringRef hint ):
                 ParserRefImpl( accept_many, ref, hint ) {}
 
             template <typename T,
                       typename = typename std::enable_if_t<
                           !Detail::is_unary_function<T>::value>>
-            Opt( T& ref, std::string const& hint ):
+            Opt( T& ref, StringRef hint ):
                 ParserRefImpl( ref, hint ) {}
 
-            auto operator[](std::string const& optName) -> Opt& {
+            Opt& operator[]( StringRef optName ) & {
                 m_optNames.push_back(optName);
                 return *this;
             }
+            Opt&& operator[]( StringRef optName ) && {
+                m_optNames.push_back( optName );
+                return CATCH_MOVE(*this);
+            }
 
-            std::vector<Detail::HelpColumns> getHelpColumns() const;
+            Detail::HelpColumns getHelpColumns() const;
 
-            bool isMatch(std::string const& optToken) const;
+            bool isMatch(StringRef optToken) const;
 
             using ParserBase::parse;
 
             Detail::InternalParseResult
                 parse(std::string const&,
-                      Detail::TokenStream const& tokens) const override;
+                      Detail::TokenStream tokens) const override;
 
             Detail::Result validate() const override;
         };
@@ -601,7 +635,7 @@ namespace Catch {
             // handled specially
             Detail::InternalParseResult
                 parse(std::string const&,
-                      Detail::TokenStream const& tokens) const override;
+                      Detail::TokenStream tokens) const override;
 
             std::string const& name() const { return *m_name; }
             Detail::ParserResult set(std::string const& newName);
@@ -626,16 +660,28 @@ namespace Catch {
                 return *this;
             }
 
-            auto operator|=(Opt const& opt) -> Parser& {
-                m_options.push_back(opt);
-                return *this;
+            friend Parser& operator|=( Parser& p, Opt const& opt ) {
+                p.m_options.push_back( opt );
+                return p;
+            }
+            friend Parser& operator|=( Parser& p, Opt&& opt ) {
+                p.m_options.push_back( CATCH_MOVE(opt) );
+                return p;
             }
 
             Parser& operator|=(Parser const& other);
 
             template <typename T>
-            auto operator|(T const& other) const -> Parser {
-                return Parser(*this) |= other;
+            friend Parser operator|( Parser const& p, T&& rhs ) {
+                Parser temp( p );
+                temp |= rhs;
+                return temp;
+            }
+
+            template <typename T>
+            friend Parser operator|( Parser&& p, T&& rhs ) {
+                p |= CATCH_FORWARD(rhs);
+                return CATCH_MOVE(p);
             }
 
             std::vector<Detail::HelpColumns> getHelpColumns() const;
@@ -653,21 +699,23 @@ namespace Catch {
             using ParserBase::parse;
             Detail::InternalParseResult
                 parse(std::string const& exeName,
-                      Detail::TokenStream const& tokens) const override;
+                      Detail::TokenStream tokens) const override;
         };
 
-        // Transport for raw args (copied from main args, or supplied via
-        // init list for testing)
+        /**
+         * Wrapper over argc + argv, assumes that the inputs outlive it
+         */
         class Args {
             friend Detail::TokenStream;
-            std::string m_exeName;
-            std::vector<std::string> m_args;
+            StringRef m_exeName;
+            std::vector<StringRef> m_args;
 
         public:
             Args(int argc, char const* const* argv);
-            Args(std::initializer_list<std::string> args);
+            // Helper constructor for testing
+            Args(std::initializer_list<StringRef> args);
 
-            std::string const& exeName() const { return m_exeName; }
+            StringRef exeName() const { return m_exeName; }
         };
 
 
diff --git a/packages/Catch2/src/catch2/internal/catch_commandline.cpp b/packages/Catch2/src/catch2/internal/catch_commandline.cpp
index 4ac1847b20165d6ec0facba40c8bdbe350f7f8df..c29a801d3e5e6cf5343075ed8fe71d79af34b92e 100644
--- a/packages/Catch2/src/catch2/internal/catch_commandline.cpp
+++ b/packages/Catch2/src/catch2/internal/catch_commandline.cpp
@@ -9,6 +9,7 @@
 
 #include <catch2/catch_config.hpp>
 #include <catch2/internal/catch_string_manip.hpp>
+#include <catch2/interfaces/catch_interfaces_config.hpp>
 #include <catch2/interfaces/catch_interfaces_registry_hub.hpp>
 #include <catch2/internal/catch_reporter_registry.hpp>
 #include <catch2/internal/catch_console_colour.hpp>
@@ -300,8 +301,8 @@ namespace Catch {
                 ( "split the tests to execute into this many groups" )
             | Opt( setShardIndex, "shard index" )
                 ["--shard-index"]
-                ( "index of the group of tests to execute (see --shard-count)" ) |
-            Opt( config.allowZeroTests )
+                ( "index of the group of tests to execute (see --shard-count)" )
+            | Opt( config.allowZeroTests )
                 ["--allow-running-no-tests"]
                 ( "Treat 'No tests run' as a success" )
             | Arg( config.testsOrTags, "test name|pattern|tags" )
diff --git a/packages/Catch2/src/catch2/internal/catch_compiler_capabilities.hpp b/packages/Catch2/src/catch2/internal/catch_compiler_capabilities.hpp
index fc5d4f31506bb67583aa9c9adcc396aa7d793b08..dacae01b7000b86fc171f763978e80de9c2ec8d0 100644
--- a/packages/Catch2/src/catch2/internal/catch_compiler_capabilities.hpp
+++ b/packages/Catch2/src/catch2/internal/catch_compiler_capabilities.hpp
@@ -156,7 +156,9 @@
 
 ////////////////////////////////////////////////////////////////////////////////
 // Assume that some platforms do not support getenv.
-#if defined(CATCH_PLATFORM_WINDOWS_UWP) || defined(CATCH_PLATFORM_PLAYSTATION)
+#if defined( CATCH_PLATFORM_WINDOWS_UWP ) ||                                   \
+    defined( CATCH_PLATFORM_PLAYSTATION ) ||                                   \
+    defined( _GAMING_XBOX )
 #    define CATCH_INTERNAL_CONFIG_NO_GETENV
 #else
 #    define CATCH_INTERNAL_CONFIG_GETENV
diff --git a/packages/Catch2/src/catch2/internal/catch_config_prefix_messages.hpp b/packages/Catch2/src/catch2/internal/catch_config_prefix_messages.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..be1e9a963ad505c4ee592f291b3f6cc84e1fcd65
--- /dev/null
+++ b/packages/Catch2/src/catch2/internal/catch_config_prefix_messages.hpp
@@ -0,0 +1,29 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+/** \file
+ * Wrapper for the CATCH_CONFIG_PREFIX_MESSAGES configuration option
+ *
+ * CATCH_CONFIG_PREFIX_ALL can be used to avoid clashes with other macros
+ * by prepending CATCH_. This may not be desirable if the only clashes are with
+ * logger macros such as INFO and WARN. In this cases
+ * CATCH_CONFIG_PREFIX_MESSAGES can be used to only prefix a small subset
+ * of relevant macros.
+ *
+ */
+
+#ifndef CATCH_CONFIG_PREFIX_MESSAGES_HPP_INCLUDED
+#define CATCH_CONFIG_PREFIX_MESSAGES_HPP_INCLUDED
+
+#include <catch2/catch_user_config.hpp>
+
+#if defined(CATCH_CONFIG_PREFIX_ALL) && !defined(CATCH_CONFIG_PREFIX_MESSAGES)
+    #define CATCH_CONFIG_PREFIX_MESSAGES
+#endif
+
+#endif // CATCH_CONFIG_PREFIX_MESSAGES_HPP_INCLUDED
diff --git a/packages/Catch2/src/catch2/internal/catch_enum_values_registry.cpp b/packages/Catch2/src/catch2/internal/catch_enum_values_registry.cpp
index 7e8bf5e5e3eb0841924a6fb6bcdaf2a99887d5cc..a94b60881e7913615a179fb7f7540074c95e597d 100644
--- a/packages/Catch2/src/catch2/internal/catch_enum_values_registry.cpp
+++ b/packages/Catch2/src/catch2/internal/catch_enum_values_registry.cpp
@@ -39,7 +39,7 @@ namespace Catch {
             return parsed;
         }
 
-        EnumInfo::~EnumInfo() {}
+        EnumInfo::~EnumInfo() = default;
 
         StringRef EnumInfo::lookup( int value ) const {
             for( auto const& valueToName : m_values ) {
diff --git a/packages/Catch2/src/catch2/internal/catch_exception_translator_registry.cpp b/packages/Catch2/src/catch2/internal/catch_exception_translator_registry.cpp
index f3d47c0cd6056fd815cf275ac4671d2dcd71261d..1eb61147366e9d96779417e59d8a8d719c4957e3 100644
--- a/packages/Catch2/src/catch2/internal/catch_exception_translator_registry.cpp
+++ b/packages/Catch2/src/catch2/internal/catch_exception_translator_registry.cpp
@@ -15,6 +15,7 @@
 
 namespace Catch {
 
+#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
     namespace {
         static std::string tryTranslators(
             std::vector<
@@ -28,9 +29,9 @@ namespace Catch {
         }
 
     }
+#endif //!defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
 
-    ExceptionTranslatorRegistry::~ExceptionTranslatorRegistry() {
-    }
+    ExceptionTranslatorRegistry::~ExceptionTranslatorRegistry() = default;
 
     void ExceptionTranslatorRegistry::registerTranslator( Detail::unique_ptr<IExceptionTranslator>&& translator ) {
         m_translators.push_back( CATCH_MOVE( translator ) );
diff --git a/packages/Catch2/src/catch2/internal/catch_fatal_condition_handler.cpp b/packages/Catch2/src/catch2/internal/catch_fatal_condition_handler.cpp
index f9702b1847aa0c1d45806e6dcf9f0251dff72c3c..9ef5b21795bf7625e1c32803255310a05d493c79 100644
--- a/packages/Catch2/src/catch2/internal/catch_fatal_condition_handler.cpp
+++ b/packages/Catch2/src/catch2/internal/catch_fatal_condition_handler.cpp
@@ -26,6 +26,7 @@
 
 #include <catch2/internal/catch_fatal_condition_handler.hpp>
 
+#include <catch2/internal/catch_compiler_capabilities.hpp>
 #include <catch2/internal/catch_context.hpp>
 #include <catch2/internal/catch_enforce.hpp>
 #include <catch2/interfaces/catch_interfaces_capture.hpp>
diff --git a/packages/Catch2/src/catch2/internal/catch_fatal_condition_handler.hpp b/packages/Catch2/src/catch2/internal/catch_fatal_condition_handler.hpp
index ce07f9b6a7bd813190598a4642789398f990d3f4..81728b563a15ac73024ab37ef4395842bc94bbf8 100644
--- a/packages/Catch2/src/catch2/internal/catch_fatal_condition_handler.hpp
+++ b/packages/Catch2/src/catch2/internal/catch_fatal_condition_handler.hpp
@@ -8,9 +8,6 @@
 #ifndef CATCH_FATAL_CONDITION_HANDLER_HPP_INCLUDED
 #define CATCH_FATAL_CONDITION_HANDLER_HPP_INCLUDED
 
-#include <catch2/internal/catch_platform.hpp>
-#include <catch2/internal/catch_compiler_capabilities.hpp>
-
 #include <cassert>
 
 namespace Catch {
diff --git a/packages/Catch2/src/catch2/internal/catch_floating_point_helpers.cpp b/packages/Catch2/src/catch2/internal/catch_floating_point_helpers.cpp
index e30ee43422ab870afe6ff17bfea08a6ad44848b2..9631ed6d211a61272b68b0f52385a4e3e318d1f5 100644
--- a/packages/Catch2/src/catch2/internal/catch_floating_point_helpers.cpp
+++ b/packages/Catch2/src/catch2/internal/catch_floating_point_helpers.cpp
@@ -27,6 +27,17 @@ namespace Catch {
             return i;
         }
 
+#if defined( __GNUC__ ) || defined( __clang__ )
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wfloat-equal"
+#endif
+        bool directCompare( float lhs, float rhs ) { return lhs == rhs; }
+        bool directCompare( double lhs, double rhs ) { return lhs == rhs; }
+#if defined( __GNUC__ ) || defined( __clang__ )
+#    pragma GCC diagnostic pop
+#endif
+
+
     } // end namespace Detail
 } // end namespace Catch
 
diff --git a/packages/Catch2/src/catch2/internal/catch_floating_point_helpers.hpp b/packages/Catch2/src/catch2/internal/catch_floating_point_helpers.hpp
index ca883c613c1b625f8235d263c68c8763719b086f..b2143726d4c4566eddb3f2d53997cb77baa57327 100644
--- a/packages/Catch2/src/catch2/internal/catch_floating_point_helpers.hpp
+++ b/packages/Catch2/src/catch2/internal/catch_floating_point_helpers.hpp
@@ -22,6 +22,11 @@ namespace Catch {
         uint32_t convertToBits(float f);
         uint64_t convertToBits(double d);
 
+        // Used when we know we want == comparison of two doubles
+        // to centralize warning suppression
+        bool directCompare( float lhs, float rhs );
+        bool directCompare( double lhs, double rhs );
+
     } // end namespace Detail
 
 
diff --git a/packages/Catch2/src/catch2/internal/catch_istream.cpp b/packages/Catch2/src/catch2/internal/catch_istream.cpp
index bf0f66e42320d18480531076b19b14774472b318..2867ce747caf48871713bc84a4fd1cf77be990c2 100644
--- a/packages/Catch2/src/catch2/internal/catch_istream.cpp
+++ b/packages/Catch2/src/catch2/internal/catch_istream.cpp
@@ -80,7 +80,6 @@ namespace Detail {
                 CATCH_ENFORCE( !m_ofs.fail(), "Unable to open file: '" << filename << '\'' );
                 m_ofs << std::unitbuf;
             }
-            ~FileStream() override = default;
         public: // IStream
             std::ostream& stream() override {
                 return m_ofs;
@@ -95,7 +94,6 @@ namespace Detail {
             // Store the streambuf from cout up-front because
             // cout may get redirected when running tests
             CoutStream() : m_os( Catch::cout().rdbuf() ) {}
-            ~CoutStream() override = default;
 
         public: // IStream
             std::ostream& stream() override { return m_os; }
@@ -109,7 +107,6 @@ namespace Detail {
             // Store the streambuf from cerr up-front because
             // cout may get redirected when running tests
             CerrStream(): m_os( Catch::cerr().rdbuf() ) {}
-            ~CerrStream() override = default;
 
         public: // IStream
             std::ostream& stream() override { return m_os; }
@@ -127,8 +124,6 @@ namespace Detail {
                 m_os( m_streamBuf.get() )
             {}
 
-            ~DebugOutStream() override = default;
-
         public: // IStream
             std::ostream& stream() override { return m_os; }
         };
diff --git a/packages/Catch2/src/catch2/internal/catch_jsonwriter.cpp b/packages/Catch2/src/catch2/internal/catch_jsonwriter.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ff65a9d346c924b9722fd79f2dd5cac09c29784d
--- /dev/null
+++ b/packages/Catch2/src/catch2/internal/catch_jsonwriter.cpp
@@ -0,0 +1,148 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/internal/catch_enforce.hpp>
+#include <catch2/internal/catch_jsonwriter.hpp>
+
+namespace Catch {
+    void JsonUtils::indent( std::ostream& os, std::uint64_t level ) {
+        for ( std::uint64_t i = 0; i < level; ++i ) {
+            os << "  ";
+        }
+    }
+    void JsonUtils::appendCommaNewline( std::ostream& os,
+                                        bool& should_comma,
+                                        std::uint64_t level ) {
+        if ( should_comma ) { os << ','; }
+        should_comma = true;
+        os << '\n';
+        indent( os, level );
+    }
+
+    JsonObjectWriter::JsonObjectWriter( std::ostream& os ):
+        JsonObjectWriter{ os, 0 } {}
+
+    JsonObjectWriter::JsonObjectWriter( std::ostream& os,
+                                        std::uint64_t indent_level ):
+        m_os{ os }, m_indent_level{ indent_level } {
+        m_os << '{';
+    }
+    JsonObjectWriter::JsonObjectWriter( JsonObjectWriter&& source ):
+        m_os{ source.m_os },
+        m_indent_level{ source.m_indent_level },
+        m_should_comma{ source.m_should_comma },
+        m_active{ source.m_active } {
+        source.m_active = false;
+    }
+
+    JsonObjectWriter::~JsonObjectWriter() {
+        if ( !m_active ) { return; }
+
+        m_os << '\n';
+        JsonUtils::indent( m_os, m_indent_level );
+        m_os << '}';
+    }
+
+    JsonValueWriter JsonObjectWriter::write( StringRef key ) {
+        JsonUtils::appendCommaNewline(
+            m_os, m_should_comma, m_indent_level + 1 );
+
+        m_os << '"' << key << "\": ";
+        return JsonValueWriter{ m_os, m_indent_level + 1 };
+    }
+
+    JsonArrayWriter::JsonArrayWriter( std::ostream& os ):
+        JsonArrayWriter{ os, 0 } {}
+    JsonArrayWriter::JsonArrayWriter( std::ostream& os,
+                                      std::uint64_t indent_level ):
+        m_os{ os }, m_indent_level{ indent_level } {
+        m_os << '[';
+    }
+    JsonArrayWriter::JsonArrayWriter( JsonArrayWriter&& source ):
+        m_os{ source.m_os },
+        m_indent_level{ source.m_indent_level },
+        m_should_comma{ source.m_should_comma },
+        m_active{ source.m_active } {
+        source.m_active = false;
+    }
+    JsonArrayWriter::~JsonArrayWriter() {
+        if ( !m_active ) { return; }
+
+        m_os << '\n';
+        JsonUtils::indent( m_os, m_indent_level );
+        m_os << ']';
+    }
+
+    JsonObjectWriter JsonArrayWriter::writeObject() {
+        JsonUtils::appendCommaNewline(
+            m_os, m_should_comma, m_indent_level + 1 );
+        return JsonObjectWriter{ m_os, m_indent_level + 1 };
+    }
+
+    JsonArrayWriter JsonArrayWriter::writeArray() {
+        JsonUtils::appendCommaNewline(
+            m_os, m_should_comma, m_indent_level + 1 );
+        return JsonArrayWriter{ m_os, m_indent_level + 1 };
+    }
+
+    JsonArrayWriter& JsonArrayWriter::write( bool value ) {
+        return writeImpl( value );
+    }
+
+    JsonValueWriter::JsonValueWriter( std::ostream& os ):
+        JsonValueWriter{ os, 0 } {}
+
+    JsonValueWriter::JsonValueWriter( std::ostream& os,
+                                      std::uint64_t indent_level ):
+        m_os{ os }, m_indent_level{ indent_level } {}
+
+    JsonObjectWriter JsonValueWriter::writeObject() && {
+        return JsonObjectWriter{ m_os, m_indent_level };
+    }
+
+    JsonArrayWriter JsonValueWriter::writeArray() && {
+        return JsonArrayWriter{ m_os, m_indent_level };
+    }
+
+    void JsonValueWriter::write( Catch::StringRef value ) && {
+        writeImpl( value, true );
+    }
+
+    void JsonValueWriter::write( bool value ) && {
+        writeImpl( value ? "true"_sr : "false"_sr, false );
+    }
+
+    void JsonValueWriter::writeImpl( Catch::StringRef value, bool quote ) {
+        if ( quote ) { m_os << '"'; }
+        for (char c : value) {
+            // Escape list taken from https://www.json.org/json-en.html,
+            // string definition.
+            // Note that while forward slash _can_ be escaped, it does
+            // not have to be, if JSON is not further embedded somewhere
+            // where forward slash is meaningful.
+            if ( c == '"' ) {
+                m_os << "\\\"";
+            } else if ( c == '\\' ) {
+                m_os << "\\\\";
+            } else if ( c == '\b' ) {
+                m_os << "\\b";
+            } else if ( c == '\f' ) {
+                m_os << "\\f";
+            } else if ( c == '\n' ) {
+                m_os << "\\n";
+            } else if ( c == '\r' ) {
+                m_os << "\\r";
+            } else if ( c == '\t' ) {
+                m_os << "\\t";
+            } else {
+                m_os << c;
+            }
+        }
+        if ( quote ) { m_os << '"'; }
+    }
+
+} // namespace Catch
diff --git a/packages/Catch2/src/catch2/internal/catch_jsonwriter.hpp b/packages/Catch2/src/catch2/internal/catch_jsonwriter.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..59c044e45049cac79d5e3dbf690fabe97385eb16
--- /dev/null
+++ b/packages/Catch2/src/catch2/internal/catch_jsonwriter.hpp
@@ -0,0 +1,120 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_JSONWRITER_HPP_INCLUDED
+#define CATCH_JSONWRITER_HPP_INCLUDED
+
+#include <catch2/internal/catch_reusable_string_stream.hpp>
+#include <catch2/internal/catch_stringref.hpp>
+
+#include <cstdint>
+#include <sstream>
+
+namespace Catch {
+    class JsonObjectWriter;
+    class JsonArrayWriter;
+
+    struct JsonUtils {
+        static void indent( std::ostream& os, std::uint64_t level );
+        static void appendCommaNewline( std::ostream& os,
+                                        bool& should_comma,
+                                        std::uint64_t level );
+    };
+
+    class JsonValueWriter {
+    public:
+        JsonValueWriter( std::ostream& os );
+        JsonValueWriter( std::ostream& os, std::uint64_t indent_level );
+
+        JsonObjectWriter writeObject() &&;
+        JsonArrayWriter writeArray() &&;
+
+        template <typename T>
+        void write( T const& value ) && {
+            writeImpl( value, !std::is_arithmetic<T>::value );
+        }
+        void write( StringRef value ) &&;
+        void write( bool value ) &&;
+
+    private:
+        void writeImpl( StringRef value, bool quote );
+
+        // Without this SFINAE, this overload is a better match
+        // for `std::string`, `char const*`, `char const[N]` args.
+        // While it would still work, it would cause code bloat
+        // and multiple iteration over the strings
+        template <typename T,
+                  typename = typename std::enable_if_t<
+                      !std::is_convertible<T, StringRef>::value>>
+        void writeImpl( T const& value, bool quote_value ) {
+            m_sstream << value;
+            writeImpl( m_sstream.str(), quote_value );
+        }
+
+        std::ostream& m_os;
+        std::stringstream m_sstream;
+        std::uint64_t m_indent_level;
+    };
+
+    class JsonObjectWriter {
+    public:
+        JsonObjectWriter( std::ostream& os );
+        JsonObjectWriter( std::ostream& os, std::uint64_t indent_level );
+
+        JsonObjectWriter( JsonObjectWriter&& source );
+        JsonObjectWriter& operator=( JsonObjectWriter&& source ) = delete;
+
+        ~JsonObjectWriter();
+
+        JsonValueWriter write( StringRef key );
+
+    private:
+        std::ostream& m_os;
+        std::uint64_t m_indent_level;
+        bool m_should_comma = false;
+        bool m_active = true;
+    };
+
+    class JsonArrayWriter {
+    public:
+        JsonArrayWriter( std::ostream& os );
+        JsonArrayWriter( std::ostream& os, std::uint64_t indent_level );
+
+        JsonArrayWriter( JsonArrayWriter&& source );
+        JsonArrayWriter& operator=( JsonArrayWriter&& source ) = delete;
+
+        ~JsonArrayWriter();
+
+        JsonObjectWriter writeObject();
+        JsonArrayWriter writeArray();
+
+        template <typename T>
+        JsonArrayWriter& write( T const& value ) {
+            return writeImpl( value );
+        }
+
+        JsonArrayWriter& write( bool value );
+
+    private:
+        template <typename T>
+        JsonArrayWriter& writeImpl( T const& value ) {
+            JsonUtils::appendCommaNewline(
+                m_os, m_should_comma, m_indent_level + 1 );
+            JsonValueWriter{ m_os }.write( value );
+
+            return *this;
+        }
+
+        std::ostream& m_os;
+        std::uint64_t m_indent_level;
+        bool m_should_comma = false;
+        bool m_active = true;
+    };
+
+} // namespace Catch
+
+#endif // CATCH_JSONWRITER_HPP_INCLUDED
diff --git a/packages/Catch2/src/catch2/internal/catch_leak_detector.cpp b/packages/Catch2/src/catch2/internal/catch_leak_detector.cpp
index 7389eaf7787dc640f0a6e5817991ba0a548e307b..691bc772e186c41b35f69a04bdd5fe9579febe61 100644
--- a/packages/Catch2/src/catch2/internal/catch_leak_detector.cpp
+++ b/packages/Catch2/src/catch2/internal/catch_leak_detector.cpp
@@ -29,7 +29,7 @@ namespace Catch {
 
 #else // ^^ Windows crt debug heap enabled // Windows crt debug heap disabled vv
 
-    Catch::LeakDetector::LeakDetector() {}
+    Catch::LeakDetector::LeakDetector() = default;
 
 #endif // CATCH_CONFIG_WINDOWS_CRTDBG
 
diff --git a/packages/Catch2/src/catch2/internal/catch_list.cpp b/packages/Catch2/src/catch2/internal/catch_list.cpp
index 97e4c59315b7d74c40fcffa71215d583ab0a7776..5bd06a2aefdbdefb9d109ceb80904b3da7cdd106 100644
--- a/packages/Catch2/src/catch2/internal/catch_list.cpp
+++ b/packages/Catch2/src/catch2/internal/catch_list.cpp
@@ -14,10 +14,7 @@
 #include <catch2/internal/catch_reporter_registry.hpp>
 #include <catch2/internal/catch_move_and_forward.hpp>
 #include <catch2/internal/catch_case_insensitive_comparisons.hpp>
-
-#include <catch2/internal/catch_context.hpp>
 #include <catch2/catch_config.hpp>
-#include <catch2/catch_test_spec.hpp>
 #include <catch2/catch_test_case_info.hpp>
 
 namespace Catch {
diff --git a/packages/Catch2/src/catch2/internal/catch_polyfills.cpp b/packages/Catch2/src/catch2/internal/catch_polyfills.cpp
index 96efad5dded864ba4c7e27c68caa451335fc88a8..776c224396708ab22ae210566f5c6b7ed50b321d 100644
--- a/packages/Catch2/src/catch2/internal/catch_polyfills.cpp
+++ b/packages/Catch2/src/catch2/internal/catch_polyfills.cpp
@@ -31,4 +31,12 @@ namespace Catch {
     }
 #endif
 
+#if !defined( CATCH_CONFIG_GLOBAL_NEXTAFTER )
+    float nextafter( float x, float y ) { return std::nextafter( x, y ); }
+    double nextafter( double x, double y ) { return std::nextafter( x, y ); }
+#else
+    float nextafter( float x, float y ) { return ::nextafterf( x, y ); }
+    double nextafter( double x, double y ) { return ::nextafter( x, y ); }
+#endif
+
 } // end namespace Catch
diff --git a/packages/Catch2/src/catch2/internal/catch_polyfills.hpp b/packages/Catch2/src/catch2/internal/catch_polyfills.hpp
index 23a9332bc75dbd236efed770ba2aa74461f696db..4503f8f2a2f11f1b467cc0a3dd4659369a660cfa 100644
--- a/packages/Catch2/src/catch2/internal/catch_polyfills.hpp
+++ b/packages/Catch2/src/catch2/internal/catch_polyfills.hpp
@@ -9,8 +9,13 @@
 #define CATCH_POLYFILLS_HPP_INCLUDED
 
 namespace Catch {
+
     bool isnan(float f);
     bool isnan(double d);
+
+    float nextafter(float x, float y);
+    double nextafter(double x, double y);
+
 }
 
 #endif // CATCH_POLYFILLS_HPP_INCLUDED
diff --git a/packages/Catch2/src/catch2/internal/catch_random_floating_point_helpers.hpp b/packages/Catch2/src/catch2/internal/catch_random_floating_point_helpers.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c59c053916ea903a99f7ba55cb38d2d03ca2db10
--- /dev/null
+++ b/packages/Catch2/src/catch2/internal/catch_random_floating_point_helpers.hpp
@@ -0,0 +1,94 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#ifndef CATCH_RANDOM_FLOATING_POINT_HELPERS_HPP_INCLUDED
+#define CATCH_RANDOM_FLOATING_POINT_HELPERS_HPP_INCLUDED
+
+#include <catch2/internal/catch_polyfills.hpp>
+
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+
+namespace Catch {
+
+    namespace Detail {
+        /**
+         * Returns the largest magnitude of 1-ULP distance inside the [a, b] range.
+         *
+         * Assumes `a < b`.
+         */
+        template <typename FloatType>
+        FloatType gamma(FloatType a, FloatType b) {
+            static_assert( std::is_floating_point<FloatType>::value,
+                           "gamma returns the largest ULP magnitude within "
+                           "floating point range [a, b]. This only makes sense "
+                           "for floating point types" );
+            assert( a <= b );
+
+            const auto gamma_up = Catch::nextafter( a, std::numeric_limits<FloatType>::infinity() ) - a;
+            const auto gamma_down = b - Catch::nextafter( b, -std::numeric_limits<FloatType>::infinity() );
+
+            return gamma_up < gamma_down ? gamma_down : gamma_up;
+        }
+
+        template <typename FloatingPoint>
+        struct DistanceTypePicker;
+        template <>
+        struct DistanceTypePicker<float> {
+            using type = std::uint32_t;
+        };
+        template <>
+        struct DistanceTypePicker<double> {
+            using type = std::uint64_t;
+        };
+
+        template <typename T>
+        using DistanceType = typename DistanceTypePicker<T>::type;
+
+#if defined( __GNUC__ ) || defined( __clang__ )
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wfloat-equal"
+#endif
+        /**
+         * Computes the number of equi-distant floats in [a, b]
+         *
+         * Since not every range can be split into equidistant floats
+         * exactly, we actually compute ceil(b/distance - a/distance),
+         * because in those cases we want to overcount.
+         *
+         * Uses modified Dekker's FastTwoSum algorithm to handle rounding.
+         */
+        template <typename FloatType>
+        DistanceType<FloatType>
+        count_equidistant_floats( FloatType a, FloatType b, FloatType distance ) {
+            assert( a <= b );
+            // We get distance as gamma for our uniform float distribution,
+            // so this will round perfectly.
+            const auto ag = a / distance;
+            const auto bg = b / distance;
+
+            const auto s = bg - ag;
+            const auto err = ( std::fabs( a ) <= std::fabs( b ) )
+                                 ? -ag - ( s - bg )
+                                 : bg - ( s + ag );
+            const auto ceil_s = static_cast<DistanceType<FloatType>>( std::ceil( s ) );
+
+            return ( ceil_s != s ) ? ceil_s : ceil_s + ( err > 0 );
+        }
+#if defined( __GNUC__ ) || defined( __clang__ )
+#    pragma GCC diagnostic pop
+#endif
+
+    }
+
+} // end namespace Catch
+
+#endif // CATCH_RANDOM_FLOATING_POINT_HELPERS_HPP_INCLUDED
diff --git a/packages/Catch2/src/catch2/internal/catch_random_integer_helpers.hpp b/packages/Catch2/src/catch2/internal/catch_random_integer_helpers.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..1c450f05c3d4d93b494dea335bff2bd15e08a652
--- /dev/null
+++ b/packages/Catch2/src/catch2/internal/catch_random_integer_helpers.hpp
@@ -0,0 +1,202 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#ifndef CATCH_RANDOM_INTEGER_HELPERS_HPP_INCLUDED
+#define CATCH_RANDOM_INTEGER_HELPERS_HPP_INCLUDED
+
+#include <climits>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+namespace Catch {
+    namespace Detail {
+
+        template <std::size_t>
+        struct SizedUnsignedType;
+#define SizedUnsignedTypeHelper( TYPE )        \
+    template <>                                \
+    struct SizedUnsignedType<sizeof( TYPE )> { \
+        using type = TYPE;                     \
+    }
+
+        SizedUnsignedTypeHelper( std::uint8_t );
+        SizedUnsignedTypeHelper( std::uint16_t );
+        SizedUnsignedTypeHelper( std::uint32_t );
+        SizedUnsignedTypeHelper( std::uint64_t );
+#undef SizedUnsignedTypeHelper
+
+        template <std::size_t sz>
+        using SizedUnsignedType_t = typename SizedUnsignedType<sz>::type;
+
+        template <typename T>
+        using DoubleWidthUnsignedType_t = SizedUnsignedType_t<2 * sizeof( T )>;
+
+        template <typename T>
+        struct ExtendedMultResult {
+            T upper;
+            T lower;
+            friend bool operator==( ExtendedMultResult const& lhs,
+                                    ExtendedMultResult const& rhs ) {
+                return lhs.upper == rhs.upper && lhs.lower == rhs.lower;
+            }
+        };
+
+        // Returns 128 bit result of multiplying lhs and rhs
+        constexpr ExtendedMultResult<std::uint64_t>
+        extendedMult( std::uint64_t lhs, std::uint64_t rhs ) {
+            // We use the simple long multiplication approach for
+            // correctness, we can use platform specific builtins
+            // for performance later.
+
+            // Split the lhs and rhs into two 32bit "digits", so that we can
+            // do 64 bit arithmetic to handle carry bits.
+            //            32b    32b    32b    32b
+            //     lhs                  L1     L2
+            //   * rhs                  R1     R2
+            //            ------------------------
+            //                       |  R2 * L2  |
+            //                 |  R2 * L1  |
+            //                 |  R1 * L2  |
+            //           |  R1 * L1  |
+            //           -------------------------
+            //           |  a  |  b  |  c  |  d  |
+
+#define CarryBits( x ) ( x >> 32 )
+#define Digits( x ) ( x & 0xFF'FF'FF'FF )
+
+            auto r2l2 = Digits( rhs ) * Digits( lhs );
+            auto r2l1 = Digits( rhs ) * CarryBits( lhs );
+            auto r1l2 = CarryBits( rhs ) * Digits( lhs );
+            auto r1l1 = CarryBits( rhs ) * CarryBits( lhs );
+
+            // Sum to columns first
+            auto d = Digits( r2l2 );
+            auto c = CarryBits( r2l2 ) + Digits( r2l1 ) + Digits( r1l2 );
+            auto b = CarryBits( r2l1 ) + CarryBits( r1l2 ) + Digits( r1l1 );
+            auto a = CarryBits( r1l1 );
+
+            // Propagate carries between columns
+            c += CarryBits( d );
+            b += CarryBits( c );
+            a += CarryBits( b );
+
+            // Remove the used carries
+            c = Digits( c );
+            b = Digits( b );
+            a = Digits( a );
+
+#undef CarryBits
+#undef Digits
+
+            return {
+                a << 32 | b, // upper 64 bits
+                c << 32 | d  // lower 64 bits
+            };
+        }
+
+        template <typename UInt>
+        constexpr ExtendedMultResult<UInt> extendedMult( UInt lhs, UInt rhs ) {
+            static_assert( std::is_unsigned<UInt>::value,
+                           "extendedMult can only handle unsigned integers" );
+            static_assert( sizeof( UInt ) < sizeof( std::uint64_t ),
+                           "Generic extendedMult can only handle types smaller "
+                           "than uint64_t" );
+            using WideType = DoubleWidthUnsignedType_t<UInt>;
+
+            auto result = WideType( lhs ) * WideType( rhs );
+            return {
+                static_cast<UInt>( result >> ( CHAR_BIT * sizeof( UInt ) ) ),
+                static_cast<UInt>( result & UInt( -1 ) ) };
+        }
+
+
+        template <typename TargetType,
+                  typename Generator>
+            std::enable_if_t<sizeof(typename Generator::result_type) >= sizeof(TargetType),
+            TargetType> fillBitsFrom(Generator& gen) {
+            using gresult_type = typename Generator::result_type;
+            static_assert( std::is_unsigned<TargetType>::value, "Only unsigned integers are supported" );
+            static_assert( Generator::min() == 0 &&
+                           Generator::max() == static_cast<gresult_type>( -1 ),
+                           "Generator must be able to output all numbers in its result type (effectively it must be a random bit generator)" );
+
+            // We want to return the top bits from a generator, as they are
+            // usually considered higher quality.
+            constexpr auto generated_bits = sizeof( gresult_type ) * CHAR_BIT;
+            constexpr auto return_bits = sizeof( TargetType ) * CHAR_BIT;
+
+            return static_cast<TargetType>( gen() >>
+                                            ( generated_bits - return_bits) );
+        }
+
+        template <typename TargetType,
+                  typename Generator>
+            std::enable_if_t<sizeof(typename Generator::result_type) < sizeof(TargetType),
+            TargetType> fillBitsFrom(Generator& gen) {
+            using gresult_type = typename Generator::result_type;
+            static_assert( std::is_unsigned<TargetType>::value,
+                           "Only unsigned integers are supported" );
+            static_assert( Generator::min() == 0 &&
+                           Generator::max() == static_cast<gresult_type>( -1 ),
+                           "Generator must be able to output all numbers in its result type (effectively it must be a random bit generator)" );
+
+            constexpr auto generated_bits = sizeof( gresult_type ) * CHAR_BIT;
+            constexpr auto return_bits = sizeof( TargetType ) * CHAR_BIT;
+            std::size_t filled_bits = 0;
+            TargetType ret = 0;
+            do {
+                ret <<= generated_bits;
+                ret |= gen();
+                filled_bits += generated_bits;
+            } while ( filled_bits < return_bits );
+
+            return ret;
+        }
+
+        /*
+         * Transposes numbers into unsigned type while keeping their ordering
+         *
+         * This means that signed types are changed so that the ordering is
+         * [INT_MIN, ..., -1, 0, ..., INT_MAX], rather than order we would
+         * get by simple casting ([0, ..., INT_MAX, INT_MIN, ..., -1])
+         */
+        template <typename OriginalType, typename UnsignedType>
+        std::enable_if_t<std::is_signed<OriginalType>::value, UnsignedType>
+        transposeToNaturalOrder( UnsignedType in ) {
+            static_assert(
+                sizeof( OriginalType ) == sizeof( UnsignedType ),
+                "reordering requires the same sized types on both sides" );
+            static_assert( std::is_unsigned<UnsignedType>::value,
+                           "Input type must be unsigned" );
+            // Assuming 2s complement (standardized in current C++), the
+            // positive and negative numbers are already internally ordered,
+            // and their difference is in the top bit. Swapping it orders
+            // them the desired way.
+            constexpr auto highest_bit =
+                UnsignedType( 1 ) << ( sizeof( UnsignedType ) * CHAR_BIT - 1 );
+            return static_cast<UnsignedType>( in ^ highest_bit );
+        }
+
+
+
+        template <typename OriginalType,
+                  typename UnsignedType>
+        std::enable_if_t<std::is_unsigned<OriginalType>::value, UnsignedType>
+            transposeToNaturalOrder(UnsignedType in) {
+            static_assert(
+                sizeof( OriginalType ) == sizeof( UnsignedType ),
+                "reordering requires the same sized types on both sides" );
+            static_assert( std::is_unsigned<UnsignedType>::value, "Input type must be unsigned" );
+            // No reordering is needed for unsigned -> unsigned
+            return in;
+        }
+    } // namespace Detail
+} // namespace Catch
+
+#endif // CATCH_RANDOM_INTEGER_HELPERS_HPP_INCLUDED
diff --git a/packages/Catch2/src/catch2/internal/catch_random_seed_generation.cpp b/packages/Catch2/src/catch2/internal/catch_random_seed_generation.cpp
index 40c468cb4ba3d1debd3bb1db0f7755fe570cde51..fdc3fa19e35ab7f5d7381e7551c2efeec7270d56 100644
--- a/packages/Catch2/src/catch2/internal/catch_random_seed_generation.cpp
+++ b/packages/Catch2/src/catch2/internal/catch_random_seed_generation.cpp
@@ -9,6 +9,7 @@
 #include <catch2/internal/catch_random_seed_generation.hpp>
 
 #include <catch2/internal/catch_enforce.hpp>
+#include <catch2/internal/catch_random_integer_helpers.hpp>
 
 #include <ctime>
 #include <random>
@@ -21,10 +22,10 @@ namespace Catch {
             return static_cast<std::uint32_t>( std::time( nullptr ) );
 
         case GenerateFrom::Default:
-        case GenerateFrom::RandomDevice:
-            // In theory, a platform could have random_device that returns just
-            // 16 bits. That is still some randomness, so we don't care too much
-            return static_cast<std::uint32_t>( std::random_device{}() );
+        case GenerateFrom::RandomDevice: {
+            std::random_device rd;
+            return Detail::fillBitsFrom<std::uint32_t>( rd );
+        }
 
         default:
             CATCH_ERROR("Unknown generation method");
diff --git a/packages/Catch2/src/catch2/internal/catch_reporter_registry.cpp b/packages/Catch2/src/catch2/internal/catch_reporter_registry.cpp
index b2b0cd074da4a09f2c2ecbb7574417d69595e63f..cea8c4dc9ae090e817a1a662a75d95ff519ea715 100644
--- a/packages/Catch2/src/catch2/internal/catch_reporter_registry.cpp
+++ b/packages/Catch2/src/catch2/internal/catch_reporter_registry.cpp
@@ -6,13 +6,14 @@
 
 // SPDX-License-Identifier: BSL-1.0
 
-#include <catch2/internal/catch_reporter_registry.hpp>
 #include <catch2/interfaces/catch_interfaces_reporter_factory.hpp>
 #include <catch2/internal/catch_enforce.hpp>
 #include <catch2/internal/catch_move_and_forward.hpp>
+#include <catch2/internal/catch_reporter_registry.hpp>
 #include <catch2/reporters/catch_reporter_automake.hpp>
 #include <catch2/reporters/catch_reporter_compact.hpp>
 #include <catch2/reporters/catch_reporter_console.hpp>
+#include <catch2/reporters/catch_reporter_json.hpp>
 #include <catch2/reporters/catch_reporter_junit.hpp>
 #include <catch2/reporters/catch_reporter_registrars.hpp>
 #include <catch2/reporters/catch_reporter_sonarqube.hpp>
@@ -47,6 +48,8 @@ namespace Catch {
             Detail::make_unique<ReporterFactory<TeamCityReporter>>();
         m_impl->factories["XML"] =
             Detail::make_unique<ReporterFactory<XmlReporter>>();
+        m_impl->factories["JSON"] =
+            Detail::make_unique<ReporterFactory<JsonReporter>>();
     }
 
     ReporterRegistry::~ReporterRegistry() = default;
diff --git a/packages/Catch2/src/catch2/internal/catch_reporter_spec_parser.hpp b/packages/Catch2/src/catch2/internal/catch_reporter_spec_parser.hpp
index d446ce98b4614e0c7c804c141e48d7d680316575..9f447ee2f73e0a2ed78b35ed06ac96dfa201fc44 100644
--- a/packages/Catch2/src/catch2/internal/catch_reporter_spec_parser.hpp
+++ b/packages/Catch2/src/catch2/internal/catch_reporter_spec_parser.hpp
@@ -8,7 +8,7 @@
 #ifndef CATCH_REPORTER_SPEC_PARSER_HPP_INCLUDED
 #define CATCH_REPORTER_SPEC_PARSER_HPP_INCLUDED
 
-#include <catch2/internal/catch_console_colour.hpp>
+#include <catch2/interfaces/catch_interfaces_config.hpp>
 #include <catch2/internal/catch_optional.hpp>
 #include <catch2/internal/catch_stringref.hpp>
 
diff --git a/packages/Catch2/src/catch2/internal/catch_run_context.cpp b/packages/Catch2/src/catch2/internal/catch_run_context.cpp
index 6f15cfb1a4bbf34767d357c3eb5deccc3d31a11e..77b476d82a2313fa6c8c9e2eccd6be9be9420f2f 100644
--- a/packages/Catch2/src/catch2/internal/catch_run_context.cpp
+++ b/packages/Catch2/src/catch2/internal/catch_run_context.cpp
@@ -20,6 +20,7 @@
 #include <catch2/internal/catch_output_redirect.hpp>
 #include <catch2/internal/catch_assertion_handler.hpp>
 #include <catch2/internal/catch_test_failure_exception.hpp>
+#include <catch2/internal/catch_result_type.hpp>
 
 #include <cassert>
 #include <algorithm>
@@ -37,7 +38,6 @@ namespace Catch {
                     TrackerContext& ctx,
                     ITracker* parent ):
                     TrackerBase( CATCH_MOVE( nameAndLocation ), ctx, parent ) {}
-                ~GeneratorTracker() override = default;
 
                 static GeneratorTracker*
                 acquire( TrackerContext& ctx,
@@ -293,13 +293,14 @@ namespace Catch {
             m_messageScopes.clear();
         }
 
-        // Reset working state
-        resetAssertionInfo();
+        // Reset working state. assertion info will be reset after
+        // populateReaction is run if it is needed
         m_lastResult = CATCH_MOVE( result );
     }
     void RunContext::resetAssertionInfo() {
         m_lastAssertionInfo.macroName = StringRef();
         m_lastAssertionInfo.capturedExpression = "{Unknown expression after the reported line}"_sr;
+        m_lastAssertionInfo.resultDisposition = ResultDisposition::Normal;
     }
 
     void RunContext::notifyAssertionStarted( AssertionInfo const& info ) {
@@ -447,6 +448,7 @@ namespace Catch {
         AssertionResult result(m_lastAssertionInfo, CATCH_MOVE(tempResult));
 
         assertionEnded(CATCH_MOVE(result) );
+        resetAssertionInfo();
 
         handleUnfinishedSections();
 
@@ -583,6 +585,7 @@ namespace Catch {
             reportExpr(info, ResultWas::ExpressionFailed, &expr, negated );
             populateReaction( reaction );
         }
+        resetAssertionInfo();
     }
     void RunContext::reportExpr(
             AssertionInfo const &info,
@@ -621,6 +624,7 @@ namespace Catch {
             // considered "OK"
             reaction.shouldSkip = true;
         }
+        resetAssertionInfo();
     }
     void RunContext::handleUnexpectedExceptionNotThrown(
             AssertionInfo const& info,
@@ -641,6 +645,7 @@ namespace Catch {
         AssertionResult assertionResult{ info, CATCH_MOVE(data) };
         assertionEnded( CATCH_MOVE(assertionResult) );
         populateReaction( reaction );
+        resetAssertionInfo();
     }
 
     void RunContext::populateReaction( AssertionReaction& reaction ) {
@@ -658,6 +663,7 @@ namespace Catch {
         data.message = "Exception translation was disabled by CATCH_CONFIG_FAST_COMPILE"s;
         AssertionResult assertionResult{ info, CATCH_MOVE( data ) };
         assertionEnded( CATCH_MOVE(assertionResult) );
+        resetAssertionInfo();
     }
     void RunContext::handleNonExpr(
             AssertionInfo const &info,
@@ -672,6 +678,7 @@ namespace Catch {
         const auto isOk = assertionResult.isOk();
         assertionEnded( CATCH_MOVE(assertionResult) );
         if ( !isOk ) { populateReaction( reaction ); }
+        resetAssertionInfo();
     }
 
 
diff --git a/packages/Catch2/src/catch2/internal/catch_section.cpp b/packages/Catch2/src/catch2/internal/catch_section.cpp
index 061732b1d875c4433eb8c11632b434786a6d6ac6..677c2164c2ab8503ecfc2f6d92d12ceaf4d409ed 100644
--- a/packages/Catch2/src/catch2/internal/catch_section.cpp
+++ b/packages/Catch2/src/catch2/internal/catch_section.cpp
@@ -6,7 +6,7 @@
 
 // SPDX-License-Identifier: BSL-1.0
 #include <catch2/internal/catch_section.hpp>
-#include <catch2/internal/catch_run_context.hpp>
+#include <catch2/interfaces/catch_interfaces_capture.hpp>
 #include <catch2/internal/catch_uncaught_exceptions.hpp>
 #include <catch2/internal/catch_move_and_forward.hpp>
 
diff --git a/packages/Catch2/src/catch2/internal/catch_section.hpp b/packages/Catch2/src/catch2/internal/catch_section.hpp
index bd92bdf462b03e37aca9966bb6e6c09ac79b11ba..8c894eeb8742e20578793b7d2e34d4bc8fe85768 100644
--- a/packages/Catch2/src/catch2/internal/catch_section.hpp
+++ b/packages/Catch2/src/catch2/internal/catch_section.hpp
@@ -78,7 +78,7 @@ namespace Catch {
         CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                           \
         CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS                    \
         CATCH_INTERNAL_SUPPRESS_SHADOW_WARNINGS                             \
-        if ( [[maybe_unused]] int catchInternalPreviousSectionHint =        \
+        if ( [[maybe_unused]] const int catchInternalPreviousSectionHint =  \
                  catchInternalSectionHint,                                  \
              catchInternalSectionHint = Catch::Detail::GetNewSectionHint(); \
              catchInternalPreviousSectionHint == __LINE__ )                 \
@@ -88,7 +88,7 @@ namespace Catch {
         CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                           \
         CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS                    \
         CATCH_INTERNAL_SUPPRESS_SHADOW_WARNINGS                             \
-        if ( [[maybe_unused]] int catchInternalPreviousSectionHint =        \
+        if ( [[maybe_unused]] const int catchInternalPreviousSectionHint =  \
                  catchInternalSectionHint,                                  \
              catchInternalSectionHint = Catch::Detail::GetNewSectionHint(); \
              catchInternalPreviousSectionHint == __LINE__ )                 \
diff --git a/packages/Catch2/src/catch2/internal/catch_sharding.hpp b/packages/Catch2/src/catch2/internal/catch_sharding.hpp
index d0e4cfa13fa62c98338cf9752732aaa3a8f9d40e..22561f4bf1f4a712f16cf18e68de123683edafba 100644
--- a/packages/Catch2/src/catch2/internal/catch_sharding.hpp
+++ b/packages/Catch2/src/catch2/internal/catch_sharding.hpp
@@ -8,8 +8,7 @@
 #ifndef CATCH_SHARDING_HPP_INCLUDED
 #define CATCH_SHARDING_HPP_INCLUDED
 
-#include <catch2/catch_session.hpp>
-
+#include <cassert>
 #include <cmath>
 #include <algorithm>
 
diff --git a/packages/Catch2/src/catch2/internal/catch_stringref.hpp b/packages/Catch2/src/catch2/internal/catch_stringref.hpp
index 99bb9a986af71ddb3a0101a05a4e6f7eea6dfe4a..4b9212bfae9389d21edae831ca74b42576c75df2 100644
--- a/packages/Catch2/src/catch2/internal/catch_stringref.hpp
+++ b/packages/Catch2/src/catch2/internal/catch_stringref.hpp
@@ -25,6 +25,8 @@ namespace Catch {
         using size_type = std::size_t;
         using const_iterator = const char*;
 
+        static constexpr size_type npos{ static_cast<size_type>( -1 ) };
+
     private:
         static constexpr char const* const s_empty = "";
 
@@ -75,7 +77,7 @@ namespace Catch {
         }
 
         // Returns a substring of [start, start + length).
-        // If start + length > size(), then the substring is [start, start + size()).
+        // If start + length > size(), then the substring is [start, size()).
         // If start > size(), then the substring is empty.
         constexpr StringRef substr(size_type start, size_type length) const noexcept {
             if (start < m_size) {
diff --git a/packages/Catch2/src/catch2/internal/catch_tag_alias_registry.cpp b/packages/Catch2/src/catch2/internal/catch_tag_alias_registry.cpp
index b7c6b9ec74312ba3e71bd121143796f533cc0c70..510df167f9643bcb21f13f93c893373fb796ada5 100644
--- a/packages/Catch2/src/catch2/internal/catch_tag_alias_registry.cpp
+++ b/packages/Catch2/src/catch2/internal/catch_tag_alias_registry.cpp
@@ -6,14 +6,13 @@
 
 // SPDX-License-Identifier: BSL-1.0
 #include <catch2/internal/catch_tag_alias_registry.hpp>
-#include <catch2/internal/catch_console_colour.hpp>
 #include <catch2/internal/catch_enforce.hpp>
 #include <catch2/interfaces/catch_interfaces_registry_hub.hpp>
 #include <catch2/internal/catch_string_manip.hpp>
 
 namespace Catch {
 
-    TagAliasRegistry::~TagAliasRegistry() {}
+    TagAliasRegistry::~TagAliasRegistry() = default;
 
     TagAlias const* TagAliasRegistry::find( std::string const& alias ) const {
         auto it = m_registry.find( alias );
diff --git a/packages/Catch2/src/catch2/internal/catch_test_case_registry_impl.cpp b/packages/Catch2/src/catch2/internal/catch_test_case_registry_impl.cpp
index f1702979e7c79b8fa78410778dc4d2ccd5df5811..c2b052daf705ba99f3068c6d95c1eff411d356f0 100644
--- a/packages/Catch2/src/catch2/internal/catch_test_case_registry_impl.cpp
+++ b/packages/Catch2/src/catch2/internal/catch_test_case_registry_impl.cpp
@@ -7,12 +7,9 @@
 // SPDX-License-Identifier: BSL-1.0
 #include <catch2/internal/catch_test_case_registry_impl.hpp>
 
-#include <catch2/internal/catch_context.hpp>
 #include <catch2/internal/catch_enforce.hpp>
 #include <catch2/interfaces/catch_interfaces_config.hpp>
 #include <catch2/interfaces/catch_interfaces_registry_hub.hpp>
-#include <catch2/internal/catch_random_number_generator.hpp>
-#include <catch2/internal/catch_run_context.hpp>
 #include <catch2/internal/catch_sharding.hpp>
 #include <catch2/catch_test_case_info.hpp>
 #include <catch2/catch_test_spec.hpp>
@@ -73,7 +70,6 @@ namespace Catch {
             return sorted;
         }
         case TestRunOrder::Randomized: {
-            seedRng(config);
             using TestWithHash = std::pair<TestCaseInfoHasher::hash_t, TestCaseHandle>;
 
             TestCaseInfoHasher h{ config.rngSeed() };
diff --git a/packages/Catch2/src/catch2/internal/catch_test_case_registry_impl.hpp b/packages/Catch2/src/catch2/internal/catch_test_case_registry_impl.hpp
index a4a27ed122ddbc86c267ef42f902390c96f248b7..99a38498f8deb94c9e80a3b97486ce8ff0ba22e4 100644
--- a/packages/Catch2/src/catch2/internal/catch_test_case_registry_impl.hpp
+++ b/packages/Catch2/src/catch2/internal/catch_test_case_registry_impl.hpp
@@ -30,8 +30,6 @@ namespace Catch {
 
     class TestRegistry : public ITestCaseRegistry {
     public:
-        ~TestRegistry() override = default;
-
         void registerTest( Detail::unique_ptr<TestCaseInfo> testInfo, Detail::unique_ptr<ITestInvoker> testInvoker );
 
         std::vector<TestCaseInfo*> const& getAllInfos() const override;
diff --git a/packages/Catch2/src/catch2/internal/catch_test_macro_impl.hpp b/packages/Catch2/src/catch2/internal/catch_test_macro_impl.hpp
index 39366023f426193457241be4d5c4dc079fa07374..0d95650fb52e7f13066fe79275a0defdda3b6a62 100644
--- a/packages/Catch2/src/catch2/internal/catch_test_macro_impl.hpp
+++ b/packages/Catch2/src/catch2/internal/catch_test_macro_impl.hpp
@@ -34,7 +34,7 @@
 #else // CATCH_CONFIG_FAST_COMPILE
 
 #define INTERNAL_CATCH_TRY try
-#define INTERNAL_CATCH_CATCH( handler ) catch(...) { handler.handleUnexpectedInflightException(); }
+#define INTERNAL_CATCH_CATCH( handler ) catch(...) { (handler).handleUnexpectedInflightException(); }
 
 #endif
 
@@ -49,7 +49,7 @@
         INTERNAL_CATCH_TRY { \
             CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
             CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS \
-            catchAssertionHandler.handleExpr( Catch::Decomposer() <= __VA_ARGS__ ); \
+            catchAssertionHandler.handleExpr( Catch::Decomposer() <= __VA_ARGS__ ); /* NOLINT(bugprone-chained-comparison) */ \
             CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \
         } INTERNAL_CATCH_CATCH( catchAssertionHandler ) \
         INTERNAL_CATCH_REACT( catchAssertionHandler ) \
diff --git a/packages/Catch2/src/catch2/internal/catch_test_registry.hpp b/packages/Catch2/src/catch2/internal/catch_test_registry.hpp
index d248d3cf904a6f90fb6813929a501bfcacf8281a..7766fe111381d444562c28397b96fc6a3fe4513b 100644
--- a/packages/Catch2/src/catch2/internal/catch_test_registry.hpp
+++ b/packages/Catch2/src/catch2/internal/catch_test_registry.hpp
@@ -113,7 +113,7 @@ static int catchInternalSectionHint = 0;
         CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS                           \
         CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS                   \
         static const Catch::Detail::DummyUse INTERNAL_CATCH_UNIQUE_NAME(   \
-            dummyUser )( &fname );                                         \
+            dummyUser )( &(fname) );                                       \
         CATCH_INTERNAL_SUPPRESS_SHADOW_WARNINGS                            \
         static void fname( [[maybe_unused]] int catchInternalSectionHint ) \
             CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
diff --git a/packages/Catch2/src/catch2/internal/catch_textflow.cpp b/packages/Catch2/src/catch2/internal/catch_textflow.cpp
index 7eac973258e8adcfc319d0e21b53217ea221a39b..857fd2b9f436037bd9ba75c8f6e03279e6762771 100644
--- a/packages/Catch2/src/catch2/internal/catch_textflow.cpp
+++ b/packages/Catch2/src/catch2/internal/catch_textflow.cpp
@@ -233,23 +233,36 @@ namespace Catch {
             return os;
         }
 
-        Columns Column::operator+( Column const& other ) {
+        Columns operator+(Column const& lhs, Column const& rhs) {
             Columns cols;
-            cols += *this;
-            cols += other;
+            cols += lhs;
+            cols += rhs;
             return cols;
         }
-
-        Columns& Columns::operator+=( Column const& col ) {
-            m_columns.push_back( col );
-            return *this;
+        Columns operator+(Column&& lhs, Column&& rhs) {
+            Columns cols;
+            cols += CATCH_MOVE( lhs );
+            cols += CATCH_MOVE( rhs );
+            return cols;
         }
 
-        Columns Columns::operator+( Column const& col ) {
-            Columns combined = *this;
-            combined += col;
+        Columns& operator+=(Columns& lhs, Column const& rhs) {
+            lhs.m_columns.push_back( rhs );
+            return lhs;
+        }
+        Columns& operator+=(Columns& lhs, Column&& rhs) {
+            lhs.m_columns.push_back( CATCH_MOVE(rhs) );
+            return lhs;
+        }
+        Columns operator+( Columns const& lhs, Column const& rhs ) {
+            auto combined( lhs );
+            combined += rhs;
             return combined;
         }
+        Columns operator+( Columns&& lhs, Column&& rhs ) {
+            lhs += CATCH_MOVE( rhs );
+            return CATCH_MOVE( lhs );
+        }
 
     } // namespace TextFlow
 } // namespace Catch
diff --git a/packages/Catch2/src/catch2/internal/catch_textflow.hpp b/packages/Catch2/src/catch2/internal/catch_textflow.hpp
index 0776ab9227d01f7ed99c11fcb771b57c3f778c0c..a78451d559cfc33790f9a601db658f551a998e92 100644
--- a/packages/Catch2/src/catch2/internal/catch_textflow.hpp
+++ b/packages/Catch2/src/catch2/internal/catch_textflow.hpp
@@ -8,8 +8,10 @@
 #ifndef CATCH_TEXTFLOW_HPP_INCLUDED
 #define CATCH_TEXTFLOW_HPP_INCLUDED
 
-#include <cassert>
 #include <catch2/internal/catch_console_width.hpp>
+#include <catch2/internal/catch_move_and_forward.hpp>
+
+#include <cassert>
 #include <string>
 #include <vector>
 
@@ -37,7 +39,7 @@ namespace Catch {
 
         public:
             /**
-             * Iterates "lines" in `Column` and return sthem
+             * Iterates "lines" in `Column` and returns them
              */
             class const_iterator {
                 friend Column;
@@ -91,20 +93,35 @@ namespace Catch {
             using iterator = const_iterator;
 
             explicit Column( std::string const& text ): m_string( text ) {}
+            explicit Column( std::string&& text ):
+                m_string( CATCH_MOVE(text)) {}
 
-            Column& width( size_t newWidth ) {
+            Column& width( size_t newWidth ) & {
                 assert( newWidth > 0 );
                 m_width = newWidth;
                 return *this;
             }
-            Column& indent( size_t newIndent ) {
+            Column&& width( size_t newWidth ) && {
+                assert( newWidth > 0 );
+                m_width = newWidth;
+                return CATCH_MOVE( *this );
+            }
+            Column& indent( size_t newIndent ) & {
                 m_indent = newIndent;
                 return *this;
             }
-            Column& initialIndent( size_t newIndent ) {
+            Column&& indent( size_t newIndent ) && {
+                m_indent = newIndent;
+                return CATCH_MOVE( *this );
+            }
+            Column& initialIndent( size_t newIndent ) & {
                 m_initialIndent = newIndent;
                 return *this;
             }
+            Column&& initialIndent( size_t newIndent ) && {
+                m_initialIndent = newIndent;
+                return CATCH_MOVE( *this );
+            }
 
             size_t width() const { return m_width; }
             const_iterator begin() const { return const_iterator( *this ); }
@@ -113,7 +130,8 @@ namespace Catch {
             friend std::ostream& operator<<( std::ostream& os,
                                              Column const& col );
 
-            Columns operator+( Column const& other );
+            friend Columns operator+( Column const& lhs, Column const& rhs );
+            friend Columns operator+( Column&& lhs, Column&& rhs );
         };
 
         //! Creates a column that serves as an empty space of specific width
@@ -157,8 +175,10 @@ namespace Catch {
             iterator begin() const { return iterator( *this ); }
             iterator end() const { return { *this, iterator::EndTag() }; }
 
-            Columns& operator+=( Column const& col );
-            Columns operator+( Column const& col );
+            friend Columns& operator+=( Columns& lhs, Column const& rhs );
+            friend Columns& operator+=( Columns& lhs, Column&& rhs );
+            friend Columns operator+( Columns const& lhs, Column const& rhs );
+            friend Columns operator+( Columns&& lhs, Column&& rhs );
 
             friend std::ostream& operator<<( std::ostream& os,
                                              Columns const& cols );
diff --git a/packages/Catch2/src/catch2/internal/catch_uncaught_exceptions.cpp b/packages/Catch2/src/catch2/internal/catch_uncaught_exceptions.cpp
index 704d6e1ca92cd9f70d36bc95224c34d59efe9f1b..8cfabc0f8beb9c97934fddae87e002e4acbc8818 100644
--- a/packages/Catch2/src/catch2/internal/catch_uncaught_exceptions.cpp
+++ b/packages/Catch2/src/catch2/internal/catch_uncaught_exceptions.cpp
@@ -7,7 +7,6 @@
 // SPDX-License-Identifier: BSL-1.0
 
 #include <catch2/internal/catch_uncaught_exceptions.hpp>
-#include <catch2/internal/catch_compiler_capabilities.hpp>
 #include <catch2/internal/catch_config_uncaught_exceptions.hpp>
 #include <catch2/catch_user_config.hpp>
 
diff --git a/packages/Catch2/src/catch2/internal/catch_uniform_floating_point_distribution.hpp b/packages/Catch2/src/catch2/internal/catch_uniform_floating_point_distribution.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..23d03b43c36f4a09ceaba7596f85354bfb14461b
--- /dev/null
+++ b/packages/Catch2/src/catch2/internal/catch_uniform_floating_point_distribution.hpp
@@ -0,0 +1,131 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#ifndef CATCH_UNIFORM_FLOATING_POINT_DISTRIBUTION_HPP_INCLUDED
+#define CATCH_UNIFORM_FLOATING_POINT_DISTRIBUTION_HPP_INCLUDED
+
+#include <catch2/internal/catch_random_floating_point_helpers.hpp>
+#include <catch2/internal/catch_uniform_integer_distribution.hpp>
+
+#include <cmath>
+#include <type_traits>
+
+namespace Catch {
+
+    namespace Detail {
+#if defined( __GNUC__ ) || defined( __clang__ )
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wfloat-equal"
+#endif
+        // The issue with overflow only happens with maximal ULP and HUGE
+        // distance, e.g. when generating numbers in [-inf, inf] for given
+        // type. So we only check for the largest possible ULP in the
+        // type, and return something that does not overflow to inf in 1 mult.
+        constexpr std::uint64_t calculate_max_steps_in_one_go(double gamma) {
+            if ( gamma == 1.99584030953472e+292 ) { return 9007199254740991; }
+            return static_cast<std::uint64_t>( -1 );
+        }
+        constexpr std::uint32_t calculate_max_steps_in_one_go(float gamma) {
+            if ( gamma == 2.028241e+31f ) { return 16777215; }
+            return static_cast<std::uint32_t>( -1 );
+        }
+#if defined( __GNUC__ ) || defined( __clang__ )
+#    pragma GCC diagnostic pop
+#endif
+    }
+
+/**
+ * Implementation of uniform distribution on floating point numbers.
+ *
+ * Note that we support only `float` and `double` types, because these
+ * usually mean the same thing across different platform. `long double`
+ * varies wildly by platform and thus we cannot provide reproducible
+ * implementation. Also note that we don't implement all parts of
+ * distribution per standard: this distribution is not serializable, nor
+ * can the range be arbitrarily reset.
+ *
+ * The implementation also uses different approach than the one taken by
+ * `std::uniform_real_distribution`, where instead of generating a number
+ * between [0, 1) and then multiplying the range bounds with it, we first
+ * split the [a, b] range into a set of equidistributed floating point
+ * numbers, and then use uniform int distribution to pick which one to
+ * return.
+ *
+ * This has the advantage of guaranteeing uniformity (the multiplication
+ * method loses uniformity due to rounding when multiplying floats), except
+ * for small non-uniformity at one side of the interval, where we have
+ * to deal with the fact that not every interval is splittable into
+ * equidistributed floats.
+ *
+ * Based on "Drawing random floating-point numbers from an interval" by
+ * Frederic Goualard.
+ */
+template <typename FloatType>
+class uniform_floating_point_distribution {
+    static_assert(std::is_floating_point<FloatType>::value, "...");
+    static_assert(!std::is_same<FloatType, long double>::value,
+                  "We do not support long double due to inconsistent behaviour between platforms");
+
+    using WidthType = Detail::DistanceType<FloatType>;
+
+    FloatType m_a, m_b;
+    FloatType m_ulp_magnitude;
+    WidthType m_floats_in_range;
+    uniform_integer_distribution<WidthType> m_int_dist;
+
+    // In specific cases, we can overflow into `inf` when computing the
+    // `steps * g` offset. To avoid this, we don't offset by more than this
+    // in one multiply + addition.
+    WidthType m_max_steps_in_one_go;
+    // We don't want to do the magnitude check every call to `operator()`
+    bool m_a_has_leq_magnitude;
+
+public:
+    using result_type = FloatType;
+
+    uniform_floating_point_distribution( FloatType a, FloatType b ):
+        m_a( a ),
+        m_b( b ),
+        m_ulp_magnitude( Detail::gamma( m_a, m_b ) ),
+        m_floats_in_range( Detail::count_equidistant_floats( m_a, m_b, m_ulp_magnitude ) ),
+        m_int_dist(0, m_floats_in_range),
+        m_max_steps_in_one_go( Detail::calculate_max_steps_in_one_go(m_ulp_magnitude)),
+        m_a_has_leq_magnitude(std::fabs(m_a) <= std::fabs(m_b))
+    {
+        assert( a <= b );
+    }
+
+    template <typename Generator>
+    result_type operator()( Generator& g ) {
+        WidthType steps = m_int_dist( g );
+        if ( m_a_has_leq_magnitude ) {
+            if ( steps == m_floats_in_range ) { return m_a; }
+            auto b = m_b;
+            while (steps > m_max_steps_in_one_go) {
+                b -= m_max_steps_in_one_go * m_ulp_magnitude;
+                steps -= m_max_steps_in_one_go;
+            }
+            return b - steps * m_ulp_magnitude;
+        } else {
+            if ( steps == m_floats_in_range ) { return m_b; }
+            auto a = m_a;
+            while (steps > m_max_steps_in_one_go) {
+                a += m_max_steps_in_one_go * m_ulp_magnitude;
+                steps -= m_max_steps_in_one_go;
+            }
+            return a + steps * m_ulp_magnitude;
+        }
+    }
+
+    result_type a() const { return m_a; }
+    result_type b() const { return m_b; }
+};
+
+} // end namespace Catch
+
+#endif // CATCH_UNIFORM_FLOATING_POINT_DISTRIBUTION_HPP_INCLUDED
diff --git a/packages/Catch2/src/catch2/internal/catch_uniform_integer_distribution.hpp b/packages/Catch2/src/catch2/internal/catch_uniform_integer_distribution.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..afa2015d9f0fd9825bc2f3c8e4235b0696ad16af
--- /dev/null
+++ b/packages/Catch2/src/catch2/internal/catch_uniform_integer_distribution.hpp
@@ -0,0 +1,124 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#ifndef CATCH_UNIFORM_INTEGER_DISTRIBUTION_HPP_INCLUDED
+#define CATCH_UNIFORM_INTEGER_DISTRIBUTION_HPP_INCLUDED
+
+#include <catch2/internal/catch_random_integer_helpers.hpp>
+
+namespace Catch {
+
+    namespace Detail {
+        // Indirection to enable make_unsigned<bool> behaviour.
+        template <typename T>
+        struct make_unsigned {
+            using type = std::make_unsigned_t<T>;
+        };
+
+        template <>
+        struct make_unsigned<bool> {
+            using type = uint8_t;
+        };
+
+        template <typename T>
+        using make_unsigned_t = typename make_unsigned<T>::type;
+    }
+
+/**
+ * Implementation of uniform distribution on integers.
+ *
+ * Unlike `std::uniform_int_distribution`, this implementation supports
+ * various 1 byte integral types, including bool (but you should not
+ * actually use it for bools).
+ *
+ * The underlying algorithm is based on the one described in "Fast Random
+ * Integer Generation in an Interval" by Daniel Lemire, but has been
+ * optimized under the assumption of reuse of the same distribution object.
+ */
+template <typename IntegerType>
+class uniform_integer_distribution {
+    static_assert(std::is_integral<IntegerType>::value, "...");
+
+    using UnsignedIntegerType = Detail::make_unsigned_t<IntegerType>;
+
+    // Only the left bound is stored, and we store it converted to its
+    // unsigned image. This avoids having to do the conversions inside
+    // the operator(), at the cost of having to do the conversion in
+    // the a() getter. The right bound is only needed in the b() getter,
+    // so we recompute it there from other stored data.
+    UnsignedIntegerType m_a;
+
+    // How many different values are there in [a, b]. a == b => 1, can be 0 for distribution over all values in the type.
+    UnsignedIntegerType m_ab_distance;
+
+    // We hoisted this out of the main generation function. Technically,
+    // this means that using this distribution will be slower than Lemire's
+    // algorithm if this distribution instance will be used only few times,
+    // but it will be faster if it is used many times. Since Catch2 uses
+    // distributions only to implement random generators, we assume that each
+    // distribution will be reused many times and this is an optimization.
+    UnsignedIntegerType m_rejection_threshold = 0;
+
+    UnsignedIntegerType computeDistance(IntegerType a, IntegerType b) const {
+        // This overflows and returns 0 if a == 0 and b == TYPE_MAX.
+        // We handle that later when generating the number.
+        return transposeTo(b) - transposeTo(a) + 1;
+    }
+
+    static UnsignedIntegerType computeRejectionThreshold(UnsignedIntegerType ab_distance) {
+        // distance == 0 means that we will return all possible values from
+        // the type's range, and that we shouldn't reject anything.
+        if ( ab_distance == 0 ) { return 0; }
+        return ( ~ab_distance + 1 ) % ab_distance;
+    }
+
+    static UnsignedIntegerType transposeTo(IntegerType in) {
+        return Detail::transposeToNaturalOrder<IntegerType>(
+            static_cast<UnsignedIntegerType>( in ) );
+    }
+    static IntegerType transposeBack(UnsignedIntegerType in) {
+        return static_cast<IntegerType>(
+            Detail::transposeToNaturalOrder<IntegerType>(in) );
+    }
+
+public:
+    using result_type = IntegerType;
+
+    uniform_integer_distribution( IntegerType a, IntegerType b ):
+        m_a( transposeTo(a) ),
+        m_ab_distance( computeDistance(a, b) ),
+        m_rejection_threshold( computeRejectionThreshold(m_ab_distance) ) {
+        assert( a <= b );
+    }
+
+    template <typename Generator>
+    result_type operator()( Generator& g ) {
+        // All possible values of result_type are valid.
+        if ( m_ab_distance == 0 ) {
+            return transposeBack( Detail::fillBitsFrom<UnsignedIntegerType>( g ) );
+        }
+
+        auto random_number = Detail::fillBitsFrom<UnsignedIntegerType>( g );
+        auto emul = Detail::extendedMult( random_number, m_ab_distance );
+        // Unlike Lemire's algorithm we skip the ab_distance check, since
+        // we precomputed the rejection threshold, which is always tighter.
+        while (emul.lower < m_rejection_threshold) {
+            random_number = Detail::fillBitsFrom<UnsignedIntegerType>( g );
+            emul = Detail::extendedMult( random_number, m_ab_distance );
+        }
+
+        return transposeBack(m_a + emul.upper);
+    }
+
+    result_type a() const { return transposeBack(m_a); }
+    result_type b() const { return transposeBack(m_ab_distance + m_a - 1); }
+};
+
+} // end namespace Catch
+
+#endif // CATCH_UNIFORM_INTEGER_DISTRIBUTION_HPP_INCLUDED
diff --git a/packages/Catch2/src/catch2/matchers/catch_matchers_floating_point.cpp b/packages/Catch2/src/catch2/matchers/catch_matchers_floating_point.cpp
index 6e596466ecabf20b88881aecd049c0114862d561..206332ef732663947badaf2331c7b6a1563022c3 100644
--- a/packages/Catch2/src/catch2/matchers/catch_matchers_floating_point.cpp
+++ b/packages/Catch2/src/catch2/matchers/catch_matchers_floating_point.cpp
@@ -38,26 +38,11 @@ namespace {
         return ulpDist <= maxUlpDiff;
     }
 
-#if defined(CATCH_CONFIG_GLOBAL_NEXTAFTER)
-
-    float nextafter(float x, float y) {
-        return ::nextafterf(x, y);
-    }
-
-    double nextafter(double x, double y) {
-        return ::nextafter(x, y);
-    }
-
-#endif // ^^^ CATCH_CONFIG_GLOBAL_NEXTAFTER ^^^
 
 template <typename FP>
 FP step(FP start, FP direction, uint64_t steps) {
     for (uint64_t i = 0; i < steps; ++i) {
-#if defined(CATCH_CONFIG_GLOBAL_NEXTAFTER)
         start = Catch::nextafter(start, direction);
-#else
-        start = std::nextafter(start, direction);
-#endif
     }
     return start;
 }
diff --git a/packages/Catch2/src/catch2/meson.build b/packages/Catch2/src/catch2/meson.build
index 2e9469d8419ed99e188ab5d4f233080ad5a0c9b1..cc45e6419a1b96f19a8db1328066b600a331c077 100644
--- a/packages/Catch2/src/catch2/meson.build
+++ b/packages/Catch2/src/catch2/meson.build
@@ -18,6 +18,8 @@ configure_file(
   configuration: conf_data,
 )
 
+fs = import('fs')
+
 benchmark_headers = [
   'benchmark/catch_benchmark.hpp',
   'benchmark/catch_benchmark_all.hpp',
@@ -45,6 +47,7 @@ benchmark_headers = [
 
 benchmark_sources = files(
   'benchmark/catch_chronometer.cpp',
+  'benchmark/detail/catch_analyse.cpp',
   'benchmark/detail/catch_benchmark_function.cpp',
   'benchmark/detail/catch_run_for_at_least.cpp',
   'benchmark/detail/catch_stats.cpp',
@@ -97,6 +100,7 @@ internal_headers = [
   'internal/catch_getenv.hpp',
   'internal/catch_istream.hpp',
   'internal/catch_is_permutation.hpp',
+  'internal/catch_jsonwriter.hpp',
   'internal/catch_lazy_expr.hpp',
   'internal/catch_leak_detector.hpp',
   'internal/catch_list.hpp',
@@ -113,6 +117,8 @@ internal_headers = [
   'internal/catch_preprocessor.hpp',
   'internal/catch_preprocessor_internal_stringify.hpp',
   'internal/catch_preprocessor_remove_parens.hpp',
+  'internal/catch_random_floating_point_helpers.hpp',
+  'internal/catch_random_integer_helpers.hpp',
   'internal/catch_random_number_generator.hpp',
   'internal/catch_random_seed_generation.hpp',
   'internal/catch_reporter_registry.hpp',
@@ -142,6 +148,8 @@ internal_headers = [
   'internal/catch_textflow.hpp',
   'internal/catch_to_string.hpp',
   'internal/catch_uncaught_exceptions.hpp',
+  'internal/catch_uniform_floating_point_distribution.hpp',
+  'internal/catch_uniform_integer_distribution.hpp',
   'internal/catch_unique_name.hpp',
   'internal/catch_unique_ptr.hpp',
   'internal/catch_void_type.hpp',
@@ -156,6 +164,7 @@ internal_headers = [
   'matchers/catch_matchers_floating_point.hpp',
   'matchers/catch_matchers_predicate.hpp',
   'matchers/catch_matchers_quantifiers.hpp',
+  'matchers/catch_matchers_range_equals.hpp',
   'matchers/catch_matchers_string.hpp',
   'matchers/catch_matchers_templated.hpp',
   'matchers/catch_matchers_vector.hpp',
@@ -212,6 +221,7 @@ internal_sources = files(
   'internal/catch_floating_point_helpers.cpp',
   'internal/catch_getenv.cpp',
   'internal/catch_istream.cpp',
+  'internal/catch_jsonwriter.cpp',
   'internal/catch_lazy_expr.cpp',
   'internal/catch_leak_detector.cpp',
   'internal/catch_list.cpp',
@@ -278,6 +288,7 @@ reporter_headers = [
   'reporters/catch_reporter_cumulative_base.hpp',
   'reporters/catch_reporter_event_listener.hpp',
   'reporters/catch_reporter_helpers.hpp',
+  'reporters/catch_reporter_json.hpp',
   'reporters/catch_reporter_junit.hpp',
   'reporters/catch_reporter_multi.hpp',
   'reporters/catch_reporter_registrars.hpp',
@@ -297,6 +308,7 @@ reporter_sources = files(
   'reporters/catch_reporter_cumulative_base.cpp',
   'reporters/catch_reporter_event_listener.cpp',
   'reporters/catch_reporter_helpers.cpp',
+  'reporters/catch_reporter_json.cpp',
   'reporters/catch_reporter_junit.cpp',
   'reporters/catch_reporter_multi.cpp',
   'reporters/catch_reporter_registrars.cpp',
@@ -330,9 +342,19 @@ foreach file : headers
   install_headers(file, subdir: join_paths(include_subdir, folder))
 endforeach
 
+catch2_dependencies = []
+# Check if this is an Android NDK build.
+if ((host_machine.system() == 'android') or
+  # Check if this is an Android Termux build.
+  (host_machine.system() == 'linux' and fs.is_dir('/data/data/com.termux')))
+  log_dep = meson.get_compiler('cpp').find_library('log')
+  catch2_dependencies += log_dep
+endif
+
 catch2 = static_library(
   'Catch2',
   sources,
+  dependencies: catch2_dependencies,
   include_directories: '..',
   install: true,
 )
diff --git a/packages/Catch2/src/catch2/reporters/catch_reporter_automake.cpp b/packages/Catch2/src/catch2/reporters/catch_reporter_automake.cpp
index 993b594b8566206c998a4c2c471d7218d9662bd3..5e506a6bca44ea61ec3ee018e7ab21149abab3a4 100644
--- a/packages/Catch2/src/catch2/reporters/catch_reporter_automake.cpp
+++ b/packages/Catch2/src/catch2/reporters/catch_reporter_automake.cpp
@@ -12,7 +12,7 @@
 
 namespace Catch {
 
-    AutomakeReporter::~AutomakeReporter() {}
+    AutomakeReporter::~AutomakeReporter() = default;
 
     void AutomakeReporter::testCaseEnded(TestCaseStats const& _testCaseStats) {
         // Possible values to emit are PASS, XFAIL, SKIP, FAIL, XPASS and ERROR.
diff --git a/packages/Catch2/src/catch2/reporters/catch_reporter_compact.cpp b/packages/Catch2/src/catch2/reporters/catch_reporter_compact.cpp
index 88acb6a465ac8864e6691d96d5e384899a0481b7..0f855944e86082744ee95c23231ecb562b3253a7 100644
--- a/packages/Catch2/src/catch2/reporters/catch_reporter_compact.cpp
+++ b/packages/Catch2/src/catch2/reporters/catch_reporter_compact.cpp
@@ -249,6 +249,6 @@ private:
             StreamingReporterBase::testRunEnded( _testRunStats );
         }
 
-        CompactReporter::~CompactReporter() {}
+        CompactReporter::~CompactReporter() = default;
 
 } // end namespace Catch
diff --git a/packages/Catch2/src/catch2/reporters/catch_reporter_console.cpp b/packages/Catch2/src/catch2/reporters/catch_reporter_console.cpp
index a46b22cf086c3fad78a002ab72f8232b2156855b..f3b8b5b14dc3d57c4b50c0592c3d187442da6d8a 100644
--- a/packages/Catch2/src/catch2/reporters/catch_reporter_console.cpp
+++ b/packages/Catch2/src/catch2/reporters/catch_reporter_console.cpp
@@ -209,15 +209,9 @@ findMax( std::size_t& i, std::size_t& j, std::size_t& k, std::size_t& l ) {
         return l;
 }
 
-enum class Justification { Left, Right };
-
-struct ColumnInfo {
-    std::string name;
-    std::size_t width;
-    Justification justification;
-};
 struct ColumnBreak {};
 struct RowBreak {};
+struct OutputFlush {};
 
 class Duration {
     enum class Unit {
@@ -292,6 +286,14 @@ public:
 };
 } // end anon namespace
 
+enum class Justification { Left, Right };
+
+struct ColumnInfo {
+    std::string name;
+    std::size_t width;
+    Justification justification;
+};
+
 class TablePrinter {
     std::ostream& m_os;
     std::vector<ColumnInfo> m_columnInfos;
@@ -314,11 +316,10 @@ public:
             *this << RowBreak();
 
 			TextFlow::Columns headerCols;
-			auto spacer = TextFlow::Spacer(2);
 			for (auto const& info : m_columnInfos) {
                 assert(info.width > 2);
 				headerCols += TextFlow::Column(info.name).width(info.width - 2);
-				headerCols += spacer;
+                headerCols += TextFlow::Spacer( 2 );
 			}
 			m_os << headerCols << '\n';
 
@@ -334,12 +335,12 @@ public:
     }
 
     template<typename T>
-    friend TablePrinter& operator << (TablePrinter& tp, T const& value) {
+    friend TablePrinter& operator<< (TablePrinter& tp, T const& value) {
         tp.m_oss << value;
         return tp;
     }
 
-    friend TablePrinter& operator << (TablePrinter& tp, ColumnBreak) {
+    friend TablePrinter& operator<< (TablePrinter& tp, ColumnBreak) {
         auto colStr = tp.m_oss.str();
         const auto strSize = colStr.size();
         tp.m_oss.str("");
@@ -361,13 +362,18 @@ public:
         return tp;
     }
 
-    friend TablePrinter& operator << (TablePrinter& tp, RowBreak) {
+    friend TablePrinter& operator<< (TablePrinter& tp, RowBreak) {
         if (tp.m_currentColumn > 0) {
             tp.m_os << '\n';
             tp.m_currentColumn = -1;
         }
         return tp;
     }
+
+    friend TablePrinter& operator<<(TablePrinter& tp, OutputFlush) {
+        tp.m_os << std::flush;
+        return tp;
+    }
 };
 
 ConsoleReporter::ConsoleReporter(ReporterConfig&& config):
@@ -389,7 +395,7 @@ ConsoleReporter::ConsoleReporter(ReporterConfig&& config):
                 { "benchmark name", CATCH_CONFIG_CONSOLE_WIDTH - 43, Justification::Left },
                 { "samples      mean       std dev", 14, Justification::Right },
                 { "iterations   low mean   low std dev", 14, Justification::Right },
-                { "estimated    high mean  high std dev", 14, Justification::Right }
+                { "est run time high mean  high std dev", 14, Justification::Right }
             };
         }
     }())) {}
@@ -473,8 +479,11 @@ void ConsoleReporter::benchmarkPreparing( StringRef name ) {
 void ConsoleReporter::benchmarkStarting(BenchmarkInfo const& info) {
     (*m_tablePrinter) << info.samples << ColumnBreak()
         << info.iterations << ColumnBreak();
-    if (!m_config->benchmarkNoAnalysis())
-        (*m_tablePrinter) << Duration(info.estimatedDuration) << ColumnBreak();
+    if ( !m_config->benchmarkNoAnalysis() ) {
+        ( *m_tablePrinter )
+            << Duration( info.estimatedDuration ) << ColumnBreak();
+    }
+    ( *m_tablePrinter ) << OutputFlush{};
 }
 void ConsoleReporter::benchmarkEnded(BenchmarkStats<> const& stats) {
     if (m_config->benchmarkNoAnalysis())
diff --git a/packages/Catch2/src/catch2/reporters/catch_reporter_json.cpp b/packages/Catch2/src/catch2/reporters/catch_reporter_json.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1f0db8b0db6f8b63c2f4d10cabe6070e5e33d525
--- /dev/null
+++ b/packages/Catch2/src/catch2/reporters/catch_reporter_json.cpp
@@ -0,0 +1,372 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+//
+#include <catch2/catch_test_case_info.hpp>
+#include <catch2/catch_test_spec.hpp>
+#include <catch2/catch_version.hpp>
+#include <catch2/interfaces/catch_interfaces_config.hpp>
+#include <catch2/internal/catch_list.hpp>
+#include <catch2/internal/catch_string_manip.hpp>
+#include <catch2/reporters/catch_reporter_json.hpp>
+
+namespace Catch {
+    namespace {
+        void writeSourceInfo( JsonObjectWriter& writer,
+                              SourceLineInfo const& sourceInfo ) {
+            auto source_location_writer =
+                writer.write( "source-location"_sr ).writeObject();
+            source_location_writer.write( "filename"_sr )
+                .write( sourceInfo.file );
+            source_location_writer.write( "line"_sr ).write( sourceInfo.line );
+        }
+
+        void writeTags( JsonArrayWriter writer, std::vector<Tag> const& tags ) {
+            for ( auto const& tag : tags ) {
+                writer.write( tag.original );
+            }
+        }
+
+        void writeProperties( JsonArrayWriter writer,
+                              TestCaseInfo const& info ) {
+            if ( info.isHidden() ) { writer.write( "is-hidden"_sr ); }
+            if ( info.okToFail() ) { writer.write( "ok-to-fail"_sr ); }
+            if ( info.expectedToFail() ) {
+                writer.write( "expected-to-fail"_sr );
+            }
+            if ( info.throws() ) { writer.write( "throws"_sr ); }
+        }
+
+    } // namespace
+
+    JsonReporter::JsonReporter( ReporterConfig&& config ):
+        StreamingReporterBase{ CATCH_MOVE( config ) } {
+
+        m_preferences.shouldRedirectStdOut = true;
+        // TBD: Do we want to report all assertions? XML reporter does
+        //      not, but for machine-parseable reporters I think the answer
+        //      should be yes.
+        m_preferences.shouldReportAllAssertions = true;
+
+        m_objectWriters.emplace( m_stream );
+        m_writers.emplace( Writer::Object );
+        auto& writer = m_objectWriters.top();
+
+        writer.write( "version"_sr ).write( 1 );
+
+        {
+            auto metadata_writer = writer.write( "metadata"_sr ).writeObject();
+            metadata_writer.write( "name"_sr ).write( m_config->name() );
+            metadata_writer.write( "rng-seed"_sr ).write( m_config->rngSeed() );
+            metadata_writer.write( "catch2-version"_sr )
+                .write( libraryVersion() );
+            if ( m_config->testSpec().hasFilters() ) {
+                metadata_writer.write( "filters"_sr )
+                    .write( m_config->testSpec() );
+            }
+        }
+    }
+
+    JsonReporter::~JsonReporter() {
+        endListing();
+        // TODO: Ensure this closes the top level object, add asserts
+        assert( m_writers.size() == 1 && "Only the top level object should be open" );
+        assert( m_writers.top() == Writer::Object );
+        endObject();
+        m_stream << '\n' << std::flush;
+        assert( m_writers.empty() );
+    }
+
+    JsonArrayWriter& JsonReporter::startArray() {
+        m_arrayWriters.emplace( m_arrayWriters.top().writeArray() );
+        m_writers.emplace( Writer::Array );
+        return m_arrayWriters.top();
+    }
+    JsonArrayWriter& JsonReporter::startArray( StringRef key ) {
+        m_arrayWriters.emplace(
+            m_objectWriters.top().write( key ).writeArray() );
+        m_writers.emplace( Writer::Array );
+        return m_arrayWriters.top();
+    }
+
+    JsonObjectWriter& JsonReporter::startObject() {
+        m_objectWriters.emplace( m_arrayWriters.top().writeObject() );
+        m_writers.emplace( Writer::Object );
+        return m_objectWriters.top();
+    }
+    JsonObjectWriter& JsonReporter::startObject( StringRef key ) {
+        m_objectWriters.emplace(
+            m_objectWriters.top().write( key ).writeObject() );
+        m_writers.emplace( Writer::Object );
+        return m_objectWriters.top();
+    }
+
+    void JsonReporter::endObject() {
+        assert( isInside( Writer::Object ) );
+        m_objectWriters.pop();
+        m_writers.pop();
+    }
+    void JsonReporter::endArray() {
+        assert( isInside( Writer::Array ) );
+        m_arrayWriters.pop();
+        m_writers.pop();
+    }
+
+    bool JsonReporter::isInside( Writer writer ) {
+        return !m_writers.empty() && m_writers.top() == writer;
+    }
+
+    void JsonReporter::startListing() {
+        if ( !m_startedListing ) { startObject( "listings"_sr ); }
+        m_startedListing = true;
+    }
+    void JsonReporter::endListing() {
+        if ( m_startedListing ) { endObject(); }
+        m_startedListing = false;
+    }
+
+    std::string JsonReporter::getDescription() {
+        return "Outputs listings as JSON. Test listing is Work-in-Progress!";
+    }
+
+    void JsonReporter::testRunStarting( TestRunInfo const& testInfo ) {
+        StreamingReporterBase::testRunStarting( testInfo );
+        endListing();
+
+        assert( isInside( Writer::Object ) );
+        startObject( "test-run"_sr );
+        startArray( "test-cases"_sr );
+    }
+
+     static void writeCounts( JsonObjectWriter&& writer, Counts const& counts ) {
+        writer.write( "passed"_sr ).write( counts.passed );
+        writer.write( "failed"_sr ).write( counts.failed );
+        writer.write( "fail-but-ok"_sr ).write( counts.failedButOk );
+        writer.write( "skipped"_sr ).write( counts.skipped );
+    }
+
+    void JsonReporter::testRunEnded(TestRunStats const& runStats) {
+        assert( isInside( Writer::Array ) );
+        // End "test-cases"
+        endArray();
+
+        {
+            auto totals =
+                m_objectWriters.top().write( "totals"_sr ).writeObject();
+            writeCounts( totals.write( "assertions"_sr ).writeObject(),
+                         runStats.totals.assertions );
+            writeCounts( totals.write( "test-cases"_sr ).writeObject(),
+                         runStats.totals.testCases );
+        }
+
+        // End the "test-run" object
+        endObject();
+    }
+
+    void JsonReporter::testCaseStarting( TestCaseInfo const& tcInfo ) {
+        StreamingReporterBase::testCaseStarting( tcInfo );
+
+        assert( isInside( Writer::Array ) &&
+                "We should be in the 'test-cases' array" );
+        startObject();
+        // "test-info" prelude
+        {
+            auto testInfo =
+                m_objectWriters.top().write( "test-info"_sr ).writeObject();
+            // TODO: handle testName vs className!!
+            testInfo.write( "name"_sr ).write( tcInfo.name );
+            writeSourceInfo(testInfo, tcInfo.lineInfo);
+            writeTags( testInfo.write( "tags"_sr ).writeArray(), tcInfo.tags );
+            writeProperties( testInfo.write( "properties"_sr ).writeArray(),
+                             tcInfo );
+        }
+
+
+        // Start the array for individual test runs (testCasePartial pairs)
+        startArray( "runs"_sr );
+    }
+
+    void JsonReporter::testCaseEnded( TestCaseStats const& tcStats ) {
+        StreamingReporterBase::testCaseEnded( tcStats );
+
+        // We need to close the 'runs' array before finishing the test case
+        assert( isInside( Writer::Array ) );
+        endArray();
+
+        {
+            auto totals =
+                m_objectWriters.top().write( "totals"_sr ).writeObject();
+            writeCounts( totals.write( "assertions"_sr ).writeObject(),
+                         tcStats.totals.assertions );
+            // We do not write the test case totals, because there will always be just one test case here.
+            // TODO: overall "result" -> success, skip, fail here? Or in partial result?
+        }
+        // We do not write out stderr/stdout, because we instead wrote those out in partial runs
+
+        // TODO: aborting?
+
+        // And we also close this test case's object
+        assert( isInside( Writer::Object ) );
+        endObject();
+    }
+
+    void JsonReporter::testCasePartialStarting( TestCaseInfo const& /*tcInfo*/,
+                                                uint64_t index ) {
+        startObject();
+        m_objectWriters.top().write( "run-idx"_sr ).write( index );
+        startArray( "path"_sr );
+        // TODO: we want to delay most of the printing to the 'root' section
+        // TODO: childSection key name?
+    }
+
+    void JsonReporter::testCasePartialEnded( TestCaseStats const& tcStats,
+                                             uint64_t /*index*/ ) {
+        // Fixme: the top level section handles this.
+        //// path object
+        endArray();
+        if ( !tcStats.stdOut.empty() ) {
+            m_objectWriters.top()
+                .write( "captured-stdout"_sr )
+                .write( tcStats.stdOut );
+        }
+        if ( !tcStats.stdErr.empty() ) {
+            m_objectWriters.top()
+                .write( "captured-stderr"_sr )
+                .write( tcStats.stdErr );
+        }
+        {
+            auto totals =
+                m_objectWriters.top().write( "totals"_sr ).writeObject();
+            writeCounts( totals.write( "assertions"_sr ).writeObject(),
+                         tcStats.totals.assertions );
+            // We do not write the test case totals, because there will
+            // always be just one test case here.
+            // TODO: overall "result" -> success, skip, fail here? Or in
+            // partial result?
+        }
+        // TODO: aborting?
+        // run object
+        endObject();
+    }
+
+    void JsonReporter::sectionStarting( SectionInfo const& sectionInfo ) {
+        assert( isInside( Writer::Array ) &&
+                "Section should always start inside an object" );
+        // We want to nest top level sections, even though it shares name
+        // and source loc with the TEST_CASE
+        auto& sectionObject = startObject();
+        sectionObject.write( "kind"_sr ).write( "section"_sr );
+        sectionObject.write( "name"_sr ).write( sectionInfo.name );
+        writeSourceInfo( m_objectWriters.top(), sectionInfo.lineInfo );
+
+
+        // TBD: Do we want to create this event lazily? It would become
+        //      rather complex, but we could do it, and it would look
+        //      better for empty sections. OTOH, empty sections should
+        //      be rare.
+        startArray( "path"_sr );
+    }
+    void JsonReporter::sectionEnded( SectionStats const& /*sectionStats */) {
+        // End the subpath array
+        endArray();
+        // TODO: metadata
+        // TODO: what info do we have here?
+
+        // End the section object
+        endObject();
+    }
+
+    void JsonReporter::assertionStarting( AssertionInfo const& /*assertionInfo*/ ) {}
+    void JsonReporter::assertionEnded( AssertionStats const& assertionStats ) {
+        // TODO: There is lot of different things to handle here, but
+        //       we can fill it in later, after we show that the basic
+        //       outline and streaming reporter impl works well enough.
+        //if ( !m_config->includeSuccessfulResults()
+        //    && assertionStats.assertionResult.isOk() ) {
+        //    return;
+        //}
+        assert( isInside( Writer::Array ) );
+        auto assertionObject = m_arrayWriters.top().writeObject();
+
+        assertionObject.write( "kind"_sr ).write( "assertion"_sr );
+        writeSourceInfo( assertionObject,
+                         assertionStats.assertionResult.getSourceInfo() );
+        assertionObject.write( "status"_sr )
+            .write( assertionStats.assertionResult.isOk() );
+        // TODO: handling of result.
+        // TODO: messages
+        // TODO: totals?
+    }
+
+
+    void JsonReporter::benchmarkPreparing( StringRef name ) { (void)name; }
+    void JsonReporter::benchmarkStarting( BenchmarkInfo const& ) {}
+    void JsonReporter::benchmarkEnded( BenchmarkStats<> const& ) {}
+    void JsonReporter::benchmarkFailed( StringRef error ) { (void)error; }
+
+    void JsonReporter::listReporters(
+        std::vector<ReporterDescription> const& descriptions ) {
+        startListing();
+
+        auto writer =
+            m_objectWriters.top().write( "reporters"_sr ).writeArray();
+        for ( auto const& desc : descriptions ) {
+            auto desc_writer = writer.writeObject();
+            desc_writer.write( "name"_sr ).write( desc.name );
+            desc_writer.write( "description"_sr ).write( desc.description );
+        }
+    }
+    void JsonReporter::listListeners(
+        std::vector<ListenerDescription> const& descriptions ) {
+        startListing();
+
+        auto writer =
+            m_objectWriters.top().write( "listeners"_sr ).writeArray();
+
+        for ( auto const& desc : descriptions ) {
+            auto desc_writer = writer.writeObject();
+            desc_writer.write( "name"_sr ).write( desc.name );
+            desc_writer.write( "description"_sr ).write( desc.description );
+        }
+    }
+    void JsonReporter::listTests( std::vector<TestCaseHandle> const& tests ) {
+        startListing();
+
+        auto writer = m_objectWriters.top().write( "tests"_sr ).writeArray();
+
+        for ( auto const& test : tests ) {
+            auto desc_writer = writer.writeObject();
+            auto const& info = test.getTestCaseInfo();
+
+            desc_writer.write( "name"_sr ).write( info.name );
+            desc_writer.write( "class-name"_sr ).write( info.className );
+            {
+                auto tag_writer = desc_writer.write( "tags"_sr ).writeArray();
+                for ( auto const& tag : info.tags ) {
+                    tag_writer.write( tag.original );
+                }
+            }
+            writeSourceInfo( desc_writer, info.lineInfo );
+        }
+    }
+    void JsonReporter::listTags( std::vector<TagInfo> const& tags ) {
+        startListing();
+
+        auto writer = m_objectWriters.top().write( "tags"_sr ).writeArray();
+        for ( auto const& tag : tags ) {
+            auto tag_writer = writer.writeObject();
+            {
+                auto aliases_writer =
+                    tag_writer.write( "aliases"_sr ).writeArray();
+                for ( auto alias : tag.spellings ) {
+                    aliases_writer.write( alias );
+                }
+            }
+            tag_writer.write( "count"_sr ).write( tag.count );
+        }
+    }
+} // namespace Catch
diff --git a/packages/Catch2/src/catch2/reporters/catch_reporter_json.hpp b/packages/Catch2/src/catch2/reporters/catch_reporter_json.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c938ca3944b8e5aab0c584abe1544fb99724347e
--- /dev/null
+++ b/packages/Catch2/src/catch2/reporters/catch_reporter_json.hpp
@@ -0,0 +1,95 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#ifndef CATCH_REPORTER_JSON_HPP_INCLUDED
+#define CATCH_REPORTER_JSON_HPP_INCLUDED
+
+#include <catch2/catch_timer.hpp>
+#include <catch2/internal/catch_jsonwriter.hpp>
+#include <catch2/reporters/catch_reporter_streaming_base.hpp>
+
+#include <stack>
+
+namespace Catch {
+    class JsonReporter : public StreamingReporterBase {
+    public:
+        JsonReporter( ReporterConfig&& config );
+
+        ~JsonReporter() override;
+
+        static std::string getDescription();
+
+    public: // StreamingReporterBase
+        void testRunStarting( TestRunInfo const& runInfo ) override;
+        void testRunEnded( TestRunStats const& runStats ) override;
+
+        void testCaseStarting( TestCaseInfo const& tcInfo ) override;
+        void testCaseEnded( TestCaseStats const& tcStats ) override;
+
+        void testCasePartialStarting( TestCaseInfo const& tcInfo,
+                                      uint64_t index ) override;
+        void testCasePartialEnded( TestCaseStats const& tcStats,
+                                   uint64_t index ) override;
+
+        void sectionStarting( SectionInfo const& sectionInfo ) override;
+        void sectionEnded( SectionStats const& sectionStats ) override;
+
+        void assertionStarting( AssertionInfo const& assertionInfo ) override;
+        void assertionEnded( AssertionStats const& assertionStats ) override;
+
+        //void testRunEndedCumulative() override;
+
+        void benchmarkPreparing( StringRef name ) override;
+        void benchmarkStarting( BenchmarkInfo const& ) override;
+        void benchmarkEnded( BenchmarkStats<> const& ) override;
+        void benchmarkFailed( StringRef error ) override;
+
+        void listReporters(
+            std::vector<ReporterDescription> const& descriptions ) override;
+        void listListeners(
+            std::vector<ListenerDescription> const& descriptions ) override;
+        void listTests( std::vector<TestCaseHandle> const& tests ) override;
+        void listTags( std::vector<TagInfo> const& tags ) override;
+
+    private:
+        Timer m_testCaseTimer;
+        enum class Writer {
+            Object,
+            Array
+        };
+
+        JsonArrayWriter& startArray();
+        JsonArrayWriter& startArray( StringRef key );
+
+        JsonObjectWriter& startObject();
+        JsonObjectWriter& startObject( StringRef key );
+
+        void endObject();
+        void endArray();
+
+        bool isInside( Writer writer );
+
+        void startListing();
+        void endListing();
+
+        // Invariant:
+        // When m_writers is not empty and its top element is
+        // - Writer::Object, then m_objectWriters is not be empty
+        // - Writer::Array,  then m_arrayWriters shall not be empty
+        std::stack<JsonObjectWriter> m_objectWriters{};
+        std::stack<JsonArrayWriter> m_arrayWriters{};
+        std::stack<Writer> m_writers{};
+
+        bool m_startedListing = false;
+
+        // std::size_t m_sectionDepth = 0;
+        // std::size_t m_sectionStarted = 0;
+    };
+} // namespace Catch
+
+#endif // CATCH_REPORTER_JSON_HPP_INCLUDED
diff --git a/packages/Catch2/src/catch2/reporters/catch_reporter_junit.hpp b/packages/Catch2/src/catch2/reporters/catch_reporter_junit.hpp
index 87c7c5679e4f2ee1ad5fff5f10b55393572987d7..7cb53c25b56635de5466335b0ad1fa54d1ec70cc 100644
--- a/packages/Catch2/src/catch2/reporters/catch_reporter_junit.hpp
+++ b/packages/Catch2/src/catch2/reporters/catch_reporter_junit.hpp
@@ -19,8 +19,6 @@ namespace Catch {
     public:
         JunitReporter(ReporterConfig&& _config);
 
-        ~JunitReporter() override = default;
-
         static std::string getDescription();
 
         void testRunStarting(TestRunInfo const& runInfo) override;
diff --git a/packages/Catch2/src/catch2/reporters/catch_reporter_sonarqube.hpp b/packages/Catch2/src/catch2/reporters/catch_reporter_sonarqube.hpp
index cad6deec8ca0a0b82377a01c5e84d4ff90b91ba1..d26af62e89ea825fc866243773ea8c4d61b7d9dc 100644
--- a/packages/Catch2/src/catch2/reporters/catch_reporter_sonarqube.hpp
+++ b/packages/Catch2/src/catch2/reporters/catch_reporter_sonarqube.hpp
@@ -25,8 +25,6 @@ namespace Catch {
             m_shouldStoreSuccesfulAssertions = false;
         }
 
-        ~SonarQubeReporter() override = default;
-
         static std::string getDescription() {
             using namespace std::string_literals;
             return "Reports test results in the Generic Test Data SonarQube XML format"s;
diff --git a/packages/Catch2/src/catch2/reporters/catch_reporter_tap.hpp b/packages/Catch2/src/catch2/reporters/catch_reporter_tap.hpp
index fe45df63e8acffd718e0af45be6bc491681a3e92..e6889bb110ec6c46c262053c9c09f5b52f26c802 100644
--- a/packages/Catch2/src/catch2/reporters/catch_reporter_tap.hpp
+++ b/packages/Catch2/src/catch2/reporters/catch_reporter_tap.hpp
@@ -19,7 +19,6 @@ namespace Catch {
             StreamingReporterBase( CATCH_MOVE(config) ) {
             m_preferences.shouldReportAllAssertions = true;
         }
-        ~TAPReporter() override = default;
 
         static std::string getDescription() {
             using namespace std::string_literals;
diff --git a/packages/Catch2/src/catch2/reporters/catch_reporter_teamcity.cpp b/packages/Catch2/src/catch2/reporters/catch_reporter_teamcity.cpp
index 320728007e9fbbab52aec1bd70c3d88a58a0068f..38aa55a656947c359c71375bc54b7cbde81e49d6 100644
--- a/packages/Catch2/src/catch2/reporters/catch_reporter_teamcity.cpp
+++ b/packages/Catch2/src/catch2/reporters/catch_reporter_teamcity.cpp
@@ -45,7 +45,7 @@ namespace Catch {
     } // end anonymous namespace
 
 
-    TeamCityReporter::~TeamCityReporter() {}
+    TeamCityReporter::~TeamCityReporter() = default;
 
     void TeamCityReporter::testRunStarting( TestRunInfo const& runInfo ) {
         m_stream << "##teamcity[testSuiteStarted name='" << escape( runInfo.name )
diff --git a/packages/Catch2/src/catch2/reporters/catch_reporter_xml.cpp b/packages/Catch2/src/catch2/reporters/catch_reporter_xml.cpp
index f80cd2470a246ba9ef329c1c9c6451ad04c00ab6..35a3028ee2a92e7314181cd2f9e4e69f2ac01710 100644
--- a/packages/Catch2/src/catch2/reporters/catch_reporter_xml.cpp
+++ b/packages/Catch2/src/catch2/reporters/catch_reporter_xml.cpp
@@ -234,26 +234,23 @@ namespace Catch {
     }
 
     void XmlReporter::benchmarkEnded(BenchmarkStats<> const& benchmarkStats) {
-        m_xml.startElement("mean")
+        m_xml.scopedElement("mean")
             .writeAttribute("value"_sr, benchmarkStats.mean.point.count())
             .writeAttribute("lowerBound"_sr, benchmarkStats.mean.lower_bound.count())
             .writeAttribute("upperBound"_sr, benchmarkStats.mean.upper_bound.count())
             .writeAttribute("ci"_sr, benchmarkStats.mean.confidence_interval);
-        m_xml.endElement();
-        m_xml.startElement("standardDeviation")
+        m_xml.scopedElement("standardDeviation")
             .writeAttribute("value"_sr, benchmarkStats.standardDeviation.point.count())
             .writeAttribute("lowerBound"_sr, benchmarkStats.standardDeviation.lower_bound.count())
             .writeAttribute("upperBound"_sr, benchmarkStats.standardDeviation.upper_bound.count())
             .writeAttribute("ci"_sr, benchmarkStats.standardDeviation.confidence_interval);
-        m_xml.endElement();
-        m_xml.startElement("outliers")
+        m_xml.scopedElement("outliers")
             .writeAttribute("variance"_sr, benchmarkStats.outlierVariance)
             .writeAttribute("lowMild"_sr, benchmarkStats.outliers.low_mild)
             .writeAttribute("lowSevere"_sr, benchmarkStats.outliers.low_severe)
             .writeAttribute("highMild"_sr, benchmarkStats.outliers.high_mild)
             .writeAttribute("highSevere"_sr, benchmarkStats.outliers.high_severe);
         m_xml.endElement();
-        m_xml.endElement();
     }
 
     void XmlReporter::benchmarkFailed(StringRef error) {
diff --git a/packages/Catch2/src/catch2/reporters/catch_reporters_all.hpp b/packages/Catch2/src/catch2/reporters/catch_reporters_all.hpp
index 16f7bd70cdcae5f28757ad4cc94d98ca9a913ae0..5c713fe1446c3e4176740a3cf95f74714c096d61 100644
--- a/packages/Catch2/src/catch2/reporters/catch_reporters_all.hpp
+++ b/packages/Catch2/src/catch2/reporters/catch_reporters_all.hpp
@@ -28,6 +28,7 @@
 #include <catch2/reporters/catch_reporter_cumulative_base.hpp>
 #include <catch2/reporters/catch_reporter_event_listener.hpp>
 #include <catch2/reporters/catch_reporter_helpers.hpp>
+#include <catch2/reporters/catch_reporter_json.hpp>
 #include <catch2/reporters/catch_reporter_junit.hpp>
 #include <catch2/reporters/catch_reporter_multi.hpp>
 #include <catch2/reporters/catch_reporter_registrars.hpp>
diff --git a/packages/Catch2/tests/CMakeLists.txt b/packages/Catch2/tests/CMakeLists.txt
index 7be57abec1856d6b2116f012ed641551ae3e48a9..d3ab14a7f6cd79bb5296ca51e821ae07756789ba 100644
--- a/packages/Catch2/tests/CMakeLists.txt
+++ b/packages/Catch2/tests/CMakeLists.txt
@@ -78,6 +78,7 @@ endif(MSVC) #Temporary workaround
 set(TEST_SOURCES
         ${SELF_TEST_DIR}/TestRegistrations.cpp
         ${SELF_TEST_DIR}/IntrospectiveTests/Algorithms.tests.cpp
+        ${SELF_TEST_DIR}/IntrospectiveTests/AssertionHandler.tests.cpp
         ${SELF_TEST_DIR}/IntrospectiveTests/Clara.tests.cpp
         ${SELF_TEST_DIR}/IntrospectiveTests/CmdLine.tests.cpp
         ${SELF_TEST_DIR}/IntrospectiveTests/CmdLineHelpers.tests.cpp
@@ -85,7 +86,9 @@ set(TEST_SOURCES
         ${SELF_TEST_DIR}/IntrospectiveTests/Details.tests.cpp
         ${SELF_TEST_DIR}/IntrospectiveTests/FloatingPoint.tests.cpp
         ${SELF_TEST_DIR}/IntrospectiveTests/GeneratorsImpl.tests.cpp
+        ${SELF_TEST_DIR}/IntrospectiveTests/Integer.tests.cpp
         ${SELF_TEST_DIR}/IntrospectiveTests/InternalBenchmark.tests.cpp
+        ${SELF_TEST_DIR}/IntrospectiveTests/Json.tests.cpp
         ${SELF_TEST_DIR}/IntrospectiveTests/Parse.tests.cpp
         ${SELF_TEST_DIR}/IntrospectiveTests/PartTracker.tests.cpp
         ${SELF_TEST_DIR}/IntrospectiveTests/RandomNumberGeneration.tests.cpp
@@ -622,6 +625,18 @@ if (CATCH_ENABLE_CONFIGURE_TESTS)
     endforeach()
 endif()
 
+if (CATCH_ENABLE_CMAKE_HELPER_TESTS)
+    add_test(NAME "CMakeHelper::DiscoverTests"
+      COMMAND
+        "${PYTHON_EXECUTABLE}" "${CMAKE_CURRENT_LIST_DIR}/TestScripts/DiscoverTests/VerifyRegistration.py" "${CATCH_DIR}" "${CMAKE_CURRENT_BINARY_DIR}"
+    )
+    set_tests_properties("CMakeHelper::DiscoverTests"
+      PROPERTIES
+        COST 240
+        LABELS "uses-python"
+    )
+endif()
+
 foreach (reporterName # "Automake" - the simple .trs format does not support any kind of comments/metadata
                       "compact"
                       "console"
@@ -629,7 +644,8 @@ foreach (reporterName # "Automake" - the simple .trs format does not support any
                       "SonarQube"
                       "TAP"
                       # "TeamCity" - does not seem to support test suite-level metadata/comments
-                      "XML")
+                      "XML"
+                      "JSON")
 
     add_test(NAME "Reporters:Filters:${reporterName}"
       COMMAND
@@ -639,6 +655,8 @@ foreach (reporterName # "Automake" - the simple .trs format does not support any
     # Different regex for these two reporters, because the commas end up xml-escaped
     if (reporterName MATCHES "JUnit|XML")
       set(testCaseNameFormat "&quot;CaseInsensitiveLess is case insensitive&quot;")
+    elseif(reporterName MATCHES "JSON")
+      set(testCaseNameFormat "\\\\\"CaseInsensitiveLess is case insensitive\\\\\"")
     else()
       set(testCaseNameFormat "\"CaseInsensitiveLess is case insensitive\"")
     endif()
diff --git a/packages/Catch2/tests/ExtraTests/X20-AssertionStartingEventGoesBeforeAssertionIsEvaluated.cpp b/packages/Catch2/tests/ExtraTests/X20-AssertionStartingEventGoesBeforeAssertionIsEvaluated.cpp
index ef5b46b95de33876b1356240a460e06e219f3d0e..6f44bf691b4e6ff1dda73f0f6332e16f2ce4c626 100644
--- a/packages/Catch2/tests/ExtraTests/X20-AssertionStartingEventGoesBeforeAssertionIsEvaluated.cpp
+++ b/packages/Catch2/tests/ExtraTests/X20-AssertionStartingEventGoesBeforeAssertionIsEvaluated.cpp
@@ -7,11 +7,10 @@
 // SPDX-License-Identifier: BSL-1.0
 
 /**\file
- * TODO: FIXES Registers custom reporter that reports testCase* events
+ * Registers an event listener to increments counter of assertionStarting events.
  *
- * The resulting executable can then be used by an external Python script
- * to verify that testCase{Starting,Ended} and testCasePartial{Starting,Ended}
- * events are properly nested.
+ * Different assertion macros then check that the counter is at expected
+ * value when they are evaluated.
  */
 
 #include <catch2/catch_test_macros.hpp>
@@ -23,9 +22,6 @@ namespace {
 
     static size_t assertion_starting_events_seen = 0;
 
-    // TODO: custom matcher to check that "assertion_starting_events_seen" has
-    // the right number of checks
-
     class AssertionStartingListener : public Catch::EventListenerBase {
     public:
         AssertionStartingListener( Catch::IConfig const* config ):
diff --git a/packages/Catch2/tests/SelfTest/Baselines/automake.sw.approved.txt b/packages/Catch2/tests/SelfTest/Baselines/automake.sw.approved.txt
index 6b5938a67b2402df688c5aabd08f5aa23e58dbb7..88c23e17376a86f3a177b262095f41c16abe59e7 100644
--- a/packages/Catch2/tests/SelfTest/Baselines/automake.sw.approved.txt
+++ b/packages/Catch2/tests/SelfTest/Baselines/automake.sw.approved.txt
@@ -154,6 +154,7 @@ Nor would this
 :test-result: PASS Filter generator throws exception for empty generator
 :test-result: PASS Floating point matchers: double
 :test-result: PASS Floating point matchers: float
+:test-result: PASS GENERATE can combine literals and generators
 :test-result: PASS Generators -- adapters
 :test-result: PASS Generators -- simple
 :test-result: PASS Generators internals
@@ -162,12 +163,16 @@ Nor would this
 :test-result: PASS Hashers with same seed produce same hash
 :test-result: PASS Hashing different test cases produces different result
 :test-result: PASS Hashing test case produces same hash across multiple calls
+:test-result: FAIL INFO and UNSCOPED_INFO can stream multiple arguments
 :test-result: FAIL INFO and WARN do not abort tests
 :test-result: FAIL INFO gets logged on failure
 :test-result: FAIL INFO gets logged on failure, even if captured before successful assertions
 :test-result: FAIL INFO is reset for each loop
+:test-result: XFAIL Incomplete AssertionHandler
 :test-result: XFAIL Inequality checks that should fail
 :test-result: PASS Inequality checks that should succeed
+:test-result: PASS JsonWriter
+:test-result: PASS JsonWriter escapes charaters in strings properly
 :test-result: PASS Lambdas in assertions
 :test-result: PASS Less-than inequalities with different epsilons
 :test-result: PASS ManuallyRegistered
@@ -265,6 +270,8 @@ Message from section two
 :test-result: PASS Testing checked-if
 :test-result: XFAIL Testing checked-if 2
 :test-result: XFAIL Testing checked-if 3
+:test-result: XFAIL Testing checked-if 4
+:test-result: XFAIL Testing checked-if 5
 :test-result: FAIL The NO_FAIL macro reports a failure but does not fail the test
 :test-result: PASS The default listing implementation write to provided stream
 :test-result: FAIL This test 'should' fail but doesn't
@@ -408,6 +415,7 @@ b1!
 :test-result: PASS tuple<string,string>
 :test-result: PASS tuple<tuple<int>,tuple<>,float>
 :test-result: PASS uniform samples
+:test-result: PASS uniform_integer_distribution can return the bounds
 :test-result: PASS unique_ptr reimplementation: basic functionality
 :test-result: PASS vec<vec<string,alloc>> -> toString
 :test-result: PASS vector<bool> -> toString
diff --git a/packages/Catch2/tests/SelfTest/Baselines/automake.sw.multi.approved.txt b/packages/Catch2/tests/SelfTest/Baselines/automake.sw.multi.approved.txt
index cd56e6487121bd0629b83ca3f3607cfd4df6a860..a37b1a2b5d6feb00a5d73fc28e396281235e4539 100644
--- a/packages/Catch2/tests/SelfTest/Baselines/automake.sw.multi.approved.txt
+++ b/packages/Catch2/tests/SelfTest/Baselines/automake.sw.multi.approved.txt
@@ -152,6 +152,7 @@
 :test-result: PASS Filter generator throws exception for empty generator
 :test-result: PASS Floating point matchers: double
 :test-result: PASS Floating point matchers: float
+:test-result: PASS GENERATE can combine literals and generators
 :test-result: PASS Generators -- adapters
 :test-result: PASS Generators -- simple
 :test-result: PASS Generators internals
@@ -160,12 +161,16 @@
 :test-result: PASS Hashers with same seed produce same hash
 :test-result: PASS Hashing different test cases produces different result
 :test-result: PASS Hashing test case produces same hash across multiple calls
+:test-result: FAIL INFO and UNSCOPED_INFO can stream multiple arguments
 :test-result: FAIL INFO and WARN do not abort tests
 :test-result: FAIL INFO gets logged on failure
 :test-result: FAIL INFO gets logged on failure, even if captured before successful assertions
 :test-result: FAIL INFO is reset for each loop
+:test-result: XFAIL Incomplete AssertionHandler
 :test-result: XFAIL Inequality checks that should fail
 :test-result: PASS Inequality checks that should succeed
+:test-result: PASS JsonWriter
+:test-result: PASS JsonWriter escapes charaters in strings properly
 :test-result: PASS Lambdas in assertions
 :test-result: PASS Less-than inequalities with different epsilons
 :test-result: PASS ManuallyRegistered
@@ -258,6 +263,8 @@
 :test-result: PASS Testing checked-if
 :test-result: XFAIL Testing checked-if 2
 :test-result: XFAIL Testing checked-if 3
+:test-result: XFAIL Testing checked-if 4
+:test-result: XFAIL Testing checked-if 5
 :test-result: FAIL The NO_FAIL macro reports a failure but does not fail the test
 :test-result: PASS The default listing implementation write to provided stream
 :test-result: FAIL This test 'should' fail but doesn't
@@ -397,6 +404,7 @@
 :test-result: PASS tuple<string,string>
 :test-result: PASS tuple<tuple<int>,tuple<>,float>
 :test-result: PASS uniform samples
+:test-result: PASS uniform_integer_distribution can return the bounds
 :test-result: PASS unique_ptr reimplementation: basic functionality
 :test-result: PASS vec<vec<string,alloc>> -> toString
 :test-result: PASS vector<bool> -> toString
diff --git a/packages/Catch2/tests/SelfTest/Baselines/compact.sw.approved.txt b/packages/Catch2/tests/SelfTest/Baselines/compact.sw.approved.txt
index be7a4120358629363c6bb134ec364c602de737e4..0669fdbbbea26a9f55ad449f42582ecde6ace993 100644
--- a/packages/Catch2/tests/SelfTest/Baselines/compact.sw.approved.txt
+++ b/packages/Catch2/tests/SelfTest/Baselines/compact.sw.approved.txt
@@ -331,7 +331,7 @@ MatchersRanges.tests.cpp:<line number>: passed: inner_lists_are_empty.front(), I
 MatchersRanges.tests.cpp:<line number>: passed: has_empty{}, !IsEmpty() for: {?} not is empty
 MatchersRanges.tests.cpp:<line number>: passed: unrelated::ADL_empty{}, IsEmpty() for: {?} is empty
 Message.tests.cpp:<line number>: passed: with 7 messages: 'a := 1' and 'b := 2' and 'c := 3' and 'a + b := 3' and 'a+b := 3' and 'c > b := true' and 'a == 1 := true'
-Message.tests.cpp:<line number>: passed: with 7 messages: 'std::vector<int>{1, 2, 3}[0, 1, 2] := 3' and 'std::vector<int>{1, 2, 3}[(0, 1)] := 2' and 'std::vector<int>{1, 2, 3}[0] := 1' and '(helper_1436<int, int>{12, -12}) := { 12, -12 }' and '(helper_1436<int, int>(-12, 12)) := { -12, 12 }' and '(1, 2) := 2' and '(2, 3) := 3'
+Message.tests.cpp:<line number>: passed: with 7 messages: 'custom_index_op<int>{1, 2, 3}[0, 1, 2] := 0' and 'custom_index_op<int>{1, 2, 3}[(0, 1)] := 0' and 'custom_index_op<int>{1, 2, 3}[0] := 0' and '(helper_1436<int, int>{12, -12}) := { 12, -12 }' and '(helper_1436<int, int>(-12, 12)) := { -12, 12 }' and '(1, 2) := 2' and '(2, 3) := 3'
 Message.tests.cpp:<line number>: passed: with 11 messages: '("comma, in string", "escaped, \", ") := "escaped, ", "' and '"single quote in string,'," := "single quote in string,',"' and '"some escapes, \\,\\\\" := "some escapes, \,\\"' and '"some, ), unmatched, } prenheses {[<" := "some, ), unmatched, } prenheses {[<"' and ''"' := '"'' and ''\'' := '''' and '',' := ','' and ''}' := '}'' and '')' := ')'' and ''(' := '('' and ''{' := '{''
 ToStringGeneral.tests.cpp:<line number>: passed: true with 1 message: 'i := 2'
 ToStringGeneral.tests.cpp:<line number>: passed: true with 1 message: '3'
@@ -666,6 +666,10 @@ Matchers.tests.cpp:<line number>: passed: 1., !IsNaN() for: 1.0 not is NaN
 Generators.tests.cpp:<line number>: passed: i % 2 == 0 for: 0 == 0
 Generators.tests.cpp:<line number>: passed: i % 2 == 0 for: 0 == 0
 Generators.tests.cpp:<line number>: passed: i % 2 == 0 for: 0 == 0
+Generators.tests.cpp:<line number>: passed: i % 2 == 0 for: 0 == 0
+Generators.tests.cpp:<line number>: passed: i % 2 == 0 for: 0 == 0
+Generators.tests.cpp:<line number>: passed: i % 2 == 0 for: 0 == 0
+Generators.tests.cpp:<line number>: passed: i % 2 == 0 for: 0 == 0
 Generators.tests.cpp:<line number>: passed: filter([] (int) {return false; }, value(1)), Catch::GeneratorException
 Generators.tests.cpp:<line number>: passed: i < 4 for: 1 < 4
 Generators.tests.cpp:<line number>: passed: i < 4 for: 2 < 4
@@ -944,6 +948,7 @@ TestCaseInfoHasher.tests.cpp:<line number>: passed: h( dummy1 ) != h( dummy2 ) f
 TestCaseInfoHasher.tests.cpp:<line number>: passed: h( dummy ) == h( dummy ) for: 3422778688 (0x<hex digits>)
 ==
 3422778688 (0x<hex digits>)
+Message.tests.cpp:<line number>: failed: explicitly with 3 messages: 'This info has multiple parts.' and 'This unscoped info has multiple parts.' and 'Show infos!'
 Message.tests.cpp:<line number>: warning: 'this is a message' with 1 message: 'this is a warning'
 Message.tests.cpp:<line number>: failed: a == 1 for: 2 == 1 with 2 messages: 'this message should be logged' and 'so should this'
 Message.tests.cpp:<line number>: passed: a == 2 for: 2 == 2 with 1 message: 'this message may be logged later'
@@ -961,6 +966,7 @@ Message.tests.cpp:<line number>: passed: i < 10 for: 7 < 10 with 2 messages: 'cu
 Message.tests.cpp:<line number>: passed: i < 10 for: 8 < 10 with 2 messages: 'current counter 8' and 'i := 8'
 Message.tests.cpp:<line number>: passed: i < 10 for: 9 < 10 with 2 messages: 'current counter 9' and 'i := 9'
 Message.tests.cpp:<line number>: failed: i < 10 for: 10 < 10 with 2 messages: 'current counter 10' and 'i := 10'
+AssertionHandler.tests.cpp:<line number>: failed: unexpected exception with message: 'Exception translation was disabled by CATCH_CONFIG_FAST_COMPILE'; expression was: Dummy
 Condition.tests.cpp:<line number>: failed: data.int_seven != 7 for: 7 != 7
 Condition.tests.cpp:<line number>: failed: data.float_nine_point_one != Approx( 9.1f ) for: 9.1f != Approx( 9.1000003815 )
 Condition.tests.cpp:<line number>: failed: data.double_pi != Approx( 3.1415926535 ) for: 3.1415926535 != Approx( 3.1415926535 )
@@ -977,6 +983,91 @@ Condition.tests.cpp:<line number>: passed: data.str_hello != "goodbye" for: "hel
 Condition.tests.cpp:<line number>: passed: data.str_hello != "hell" for: "hello" != "hell"
 Condition.tests.cpp:<line number>: passed: data.str_hello != "hello1" for: "hello" != "hello1"
 Condition.tests.cpp:<line number>: passed: data.str_hello.size() != 6 for: 5 != 6
+Json.tests.cpp:<line number>: passed: stream.str() == "" for: "" == ""
+Json.tests.cpp:<line number>: passed: stream.str() == "{\n}" for: "{
+}"
+==
+"{
+}"
+Json.tests.cpp:<line number>: passed: stream.str(), ContainsSubstring( "\"int\": 1," ) && ContainsSubstring( "\"double\": 1.5," ) && ContainsSubstring( "\"true\": true," ) && ContainsSubstring( "\"false\": false," ) && ContainsSubstring( "\"string\": \"this is a string\"," ) && ContainsSubstring( "\"array\": [\n    1,\n    2\n  ]\n}" ) for: "{
+  "int": 1,
+  "double": 1.5,
+  "true": true,
+  "false": false,
+  "string": "this is a string",
+  "array": [
+    1,
+    2
+  ]
+}" ( contains: ""int": 1," and contains: ""double": 1.5," and contains: ""true": true," and contains: ""false": false," and contains: ""string": "this is a string"," and contains: ""array": [
+    1,
+    2
+  ]
+}" )
+Json.tests.cpp:<line number>: passed: stream.str(), ContainsSubstring( "\"empty_object\": {\n  }," ) && ContainsSubstring( "\"fully_object\": {\n    \"key\": 1\n  }" ) for: "{
+  "empty_object": {
+  },
+  "fully_object": {
+    "key": 1
+  }
+}" ( contains: ""empty_object": {
+  }," and contains: ""fully_object": {
+    "key": 1
+  }" )
+Json.tests.cpp:<line number>: passed: stream.str() == "[\n]" for: "[
+]"
+==
+"[
+]"
+Json.tests.cpp:<line number>: passed: stream.str() == "[\n  1,\n  1.5,\n  true,\n  false,\n  \"this is a string\",\n  {\n    \"object\": 42\n  },\n  [\n    \"array\",\n    42.5\n  ]\n]" for: "[
+  1,
+  1.5,
+  true,
+  false,
+  "this is a string",
+  {
+    "object": 42
+  },
+  [
+    "array",
+    42.5
+  ]
+]"
+==
+"[
+  1,
+  1.5,
+  true,
+  false,
+  "this is a string",
+  {
+    "object": 42
+  },
+  [
+    "array",
+    42.5
+  ]
+]"
+Json.tests.cpp:<line number>: passed: stream.str() == "{\n}" for: "{
+}"
+==
+"{
+}"
+Json.tests.cpp:<line number>: passed: stream.str() == "[\n]" for: "[
+]"
+==
+"[
+]"
+Json.tests.cpp:<line number>: passed: stream.str() == "\"custom\"" for: ""custom"" == ""custom""
+Json.tests.cpp:<line number>: passed: sstream.str() == "\"\\\"\"" for: ""\""" == ""\"""
+Json.tests.cpp:<line number>: passed: sstream.str() == "\"\\\\\"" for: ""\\"" == ""\\""
+Json.tests.cpp:<line number>: passed: sstream.str() == "\"/\"" for: ""/"" == ""/""
+Json.tests.cpp:<line number>: passed: sstream.str() == "\"\\b\"" for: ""\b"" == ""\b""
+Json.tests.cpp:<line number>: passed: sstream.str() == "\"\\f\"" for: ""\f"" == ""\f""
+Json.tests.cpp:<line number>: passed: sstream.str() == "\"\\n\"" for: ""\n"" == ""\n""
+Json.tests.cpp:<line number>: passed: sstream.str() == "\"\\r\"" for: ""\r"" == ""\r""
+Json.tests.cpp:<line number>: passed: sstream.str() == "\"\\t\"" for: ""\t"" == ""\t""
+Json.tests.cpp:<line number>: passed: sstream.str() == "\"\\\\/\\t\\r\\n\"" for: ""\\/\t\r\n"" == ""\\/\t\r\n""
 Compilation.tests.cpp:<line number>: passed: []() { return true; }() for: true
 Approx.tests.cpp:<line number>: passed: d <= Approx( 1.24 ) for: 1.23 <= Approx( 1.24 )
 Approx.tests.cpp:<line number>: passed: d <= Approx( 1.23 ) for: 1.23 <= Approx( 1.23 )
@@ -1341,6 +1432,60 @@ Reporters.tests.cpp:<line number>: passed: listingString, ContainsSubstring( "fa
 
 " ( contains: "fake test name" and contains: "fakeTestTag" ) with 1 message: 'Tested reporter: console'
 Reporters.tests.cpp:<line number>: passed: !(factories.empty()) for: !false
+Reporters.tests.cpp:<line number>: passed: listingString, ContainsSubstring("fakeTag"s) for: "{
+  "version": 1,
+  "metadata": {
+    "name": "",
+    "rng-seed": 1234,
+    "catch2-version": "<version>"
+  },
+  "listings": {
+    "tags": [
+      {
+        "aliases": [
+          "fakeTag"
+        ],
+        "count": 1
+      }
+    ]" contains: "fakeTag" with 1 message: 'Tested reporter: JSON'
+Reporters.tests.cpp:<line number>: passed: !(factories.empty()) for: !false
+Reporters.tests.cpp:<line number>: passed: listingString, ContainsSubstring("fake reporter"s) for: "{
+  "version": 1,
+  "metadata": {
+    "name": "",
+    "rng-seed": 1234,
+    "catch2-version": "<version>"
+  },
+  "listings": {
+    "reporters": [
+      {
+        "name": "fake reporter",
+        "description": "fake description"
+      }
+    ]" contains: "fake reporter" with 1 message: 'Tested reporter: JSON'
+Reporters.tests.cpp:<line number>: passed: !(factories.empty()) for: !false
+Reporters.tests.cpp:<line number>: passed: listingString, ContainsSubstring( "fake test name"s ) && ContainsSubstring( "fakeTestTag"s ) for: "{
+  "version": 1,
+  "metadata": {
+    "name": "",
+    "rng-seed": 1234,
+    "catch2-version": "<version>"
+  },
+  "listings": {
+    "tests": [
+      {
+        "name": "fake test name",
+        "class-name": "",
+        "tags": [
+          "fakeTestTag"
+        ],
+        "source-location": {
+          "filename": "fake-file.cpp",
+          "line": 123456789
+        }
+      }
+    ]" ( contains: "fake test name" and contains: "fakeTestTag" ) with 1 message: 'Tested reporter: JSON'
+Reporters.tests.cpp:<line number>: passed: !(factories.empty()) for: !false
 Reporters.tests.cpp:<line number>: passed: listingString, ContainsSubstring("fakeTag"s) for: "<?xml version="1.0" encoding="UTF-8"?>
 All available tags:
    1  [fakeTag]
@@ -1750,6 +1895,10 @@ Misc.tests.cpp:<line number>: passed: true
 Misc.tests.cpp:<line number>: failed: explicitly
 Misc.tests.cpp:<line number>: failed - but was ok: false
 Misc.tests.cpp:<line number>: failed: explicitly
+Misc.tests.cpp:<line number>: passed: true
+Misc.tests.cpp:<line number>: failed: unexpected exception with message: 'Uncaught exception should fail!'; expression was: {Unknown expression after the reported line}
+Misc.tests.cpp:<line number>: failed - but was ok: false
+Misc.tests.cpp:<line number>: failed: unexpected exception with message: 'Uncaught exception should fail!'; expression was: {Unknown expression after the reported line}
 Message.tests.cpp:<line number>: failed - but was ok: 1 == 2
 Reporters.tests.cpp:<line number>: passed: listingString, ContainsSubstring("[fakeTag]"s) for: "All available tags:
    1  [fakeTag]
@@ -2473,6 +2622,8 @@ InternalBenchmark.tests.cpp:<line number>: passed: e.point == 23 for: 23.0 == 23
 InternalBenchmark.tests.cpp:<line number>: passed: e.upper_bound == 23 for: 23.0 == 23
 InternalBenchmark.tests.cpp:<line number>: passed: e.lower_bound == 23 for: 23.0 == 23
 InternalBenchmark.tests.cpp:<line number>: passed: e.confidence_interval == 0.95 for: 0.95 == 0.95
+RandomNumberGeneration.tests.cpp:<line number>: passed: dist.a() == -10 for: -10 == -10
+RandomNumberGeneration.tests.cpp:<line number>: passed: dist.b() == 10 for: 10 == 10
 UniquePtr.tests.cpp:<line number>: passed: !(ptr) for: !{?}
 UniquePtr.tests.cpp:<line number>: passed: ptr.get() == 0 for: 0 == 0
 UniquePtr.tests.cpp:<line number>: passed: ptr for: {?}
@@ -2538,7 +2689,7 @@ InternalBenchmark.tests.cpp:<line number>: passed: med == 18. for: 18.0 == 18.0
 InternalBenchmark.tests.cpp:<line number>: passed: q3 == 23. for: 23.0 == 23.0
 Misc.tests.cpp:<line number>: passed:
 Misc.tests.cpp:<line number>: passed:
-test cases:  409 |  308 passed |  84 failed | 6 skipped | 11 failed as expected
-assertions: 2225 | 2048 passed | 145 failed | 32 failed as expected
+test cases:  417 |  312 passed |  85 failed | 6 skipped | 14 failed as expected
+assertions: 2260 | 2079 passed | 146 failed | 35 failed as expected
 
 
diff --git a/packages/Catch2/tests/SelfTest/Baselines/compact.sw.multi.approved.txt b/packages/Catch2/tests/SelfTest/Baselines/compact.sw.multi.approved.txt
index 6c48ab917fb8b1d1aba40105e3d4c58c8bb14cd7..214fef74b8100bd2a3088cd213545c74555ae086 100644
--- a/packages/Catch2/tests/SelfTest/Baselines/compact.sw.multi.approved.txt
+++ b/packages/Catch2/tests/SelfTest/Baselines/compact.sw.multi.approved.txt
@@ -329,7 +329,7 @@ MatchersRanges.tests.cpp:<line number>: passed: inner_lists_are_empty.front(), I
 MatchersRanges.tests.cpp:<line number>: passed: has_empty{}, !IsEmpty() for: {?} not is empty
 MatchersRanges.tests.cpp:<line number>: passed: unrelated::ADL_empty{}, IsEmpty() for: {?} is empty
 Message.tests.cpp:<line number>: passed: with 7 messages: 'a := 1' and 'b := 2' and 'c := 3' and 'a + b := 3' and 'a+b := 3' and 'c > b := true' and 'a == 1 := true'
-Message.tests.cpp:<line number>: passed: with 7 messages: 'std::vector<int>{1, 2, 3}[0, 1, 2] := 3' and 'std::vector<int>{1, 2, 3}[(0, 1)] := 2' and 'std::vector<int>{1, 2, 3}[0] := 1' and '(helper_1436<int, int>{12, -12}) := { 12, -12 }' and '(helper_1436<int, int>(-12, 12)) := { -12, 12 }' and '(1, 2) := 2' and '(2, 3) := 3'
+Message.tests.cpp:<line number>: passed: with 7 messages: 'custom_index_op<int>{1, 2, 3}[0, 1, 2] := 0' and 'custom_index_op<int>{1, 2, 3}[(0, 1)] := 0' and 'custom_index_op<int>{1, 2, 3}[0] := 0' and '(helper_1436<int, int>{12, -12}) := { 12, -12 }' and '(helper_1436<int, int>(-12, 12)) := { -12, 12 }' and '(1, 2) := 2' and '(2, 3) := 3'
 Message.tests.cpp:<line number>: passed: with 11 messages: '("comma, in string", "escaped, \", ") := "escaped, ", "' and '"single quote in string,'," := "single quote in string,',"' and '"some escapes, \\,\\\\" := "some escapes, \,\\"' and '"some, ), unmatched, } prenheses {[<" := "some, ), unmatched, } prenheses {[<"' and ''"' := '"'' and ''\'' := '''' and '',' := ','' and ''}' := '}'' and '')' := ')'' and ''(' := '('' and ''{' := '{''
 ToStringGeneral.tests.cpp:<line number>: passed: true with 1 message: 'i := 2'
 ToStringGeneral.tests.cpp:<line number>: passed: true with 1 message: '3'
@@ -664,6 +664,10 @@ Matchers.tests.cpp:<line number>: passed: 1., !IsNaN() for: 1.0 not is NaN
 Generators.tests.cpp:<line number>: passed: i % 2 == 0 for: 0 == 0
 Generators.tests.cpp:<line number>: passed: i % 2 == 0 for: 0 == 0
 Generators.tests.cpp:<line number>: passed: i % 2 == 0 for: 0 == 0
+Generators.tests.cpp:<line number>: passed: i % 2 == 0 for: 0 == 0
+Generators.tests.cpp:<line number>: passed: i % 2 == 0 for: 0 == 0
+Generators.tests.cpp:<line number>: passed: i % 2 == 0 for: 0 == 0
+Generators.tests.cpp:<line number>: passed: i % 2 == 0 for: 0 == 0
 Generators.tests.cpp:<line number>: passed: filter([] (int) {return false; }, value(1)), Catch::GeneratorException
 Generators.tests.cpp:<line number>: passed: i < 4 for: 1 < 4
 Generators.tests.cpp:<line number>: passed: i < 4 for: 2 < 4
@@ -942,6 +946,7 @@ TestCaseInfoHasher.tests.cpp:<line number>: passed: h( dummy1 ) != h( dummy2 ) f
 TestCaseInfoHasher.tests.cpp:<line number>: passed: h( dummy ) == h( dummy ) for: 3422778688 (0x<hex digits>)
 ==
 3422778688 (0x<hex digits>)
+Message.tests.cpp:<line number>: failed: explicitly with 3 messages: 'This info has multiple parts.' and 'This unscoped info has multiple parts.' and 'Show infos!'
 Message.tests.cpp:<line number>: warning: 'this is a message' with 1 message: 'this is a warning'
 Message.tests.cpp:<line number>: failed: a == 1 for: 2 == 1 with 2 messages: 'this message should be logged' and 'so should this'
 Message.tests.cpp:<line number>: passed: a == 2 for: 2 == 2 with 1 message: 'this message may be logged later'
@@ -959,6 +964,7 @@ Message.tests.cpp:<line number>: passed: i < 10 for: 7 < 10 with 2 messages: 'cu
 Message.tests.cpp:<line number>: passed: i < 10 for: 8 < 10 with 2 messages: 'current counter 8' and 'i := 8'
 Message.tests.cpp:<line number>: passed: i < 10 for: 9 < 10 with 2 messages: 'current counter 9' and 'i := 9'
 Message.tests.cpp:<line number>: failed: i < 10 for: 10 < 10 with 2 messages: 'current counter 10' and 'i := 10'
+AssertionHandler.tests.cpp:<line number>: failed: unexpected exception with message: 'Exception translation was disabled by CATCH_CONFIG_FAST_COMPILE'; expression was: Dummy
 Condition.tests.cpp:<line number>: failed: data.int_seven != 7 for: 7 != 7
 Condition.tests.cpp:<line number>: failed: data.float_nine_point_one != Approx( 9.1f ) for: 9.1f != Approx( 9.1000003815 )
 Condition.tests.cpp:<line number>: failed: data.double_pi != Approx( 3.1415926535 ) for: 3.1415926535 != Approx( 3.1415926535 )
@@ -975,6 +981,91 @@ Condition.tests.cpp:<line number>: passed: data.str_hello != "goodbye" for: "hel
 Condition.tests.cpp:<line number>: passed: data.str_hello != "hell" for: "hello" != "hell"
 Condition.tests.cpp:<line number>: passed: data.str_hello != "hello1" for: "hello" != "hello1"
 Condition.tests.cpp:<line number>: passed: data.str_hello.size() != 6 for: 5 != 6
+Json.tests.cpp:<line number>: passed: stream.str() == "" for: "" == ""
+Json.tests.cpp:<line number>: passed: stream.str() == "{\n}" for: "{
+}"
+==
+"{
+}"
+Json.tests.cpp:<line number>: passed: stream.str(), ContainsSubstring( "\"int\": 1," ) && ContainsSubstring( "\"double\": 1.5," ) && ContainsSubstring( "\"true\": true," ) && ContainsSubstring( "\"false\": false," ) && ContainsSubstring( "\"string\": \"this is a string\"," ) && ContainsSubstring( "\"array\": [\n    1,\n    2\n  ]\n}" ) for: "{
+  "int": 1,
+  "double": 1.5,
+  "true": true,
+  "false": false,
+  "string": "this is a string",
+  "array": [
+    1,
+    2
+  ]
+}" ( contains: ""int": 1," and contains: ""double": 1.5," and contains: ""true": true," and contains: ""false": false," and contains: ""string": "this is a string"," and contains: ""array": [
+    1,
+    2
+  ]
+}" )
+Json.tests.cpp:<line number>: passed: stream.str(), ContainsSubstring( "\"empty_object\": {\n  }," ) && ContainsSubstring( "\"fully_object\": {\n    \"key\": 1\n  }" ) for: "{
+  "empty_object": {
+  },
+  "fully_object": {
+    "key": 1
+  }
+}" ( contains: ""empty_object": {
+  }," and contains: ""fully_object": {
+    "key": 1
+  }" )
+Json.tests.cpp:<line number>: passed: stream.str() == "[\n]" for: "[
+]"
+==
+"[
+]"
+Json.tests.cpp:<line number>: passed: stream.str() == "[\n  1,\n  1.5,\n  true,\n  false,\n  \"this is a string\",\n  {\n    \"object\": 42\n  },\n  [\n    \"array\",\n    42.5\n  ]\n]" for: "[
+  1,
+  1.5,
+  true,
+  false,
+  "this is a string",
+  {
+    "object": 42
+  },
+  [
+    "array",
+    42.5
+  ]
+]"
+==
+"[
+  1,
+  1.5,
+  true,
+  false,
+  "this is a string",
+  {
+    "object": 42
+  },
+  [
+    "array",
+    42.5
+  ]
+]"
+Json.tests.cpp:<line number>: passed: stream.str() == "{\n}" for: "{
+}"
+==
+"{
+}"
+Json.tests.cpp:<line number>: passed: stream.str() == "[\n]" for: "[
+]"
+==
+"[
+]"
+Json.tests.cpp:<line number>: passed: stream.str() == "\"custom\"" for: ""custom"" == ""custom""
+Json.tests.cpp:<line number>: passed: sstream.str() == "\"\\\"\"" for: ""\""" == ""\"""
+Json.tests.cpp:<line number>: passed: sstream.str() == "\"\\\\\"" for: ""\\"" == ""\\""
+Json.tests.cpp:<line number>: passed: sstream.str() == "\"/\"" for: ""/"" == ""/""
+Json.tests.cpp:<line number>: passed: sstream.str() == "\"\\b\"" for: ""\b"" == ""\b""
+Json.tests.cpp:<line number>: passed: sstream.str() == "\"\\f\"" for: ""\f"" == ""\f""
+Json.tests.cpp:<line number>: passed: sstream.str() == "\"\\n\"" for: ""\n"" == ""\n""
+Json.tests.cpp:<line number>: passed: sstream.str() == "\"\\r\"" for: ""\r"" == ""\r""
+Json.tests.cpp:<line number>: passed: sstream.str() == "\"\\t\"" for: ""\t"" == ""\t""
+Json.tests.cpp:<line number>: passed: sstream.str() == "\"\\\\/\\t\\r\\n\"" for: ""\\/\t\r\n"" == ""\\/\t\r\n""
 Compilation.tests.cpp:<line number>: passed: []() { return true; }() for: true
 Approx.tests.cpp:<line number>: passed: d <= Approx( 1.24 ) for: 1.23 <= Approx( 1.24 )
 Approx.tests.cpp:<line number>: passed: d <= Approx( 1.23 ) for: 1.23 <= Approx( 1.23 )
@@ -1339,6 +1430,60 @@ Reporters.tests.cpp:<line number>: passed: listingString, ContainsSubstring( "fa
 
 " ( contains: "fake test name" and contains: "fakeTestTag" ) with 1 message: 'Tested reporter: console'
 Reporters.tests.cpp:<line number>: passed: !(factories.empty()) for: !false
+Reporters.tests.cpp:<line number>: passed: listingString, ContainsSubstring("fakeTag"s) for: "{
+  "version": 1,
+  "metadata": {
+    "name": "",
+    "rng-seed": 1234,
+    "catch2-version": "<version>"
+  },
+  "listings": {
+    "tags": [
+      {
+        "aliases": [
+          "fakeTag"
+        ],
+        "count": 1
+      }
+    ]" contains: "fakeTag" with 1 message: 'Tested reporter: JSON'
+Reporters.tests.cpp:<line number>: passed: !(factories.empty()) for: !false
+Reporters.tests.cpp:<line number>: passed: listingString, ContainsSubstring("fake reporter"s) for: "{
+  "version": 1,
+  "metadata": {
+    "name": "",
+    "rng-seed": 1234,
+    "catch2-version": "<version>"
+  },
+  "listings": {
+    "reporters": [
+      {
+        "name": "fake reporter",
+        "description": "fake description"
+      }
+    ]" contains: "fake reporter" with 1 message: 'Tested reporter: JSON'
+Reporters.tests.cpp:<line number>: passed: !(factories.empty()) for: !false
+Reporters.tests.cpp:<line number>: passed: listingString, ContainsSubstring( "fake test name"s ) && ContainsSubstring( "fakeTestTag"s ) for: "{
+  "version": 1,
+  "metadata": {
+    "name": "",
+    "rng-seed": 1234,
+    "catch2-version": "<version>"
+  },
+  "listings": {
+    "tests": [
+      {
+        "name": "fake test name",
+        "class-name": "",
+        "tags": [
+          "fakeTestTag"
+        ],
+        "source-location": {
+          "filename": "fake-file.cpp",
+          "line": 123456789
+        }
+      }
+    ]" ( contains: "fake test name" and contains: "fakeTestTag" ) with 1 message: 'Tested reporter: JSON'
+Reporters.tests.cpp:<line number>: passed: !(factories.empty()) for: !false
 Reporters.tests.cpp:<line number>: passed: listingString, ContainsSubstring("fakeTag"s) for: "<?xml version="1.0" encoding="UTF-8"?>
 All available tags:
    1  [fakeTag]
@@ -1743,6 +1888,10 @@ Misc.tests.cpp:<line number>: passed: true
 Misc.tests.cpp:<line number>: failed: explicitly
 Misc.tests.cpp:<line number>: failed - but was ok: false
 Misc.tests.cpp:<line number>: failed: explicitly
+Misc.tests.cpp:<line number>: passed: true
+Misc.tests.cpp:<line number>: failed: unexpected exception with message: 'Uncaught exception should fail!'; expression was: {Unknown expression after the reported line}
+Misc.tests.cpp:<line number>: failed - but was ok: false
+Misc.tests.cpp:<line number>: failed: unexpected exception with message: 'Uncaught exception should fail!'; expression was: {Unknown expression after the reported line}
 Message.tests.cpp:<line number>: failed - but was ok: 1 == 2
 Reporters.tests.cpp:<line number>: passed: listingString, ContainsSubstring("[fakeTag]"s) for: "All available tags:
    1  [fakeTag]
@@ -2462,6 +2611,8 @@ InternalBenchmark.tests.cpp:<line number>: passed: e.point == 23 for: 23.0 == 23
 InternalBenchmark.tests.cpp:<line number>: passed: e.upper_bound == 23 for: 23.0 == 23
 InternalBenchmark.tests.cpp:<line number>: passed: e.lower_bound == 23 for: 23.0 == 23
 InternalBenchmark.tests.cpp:<line number>: passed: e.confidence_interval == 0.95 for: 0.95 == 0.95
+RandomNumberGeneration.tests.cpp:<line number>: passed: dist.a() == -10 for: -10 == -10
+RandomNumberGeneration.tests.cpp:<line number>: passed: dist.b() == 10 for: 10 == 10
 UniquePtr.tests.cpp:<line number>: passed: !(ptr) for: !{?}
 UniquePtr.tests.cpp:<line number>: passed: ptr.get() == 0 for: 0 == 0
 UniquePtr.tests.cpp:<line number>: passed: ptr for: {?}
@@ -2527,7 +2678,7 @@ InternalBenchmark.tests.cpp:<line number>: passed: med == 18. for: 18.0 == 18.0
 InternalBenchmark.tests.cpp:<line number>: passed: q3 == 23. for: 23.0 == 23.0
 Misc.tests.cpp:<line number>: passed:
 Misc.tests.cpp:<line number>: passed:
-test cases:  409 |  308 passed |  84 failed | 6 skipped | 11 failed as expected
-assertions: 2225 | 2048 passed | 145 failed | 32 failed as expected
+test cases:  417 |  312 passed |  85 failed | 6 skipped | 14 failed as expected
+assertions: 2260 | 2079 passed | 146 failed | 35 failed as expected
 
 
diff --git a/packages/Catch2/tests/SelfTest/Baselines/console.std.approved.txt b/packages/Catch2/tests/SelfTest/Baselines/console.std.approved.txt
index 0945f0dfb8c4f4dda4ba9b5dbb46eadc43cdb671..2542625656cb623db585749d003a4c9a7d3d221d 100644
--- a/packages/Catch2/tests/SelfTest/Baselines/console.std.approved.txt
+++ b/packages/Catch2/tests/SelfTest/Baselines/console.std.approved.txt
@@ -599,6 +599,18 @@ explicitly with message:
 Message.tests.cpp:<line number>: warning:
   This message appears in the output
 
+-------------------------------------------------------------------------------
+INFO and UNSCOPED_INFO can stream multiple arguments
+-------------------------------------------------------------------------------
+Message.tests.cpp:<line number>
+...............................................................................
+
+Message.tests.cpp:<line number>: FAILED:
+explicitly with messages:
+  This info has multiple parts.
+  This unscoped info has multiple parts.
+  Show infos!
+
 -------------------------------------------------------------------------------
 INFO and WARN do not abort tests
 -------------------------------------------------------------------------------
@@ -659,6 +671,17 @@ with messages:
   current counter 10
   i := 10
 
+-------------------------------------------------------------------------------
+Incomplete AssertionHandler
+-------------------------------------------------------------------------------
+AssertionHandler.tests.cpp:<line number>
+...............................................................................
+
+AssertionHandler.tests.cpp:<line number>: FAILED:
+  REQUIRE( Dummy )
+due to unexpected exception with message:
+  Exception translation was disabled by CATCH_CONFIG_FAST_COMPILE
+
 -------------------------------------------------------------------------------
 Inequality checks that should fail
 -------------------------------------------------------------------------------
@@ -997,6 +1020,28 @@ Misc.tests.cpp:<line number>
 
 Misc.tests.cpp:<line number>: FAILED:
 
+-------------------------------------------------------------------------------
+Testing checked-if 4
+-------------------------------------------------------------------------------
+Misc.tests.cpp:<line number>
+...............................................................................
+
+Misc.tests.cpp:<line number>: FAILED:
+  {Unknown expression after the reported line}
+due to unexpected exception with message:
+  Uncaught exception should fail!
+
+-------------------------------------------------------------------------------
+Testing checked-if 5
+-------------------------------------------------------------------------------
+Misc.tests.cpp:<line number>
+...............................................................................
+
+Misc.tests.cpp:<line number>: FAILED:
+  {Unknown expression after the reported line}
+due to unexpected exception with message:
+  Uncaught exception should fail!
+
 -------------------------------------------------------------------------------
 Thrown string literals are translated
 -------------------------------------------------------------------------------
@@ -1543,6 +1588,6 @@ due to unexpected exception with message:
   Why would you throw a std::string?
 
 ===============================================================================
-test cases:  409 |  322 passed |  69 failed | 7 skipped | 11 failed as expected
-assertions: 2208 | 2048 passed | 128 failed | 32 failed as expected
+test cases:  417 |  326 passed |  70 failed | 7 skipped | 14 failed as expected
+assertions: 2243 | 2079 passed | 129 failed | 35 failed as expected
 
diff --git a/packages/Catch2/tests/SelfTest/Baselines/console.sw.approved.txt b/packages/Catch2/tests/SelfTest/Baselines/console.sw.approved.txt
index 150980e82ff902f791fec3ee357653d1a67b6c07..077b7bf750b7424dcc6fbc0498851a6a0a2a0e96 100644
--- a/packages/Catch2/tests/SelfTest/Baselines/console.sw.approved.txt
+++ b/packages/Catch2/tests/SelfTest/Baselines/console.sw.approved.txt
@@ -2740,9 +2740,9 @@ Message.tests.cpp:<line number>
 
 Message.tests.cpp:<line number>: PASSED:
 with messages:
-  std::vector<int>{1, 2, 3}[0, 1, 2] := 3
-  std::vector<int>{1, 2, 3}[(0, 1)] := 2
-  std::vector<int>{1, 2, 3}[0] := 1
+  custom_index_op<int>{1, 2, 3}[0, 1, 2] := 0
+  custom_index_op<int>{1, 2, 3}[(0, 1)] := 0
+  custom_index_op<int>{1, 2, 3}[0] := 0
   (helper_1436<int, int>{12, -12}) := { 12, -12 }
   (helper_1436<int, int>(-12, 12)) := { -12, 12 }
   (1, 2) := 2
@@ -4889,6 +4889,50 @@ Matchers.tests.cpp:<line number>: PASSED:
 with expansion:
   1.0 not is NaN
 
+-------------------------------------------------------------------------------
+GENERATE can combine literals and generators
+-------------------------------------------------------------------------------
+Generators.tests.cpp:<line number>
+...............................................................................
+
+Generators.tests.cpp:<line number>: PASSED:
+  REQUIRE( i % 2 == 0 )
+with expansion:
+  0 == 0
+
+-------------------------------------------------------------------------------
+GENERATE can combine literals and generators
+-------------------------------------------------------------------------------
+Generators.tests.cpp:<line number>
+...............................................................................
+
+Generators.tests.cpp:<line number>: PASSED:
+  REQUIRE( i % 2 == 0 )
+with expansion:
+  0 == 0
+
+-------------------------------------------------------------------------------
+GENERATE can combine literals and generators
+-------------------------------------------------------------------------------
+Generators.tests.cpp:<line number>
+...............................................................................
+
+Generators.tests.cpp:<line number>: PASSED:
+  REQUIRE( i % 2 == 0 )
+with expansion:
+  0 == 0
+
+-------------------------------------------------------------------------------
+GENERATE can combine literals and generators
+-------------------------------------------------------------------------------
+Generators.tests.cpp:<line number>
+...............................................................................
+
+Generators.tests.cpp:<line number>: PASSED:
+  REQUIRE( i % 2 == 0 )
+with expansion:
+  0 == 0
+
 -------------------------------------------------------------------------------
 Generators -- adapters
   Filtering by predicate
@@ -6982,6 +7026,18 @@ with expansion:
   ==
   3422778688 (0x<hex digits>)
 
+-------------------------------------------------------------------------------
+INFO and UNSCOPED_INFO can stream multiple arguments
+-------------------------------------------------------------------------------
+Message.tests.cpp:<line number>
+...............................................................................
+
+Message.tests.cpp:<line number>: FAILED:
+explicitly with messages:
+  This info has multiple parts.
+  This unscoped info has multiple parts.
+  Show infos!
+
 -------------------------------------------------------------------------------
 INFO and WARN do not abort tests
 -------------------------------------------------------------------------------
@@ -7143,6 +7199,17 @@ with messages:
   current counter 10
   i := 10
 
+-------------------------------------------------------------------------------
+Incomplete AssertionHandler
+-------------------------------------------------------------------------------
+AssertionHandler.tests.cpp:<line number>
+...............................................................................
+
+AssertionHandler.tests.cpp:<line number>: FAILED:
+  REQUIRE( Dummy )
+due to unexpected exception with message:
+  Exception translation was disabled by CATCH_CONFIG_FAST_COMPILE
+
 -------------------------------------------------------------------------------
 Inequality checks that should fail
 -------------------------------------------------------------------------------
@@ -7235,6 +7302,291 @@ Condition.tests.cpp:<line number>: PASSED:
 with expansion:
   5 != 6
 
+-------------------------------------------------------------------------------
+JsonWriter
+  Newly constructed JsonWriter does nothing
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( stream.str() == "" )
+with expansion:
+  "" == ""
+
+-------------------------------------------------------------------------------
+JsonWriter
+  Calling writeObject will create an empty pair of braces
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( stream.str() == "{\n}" )
+with expansion:
+  "{
+  }"
+  ==
+  "{
+  }"
+
+-------------------------------------------------------------------------------
+JsonWriter
+  Calling writeObject with key will create an object to write the value
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE_THAT( stream.str(), ContainsSubstring( "\"int\": 1," ) && ContainsSubstring( "\"double\": 1.5," ) && ContainsSubstring( "\"true\": true," ) && ContainsSubstring( "\"false\": false," ) && ContainsSubstring( "\"string\": \"this is a string\"," ) && ContainsSubstring( "\"array\": [\n    1,\n    2\n  ]\n}" ) )
+with expansion:
+  "{
+    "int": 1,
+    "double": 1.5,
+    "true": true,
+    "false": false,
+    "string": "this is a string",
+    "array": [
+      1,
+      2
+    ]
+  }" ( contains: ""int": 1," and contains: ""double": 1.5," and contains:
+  ""true": true," and contains: ""false": false," and contains: ""string":
+  "this is a string"," and contains: ""array": [
+      1,
+      2
+    ]
+  }" )
+
+-------------------------------------------------------------------------------
+JsonWriter
+  nesting objects
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE_THAT( stream.str(), ContainsSubstring( "\"empty_object\": {\n  }," ) && ContainsSubstring( "\"fully_object\": {\n    \"key\": 1\n  }" ) )
+with expansion:
+  "{
+    "empty_object": {
+    },
+    "fully_object": {
+      "key": 1
+    }
+  }" ( contains: ""empty_object": {
+    }," and contains: ""fully_object": {
+      "key": 1
+    }" )
+
+-------------------------------------------------------------------------------
+JsonWriter
+  Calling writeArray will create an empty pair of braces
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( stream.str() == "[\n]" )
+with expansion:
+  "[
+  ]"
+  ==
+  "[
+  ]"
+
+-------------------------------------------------------------------------------
+JsonWriter
+  Calling writeArray creates array to write the values to
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( stream.str() == "[\n  1,\n  1.5,\n  true,\n  false,\n  \"this is a string\",\n  {\n    \"object\": 42\n  },\n  [\n    \"array\",\n    42.5\n  ]\n]" )
+with expansion:
+  "[
+    1,
+    1.5,
+    true,
+    false,
+    "this is a string",
+    {
+      "object": 42
+    },
+    [
+      "array",
+      42.5
+    ]
+  ]"
+  ==
+  "[
+    1,
+    1.5,
+    true,
+    false,
+    "this is a string",
+    {
+      "object": 42
+    },
+    [
+      "array",
+      42.5
+    ]
+  ]"
+
+-------------------------------------------------------------------------------
+JsonWriter
+  Moved from JsonObjectWriter shall not insert superfluous brace
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( stream.str() == "{\n}" )
+with expansion:
+  "{
+  }"
+  ==
+  "{
+  }"
+
+-------------------------------------------------------------------------------
+JsonWriter
+  Moved from JsonArrayWriter shall not insert superfluous bracket
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( stream.str() == "[\n]" )
+with expansion:
+  "[
+  ]"
+  ==
+  "[
+  ]"
+
+-------------------------------------------------------------------------------
+JsonWriter
+  Custom class shall be quoted
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( stream.str() == "\"custom\"" )
+with expansion:
+  ""custom"" == ""custom""
+
+-------------------------------------------------------------------------------
+JsonWriter escapes charaters in strings properly
+  Quote in a string is escaped
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( sstream.str() == "\"\\\"\"" )
+with expansion:
+  ""\""" == ""\"""
+
+-------------------------------------------------------------------------------
+JsonWriter escapes charaters in strings properly
+  Backslash in a string is escaped
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( sstream.str() == "\"\\\\\"" )
+with expansion:
+  ""\\"" == ""\\""
+
+-------------------------------------------------------------------------------
+JsonWriter escapes charaters in strings properly
+  Forward slash in a string is **not** escaped
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( sstream.str() == "\"/\"" )
+with expansion:
+  ""/"" == ""/""
+
+-------------------------------------------------------------------------------
+JsonWriter escapes charaters in strings properly
+  Backspace in a string is escaped
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( sstream.str() == "\"\\b\"" )
+with expansion:
+  ""\b"" == ""\b""
+
+-------------------------------------------------------------------------------
+JsonWriter escapes charaters in strings properly
+  Formfeed in a string is escaped
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( sstream.str() == "\"\\f\"" )
+with expansion:
+  ""\f"" == ""\f""
+
+-------------------------------------------------------------------------------
+JsonWriter escapes charaters in strings properly
+  linefeed in a string is escaped
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( sstream.str() == "\"\\n\"" )
+with expansion:
+  ""\n"" == ""\n""
+
+-------------------------------------------------------------------------------
+JsonWriter escapes charaters in strings properly
+  carriage return in a string is escaped
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( sstream.str() == "\"\\r\"" )
+with expansion:
+  ""\r"" == ""\r""
+
+-------------------------------------------------------------------------------
+JsonWriter escapes charaters in strings properly
+  tab in a string is escaped
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( sstream.str() == "\"\\t\"" )
+with expansion:
+  ""\t"" == ""\t""
+
+-------------------------------------------------------------------------------
+JsonWriter escapes charaters in strings properly
+  combination of characters is escaped
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( sstream.str() == "\"\\\\/\\t\\r\\n\"" )
+with expansion:
+  ""\\/\t\r\n"" == ""\\/\t\r\n""
+
 -------------------------------------------------------------------------------
 Lambdas in assertions
 -------------------------------------------------------------------------------
@@ -9733,6 +10085,129 @@ Reporter's write listings to provided stream
 Reporters.tests.cpp:<line number>
 ...............................................................................
 
+Reporters.tests.cpp:<line number>: PASSED:
+  REQUIRE_FALSE( factories.empty() )
+with expansion:
+  !false
+
+-------------------------------------------------------------------------------
+Reporter's write listings to provided stream
+  JSON reporter lists tags
+-------------------------------------------------------------------------------
+Reporters.tests.cpp:<line number>
+...............................................................................
+
+Reporters.tests.cpp:<line number>: PASSED:
+  REQUIRE_THAT( listingString, ContainsSubstring("fakeTag"s) )
+with expansion:
+  "{
+    "version": 1,
+    "metadata": {
+      "name": "",
+      "rng-seed": 1234,
+      "catch2-version": "<version>"
+    },
+    "listings": {
+      "tags": [
+        {
+          "aliases": [
+            "fakeTag"
+          ],
+          "count": 1
+        }
+      ]" contains: "fakeTag"
+with message:
+  Tested reporter: JSON
+
+-------------------------------------------------------------------------------
+Reporter's write listings to provided stream
+-------------------------------------------------------------------------------
+Reporters.tests.cpp:<line number>
+...............................................................................
+
+Reporters.tests.cpp:<line number>: PASSED:
+  REQUIRE_FALSE( factories.empty() )
+with expansion:
+  !false
+
+-------------------------------------------------------------------------------
+Reporter's write listings to provided stream
+  JSON reporter lists reporters
+-------------------------------------------------------------------------------
+Reporters.tests.cpp:<line number>
+...............................................................................
+
+Reporters.tests.cpp:<line number>: PASSED:
+  REQUIRE_THAT( listingString, ContainsSubstring("fake reporter"s) )
+with expansion:
+  "{
+    "version": 1,
+    "metadata": {
+      "name": "",
+      "rng-seed": 1234,
+      "catch2-version": "<version>"
+    },
+    "listings": {
+      "reporters": [
+        {
+          "name": "fake reporter",
+          "description": "fake description"
+        }
+      ]" contains: "fake reporter"
+with message:
+  Tested reporter: JSON
+
+-------------------------------------------------------------------------------
+Reporter's write listings to provided stream
+-------------------------------------------------------------------------------
+Reporters.tests.cpp:<line number>
+...............................................................................
+
+Reporters.tests.cpp:<line number>: PASSED:
+  REQUIRE_FALSE( factories.empty() )
+with expansion:
+  !false
+
+-------------------------------------------------------------------------------
+Reporter's write listings to provided stream
+  JSON reporter lists tests
+-------------------------------------------------------------------------------
+Reporters.tests.cpp:<line number>
+...............................................................................
+
+Reporters.tests.cpp:<line number>: PASSED:
+  REQUIRE_THAT( listingString, ContainsSubstring( "fake test name"s ) && ContainsSubstring( "fakeTestTag"s ) )
+with expansion:
+  "{
+    "version": 1,
+    "metadata": {
+      "name": "",
+      "rng-seed": 1234,
+      "catch2-version": "<version>"
+    },
+    "listings": {
+      "tests": [
+        {
+          "name": "fake test name",
+          "class-name": "",
+          "tags": [
+            "fakeTestTag"
+          ],
+          "source-location": {
+            "filename": "fake-file.cpp",
+            "line": 123456789
+          }
+        }
+      ]" ( contains: "fake test name" and contains: "fakeTestTag" )
+with message:
+  Tested reporter: JSON
+
+-------------------------------------------------------------------------------
+Reporter's write listings to provided stream
+-------------------------------------------------------------------------------
+Reporters.tests.cpp:<line number>
+...............................................................................
+
 Reporters.tests.cpp:<line number>: PASSED:
   REQUIRE_FALSE( factories.empty() )
 with expansion:
@@ -12522,6 +12997,34 @@ Misc.tests.cpp:<line number>: FAILED - but was ok:
 
 Misc.tests.cpp:<line number>: FAILED:
 
+-------------------------------------------------------------------------------
+Testing checked-if 4
+-------------------------------------------------------------------------------
+Misc.tests.cpp:<line number>
+...............................................................................
+
+Misc.tests.cpp:<line number>: PASSED:
+  CHECKED_ELSE( true )
+
+Misc.tests.cpp:<line number>: FAILED:
+  {Unknown expression after the reported line}
+due to unexpected exception with message:
+  Uncaught exception should fail!
+
+-------------------------------------------------------------------------------
+Testing checked-if 5
+-------------------------------------------------------------------------------
+Misc.tests.cpp:<line number>
+...............................................................................
+
+Misc.tests.cpp:<line number>: FAILED - but was ok:
+  CHECKED_ELSE( false )
+
+Misc.tests.cpp:<line number>: FAILED:
+  {Unknown expression after the reported line}
+due to unexpected exception with message:
+  Uncaught exception should fail!
+
 -------------------------------------------------------------------------------
 The NO_FAIL macro reports a failure but does not fail the test
 -------------------------------------------------------------------------------
@@ -17745,6 +18248,22 @@ InternalBenchmark.tests.cpp:<line number>: PASSED:
 with expansion:
   0.95 == 0.95
 
+-------------------------------------------------------------------------------
+uniform_integer_distribution can return the bounds
+-------------------------------------------------------------------------------
+RandomNumberGeneration.tests.cpp:<line number>
+...............................................................................
+
+RandomNumberGeneration.tests.cpp:<line number>: PASSED:
+  REQUIRE( dist.a() == -10 )
+with expansion:
+  -10 == -10
+
+RandomNumberGeneration.tests.cpp:<line number>: PASSED:
+  REQUIRE( dist.b() == 10 )
+with expansion:
+  10 == 10
+
 -------------------------------------------------------------------------------
 unique_ptr reimplementation: basic functionality
   Default constructed unique_ptr is empty
@@ -18232,6 +18751,6 @@ Misc.tests.cpp:<line number>
 Misc.tests.cpp:<line number>: PASSED:
 
 ===============================================================================
-test cases:  409 |  308 passed |  84 failed | 6 skipped | 11 failed as expected
-assertions: 2225 | 2048 passed | 145 failed | 32 failed as expected
+test cases:  417 |  312 passed |  85 failed | 6 skipped | 14 failed as expected
+assertions: 2260 | 2079 passed | 146 failed | 35 failed as expected
 
diff --git a/packages/Catch2/tests/SelfTest/Baselines/console.sw.multi.approved.txt b/packages/Catch2/tests/SelfTest/Baselines/console.sw.multi.approved.txt
index 4cc942dd49d739cf7f8e95ee1a0ab0bc6242e3ea..5d204990c6cdbe1aab46c615d2dbc6fbe418a32b 100644
--- a/packages/Catch2/tests/SelfTest/Baselines/console.sw.multi.approved.txt
+++ b/packages/Catch2/tests/SelfTest/Baselines/console.sw.multi.approved.txt
@@ -2738,9 +2738,9 @@ Message.tests.cpp:<line number>
 
 Message.tests.cpp:<line number>: PASSED:
 with messages:
-  std::vector<int>{1, 2, 3}[0, 1, 2] := 3
-  std::vector<int>{1, 2, 3}[(0, 1)] := 2
-  std::vector<int>{1, 2, 3}[0] := 1
+  custom_index_op<int>{1, 2, 3}[0, 1, 2] := 0
+  custom_index_op<int>{1, 2, 3}[(0, 1)] := 0
+  custom_index_op<int>{1, 2, 3}[0] := 0
   (helper_1436<int, int>{12, -12}) := { 12, -12 }
   (helper_1436<int, int>(-12, 12)) := { -12, 12 }
   (1, 2) := 2
@@ -4887,6 +4887,50 @@ Matchers.tests.cpp:<line number>: PASSED:
 with expansion:
   1.0 not is NaN
 
+-------------------------------------------------------------------------------
+GENERATE can combine literals and generators
+-------------------------------------------------------------------------------
+Generators.tests.cpp:<line number>
+...............................................................................
+
+Generators.tests.cpp:<line number>: PASSED:
+  REQUIRE( i % 2 == 0 )
+with expansion:
+  0 == 0
+
+-------------------------------------------------------------------------------
+GENERATE can combine literals and generators
+-------------------------------------------------------------------------------
+Generators.tests.cpp:<line number>
+...............................................................................
+
+Generators.tests.cpp:<line number>: PASSED:
+  REQUIRE( i % 2 == 0 )
+with expansion:
+  0 == 0
+
+-------------------------------------------------------------------------------
+GENERATE can combine literals and generators
+-------------------------------------------------------------------------------
+Generators.tests.cpp:<line number>
+...............................................................................
+
+Generators.tests.cpp:<line number>: PASSED:
+  REQUIRE( i % 2 == 0 )
+with expansion:
+  0 == 0
+
+-------------------------------------------------------------------------------
+GENERATE can combine literals and generators
+-------------------------------------------------------------------------------
+Generators.tests.cpp:<line number>
+...............................................................................
+
+Generators.tests.cpp:<line number>: PASSED:
+  REQUIRE( i % 2 == 0 )
+with expansion:
+  0 == 0
+
 -------------------------------------------------------------------------------
 Generators -- adapters
   Filtering by predicate
@@ -6980,6 +7024,18 @@ with expansion:
   ==
   3422778688 (0x<hex digits>)
 
+-------------------------------------------------------------------------------
+INFO and UNSCOPED_INFO can stream multiple arguments
+-------------------------------------------------------------------------------
+Message.tests.cpp:<line number>
+...............................................................................
+
+Message.tests.cpp:<line number>: FAILED:
+explicitly with messages:
+  This info has multiple parts.
+  This unscoped info has multiple parts.
+  Show infos!
+
 -------------------------------------------------------------------------------
 INFO and WARN do not abort tests
 -------------------------------------------------------------------------------
@@ -7141,6 +7197,17 @@ with messages:
   current counter 10
   i := 10
 
+-------------------------------------------------------------------------------
+Incomplete AssertionHandler
+-------------------------------------------------------------------------------
+AssertionHandler.tests.cpp:<line number>
+...............................................................................
+
+AssertionHandler.tests.cpp:<line number>: FAILED:
+  REQUIRE( Dummy )
+due to unexpected exception with message:
+  Exception translation was disabled by CATCH_CONFIG_FAST_COMPILE
+
 -------------------------------------------------------------------------------
 Inequality checks that should fail
 -------------------------------------------------------------------------------
@@ -7233,6 +7300,291 @@ Condition.tests.cpp:<line number>: PASSED:
 with expansion:
   5 != 6
 
+-------------------------------------------------------------------------------
+JsonWriter
+  Newly constructed JsonWriter does nothing
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( stream.str() == "" )
+with expansion:
+  "" == ""
+
+-------------------------------------------------------------------------------
+JsonWriter
+  Calling writeObject will create an empty pair of braces
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( stream.str() == "{\n}" )
+with expansion:
+  "{
+  }"
+  ==
+  "{
+  }"
+
+-------------------------------------------------------------------------------
+JsonWriter
+  Calling writeObject with key will create an object to write the value
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE_THAT( stream.str(), ContainsSubstring( "\"int\": 1," ) && ContainsSubstring( "\"double\": 1.5," ) && ContainsSubstring( "\"true\": true," ) && ContainsSubstring( "\"false\": false," ) && ContainsSubstring( "\"string\": \"this is a string\"," ) && ContainsSubstring( "\"array\": [\n    1,\n    2\n  ]\n}" ) )
+with expansion:
+  "{
+    "int": 1,
+    "double": 1.5,
+    "true": true,
+    "false": false,
+    "string": "this is a string",
+    "array": [
+      1,
+      2
+    ]
+  }" ( contains: ""int": 1," and contains: ""double": 1.5," and contains:
+  ""true": true," and contains: ""false": false," and contains: ""string":
+  "this is a string"," and contains: ""array": [
+      1,
+      2
+    ]
+  }" )
+
+-------------------------------------------------------------------------------
+JsonWriter
+  nesting objects
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE_THAT( stream.str(), ContainsSubstring( "\"empty_object\": {\n  }," ) && ContainsSubstring( "\"fully_object\": {\n    \"key\": 1\n  }" ) )
+with expansion:
+  "{
+    "empty_object": {
+    },
+    "fully_object": {
+      "key": 1
+    }
+  }" ( contains: ""empty_object": {
+    }," and contains: ""fully_object": {
+      "key": 1
+    }" )
+
+-------------------------------------------------------------------------------
+JsonWriter
+  Calling writeArray will create an empty pair of braces
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( stream.str() == "[\n]" )
+with expansion:
+  "[
+  ]"
+  ==
+  "[
+  ]"
+
+-------------------------------------------------------------------------------
+JsonWriter
+  Calling writeArray creates array to write the values to
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( stream.str() == "[\n  1,\n  1.5,\n  true,\n  false,\n  \"this is a string\",\n  {\n    \"object\": 42\n  },\n  [\n    \"array\",\n    42.5\n  ]\n]" )
+with expansion:
+  "[
+    1,
+    1.5,
+    true,
+    false,
+    "this is a string",
+    {
+      "object": 42
+    },
+    [
+      "array",
+      42.5
+    ]
+  ]"
+  ==
+  "[
+    1,
+    1.5,
+    true,
+    false,
+    "this is a string",
+    {
+      "object": 42
+    },
+    [
+      "array",
+      42.5
+    ]
+  ]"
+
+-------------------------------------------------------------------------------
+JsonWriter
+  Moved from JsonObjectWriter shall not insert superfluous brace
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( stream.str() == "{\n}" )
+with expansion:
+  "{
+  }"
+  ==
+  "{
+  }"
+
+-------------------------------------------------------------------------------
+JsonWriter
+  Moved from JsonArrayWriter shall not insert superfluous bracket
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( stream.str() == "[\n]" )
+with expansion:
+  "[
+  ]"
+  ==
+  "[
+  ]"
+
+-------------------------------------------------------------------------------
+JsonWriter
+  Custom class shall be quoted
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( stream.str() == "\"custom\"" )
+with expansion:
+  ""custom"" == ""custom""
+
+-------------------------------------------------------------------------------
+JsonWriter escapes charaters in strings properly
+  Quote in a string is escaped
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( sstream.str() == "\"\\\"\"" )
+with expansion:
+  ""\""" == ""\"""
+
+-------------------------------------------------------------------------------
+JsonWriter escapes charaters in strings properly
+  Backslash in a string is escaped
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( sstream.str() == "\"\\\\\"" )
+with expansion:
+  ""\\"" == ""\\""
+
+-------------------------------------------------------------------------------
+JsonWriter escapes charaters in strings properly
+  Forward slash in a string is **not** escaped
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( sstream.str() == "\"/\"" )
+with expansion:
+  ""/"" == ""/""
+
+-------------------------------------------------------------------------------
+JsonWriter escapes charaters in strings properly
+  Backspace in a string is escaped
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( sstream.str() == "\"\\b\"" )
+with expansion:
+  ""\b"" == ""\b""
+
+-------------------------------------------------------------------------------
+JsonWriter escapes charaters in strings properly
+  Formfeed in a string is escaped
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( sstream.str() == "\"\\f\"" )
+with expansion:
+  ""\f"" == ""\f""
+
+-------------------------------------------------------------------------------
+JsonWriter escapes charaters in strings properly
+  linefeed in a string is escaped
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( sstream.str() == "\"\\n\"" )
+with expansion:
+  ""\n"" == ""\n""
+
+-------------------------------------------------------------------------------
+JsonWriter escapes charaters in strings properly
+  carriage return in a string is escaped
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( sstream.str() == "\"\\r\"" )
+with expansion:
+  ""\r"" == ""\r""
+
+-------------------------------------------------------------------------------
+JsonWriter escapes charaters in strings properly
+  tab in a string is escaped
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( sstream.str() == "\"\\t\"" )
+with expansion:
+  ""\t"" == ""\t""
+
+-------------------------------------------------------------------------------
+JsonWriter escapes charaters in strings properly
+  combination of characters is escaped
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( sstream.str() == "\"\\\\/\\t\\r\\n\"" )
+with expansion:
+  ""\\/\t\r\n"" == ""\\/\t\r\n""
+
 -------------------------------------------------------------------------------
 Lambdas in assertions
 -------------------------------------------------------------------------------
@@ -9731,6 +10083,129 @@ Reporter's write listings to provided stream
 Reporters.tests.cpp:<line number>
 ...............................................................................
 
+Reporters.tests.cpp:<line number>: PASSED:
+  REQUIRE_FALSE( factories.empty() )
+with expansion:
+  !false
+
+-------------------------------------------------------------------------------
+Reporter's write listings to provided stream
+  JSON reporter lists tags
+-------------------------------------------------------------------------------
+Reporters.tests.cpp:<line number>
+...............................................................................
+
+Reporters.tests.cpp:<line number>: PASSED:
+  REQUIRE_THAT( listingString, ContainsSubstring("fakeTag"s) )
+with expansion:
+  "{
+    "version": 1,
+    "metadata": {
+      "name": "",
+      "rng-seed": 1234,
+      "catch2-version": "<version>"
+    },
+    "listings": {
+      "tags": [
+        {
+          "aliases": [
+            "fakeTag"
+          ],
+          "count": 1
+        }
+      ]" contains: "fakeTag"
+with message:
+  Tested reporter: JSON
+
+-------------------------------------------------------------------------------
+Reporter's write listings to provided stream
+-------------------------------------------------------------------------------
+Reporters.tests.cpp:<line number>
+...............................................................................
+
+Reporters.tests.cpp:<line number>: PASSED:
+  REQUIRE_FALSE( factories.empty() )
+with expansion:
+  !false
+
+-------------------------------------------------------------------------------
+Reporter's write listings to provided stream
+  JSON reporter lists reporters
+-------------------------------------------------------------------------------
+Reporters.tests.cpp:<line number>
+...............................................................................
+
+Reporters.tests.cpp:<line number>: PASSED:
+  REQUIRE_THAT( listingString, ContainsSubstring("fake reporter"s) )
+with expansion:
+  "{
+    "version": 1,
+    "metadata": {
+      "name": "",
+      "rng-seed": 1234,
+      "catch2-version": "<version>"
+    },
+    "listings": {
+      "reporters": [
+        {
+          "name": "fake reporter",
+          "description": "fake description"
+        }
+      ]" contains: "fake reporter"
+with message:
+  Tested reporter: JSON
+
+-------------------------------------------------------------------------------
+Reporter's write listings to provided stream
+-------------------------------------------------------------------------------
+Reporters.tests.cpp:<line number>
+...............................................................................
+
+Reporters.tests.cpp:<line number>: PASSED:
+  REQUIRE_FALSE( factories.empty() )
+with expansion:
+  !false
+
+-------------------------------------------------------------------------------
+Reporter's write listings to provided stream
+  JSON reporter lists tests
+-------------------------------------------------------------------------------
+Reporters.tests.cpp:<line number>
+...............................................................................
+
+Reporters.tests.cpp:<line number>: PASSED:
+  REQUIRE_THAT( listingString, ContainsSubstring( "fake test name"s ) && ContainsSubstring( "fakeTestTag"s ) )
+with expansion:
+  "{
+    "version": 1,
+    "metadata": {
+      "name": "",
+      "rng-seed": 1234,
+      "catch2-version": "<version>"
+    },
+    "listings": {
+      "tests": [
+        {
+          "name": "fake test name",
+          "class-name": "",
+          "tags": [
+            "fakeTestTag"
+          ],
+          "source-location": {
+            "filename": "fake-file.cpp",
+            "line": 123456789
+          }
+        }
+      ]" ( contains: "fake test name" and contains: "fakeTestTag" )
+with message:
+  Tested reporter: JSON
+
+-------------------------------------------------------------------------------
+Reporter's write listings to provided stream
+-------------------------------------------------------------------------------
+Reporters.tests.cpp:<line number>
+...............................................................................
+
 Reporters.tests.cpp:<line number>: PASSED:
   REQUIRE_FALSE( factories.empty() )
 with expansion:
@@ -12515,6 +12990,34 @@ Misc.tests.cpp:<line number>: FAILED - but was ok:
 
 Misc.tests.cpp:<line number>: FAILED:
 
+-------------------------------------------------------------------------------
+Testing checked-if 4
+-------------------------------------------------------------------------------
+Misc.tests.cpp:<line number>
+...............................................................................
+
+Misc.tests.cpp:<line number>: PASSED:
+  CHECKED_ELSE( true )
+
+Misc.tests.cpp:<line number>: FAILED:
+  {Unknown expression after the reported line}
+due to unexpected exception with message:
+  Uncaught exception should fail!
+
+-------------------------------------------------------------------------------
+Testing checked-if 5
+-------------------------------------------------------------------------------
+Misc.tests.cpp:<line number>
+...............................................................................
+
+Misc.tests.cpp:<line number>: FAILED - but was ok:
+  CHECKED_ELSE( false )
+
+Misc.tests.cpp:<line number>: FAILED:
+  {Unknown expression after the reported line}
+due to unexpected exception with message:
+  Uncaught exception should fail!
+
 -------------------------------------------------------------------------------
 The NO_FAIL macro reports a failure but does not fail the test
 -------------------------------------------------------------------------------
@@ -17734,6 +18237,22 @@ InternalBenchmark.tests.cpp:<line number>: PASSED:
 with expansion:
   0.95 == 0.95
 
+-------------------------------------------------------------------------------
+uniform_integer_distribution can return the bounds
+-------------------------------------------------------------------------------
+RandomNumberGeneration.tests.cpp:<line number>
+...............................................................................
+
+RandomNumberGeneration.tests.cpp:<line number>: PASSED:
+  REQUIRE( dist.a() == -10 )
+with expansion:
+  -10 == -10
+
+RandomNumberGeneration.tests.cpp:<line number>: PASSED:
+  REQUIRE( dist.b() == 10 )
+with expansion:
+  10 == 10
+
 -------------------------------------------------------------------------------
 unique_ptr reimplementation: basic functionality
   Default constructed unique_ptr is empty
@@ -18221,6 +18740,6 @@ Misc.tests.cpp:<line number>
 Misc.tests.cpp:<line number>: PASSED:
 
 ===============================================================================
-test cases:  409 |  308 passed |  84 failed | 6 skipped | 11 failed as expected
-assertions: 2225 | 2048 passed | 145 failed | 32 failed as expected
+test cases:  417 |  312 passed |  85 failed | 6 skipped | 14 failed as expected
+assertions: 2260 | 2079 passed | 146 failed | 35 failed as expected
 
diff --git a/packages/Catch2/tests/SelfTest/Baselines/junit.sw.approved.txt b/packages/Catch2/tests/SelfTest/Baselines/junit.sw.approved.txt
index c992154c414f38d1e9d3b0843f4dbd6f81a455a3..48eccfc3d1542348b0a3ce7d4f0b7e2172b13037 100644
--- a/packages/Catch2/tests/SelfTest/Baselines/junit.sw.approved.txt
+++ b/packages/Catch2/tests/SelfTest/Baselines/junit.sw.approved.txt
@@ -1,7 +1,7 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <testsuitesloose text artifact
 >
-  <testsuite name="<exe-name>" errors="17" failures="128" skipped="12" tests="2237" hostname="tbd" time="{duration}" timestamp="{iso8601-timestamp}">
+  <testsuite name="<exe-name>" errors="17" failures="129" skipped="12" tests="2272" hostname="tbd" time="{duration}" timestamp="{iso8601-timestamp}">
     <properties>
       <property name="random-seed" value="1"/>
       <property name="filters" value="&quot;*&quot; ~[!nonportable] ~[!benchmark] ~[approvals]"/>
@@ -708,6 +708,7 @@ at Message.tests.cpp:<line number>
     <testcase classname="<exe-name>.global" name="Floating point matchers: float/Composed" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Floating point matchers: float/Constructor validation" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Floating point matchers: float/IsNaN" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="GENERATE can combine literals and generators" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Generators -- adapters/Filtering by predicate/Basic usage" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Generators -- adapters/Filtering by predicate/Throws if there are no matching values" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Generators -- adapters/Shortening a range" time="{duration}" status="run"/>
@@ -752,6 +753,15 @@ at Message.tests.cpp:<line number>
     <testcase classname="<exe-name>.global" name="Hashing different test cases produces different result/Different classname" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Hashing different test cases produces different result/Different tags" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Hashing test case produces same hash across multiple calls" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="INFO and UNSCOPED_INFO can stream multiple arguments" time="{duration}" status="run">
+      <failure type="FAIL">
+FAILED:
+Show infos!
+This info has multiple parts.
+This unscoped info has multiple parts.
+at Message.tests.cpp:<line number>
+      </failure>
+    </testcase>
     <testcase classname="<exe-name>.global" name="INFO and WARN do not abort tests" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="INFO gets logged on failure" time="{duration}" status="run">
       <failure message="a == 1" type="REQUIRE">
@@ -796,6 +806,15 @@ i := 10
 at Message.tests.cpp:<line number>
       </failure>
     </testcase>
+    <testcase classname="<exe-name>.global" name="Incomplete AssertionHandler" time="{duration}" status="run">
+      <skipped message="TEST_CASE tagged with !mayfail"/>
+      <error message="Dummy" type="REQUIRE">
+FAILED:
+  REQUIRE( Dummy )
+Exception translation was disabled by CATCH_CONFIG_FAST_COMPILE
+at AssertionHandler.tests.cpp:<line number>
+      </error>
+    </testcase>
     <testcase classname="<exe-name>.global" name="Inequality checks that should fail" time="{duration}" status="run">
       <skipped message="TEST_CASE tagged with !mayfail"/>
       <failure message="data.int_seven != 7" type="CHECK">
@@ -835,6 +854,24 @@ at Condition.tests.cpp:<line number>
       </failure>
     </testcase>
     <testcase classname="<exe-name>.global" name="Inequality checks that should succeed" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter/Newly constructed JsonWriter does nothing" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter/Calling writeObject will create an empty pair of braces" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter/Calling writeObject with key will create an object to write the value" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter/nesting objects" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter/Calling writeArray will create an empty pair of braces" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter/Calling writeArray creates array to write the values to" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter/Moved from JsonObjectWriter shall not insert superfluous brace" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter/Moved from JsonArrayWriter shall not insert superfluous bracket" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter/Custom class shall be quoted" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter escapes charaters in strings properly/Quote in a string is escaped" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter escapes charaters in strings properly/Backslash in a string is escaped" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter escapes charaters in strings properly/Forward slash in a string is **not** escaped" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter escapes charaters in strings properly/Backspace in a string is escaped" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter escapes charaters in strings properly/Formfeed in a string is escaped" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter escapes charaters in strings properly/linefeed in a string is escaped" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter escapes charaters in strings properly/carriage return in a string is escaped" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter escapes charaters in strings properly/tab in a string is escaped" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter escapes charaters in strings properly/combination of characters is escaped" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Lambdas in assertions" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Less-than inequalities with different epsilons" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="ManuallyRegistered" time="{duration}" status="run"/>
@@ -1172,6 +1209,9 @@ at Matchers.tests.cpp:<line number>
     <testcase classname="<exe-name>.global" name="Reporter's write listings to provided stream/console reporter lists tags" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Reporter's write listings to provided stream/console reporter lists reporters" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Reporter's write listings to provided stream/console reporter lists tests" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="Reporter's write listings to provided stream/JSON reporter lists tags" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="Reporter's write listings to provided stream/JSON reporter lists reporters" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="Reporter's write listings to provided stream/JSON reporter lists tests" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Reporter's write listings to provided stream/JUnit reporter lists tags" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Reporter's write listings to provided stream/JUnit reporter lists reporters" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Reporter's write listings to provided stream/JUnit reporter lists tests" time="{duration}" status="run"/>
@@ -1360,6 +1400,24 @@ FAILED:
 at Misc.tests.cpp:<line number>
       </failure>
     </testcase>
+    <testcase classname="<exe-name>.global" name="Testing checked-if 4" time="{duration}" status="run">
+      <skipped message="TEST_CASE tagged with !mayfail"/>
+      <error message="{Unknown expression after the reported line}">
+FAILED:
+  {Unknown expression after the reported line}
+Uncaught exception should fail!
+at Misc.tests.cpp:<line number>
+      </error>
+    </testcase>
+    <testcase classname="<exe-name>.global" name="Testing checked-if 5" time="{duration}" status="run">
+      <skipped message="TEST_CASE tagged with !mayfail"/>
+      <error message="{Unknown expression after the reported line}">
+FAILED:
+  {Unknown expression after the reported line}
+Uncaught exception should fail!
+at Misc.tests.cpp:<line number>
+      </error>
+    </testcase>
     <testcase classname="<exe-name>.global" name="The NO_FAIL macro reports a failure but does not fail the test" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="The default listing implementation write to provided stream/Listing tags" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="The default listing implementation write to provided stream/Listing reporters" time="{duration}" status="run"/>
@@ -2017,6 +2075,7 @@ at Exception.tests.cpp:<line number>
     <testcase classname="<exe-name>.global" name="tuple&lt;string,string>" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="tuple&lt;tuple&lt;int>,tuple&lt;>,float>" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="uniform samples" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="uniform_integer_distribution can return the bounds" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="unique_ptr reimplementation: basic functionality/Default constructed unique_ptr is empty" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="unique_ptr reimplementation: basic functionality/Take ownership of allocation" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="unique_ptr reimplementation: basic functionality/Take ownership of allocation/Plain reset deallocates" time="{duration}" status="run"/>
diff --git a/packages/Catch2/tests/SelfTest/Baselines/junit.sw.multi.approved.txt b/packages/Catch2/tests/SelfTest/Baselines/junit.sw.multi.approved.txt
index 79c3236506da7d2fce43fa4a1db8b6d30ddf3d37..d270c88fb6dc8fe3e0af840c8d72ba665d5b146a 100644
--- a/packages/Catch2/tests/SelfTest/Baselines/junit.sw.multi.approved.txt
+++ b/packages/Catch2/tests/SelfTest/Baselines/junit.sw.multi.approved.txt
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <testsuites>
-  <testsuite name="<exe-name>" errors="17" failures="128" skipped="12" tests="2237" hostname="tbd" time="{duration}" timestamp="{iso8601-timestamp}">
+  <testsuite name="<exe-name>" errors="17" failures="129" skipped="12" tests="2272" hostname="tbd" time="{duration}" timestamp="{iso8601-timestamp}">
     <properties>
       <property name="random-seed" value="1"/>
       <property name="filters" value="&quot;*&quot; ~[!nonportable] ~[!benchmark] ~[approvals]"/>
@@ -707,6 +707,7 @@ at Message.tests.cpp:<line number>
     <testcase classname="<exe-name>.global" name="Floating point matchers: float/Composed" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Floating point matchers: float/Constructor validation" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Floating point matchers: float/IsNaN" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="GENERATE can combine literals and generators" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Generators -- adapters/Filtering by predicate/Basic usage" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Generators -- adapters/Filtering by predicate/Throws if there are no matching values" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Generators -- adapters/Shortening a range" time="{duration}" status="run"/>
@@ -751,6 +752,15 @@ at Message.tests.cpp:<line number>
     <testcase classname="<exe-name>.global" name="Hashing different test cases produces different result/Different classname" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Hashing different test cases produces different result/Different tags" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Hashing test case produces same hash across multiple calls" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="INFO and UNSCOPED_INFO can stream multiple arguments" time="{duration}" status="run">
+      <failure type="FAIL">
+FAILED:
+Show infos!
+This info has multiple parts.
+This unscoped info has multiple parts.
+at Message.tests.cpp:<line number>
+      </failure>
+    </testcase>
     <testcase classname="<exe-name>.global" name="INFO and WARN do not abort tests" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="INFO gets logged on failure" time="{duration}" status="run">
       <failure message="a == 1" type="REQUIRE">
@@ -795,6 +805,15 @@ i := 10
 at Message.tests.cpp:<line number>
       </failure>
     </testcase>
+    <testcase classname="<exe-name>.global" name="Incomplete AssertionHandler" time="{duration}" status="run">
+      <skipped message="TEST_CASE tagged with !mayfail"/>
+      <error message="Dummy" type="REQUIRE">
+FAILED:
+  REQUIRE( Dummy )
+Exception translation was disabled by CATCH_CONFIG_FAST_COMPILE
+at AssertionHandler.tests.cpp:<line number>
+      </error>
+    </testcase>
     <testcase classname="<exe-name>.global" name="Inequality checks that should fail" time="{duration}" status="run">
       <skipped message="TEST_CASE tagged with !mayfail"/>
       <failure message="data.int_seven != 7" type="CHECK">
@@ -834,6 +853,24 @@ at Condition.tests.cpp:<line number>
       </failure>
     </testcase>
     <testcase classname="<exe-name>.global" name="Inequality checks that should succeed" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter/Newly constructed JsonWriter does nothing" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter/Calling writeObject will create an empty pair of braces" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter/Calling writeObject with key will create an object to write the value" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter/nesting objects" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter/Calling writeArray will create an empty pair of braces" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter/Calling writeArray creates array to write the values to" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter/Moved from JsonObjectWriter shall not insert superfluous brace" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter/Moved from JsonArrayWriter shall not insert superfluous bracket" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter/Custom class shall be quoted" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter escapes charaters in strings properly/Quote in a string is escaped" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter escapes charaters in strings properly/Backslash in a string is escaped" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter escapes charaters in strings properly/Forward slash in a string is **not** escaped" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter escapes charaters in strings properly/Backspace in a string is escaped" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter escapes charaters in strings properly/Formfeed in a string is escaped" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter escapes charaters in strings properly/linefeed in a string is escaped" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter escapes charaters in strings properly/carriage return in a string is escaped" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter escapes charaters in strings properly/tab in a string is escaped" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter escapes charaters in strings properly/combination of characters is escaped" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Lambdas in assertions" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Less-than inequalities with different epsilons" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="ManuallyRegistered" time="{duration}" status="run"/>
@@ -1171,6 +1208,9 @@ at Matchers.tests.cpp:<line number>
     <testcase classname="<exe-name>.global" name="Reporter's write listings to provided stream/console reporter lists tags" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Reporter's write listings to provided stream/console reporter lists reporters" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Reporter's write listings to provided stream/console reporter lists tests" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="Reporter's write listings to provided stream/JSON reporter lists tags" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="Reporter's write listings to provided stream/JSON reporter lists reporters" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="Reporter's write listings to provided stream/JSON reporter lists tests" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Reporter's write listings to provided stream/JUnit reporter lists tags" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Reporter's write listings to provided stream/JUnit reporter lists reporters" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Reporter's write listings to provided stream/JUnit reporter lists tests" time="{duration}" status="run"/>
@@ -1359,6 +1399,24 @@ FAILED:
 at Misc.tests.cpp:<line number>
       </failure>
     </testcase>
+    <testcase classname="<exe-name>.global" name="Testing checked-if 4" time="{duration}" status="run">
+      <skipped message="TEST_CASE tagged with !mayfail"/>
+      <error message="{Unknown expression after the reported line}">
+FAILED:
+  {Unknown expression after the reported line}
+Uncaught exception should fail!
+at Misc.tests.cpp:<line number>
+      </error>
+    </testcase>
+    <testcase classname="<exe-name>.global" name="Testing checked-if 5" time="{duration}" status="run">
+      <skipped message="TEST_CASE tagged with !mayfail"/>
+      <error message="{Unknown expression after the reported line}">
+FAILED:
+  {Unknown expression after the reported line}
+Uncaught exception should fail!
+at Misc.tests.cpp:<line number>
+      </error>
+    </testcase>
     <testcase classname="<exe-name>.global" name="The NO_FAIL macro reports a failure but does not fail the test" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="The default listing implementation write to provided stream/Listing tags" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="The default listing implementation write to provided stream/Listing reporters" time="{duration}" status="run"/>
@@ -2016,6 +2074,7 @@ at Exception.tests.cpp:<line number>
     <testcase classname="<exe-name>.global" name="tuple&lt;string,string>" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="tuple&lt;tuple&lt;int>,tuple&lt;>,float>" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="uniform samples" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="uniform_integer_distribution can return the bounds" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="unique_ptr reimplementation: basic functionality/Default constructed unique_ptr is empty" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="unique_ptr reimplementation: basic functionality/Take ownership of allocation" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="unique_ptr reimplementation: basic functionality/Take ownership of allocation/Plain reset deallocates" time="{duration}" status="run"/>
diff --git a/packages/Catch2/tests/SelfTest/Baselines/sonarqube.sw.approved.txt b/packages/Catch2/tests/SelfTest/Baselines/sonarqube.sw.approved.txt
index 592887f9c43fa5ccfcd1f8234c06cb639a640112..36b05e54dc7b1df63ca93888271d8e827757cb21 100644
--- a/packages/Catch2/tests/SelfTest/Baselines/sonarqube.sw.approved.txt
+++ b/packages/Catch2/tests/SelfTest/Baselines/sonarqube.sw.approved.txt
@@ -2,6 +2,16 @@
 <!-- filters='"*" ~[!nonportable] ~[!benchmark] ~[approvals]' rng-seed=1 -->
 <testExecutions version="1"loose text artifact
 >
+  <file path="tests/<exe-name>/IntrospectiveTests/AssertionHandler.tests.cpp">
+    <testCase name="Incomplete AssertionHandler" duration="{duration}">
+      <skipped message="REQUIRE(Dummy)">
+FAILED:
+	REQUIRE( Dummy )
+Exception translation was disabled by CATCH_CONFIG_FAST_COMPILE
+at AssertionHandler.tests.cpp:<line number>
+      </skipped>
+    </testCase>
+  </file>
   <file path="tests/<exe-name>/IntrospectiveTests/Clara.tests.cpp">
     <testCase name="Clara::Arg supports single-arg parse the way Opt does" duration="{duration}"/>
     <testCase name="Clara::Opt supports accept-many lambdas/Parsing fails on multiple options without accept_many" duration="{duration}"/>
@@ -120,6 +130,26 @@
     <testCase name="warmup" duration="{duration}"/>
     <testCase name="weighted_average_quantile" duration="{duration}"/>
   </file>
+  <file path="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp">
+    <testCase name="JsonWriter/Newly constructed JsonWriter does nothing" duration="{duration}"/>
+    <testCase name="JsonWriter/Calling writeObject will create an empty pair of braces" duration="{duration}"/>
+    <testCase name="JsonWriter/Calling writeObject with key will create an object to write the value" duration="{duration}"/>
+    <testCase name="JsonWriter/nesting objects" duration="{duration}"/>
+    <testCase name="JsonWriter/Calling writeArray will create an empty pair of braces" duration="{duration}"/>
+    <testCase name="JsonWriter/Calling writeArray creates array to write the values to" duration="{duration}"/>
+    <testCase name="JsonWriter/Moved from JsonObjectWriter shall not insert superfluous brace" duration="{duration}"/>
+    <testCase name="JsonWriter/Moved from JsonArrayWriter shall not insert superfluous bracket" duration="{duration}"/>
+    <testCase name="JsonWriter/Custom class shall be quoted" duration="{duration}"/>
+    <testCase name="JsonWriter escapes charaters in strings properly/Quote in a string is escaped" duration="{duration}"/>
+    <testCase name="JsonWriter escapes charaters in strings properly/Backslash in a string is escaped" duration="{duration}"/>
+    <testCase name="JsonWriter escapes charaters in strings properly/Forward slash in a string is **not** escaped" duration="{duration}"/>
+    <testCase name="JsonWriter escapes charaters in strings properly/Backspace in a string is escaped" duration="{duration}"/>
+    <testCase name="JsonWriter escapes charaters in strings properly/Formfeed in a string is escaped" duration="{duration}"/>
+    <testCase name="JsonWriter escapes charaters in strings properly/linefeed in a string is escaped" duration="{duration}"/>
+    <testCase name="JsonWriter escapes charaters in strings properly/carriage return in a string is escaped" duration="{duration}"/>
+    <testCase name="JsonWriter escapes charaters in strings properly/tab in a string is escaped" duration="{duration}"/>
+    <testCase name="JsonWriter escapes charaters in strings properly/combination of characters is escaped" duration="{duration}"/>
+  </file>
   <file path="tests/<exe-name>/IntrospectiveTests/Parse.tests.cpp">
     <testCase name="Parse uints/proper inputs" duration="{duration}"/>
     <testCase name="Parse uints/Bad inputs" duration="{duration}"/>
@@ -151,6 +181,7 @@
     <testCase name="Our PCG implementation provides expected results for known seeds/Specific seed" duration="{duration}"/>
     <testCase name="Random seed generation accepts known methods" duration="{duration}"/>
     <testCase name="Random seed generation reports unknown methods" duration="{duration}"/>
+    <testCase name="uniform_integer_distribution can return the bounds" duration="{duration}"/>
   </file>
   <file path="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp">
     <testCase name="Multireporter calls reporters and listeners in correct order" duration="{duration}"/>
@@ -168,6 +199,9 @@
     <testCase name="Reporter's write listings to provided stream/console reporter lists tags" duration="{duration}"/>
     <testCase name="Reporter's write listings to provided stream/console reporter lists reporters" duration="{duration}"/>
     <testCase name="Reporter's write listings to provided stream/console reporter lists tests" duration="{duration}"/>
+    <testCase name="Reporter's write listings to provided stream/JSON reporter lists tags" duration="{duration}"/>
+    <testCase name="Reporter's write listings to provided stream/JSON reporter lists reporters" duration="{duration}"/>
+    <testCase name="Reporter's write listings to provided stream/JSON reporter lists tests" duration="{duration}"/>
     <testCase name="Reporter's write listings to provided stream/JUnit reporter lists tags" duration="{duration}"/>
     <testCase name="Reporter's write listings to provided stream/JUnit reporter lists reporters" duration="{duration}"/>
     <testCase name="Reporter's write listings to provided stream/JUnit reporter lists tests" duration="{duration}"/>
@@ -1038,6 +1072,7 @@ at Generators.tests.cpp:<line number>
     <testCase name="Copy and then generate a range/from var and iterators" duration="{duration}"/>
     <testCase name="Copy and then generate a range/From a temporary container" duration="{duration}"/>
     <testCase name="Copy and then generate a range/Final validation" duration="{duration}"/>
+    <testCase name="GENERATE can combine literals and generators" duration="{duration}"/>
     <testCase name="Generators -- adapters/Filtering by predicate/Basic usage" duration="{duration}"/>
     <testCase name="Generators -- adapters/Filtering by predicate/Throws if there are no matching values" duration="{duration}"/>
     <testCase name="Generators -- adapters/Shortening a range" duration="{duration}"/>
@@ -1467,6 +1502,15 @@ at Message.tests.cpp:<line number>
       <failure message="FAIL_CHECK()">
 FAILED:
 This is a failure
+at Message.tests.cpp:<line number>
+      </failure>
+    </testCase>
+    <testCase name="INFO and UNSCOPED_INFO can stream multiple arguments" duration="{duration}">
+      <failure message="FAIL()">
+FAILED:
+Show infos!
+This info has multiple parts.
+This unscoped info has multiple parts.
 at Message.tests.cpp:<line number>
       </failure>
     </testCase>
@@ -1727,6 +1771,22 @@ at Misc.tests.cpp:<line number>
     <testCase name="Testing checked-if 3" duration="{duration}">
       <skipped message="FAIL()">
 FAILED:
+at Misc.tests.cpp:<line number>
+      </skipped>
+    </testCase>
+    <testCase name="Testing checked-if 4" duration="{duration}">
+      <skipped message="({Unknown expression after the reported line})">
+FAILED:
+	{Unknown expression after the reported line}
+Uncaught exception should fail!
+at Misc.tests.cpp:<line number>
+      </skipped>
+    </testCase>
+    <testCase name="Testing checked-if 5" duration="{duration}">
+      <skipped message="({Unknown expression after the reported line})">
+FAILED:
+	{Unknown expression after the reported line}
+Uncaught exception should fail!
 at Misc.tests.cpp:<line number>
       </skipped>
     </testCase>
diff --git a/packages/Catch2/tests/SelfTest/Baselines/sonarqube.sw.multi.approved.txt b/packages/Catch2/tests/SelfTest/Baselines/sonarqube.sw.multi.approved.txt
index 3509287f788c0346c0681014e08e1c7fdb182223..c9d3d205bb92921a07f0e1de25759355a31b65ab 100644
--- a/packages/Catch2/tests/SelfTest/Baselines/sonarqube.sw.multi.approved.txt
+++ b/packages/Catch2/tests/SelfTest/Baselines/sonarqube.sw.multi.approved.txt
@@ -1,6 +1,16 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <!-- filters='"*" ~[!nonportable] ~[!benchmark] ~[approvals]' rng-seed=1 -->
 <testExecutions version="1">
+  <file path="tests/<exe-name>/IntrospectiveTests/AssertionHandler.tests.cpp">
+    <testCase name="Incomplete AssertionHandler" duration="{duration}">
+      <skipped message="REQUIRE(Dummy)">
+FAILED:
+	REQUIRE( Dummy )
+Exception translation was disabled by CATCH_CONFIG_FAST_COMPILE
+at AssertionHandler.tests.cpp:<line number>
+      </skipped>
+    </testCase>
+  </file>
   <file path="tests/<exe-name>/IntrospectiveTests/Clara.tests.cpp">
     <testCase name="Clara::Arg supports single-arg parse the way Opt does" duration="{duration}"/>
     <testCase name="Clara::Opt supports accept-many lambdas/Parsing fails on multiple options without accept_many" duration="{duration}"/>
@@ -119,6 +129,26 @@
     <testCase name="warmup" duration="{duration}"/>
     <testCase name="weighted_average_quantile" duration="{duration}"/>
   </file>
+  <file path="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp">
+    <testCase name="JsonWriter/Newly constructed JsonWriter does nothing" duration="{duration}"/>
+    <testCase name="JsonWriter/Calling writeObject will create an empty pair of braces" duration="{duration}"/>
+    <testCase name="JsonWriter/Calling writeObject with key will create an object to write the value" duration="{duration}"/>
+    <testCase name="JsonWriter/nesting objects" duration="{duration}"/>
+    <testCase name="JsonWriter/Calling writeArray will create an empty pair of braces" duration="{duration}"/>
+    <testCase name="JsonWriter/Calling writeArray creates array to write the values to" duration="{duration}"/>
+    <testCase name="JsonWriter/Moved from JsonObjectWriter shall not insert superfluous brace" duration="{duration}"/>
+    <testCase name="JsonWriter/Moved from JsonArrayWriter shall not insert superfluous bracket" duration="{duration}"/>
+    <testCase name="JsonWriter/Custom class shall be quoted" duration="{duration}"/>
+    <testCase name="JsonWriter escapes charaters in strings properly/Quote in a string is escaped" duration="{duration}"/>
+    <testCase name="JsonWriter escapes charaters in strings properly/Backslash in a string is escaped" duration="{duration}"/>
+    <testCase name="JsonWriter escapes charaters in strings properly/Forward slash in a string is **not** escaped" duration="{duration}"/>
+    <testCase name="JsonWriter escapes charaters in strings properly/Backspace in a string is escaped" duration="{duration}"/>
+    <testCase name="JsonWriter escapes charaters in strings properly/Formfeed in a string is escaped" duration="{duration}"/>
+    <testCase name="JsonWriter escapes charaters in strings properly/linefeed in a string is escaped" duration="{duration}"/>
+    <testCase name="JsonWriter escapes charaters in strings properly/carriage return in a string is escaped" duration="{duration}"/>
+    <testCase name="JsonWriter escapes charaters in strings properly/tab in a string is escaped" duration="{duration}"/>
+    <testCase name="JsonWriter escapes charaters in strings properly/combination of characters is escaped" duration="{duration}"/>
+  </file>
   <file path="tests/<exe-name>/IntrospectiveTests/Parse.tests.cpp">
     <testCase name="Parse uints/proper inputs" duration="{duration}"/>
     <testCase name="Parse uints/Bad inputs" duration="{duration}"/>
@@ -150,6 +180,7 @@
     <testCase name="Our PCG implementation provides expected results for known seeds/Specific seed" duration="{duration}"/>
     <testCase name="Random seed generation accepts known methods" duration="{duration}"/>
     <testCase name="Random seed generation reports unknown methods" duration="{duration}"/>
+    <testCase name="uniform_integer_distribution can return the bounds" duration="{duration}"/>
   </file>
   <file path="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp">
     <testCase name="Multireporter calls reporters and listeners in correct order" duration="{duration}"/>
@@ -167,6 +198,9 @@
     <testCase name="Reporter's write listings to provided stream/console reporter lists tags" duration="{duration}"/>
     <testCase name="Reporter's write listings to provided stream/console reporter lists reporters" duration="{duration}"/>
     <testCase name="Reporter's write listings to provided stream/console reporter lists tests" duration="{duration}"/>
+    <testCase name="Reporter's write listings to provided stream/JSON reporter lists tags" duration="{duration}"/>
+    <testCase name="Reporter's write listings to provided stream/JSON reporter lists reporters" duration="{duration}"/>
+    <testCase name="Reporter's write listings to provided stream/JSON reporter lists tests" duration="{duration}"/>
     <testCase name="Reporter's write listings to provided stream/JUnit reporter lists tags" duration="{duration}"/>
     <testCase name="Reporter's write listings to provided stream/JUnit reporter lists reporters" duration="{duration}"/>
     <testCase name="Reporter's write listings to provided stream/JUnit reporter lists tests" duration="{duration}"/>
@@ -1037,6 +1071,7 @@ at Generators.tests.cpp:<line number>
     <testCase name="Copy and then generate a range/from var and iterators" duration="{duration}"/>
     <testCase name="Copy and then generate a range/From a temporary container" duration="{duration}"/>
     <testCase name="Copy and then generate a range/Final validation" duration="{duration}"/>
+    <testCase name="GENERATE can combine literals and generators" duration="{duration}"/>
     <testCase name="Generators -- adapters/Filtering by predicate/Basic usage" duration="{duration}"/>
     <testCase name="Generators -- adapters/Filtering by predicate/Throws if there are no matching values" duration="{duration}"/>
     <testCase name="Generators -- adapters/Shortening a range" duration="{duration}"/>
@@ -1466,6 +1501,15 @@ at Message.tests.cpp:<line number>
       <failure message="FAIL_CHECK()">
 FAILED:
 This is a failure
+at Message.tests.cpp:<line number>
+      </failure>
+    </testCase>
+    <testCase name="INFO and UNSCOPED_INFO can stream multiple arguments" duration="{duration}">
+      <failure message="FAIL()">
+FAILED:
+Show infos!
+This info has multiple parts.
+This unscoped info has multiple parts.
 at Message.tests.cpp:<line number>
       </failure>
     </testCase>
@@ -1726,6 +1770,22 @@ at Misc.tests.cpp:<line number>
     <testCase name="Testing checked-if 3" duration="{duration}">
       <skipped message="FAIL()">
 FAILED:
+at Misc.tests.cpp:<line number>
+      </skipped>
+    </testCase>
+    <testCase name="Testing checked-if 4" duration="{duration}">
+      <skipped message="({Unknown expression after the reported line})">
+FAILED:
+	{Unknown expression after the reported line}
+Uncaught exception should fail!
+at Misc.tests.cpp:<line number>
+      </skipped>
+    </testCase>
+    <testCase name="Testing checked-if 5" duration="{duration}">
+      <skipped message="({Unknown expression after the reported line})">
+FAILED:
+	{Unknown expression after the reported line}
+Uncaught exception should fail!
 at Misc.tests.cpp:<line number>
       </skipped>
     </testCase>
diff --git a/packages/Catch2/tests/SelfTest/Baselines/tap.sw.approved.txt b/packages/Catch2/tests/SelfTest/Baselines/tap.sw.approved.txt
index acd0a1c14934133acd85defa52b363fce72ddcae..a02dbd9543b1581da2dd5a38ca12911c01d0d4da 100644
--- a/packages/Catch2/tests/SelfTest/Baselines/tap.sw.approved.txt
+++ b/packages/Catch2/tests/SelfTest/Baselines/tap.sw.approved.txt
@@ -659,7 +659,7 @@ ok {test-number} - unrelated::ADL_empty{}, IsEmpty() for: {?} is empty
 # CAPTURE can deal with complex expressions
 ok {test-number} - with 7 messages: 'a := 1' and 'b := 2' and 'c := 3' and 'a + b := 3' and 'a+b := 3' and 'c > b := true' and 'a == 1 := true'
 # CAPTURE can deal with complex expressions involving commas
-ok {test-number} - with 7 messages: 'std::vector<int>{1, 2, 3}[0, 1, 2] := 3' and 'std::vector<int>{1, 2, 3}[(0, 1)] := 2' and 'std::vector<int>{1, 2, 3}[0] := 1' and '(helper_1436<int, int>{12, -12}) := { 12, -12 }' and '(helper_1436<int, int>(-12, 12)) := { -12, 12 }' and '(1, 2) := 2' and '(2, 3) := 3'
+ok {test-number} - with 7 messages: 'custom_index_op<int>{1, 2, 3}[0, 1, 2] := 0' and 'custom_index_op<int>{1, 2, 3}[(0, 1)] := 0' and 'custom_index_op<int>{1, 2, 3}[0] := 0' and '(helper_1436<int, int>{12, -12}) := { 12, -12 }' and '(helper_1436<int, int>(-12, 12)) := { -12, 12 }' and '(1, 2) := 2' and '(2, 3) := 3'
 # CAPTURE parses string and character constants
 ok {test-number} - with 11 messages: '("comma, in string", "escaped, \", ") := "escaped, ", "' and '"single quote in string,'," := "single quote in string,',"' and '"some escapes, \\,\\\\" := "some escapes, \,\\"' and '"some, ), unmatched, } prenheses {[<" := "some, ), unmatched, } prenheses {[<"' and ''"' := '"'' and ''\'' := '''' and '',' := ','' and ''}' := '}'' and '')' := ')'' and ''(' := '('' and ''{' := '{''
 # Capture and info messages
@@ -1258,6 +1258,14 @@ ok {test-number} - WithinRel( 1.f, -0.2f ), std::domain_error
 ok {test-number} - WithinRel( 1.f, 1.f ), std::domain_error
 # Floating point matchers: float
 ok {test-number} - 1., !IsNaN() for: 1.0 not is NaN
+# GENERATE can combine literals and generators
+ok {test-number} - i % 2 == 0 for: 0 == 0
+# GENERATE can combine literals and generators
+ok {test-number} - i % 2 == 0 for: 0 == 0
+# GENERATE can combine literals and generators
+ok {test-number} - i % 2 == 0 for: 0 == 0
+# GENERATE can combine literals and generators
+ok {test-number} - i % 2 == 0 for: 0 == 0
 # Generators -- adapters
 ok {test-number} - i % 2 == 0 for: 0 == 0
 # Generators -- adapters
@@ -1796,6 +1804,8 @@ ok {test-number} - h( dummy1 ) != h( dummy2 ) for: 2673152918 (0x<hex digits>) !
 ok {test-number} - h( dummy1 ) != h( dummy2 ) for: 2074929312 (0x<hex digits>) != 3429949824 (0x<hex digits>)
 # Hashing test case produces same hash across multiple calls
 ok {test-number} - h( dummy ) == h( dummy ) for: 3422778688 (0x<hex digits>) == 3422778688 (0x<hex digits>)
+# INFO and UNSCOPED_INFO can stream multiple arguments
+not ok {test-number} - explicitly with 3 messages: 'This info has multiple parts.' and 'This unscoped info has multiple parts.' and 'Show infos!'
 # INFO and WARN do not abort tests
 warning {test-number} - 'this is a message' with 1 message: 'this is a warning'
 # INFO gets logged on failure
@@ -1830,6 +1840,8 @@ ok {test-number} - i < 10 for: 8 < 10 with 2 messages: 'current counter 8' and '
 ok {test-number} - i < 10 for: 9 < 10 with 2 messages: 'current counter 9' and 'i := 9'
 # INFO is reset for each loop
 not ok {test-number} - i < 10 for: 10 < 10 with 2 messages: 'current counter 10' and 'i := 10'
+# Incomplete AssertionHandler
+not ok {test-number} - unexpected exception with message: 'Exception translation was disabled by CATCH_CONFIG_FAST_COMPILE'; expression was: Dummy
 # Inequality checks that should fail
 not ok {test-number} - data.int_seven != 7 for: 7 != 7
 # Inequality checks that should fail
@@ -1862,6 +1874,42 @@ ok {test-number} - data.str_hello != "hell" for: "hello" != "hell"
 ok {test-number} - data.str_hello != "hello1" for: "hello" != "hello1"
 # Inequality checks that should succeed
 ok {test-number} - data.str_hello.size() != 6 for: 5 != 6
+# JsonWriter
+ok {test-number} - stream.str() == "" for: "" == ""
+# JsonWriter
+ok {test-number} - stream.str() == "{\n}" for: "{ }" == "{ }"
+# JsonWriter
+ok {test-number} - stream.str(), ContainsSubstring( "\"int\": 1," ) && ContainsSubstring( "\"double\": 1.5," ) && ContainsSubstring( "\"true\": true," ) && ContainsSubstring( "\"false\": false," ) && ContainsSubstring( "\"string\": \"this is a string\"," ) && ContainsSubstring( "\"array\": [\n    1,\n    2\n  ]\n}" ) for: "{   "int": 1,   "double": 1.5,   "true": true,   "false": false,   "string": "this is a string",   "array": [     1,     2   ] }" ( contains: ""int": 1," and contains: ""double": 1.5," and contains: ""true": true," and contains: ""false": false," and contains: ""string": "this is a string"," and contains: ""array": [     1,     2   ] }" )
+# JsonWriter
+ok {test-number} - stream.str(), ContainsSubstring( "\"empty_object\": {\n  }," ) && ContainsSubstring( "\"fully_object\": {\n    \"key\": 1\n  }" ) for: "{   "empty_object": {   },   "fully_object": {     "key": 1   } }" ( contains: ""empty_object": {   }," and contains: ""fully_object": {     "key": 1   }" )
+# JsonWriter
+ok {test-number} - stream.str() == "[\n]" for: "[ ]" == "[ ]"
+# JsonWriter
+ok {test-number} - stream.str() == "[\n  1,\n  1.5,\n  true,\n  false,\n  \"this is a string\",\n  {\n    \"object\": 42\n  },\n  [\n    \"array\",\n    42.5\n  ]\n]" for: "[   1,   1.5,   true,   false,   "this is a string",   {     "object": 42   },   [     "array",     42.5   ] ]" == "[   1,   1.5,   true,   false,   "this is a string",   {     "object": 42   },   [     "array",     42.5   ] ]"
+# JsonWriter
+ok {test-number} - stream.str() == "{\n}" for: "{ }" == "{ }"
+# JsonWriter
+ok {test-number} - stream.str() == "[\n]" for: "[ ]" == "[ ]"
+# JsonWriter
+ok {test-number} - stream.str() == "\"custom\"" for: ""custom"" == ""custom""
+# JsonWriter escapes charaters in strings properly
+ok {test-number} - sstream.str() == "\"\\\"\"" for: ""\""" == ""\"""
+# JsonWriter escapes charaters in strings properly
+ok {test-number} - sstream.str() == "\"\\\\\"" for: ""\\"" == ""\\""
+# JsonWriter escapes charaters in strings properly
+ok {test-number} - sstream.str() == "\"/\"" for: ""/"" == ""/""
+# JsonWriter escapes charaters in strings properly
+ok {test-number} - sstream.str() == "\"\\b\"" for: ""\b"" == ""\b""
+# JsonWriter escapes charaters in strings properly
+ok {test-number} - sstream.str() == "\"\\f\"" for: ""\f"" == ""\f""
+# JsonWriter escapes charaters in strings properly
+ok {test-number} - sstream.str() == "\"\\n\"" for: ""\n"" == ""\n""
+# JsonWriter escapes charaters in strings properly
+ok {test-number} - sstream.str() == "\"\\r\"" for: ""\r"" == ""\r""
+# JsonWriter escapes charaters in strings properly
+ok {test-number} - sstream.str() == "\"\\t\"" for: ""\t"" == ""\t""
+# JsonWriter escapes charaters in strings properly
+ok {test-number} - sstream.str() == "\"\\\\/\\t\\r\\n\"" for: ""\\/\t\r\n"" == ""\\/\t\r\n""
 # Lambdas in assertions
 ok {test-number} - []() { return true; }() for: true
 # Less-than inequalities with different epsilons
@@ -2455,6 +2503,18 @@ ok {test-number} - listingString, ContainsSubstring( "fake test name"s ) && Cont
 # Reporter's write listings to provided stream
 ok {test-number} - !(factories.empty()) for: !false
 # Reporter's write listings to provided stream
+ok {test-number} - listingString, ContainsSubstring("fakeTag"s) for: "{   "version": 1,   "metadata": {     "name": "",     "rng-seed": 1234,     "catch2-version": "<version>"   },   "listings": {     "tags": [       {         "aliases": [           "fakeTag"         ],         "count": 1       }     ]" contains: "fakeTag" with 1 message: 'Tested reporter: JSON'
+# Reporter's write listings to provided stream
+ok {test-number} - !(factories.empty()) for: !false
+# Reporter's write listings to provided stream
+ok {test-number} - listingString, ContainsSubstring("fake reporter"s) for: "{   "version": 1,   "metadata": {     "name": "",     "rng-seed": 1234,     "catch2-version": "<version>"   },   "listings": {     "reporters": [       {         "name": "fake reporter",         "description": "fake description"       }     ]" contains: "fake reporter" with 1 message: 'Tested reporter: JSON'
+# Reporter's write listings to provided stream
+ok {test-number} - !(factories.empty()) for: !false
+# Reporter's write listings to provided stream
+ok {test-number} - listingString, ContainsSubstring( "fake test name"s ) && ContainsSubstring( "fakeTestTag"s ) for: "{   "version": 1,   "metadata": {     "name": "",     "rng-seed": 1234,     "catch2-version": "<version>"   },   "listings": {     "tests": [       {         "name": "fake test name",         "class-name": "",         "tags": [           "fakeTestTag"         ],         "source-location": {           "filename": "fake-file.cpp",           "line": 123456789         }       }     ]" ( contains: "fake test name" and contains: "fakeTestTag" ) with 1 message: 'Tested reporter: JSON'
+# Reporter's write listings to provided stream
+ok {test-number} - !(factories.empty()) for: !false
+# Reporter's write listings to provided stream
 ok {test-number} - listingString, ContainsSubstring("fakeTag"s) for: "<?xml version="1.0" encoding="UTF-8"?> All available tags:    1  [fakeTag] 1 tag  " contains: "fakeTag" with 1 message: 'Tested reporter: JUnit'
 # Reporter's write listings to provided stream
 ok {test-number} - !(factories.empty()) for: !false
@@ -3067,6 +3127,14 @@ not ok {test-number} - explicitly
 ok {test-number} - false  # TODO
 # Testing checked-if 3
 not ok {test-number} - explicitly
+# Testing checked-if 4
+ok {test-number} - true
+# Testing checked-if 4
+not ok {test-number} - unexpected exception with message: 'Uncaught exception should fail!'; expression was: {Unknown expression after the reported line}
+# Testing checked-if 5
+ok {test-number} - false  # TODO
+# Testing checked-if 5
+not ok {test-number} - unexpected exception with message: 'Uncaught exception should fail!'; expression was: {Unknown expression after the reported line}
 # The NO_FAIL macro reports a failure but does not fail the test
 ok {test-number} - 1 == 2  # TODO
 # The default listing implementation write to provided stream
@@ -4355,6 +4423,10 @@ ok {test-number} - e.upper_bound == 23 for: 23.0 == 23
 ok {test-number} - e.lower_bound == 23 for: 23.0 == 23
 # uniform samples
 ok {test-number} - e.confidence_interval == 0.95 for: 0.95 == 0.95
+# uniform_integer_distribution can return the bounds
+ok {test-number} - dist.a() == -10 for: -10 == -10
+# uniform_integer_distribution can return the bounds
+ok {test-number} - dist.b() == 10 for: 10 == 10
 # unique_ptr reimplementation: basic functionality
 ok {test-number} - !(ptr) for: !{?}
 # unique_ptr reimplementation: basic functionality
@@ -4477,5 +4549,5 @@ ok {test-number} - q3 == 23. for: 23.0 == 23.0
 ok {test-number} -
 # xmlentitycheck
 ok {test-number} -
-1..2237
+1..2272
 
diff --git a/packages/Catch2/tests/SelfTest/Baselines/tap.sw.multi.approved.txt b/packages/Catch2/tests/SelfTest/Baselines/tap.sw.multi.approved.txt
index 033290497de6d943581107ed1b2026cdde1c3ccd..13449bd40b8e90b3ae64bdee30d76a2b18ef58ae 100644
--- a/packages/Catch2/tests/SelfTest/Baselines/tap.sw.multi.approved.txt
+++ b/packages/Catch2/tests/SelfTest/Baselines/tap.sw.multi.approved.txt
@@ -657,7 +657,7 @@ ok {test-number} - unrelated::ADL_empty{}, IsEmpty() for: {?} is empty
 # CAPTURE can deal with complex expressions
 ok {test-number} - with 7 messages: 'a := 1' and 'b := 2' and 'c := 3' and 'a + b := 3' and 'a+b := 3' and 'c > b := true' and 'a == 1 := true'
 # CAPTURE can deal with complex expressions involving commas
-ok {test-number} - with 7 messages: 'std::vector<int>{1, 2, 3}[0, 1, 2] := 3' and 'std::vector<int>{1, 2, 3}[(0, 1)] := 2' and 'std::vector<int>{1, 2, 3}[0] := 1' and '(helper_1436<int, int>{12, -12}) := { 12, -12 }' and '(helper_1436<int, int>(-12, 12)) := { -12, 12 }' and '(1, 2) := 2' and '(2, 3) := 3'
+ok {test-number} - with 7 messages: 'custom_index_op<int>{1, 2, 3}[0, 1, 2] := 0' and 'custom_index_op<int>{1, 2, 3}[(0, 1)] := 0' and 'custom_index_op<int>{1, 2, 3}[0] := 0' and '(helper_1436<int, int>{12, -12}) := { 12, -12 }' and '(helper_1436<int, int>(-12, 12)) := { -12, 12 }' and '(1, 2) := 2' and '(2, 3) := 3'
 # CAPTURE parses string and character constants
 ok {test-number} - with 11 messages: '("comma, in string", "escaped, \", ") := "escaped, ", "' and '"single quote in string,'," := "single quote in string,',"' and '"some escapes, \\,\\\\" := "some escapes, \,\\"' and '"some, ), unmatched, } prenheses {[<" := "some, ), unmatched, } prenheses {[<"' and ''"' := '"'' and ''\'' := '''' and '',' := ','' and ''}' := '}'' and '')' := ')'' and ''(' := '('' and ''{' := '{''
 # Capture and info messages
@@ -1256,6 +1256,14 @@ ok {test-number} - WithinRel( 1.f, -0.2f ), std::domain_error
 ok {test-number} - WithinRel( 1.f, 1.f ), std::domain_error
 # Floating point matchers: float
 ok {test-number} - 1., !IsNaN() for: 1.0 not is NaN
+# GENERATE can combine literals and generators
+ok {test-number} - i % 2 == 0 for: 0 == 0
+# GENERATE can combine literals and generators
+ok {test-number} - i % 2 == 0 for: 0 == 0
+# GENERATE can combine literals and generators
+ok {test-number} - i % 2 == 0 for: 0 == 0
+# GENERATE can combine literals and generators
+ok {test-number} - i % 2 == 0 for: 0 == 0
 # Generators -- adapters
 ok {test-number} - i % 2 == 0 for: 0 == 0
 # Generators -- adapters
@@ -1794,6 +1802,8 @@ ok {test-number} - h( dummy1 ) != h( dummy2 ) for: 2673152918 (0x<hex digits>) !
 ok {test-number} - h( dummy1 ) != h( dummy2 ) for: 2074929312 (0x<hex digits>) != 3429949824 (0x<hex digits>)
 # Hashing test case produces same hash across multiple calls
 ok {test-number} - h( dummy ) == h( dummy ) for: 3422778688 (0x<hex digits>) == 3422778688 (0x<hex digits>)
+# INFO and UNSCOPED_INFO can stream multiple arguments
+not ok {test-number} - explicitly with 3 messages: 'This info has multiple parts.' and 'This unscoped info has multiple parts.' and 'Show infos!'
 # INFO and WARN do not abort tests
 warning {test-number} - 'this is a message' with 1 message: 'this is a warning'
 # INFO gets logged on failure
@@ -1828,6 +1838,8 @@ ok {test-number} - i < 10 for: 8 < 10 with 2 messages: 'current counter 8' and '
 ok {test-number} - i < 10 for: 9 < 10 with 2 messages: 'current counter 9' and 'i := 9'
 # INFO is reset for each loop
 not ok {test-number} - i < 10 for: 10 < 10 with 2 messages: 'current counter 10' and 'i := 10'
+# Incomplete AssertionHandler
+not ok {test-number} - unexpected exception with message: 'Exception translation was disabled by CATCH_CONFIG_FAST_COMPILE'; expression was: Dummy
 # Inequality checks that should fail
 not ok {test-number} - data.int_seven != 7 for: 7 != 7
 # Inequality checks that should fail
@@ -1860,6 +1872,42 @@ ok {test-number} - data.str_hello != "hell" for: "hello" != "hell"
 ok {test-number} - data.str_hello != "hello1" for: "hello" != "hello1"
 # Inequality checks that should succeed
 ok {test-number} - data.str_hello.size() != 6 for: 5 != 6
+# JsonWriter
+ok {test-number} - stream.str() == "" for: "" == ""
+# JsonWriter
+ok {test-number} - stream.str() == "{\n}" for: "{ }" == "{ }"
+# JsonWriter
+ok {test-number} - stream.str(), ContainsSubstring( "\"int\": 1," ) && ContainsSubstring( "\"double\": 1.5," ) && ContainsSubstring( "\"true\": true," ) && ContainsSubstring( "\"false\": false," ) && ContainsSubstring( "\"string\": \"this is a string\"," ) && ContainsSubstring( "\"array\": [\n    1,\n    2\n  ]\n}" ) for: "{   "int": 1,   "double": 1.5,   "true": true,   "false": false,   "string": "this is a string",   "array": [     1,     2   ] }" ( contains: ""int": 1," and contains: ""double": 1.5," and contains: ""true": true," and contains: ""false": false," and contains: ""string": "this is a string"," and contains: ""array": [     1,     2   ] }" )
+# JsonWriter
+ok {test-number} - stream.str(), ContainsSubstring( "\"empty_object\": {\n  }," ) && ContainsSubstring( "\"fully_object\": {\n    \"key\": 1\n  }" ) for: "{   "empty_object": {   },   "fully_object": {     "key": 1   } }" ( contains: ""empty_object": {   }," and contains: ""fully_object": {     "key": 1   }" )
+# JsonWriter
+ok {test-number} - stream.str() == "[\n]" for: "[ ]" == "[ ]"
+# JsonWriter
+ok {test-number} - stream.str() == "[\n  1,\n  1.5,\n  true,\n  false,\n  \"this is a string\",\n  {\n    \"object\": 42\n  },\n  [\n    \"array\",\n    42.5\n  ]\n]" for: "[   1,   1.5,   true,   false,   "this is a string",   {     "object": 42   },   [     "array",     42.5   ] ]" == "[   1,   1.5,   true,   false,   "this is a string",   {     "object": 42   },   [     "array",     42.5   ] ]"
+# JsonWriter
+ok {test-number} - stream.str() == "{\n}" for: "{ }" == "{ }"
+# JsonWriter
+ok {test-number} - stream.str() == "[\n]" for: "[ ]" == "[ ]"
+# JsonWriter
+ok {test-number} - stream.str() == "\"custom\"" for: ""custom"" == ""custom""
+# JsonWriter escapes charaters in strings properly
+ok {test-number} - sstream.str() == "\"\\\"\"" for: ""\""" == ""\"""
+# JsonWriter escapes charaters in strings properly
+ok {test-number} - sstream.str() == "\"\\\\\"" for: ""\\"" == ""\\""
+# JsonWriter escapes charaters in strings properly
+ok {test-number} - sstream.str() == "\"/\"" for: ""/"" == ""/""
+# JsonWriter escapes charaters in strings properly
+ok {test-number} - sstream.str() == "\"\\b\"" for: ""\b"" == ""\b""
+# JsonWriter escapes charaters in strings properly
+ok {test-number} - sstream.str() == "\"\\f\"" for: ""\f"" == ""\f""
+# JsonWriter escapes charaters in strings properly
+ok {test-number} - sstream.str() == "\"\\n\"" for: ""\n"" == ""\n""
+# JsonWriter escapes charaters in strings properly
+ok {test-number} - sstream.str() == "\"\\r\"" for: ""\r"" == ""\r""
+# JsonWriter escapes charaters in strings properly
+ok {test-number} - sstream.str() == "\"\\t\"" for: ""\t"" == ""\t""
+# JsonWriter escapes charaters in strings properly
+ok {test-number} - sstream.str() == "\"\\\\/\\t\\r\\n\"" for: ""\\/\t\r\n"" == ""\\/\t\r\n""
 # Lambdas in assertions
 ok {test-number} - []() { return true; }() for: true
 # Less-than inequalities with different epsilons
@@ -2453,6 +2501,18 @@ ok {test-number} - listingString, ContainsSubstring( "fake test name"s ) && Cont
 # Reporter's write listings to provided stream
 ok {test-number} - !(factories.empty()) for: !false
 # Reporter's write listings to provided stream
+ok {test-number} - listingString, ContainsSubstring("fakeTag"s) for: "{   "version": 1,   "metadata": {     "name": "",     "rng-seed": 1234,     "catch2-version": "<version>"   },   "listings": {     "tags": [       {         "aliases": [           "fakeTag"         ],         "count": 1       }     ]" contains: "fakeTag" with 1 message: 'Tested reporter: JSON'
+# Reporter's write listings to provided stream
+ok {test-number} - !(factories.empty()) for: !false
+# Reporter's write listings to provided stream
+ok {test-number} - listingString, ContainsSubstring("fake reporter"s) for: "{   "version": 1,   "metadata": {     "name": "",     "rng-seed": 1234,     "catch2-version": "<version>"   },   "listings": {     "reporters": [       {         "name": "fake reporter",         "description": "fake description"       }     ]" contains: "fake reporter" with 1 message: 'Tested reporter: JSON'
+# Reporter's write listings to provided stream
+ok {test-number} - !(factories.empty()) for: !false
+# Reporter's write listings to provided stream
+ok {test-number} - listingString, ContainsSubstring( "fake test name"s ) && ContainsSubstring( "fakeTestTag"s ) for: "{   "version": 1,   "metadata": {     "name": "",     "rng-seed": 1234,     "catch2-version": "<version>"   },   "listings": {     "tests": [       {         "name": "fake test name",         "class-name": "",         "tags": [           "fakeTestTag"         ],         "source-location": {           "filename": "fake-file.cpp",           "line": 123456789         }       }     ]" ( contains: "fake test name" and contains: "fakeTestTag" ) with 1 message: 'Tested reporter: JSON'
+# Reporter's write listings to provided stream
+ok {test-number} - !(factories.empty()) for: !false
+# Reporter's write listings to provided stream
 ok {test-number} - listingString, ContainsSubstring("fakeTag"s) for: "<?xml version="1.0" encoding="UTF-8"?> All available tags:    1  [fakeTag] 1 tag  " contains: "fakeTag" with 1 message: 'Tested reporter: JUnit'
 # Reporter's write listings to provided stream
 ok {test-number} - !(factories.empty()) for: !false
@@ -3060,6 +3120,14 @@ not ok {test-number} - explicitly
 ok {test-number} - false  # TODO
 # Testing checked-if 3
 not ok {test-number} - explicitly
+# Testing checked-if 4
+ok {test-number} - true
+# Testing checked-if 4
+not ok {test-number} - unexpected exception with message: 'Uncaught exception should fail!'; expression was: {Unknown expression after the reported line}
+# Testing checked-if 5
+ok {test-number} - false  # TODO
+# Testing checked-if 5
+not ok {test-number} - unexpected exception with message: 'Uncaught exception should fail!'; expression was: {Unknown expression after the reported line}
 # The NO_FAIL macro reports a failure but does not fail the test
 ok {test-number} - 1 == 2  # TODO
 # The default listing implementation write to provided stream
@@ -4344,6 +4412,10 @@ ok {test-number} - e.upper_bound == 23 for: 23.0 == 23
 ok {test-number} - e.lower_bound == 23 for: 23.0 == 23
 # uniform samples
 ok {test-number} - e.confidence_interval == 0.95 for: 0.95 == 0.95
+# uniform_integer_distribution can return the bounds
+ok {test-number} - dist.a() == -10 for: -10 == -10
+# uniform_integer_distribution can return the bounds
+ok {test-number} - dist.b() == 10 for: 10 == 10
 # unique_ptr reimplementation: basic functionality
 ok {test-number} - !(ptr) for: !{?}
 # unique_ptr reimplementation: basic functionality
@@ -4466,5 +4538,5 @@ ok {test-number} - q3 == 23. for: 23.0 == 23.0
 ok {test-number} -
 # xmlentitycheck
 ok {test-number} -
-1..2237
+1..2272
 
diff --git a/packages/Catch2/tests/SelfTest/Baselines/teamcity.sw.approved.txt b/packages/Catch2/tests/SelfTest/Baselines/teamcity.sw.approved.txt
index a298633a16618590108f292f8d14a428e1ba47b6..2a2c40cfc883648deac03e38bb2809e2cd735d60 100644
--- a/packages/Catch2/tests/SelfTest/Baselines/teamcity.sw.approved.txt
+++ b/packages/Catch2/tests/SelfTest/Baselines/teamcity.sw.approved.txt
@@ -377,6 +377,8 @@
 ##teamcity[testFinished name='Floating point matchers: double' duration="{duration}"]
 ##teamcity[testStarted name='Floating point matchers: float']
 ##teamcity[testFinished name='Floating point matchers: float' duration="{duration}"]
+##teamcity[testStarted name='GENERATE can combine literals and generators']
+##teamcity[testFinished name='GENERATE can combine literals and generators' duration="{duration}"]
 ##teamcity[testStarted name='Generators -- adapters']
 ##teamcity[testFinished name='Generators -- adapters' duration="{duration}"]
 ##teamcity[testStarted name='Generators -- simple']
@@ -393,6 +395,9 @@
 ##teamcity[testFinished name='Hashing different test cases produces different result' duration="{duration}"]
 ##teamcity[testStarted name='Hashing test case produces same hash across multiple calls']
 ##teamcity[testFinished name='Hashing test case produces same hash across multiple calls' duration="{duration}"]
+##teamcity[testStarted name='INFO and UNSCOPED_INFO can stream multiple arguments']
+##teamcity[testFailed name='INFO and UNSCOPED_INFO can stream multiple arguments' message='Message.tests.cpp:<line number>|n...............................................................................|n|nMessage.tests.cpp:<line number>|nexplicit failure with messages:|n  "This info has multiple parts."|n  "This unscoped info has multiple parts."|n  "Show infos!"']
+##teamcity[testFinished name='INFO and UNSCOPED_INFO can stream multiple arguments' duration="{duration}"]
 ##teamcity[testStarted name='INFO and WARN do not abort tests']
 ##teamcity[testFinished name='INFO and WARN do not abort tests' duration="{duration}"]
 ##teamcity[testStarted name='INFO gets logged on failure']
@@ -405,6 +410,9 @@
 ##teamcity[testStarted name='INFO is reset for each loop']
 ##teamcity[testFailed name='INFO is reset for each loop' message='Message.tests.cpp:<line number>|n...............................................................................|n|nMessage.tests.cpp:<line number>|nexpression failed with messages:|n  "current counter 10"|n  "i := 10"|n  REQUIRE( i < 10 )|nwith expansion:|n  10 < 10|n']
 ##teamcity[testFinished name='INFO is reset for each loop' duration="{duration}"]
+##teamcity[testStarted name='Incomplete AssertionHandler']
+##teamcity[testIgnored name='Incomplete AssertionHandler' message='AssertionHandler.tests.cpp:<line number>|n...............................................................................|n|nAssertionHandler.tests.cpp:<line number>|nunexpected exception with message:|n  "Exception translation was disabled by CATCH_CONFIG_FAST_COMPILE"|n  REQUIRE( Dummy )|nwith expansion:|n  Dummy|n- failure ignore as test marked as |'ok to fail|'|n']
+##teamcity[testFinished name='Incomplete AssertionHandler' duration="{duration}"]
 ##teamcity[testStarted name='Inequality checks that should fail']
 ##teamcity[testIgnored name='Inequality checks that should fail' message='Condition.tests.cpp:<line number>|n...............................................................................|n|nCondition.tests.cpp:<line number>|nexpression failed|n  CHECK( data.int_seven != 7 )|nwith expansion:|n  7 != 7|n- failure ignore as test marked as |'ok to fail|'|n']
 ##teamcity[testIgnored name='Inequality checks that should fail' message='Condition.tests.cpp:<line number>|nexpression failed|n  CHECK( data.float_nine_point_one != Approx( 9.1f ) )|nwith expansion:|n  9.1f != Approx( 9.1000003815 )|n- failure ignore as test marked as |'ok to fail|'|n']
@@ -414,6 +422,10 @@
 ##teamcity[testFinished name='Inequality checks that should fail' duration="{duration}"]
 ##teamcity[testStarted name='Inequality checks that should succeed']
 ##teamcity[testFinished name='Inequality checks that should succeed' duration="{duration}"]
+##teamcity[testStarted name='JsonWriter']
+##teamcity[testFinished name='JsonWriter' duration="{duration}"]
+##teamcity[testStarted name='JsonWriter escapes charaters in strings properly']
+##teamcity[testFinished name='JsonWriter escapes charaters in strings properly' duration="{duration}"]
 ##teamcity[testStarted name='Lambdas in assertions']
 ##teamcity[testFinished name='Lambdas in assertions' duration="{duration}"]
 ##teamcity[testStarted name='Less-than inequalities with different epsilons']
@@ -639,6 +651,12 @@
 ##teamcity[testStarted name='Testing checked-if 3']
 ##teamcity[testIgnored name='Testing checked-if 3' message='Misc.tests.cpp:<line number>|n...............................................................................|n|nMisc.tests.cpp:<line number>|nexplicit failure- failure ignore as test marked as |'ok to fail|'|n']
 ##teamcity[testFinished name='Testing checked-if 3' duration="{duration}"]
+##teamcity[testStarted name='Testing checked-if 4']
+##teamcity[testIgnored name='Testing checked-if 4' message='Misc.tests.cpp:<line number>|n...............................................................................|n|nMisc.tests.cpp:<line number>|nunexpected exception with message:|n  "Uncaught exception should fail!"|n  {Unknown expression after the reported line}|nwith expansion:|n  {Unknown expression after the reported line}|n- failure ignore as test marked as |'ok to fail|'|n']
+##teamcity[testFinished name='Testing checked-if 4' duration="{duration}"]
+##teamcity[testStarted name='Testing checked-if 5']
+##teamcity[testIgnored name='Testing checked-if 5' message='Misc.tests.cpp:<line number>|n...............................................................................|n|nMisc.tests.cpp:<line number>|nunexpected exception with message:|n  "Uncaught exception should fail!"|n  {Unknown expression after the reported line}|nwith expansion:|n  {Unknown expression after the reported line}|n- failure ignore as test marked as |'ok to fail|'|n']
+##teamcity[testFinished name='Testing checked-if 5' duration="{duration}"]
 ##teamcity[testStarted name='The NO_FAIL macro reports a failure but does not fail the test']
 ##teamcity[testFinished name='The NO_FAIL macro reports a failure but does not fail the test' duration="{duration}"]
 ##teamcity[testStarted name='The default listing implementation write to provided stream']
@@ -976,6 +994,8 @@ loose text artifact
 ##teamcity[testFinished name='tuple<tuple<int>,tuple<>,float>' duration="{duration}"]
 ##teamcity[testStarted name='uniform samples']
 ##teamcity[testFinished name='uniform samples' duration="{duration}"]
+##teamcity[testStarted name='uniform_integer_distribution can return the bounds']
+##teamcity[testFinished name='uniform_integer_distribution can return the bounds' duration="{duration}"]
 ##teamcity[testStarted name='unique_ptr reimplementation: basic functionality']
 ##teamcity[testFinished name='unique_ptr reimplementation: basic functionality' duration="{duration}"]
 ##teamcity[testStarted name='vec<vec<string,alloc>> -> toString']
diff --git a/packages/Catch2/tests/SelfTest/Baselines/teamcity.sw.multi.approved.txt b/packages/Catch2/tests/SelfTest/Baselines/teamcity.sw.multi.approved.txt
index 861d64715b55a33c980cc2d135d069dcf9ad5459..24ed5d9887d592517e847f6060818ac08f82bca8 100644
--- a/packages/Catch2/tests/SelfTest/Baselines/teamcity.sw.multi.approved.txt
+++ b/packages/Catch2/tests/SelfTest/Baselines/teamcity.sw.multi.approved.txt
@@ -377,6 +377,8 @@
 ##teamcity[testFinished name='Floating point matchers: double' duration="{duration}"]
 ##teamcity[testStarted name='Floating point matchers: float']
 ##teamcity[testFinished name='Floating point matchers: float' duration="{duration}"]
+##teamcity[testStarted name='GENERATE can combine literals and generators']
+##teamcity[testFinished name='GENERATE can combine literals and generators' duration="{duration}"]
 ##teamcity[testStarted name='Generators -- adapters']
 ##teamcity[testFinished name='Generators -- adapters' duration="{duration}"]
 ##teamcity[testStarted name='Generators -- simple']
@@ -393,6 +395,9 @@
 ##teamcity[testFinished name='Hashing different test cases produces different result' duration="{duration}"]
 ##teamcity[testStarted name='Hashing test case produces same hash across multiple calls']
 ##teamcity[testFinished name='Hashing test case produces same hash across multiple calls' duration="{duration}"]
+##teamcity[testStarted name='INFO and UNSCOPED_INFO can stream multiple arguments']
+##teamcity[testFailed name='INFO and UNSCOPED_INFO can stream multiple arguments' message='Message.tests.cpp:<line number>|n...............................................................................|n|nMessage.tests.cpp:<line number>|nexplicit failure with messages:|n  "This info has multiple parts."|n  "This unscoped info has multiple parts."|n  "Show infos!"']
+##teamcity[testFinished name='INFO and UNSCOPED_INFO can stream multiple arguments' duration="{duration}"]
 ##teamcity[testStarted name='INFO and WARN do not abort tests']
 ##teamcity[testFinished name='INFO and WARN do not abort tests' duration="{duration}"]
 ##teamcity[testStarted name='INFO gets logged on failure']
@@ -405,6 +410,9 @@
 ##teamcity[testStarted name='INFO is reset for each loop']
 ##teamcity[testFailed name='INFO is reset for each loop' message='Message.tests.cpp:<line number>|n...............................................................................|n|nMessage.tests.cpp:<line number>|nexpression failed with messages:|n  "current counter 10"|n  "i := 10"|n  REQUIRE( i < 10 )|nwith expansion:|n  10 < 10|n']
 ##teamcity[testFinished name='INFO is reset for each loop' duration="{duration}"]
+##teamcity[testStarted name='Incomplete AssertionHandler']
+##teamcity[testIgnored name='Incomplete AssertionHandler' message='AssertionHandler.tests.cpp:<line number>|n...............................................................................|n|nAssertionHandler.tests.cpp:<line number>|nunexpected exception with message:|n  "Exception translation was disabled by CATCH_CONFIG_FAST_COMPILE"|n  REQUIRE( Dummy )|nwith expansion:|n  Dummy|n- failure ignore as test marked as |'ok to fail|'|n']
+##teamcity[testFinished name='Incomplete AssertionHandler' duration="{duration}"]
 ##teamcity[testStarted name='Inequality checks that should fail']
 ##teamcity[testIgnored name='Inequality checks that should fail' message='Condition.tests.cpp:<line number>|n...............................................................................|n|nCondition.tests.cpp:<line number>|nexpression failed|n  CHECK( data.int_seven != 7 )|nwith expansion:|n  7 != 7|n- failure ignore as test marked as |'ok to fail|'|n']
 ##teamcity[testIgnored name='Inequality checks that should fail' message='Condition.tests.cpp:<line number>|nexpression failed|n  CHECK( data.float_nine_point_one != Approx( 9.1f ) )|nwith expansion:|n  9.1f != Approx( 9.1000003815 )|n- failure ignore as test marked as |'ok to fail|'|n']
@@ -414,6 +422,10 @@
 ##teamcity[testFinished name='Inequality checks that should fail' duration="{duration}"]
 ##teamcity[testStarted name='Inequality checks that should succeed']
 ##teamcity[testFinished name='Inequality checks that should succeed' duration="{duration}"]
+##teamcity[testStarted name='JsonWriter']
+##teamcity[testFinished name='JsonWriter' duration="{duration}"]
+##teamcity[testStarted name='JsonWriter escapes charaters in strings properly']
+##teamcity[testFinished name='JsonWriter escapes charaters in strings properly' duration="{duration}"]
 ##teamcity[testStarted name='Lambdas in assertions']
 ##teamcity[testFinished name='Lambdas in assertions' duration="{duration}"]
 ##teamcity[testStarted name='Less-than inequalities with different epsilons']
@@ -639,6 +651,12 @@
 ##teamcity[testStarted name='Testing checked-if 3']
 ##teamcity[testIgnored name='Testing checked-if 3' message='Misc.tests.cpp:<line number>|n...............................................................................|n|nMisc.tests.cpp:<line number>|nexplicit failure- failure ignore as test marked as |'ok to fail|'|n']
 ##teamcity[testFinished name='Testing checked-if 3' duration="{duration}"]
+##teamcity[testStarted name='Testing checked-if 4']
+##teamcity[testIgnored name='Testing checked-if 4' message='Misc.tests.cpp:<line number>|n...............................................................................|n|nMisc.tests.cpp:<line number>|nunexpected exception with message:|n  "Uncaught exception should fail!"|n  {Unknown expression after the reported line}|nwith expansion:|n  {Unknown expression after the reported line}|n- failure ignore as test marked as |'ok to fail|'|n']
+##teamcity[testFinished name='Testing checked-if 4' duration="{duration}"]
+##teamcity[testStarted name='Testing checked-if 5']
+##teamcity[testIgnored name='Testing checked-if 5' message='Misc.tests.cpp:<line number>|n...............................................................................|n|nMisc.tests.cpp:<line number>|nunexpected exception with message:|n  "Uncaught exception should fail!"|n  {Unknown expression after the reported line}|nwith expansion:|n  {Unknown expression after the reported line}|n- failure ignore as test marked as |'ok to fail|'|n']
+##teamcity[testFinished name='Testing checked-if 5' duration="{duration}"]
 ##teamcity[testStarted name='The NO_FAIL macro reports a failure but does not fail the test']
 ##teamcity[testFinished name='The NO_FAIL macro reports a failure but does not fail the test' duration="{duration}"]
 ##teamcity[testStarted name='The default listing implementation write to provided stream']
@@ -975,6 +993,8 @@
 ##teamcity[testFinished name='tuple<tuple<int>,tuple<>,float>' duration="{duration}"]
 ##teamcity[testStarted name='uniform samples']
 ##teamcity[testFinished name='uniform samples' duration="{duration}"]
+##teamcity[testStarted name='uniform_integer_distribution can return the bounds']
+##teamcity[testFinished name='uniform_integer_distribution can return the bounds' duration="{duration}"]
 ##teamcity[testStarted name='unique_ptr reimplementation: basic functionality']
 ##teamcity[testFinished name='unique_ptr reimplementation: basic functionality' duration="{duration}"]
 ##teamcity[testStarted name='vec<vec<string,alloc>> -> toString']
diff --git a/packages/Catch2/tests/SelfTest/Baselines/xml.sw.approved.txt b/packages/Catch2/tests/SelfTest/Baselines/xml.sw.approved.txt
index bf9cf2053f7aaca5fe91b2cadcd3e8d5bb3ef82c..be57798bf7068d32d3bab247d36d95afafef01f6 100644
--- a/packages/Catch2/tests/SelfTest/Baselines/xml.sw.approved.txt
+++ b/packages/Catch2/tests/SelfTest/Baselines/xml.sw.approved.txt
@@ -667,7 +667,7 @@ Nor would this
     </Expression>
     <OverallResult success="true" skips="0"/>
   </TestCase>
-  <TestCase name="#2615 - Throwing in constructor generator fails test case but does not abort" tags="[!shouldfail]" filename="tests/<exe-name>/UsageTests/Generators.tests.cpp" >
+  <TestCase name="#2615 - Throwing in constructor generator fails test case but does not abort" tags="[!shouldfail][generators][regression]" filename="tests/<exe-name>/UsageTests/Generators.tests.cpp" >
     <Exception filename="tests/<exe-name>/UsageTests/Generators.tests.cpp" >
       failure to init
     </Exception>
@@ -2911,13 +2911,13 @@ Nor would this
   </TestCase>
   <TestCase name="CAPTURE can deal with complex expressions involving commas" tags="[capture][messages]" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
     <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
-      std::vector&lt;int>{1, 2, 3}[0, 1, 2] := 3
+      custom_index_op&lt;int>{1, 2, 3}[0, 1, 2] := 0
     </Info>
     <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
-      std::vector&lt;int>{1, 2, 3}[(0, 1)] := 2
+      custom_index_op&lt;int>{1, 2, 3}[(0, 1)] := 0
     </Info>
     <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
-      std::vector&lt;int>{1, 2, 3}[0] := 1
+      custom_index_op&lt;int>{1, 2, 3}[0] := 0
     </Info>
     <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       (helper_1436&lt;int, int>{12, -12}) := { 12, -12 }
@@ -5583,6 +5583,41 @@ C
     </Section>
     <OverallResult success="true" skips="0"/>
   </TestCase>
+  <TestCase name="GENERATE can combine literals and generators" tags="[generators]" filename="tests/<exe-name>/UsageTests/Generators.tests.cpp" >
+    <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Generators.tests.cpp" >
+      <Original>
+        i % 2 == 0
+      </Original>
+      <Expanded>
+        0 == 0
+      </Expanded>
+    </Expression>
+    <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Generators.tests.cpp" >
+      <Original>
+        i % 2 == 0
+      </Original>
+      <Expanded>
+        0 == 0
+      </Expanded>
+    </Expression>
+    <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Generators.tests.cpp" >
+      <Original>
+        i % 2 == 0
+      </Original>
+      <Expanded>
+        0 == 0
+      </Expanded>
+    </Expression>
+    <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Generators.tests.cpp" >
+      <Original>
+        i % 2 == 0
+      </Original>
+      <Expanded>
+        0 == 0
+      </Expanded>
+    </Expression>
+    <OverallResult success="true" skips="0"/>
+  </TestCase>
   <TestCase name="Generators -- adapters" tags="[generators][generic]" filename="tests/<exe-name>/UsageTests/Generators.tests.cpp" >
     <Section name="Filtering by predicate" filename="tests/<exe-name>/UsageTests/Generators.tests.cpp" >
       <Section name="Basic usage" filename="tests/<exe-name>/UsageTests/Generators.tests.cpp" >
@@ -8371,6 +8406,18 @@ C
     </Expression>
     <OverallResult success="true" skips="0"/>
   </TestCase>
+  <TestCase name="INFO and UNSCOPED_INFO can stream multiple arguments" tags="[.][failing][info][messages]" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
+      This info has multiple parts.
+    </Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
+      This unscoped info has multiple parts.
+    </Info>
+    <Failure filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
+      Show infos!
+    </Failure>
+    <OverallResult success="false" skips="0"/>
+  </TestCase>
   <TestCase name="INFO and WARN do not abort tests" tags="[.][messages]" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
     <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       this is a message
@@ -8619,6 +8666,20 @@ C
     </Expression>
     <OverallResult success="false" skips="0"/>
   </TestCase>
+  <TestCase name="Incomplete AssertionHandler" tags="[!shouldfail][assertion-handler]" filename="tests/<exe-name>/IntrospectiveTests/AssertionHandler.tests.cpp" >
+    <Expression success="false" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/AssertionHandler.tests.cpp" >
+      <Original>
+        Dummy
+      </Original>
+      <Expanded>
+        Dummy
+      </Expanded>
+      <Exception filename="tests/<exe-name>/IntrospectiveTests/AssertionHandler.tests.cpp" >
+        Exception translation was disabled by CATCH_CONFIG_FAST_COMPILE
+      </Exception>
+    </Expression>
+    <OverallResult success="true" skips="0"/>
+  </TestCase>
   <TestCase name="Inequality checks that should fail" tags="[!shouldfail][.][failing]" filename="tests/<exe-name>/UsageTests/Condition.tests.cpp" >
     <Expression success="false" type="CHECK" filename="tests/<exe-name>/UsageTests/Condition.tests.cpp" >
       <Original>
@@ -8753,6 +8814,277 @@ C
     </Expression>
     <OverallResult success="true" skips="0"/>
   </TestCase>
+  <TestCase name="JsonWriter" tags="[JSON][JsonWriter]" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+    <Section name="Newly constructed JsonWriter does nothing" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          stream.str() == ""
+        </Original>
+        <Expanded>
+          "" == ""
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="Calling writeObject will create an empty pair of braces" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          stream.str() == "{\n}"
+        </Original>
+        <Expanded>
+          "{
+}"
+==
+"{
+}"
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="Calling writeObject with key will create an object to write the value" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          stream.str(), ContainsSubstring( "\"int\": 1," ) &amp;&amp; ContainsSubstring( "\"double\": 1.5," ) &amp;&amp; ContainsSubstring( "\"true\": true," ) &amp;&amp; ContainsSubstring( "\"false\": false," ) &amp;&amp; ContainsSubstring( "\"string\": \"this is a string\"," ) &amp;&amp; ContainsSubstring( "\"array\": [\n    1,\n    2\n  ]\n}" )
+        </Original>
+        <Expanded>
+          "{
+  "int": 1,
+  "double": 1.5,
+  "true": true,
+  "false": false,
+  "string": "this is a string",
+  "array": [
+    1,
+    2
+  ]
+}" ( contains: ""int": 1," and contains: ""double": 1.5," and contains: ""true": true," and contains: ""false": false," and contains: ""string": "this is a string"," and contains: ""array": [
+    1,
+    2
+  ]
+}" )
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="nesting objects" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          stream.str(), ContainsSubstring( "\"empty_object\": {\n  }," ) &amp;&amp; ContainsSubstring( "\"fully_object\": {\n    \"key\": 1\n  }" )
+        </Original>
+        <Expanded>
+          "{
+  "empty_object": {
+  },
+  "fully_object": {
+    "key": 1
+  }
+}" ( contains: ""empty_object": {
+  }," and contains: ""fully_object": {
+    "key": 1
+  }" )
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="Calling writeArray will create an empty pair of braces" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          stream.str() == "[\n]"
+        </Original>
+        <Expanded>
+          "[
+]"
+==
+"[
+]"
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="Calling writeArray creates array to write the values to" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          stream.str() == "[\n  1,\n  1.5,\n  true,\n  false,\n  \"this is a string\",\n  {\n    \"object\": 42\n  },\n  [\n    \"array\",\n    42.5\n  ]\n]"
+        </Original>
+        <Expanded>
+          "[
+  1,
+  1.5,
+  true,
+  false,
+  "this is a string",
+  {
+    "object": 42
+  },
+  [
+    "array",
+    42.5
+  ]
+]"
+==
+"[
+  1,
+  1.5,
+  true,
+  false,
+  "this is a string",
+  {
+    "object": 42
+  },
+  [
+    "array",
+    42.5
+  ]
+]"
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="Moved from JsonObjectWriter shall not insert superfluous brace" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          stream.str() == "{\n}"
+        </Original>
+        <Expanded>
+          "{
+}"
+==
+"{
+}"
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="Moved from JsonArrayWriter shall not insert superfluous bracket" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          stream.str() == "[\n]"
+        </Original>
+        <Expanded>
+          "[
+]"
+==
+"[
+]"
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="Custom class shall be quoted" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          stream.str() == "\"custom\""
+        </Original>
+        <Expanded>
+          ""custom"" == ""custom""
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <OverallResult success="true" skips="0"/>
+  </TestCase>
+  <TestCase name="JsonWriter escapes charaters in strings properly" tags="[JsonWriter]" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+    <Section name="Quote in a string is escaped" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          sstream.str() == "\"\\\"\""
+        </Original>
+        <Expanded>
+          ""\""" == ""\"""
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="Backslash in a string is escaped" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          sstream.str() == "\"\\\\\""
+        </Original>
+        <Expanded>
+          ""\\"" == ""\\""
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="Forward slash in a string is **not** escaped" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          sstream.str() == "\"/\""
+        </Original>
+        <Expanded>
+          ""/"" == ""/""
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="Backspace in a string is escaped" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          sstream.str() == "\"\\b\""
+        </Original>
+        <Expanded>
+          ""\b"" == ""\b""
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="Formfeed in a string is escaped" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          sstream.str() == "\"\\f\""
+        </Original>
+        <Expanded>
+          ""\f"" == ""\f""
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="linefeed in a string is escaped" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          sstream.str() == "\"\\n\""
+        </Original>
+        <Expanded>
+          ""\n"" == ""\n""
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="carriage return in a string is escaped" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          sstream.str() == "\"\\r\""
+        </Original>
+        <Expanded>
+          ""\r"" == ""\r""
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="tab in a string is escaped" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          sstream.str() == "\"\\t\""
+        </Original>
+        <Expanded>
+          ""\t"" == ""\t""
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="combination of characters is escaped" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          sstream.str() == "\"\\\\/\\t\\r\\n\""
+        </Original>
+        <Expanded>
+          ""\\/\t\r\n"" == ""\\/\t\r\n""
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <OverallResult success="true" skips="0"/>
+  </TestCase>
   <TestCase name="Lambdas in assertions" filename="tests/<exe-name>/UsageTests/Compilation.tests.cpp" >
     <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Compilation.tests.cpp" >
       <Original>
@@ -11669,6 +12001,120 @@ C
         !false
       </Expanded>
     </Expression>
+    <Section name="JSON reporter lists tags" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
+        Tested reporter: JSON
+      </Info>
+      <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
+        <Original>
+          listingString, ContainsSubstring("fakeTag"s)
+        </Original>
+        <Expanded>
+          "{
+  "version": 1,
+  "metadata": {
+    "name": "",
+    "rng-seed": 1234,
+    "catch2-version": "<version>"
+  },
+  "listings": {
+    "tags": [
+      {
+        "aliases": [
+          "fakeTag"
+        ],
+        "count": 1
+      }
+    ]" contains: "fakeTag"
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Expression success="true" type="REQUIRE_FALSE" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
+      <Original>
+        !(factories.empty())
+      </Original>
+      <Expanded>
+        !false
+      </Expanded>
+    </Expression>
+    <Section name="JSON reporter lists reporters" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
+        Tested reporter: JSON
+      </Info>
+      <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
+        <Original>
+          listingString, ContainsSubstring("fake reporter"s)
+        </Original>
+        <Expanded>
+          "{
+  "version": 1,
+  "metadata": {
+    "name": "",
+    "rng-seed": 1234,
+    "catch2-version": "<version>"
+  },
+  "listings": {
+    "reporters": [
+      {
+        "name": "fake reporter",
+        "description": "fake description"
+      }
+    ]" contains: "fake reporter"
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Expression success="true" type="REQUIRE_FALSE" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
+      <Original>
+        !(factories.empty())
+      </Original>
+      <Expanded>
+        !false
+      </Expanded>
+    </Expression>
+    <Section name="JSON reporter lists tests" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
+        Tested reporter: JSON
+      </Info>
+      <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
+        <Original>
+          listingString, ContainsSubstring( "fake test name"s ) &amp;&amp; ContainsSubstring( "fakeTestTag"s )
+        </Original>
+        <Expanded>
+          "{
+  "version": 1,
+  "metadata": {
+    "name": "",
+    "rng-seed": 1234,
+    "catch2-version": "<version>"
+  },
+  "listings": {
+    "tests": [
+      {
+        "name": "fake test name",
+        "class-name": "",
+        "tags": [
+          "fakeTestTag"
+        ],
+        "source-location": {
+          "filename": "fake-file.cpp",
+          "line": 123456789
+        }
+      }
+    ]" ( contains: "fake test name" and contains: "fakeTestTag" )
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Expression success="true" type="REQUIRE_FALSE" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
+      <Original>
+        !(factories.empty())
+      </Original>
+      <Expanded>
+        !false
+      </Expanded>
+    </Expression>
     <Section name="JUnit reporter lists tags" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
       <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
         Tested reporter: JUnit
@@ -14547,6 +14993,50 @@ Message from section two
     <Failure filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" />
     <OverallResult success="true" skips="0"/>
   </TestCase>
+  <TestCase name="Testing checked-if 4" tags="[!shouldfail][checked-if]" filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
+    <Expression success="true" type="CHECKED_ELSE" filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
+      <Original>
+        true
+      </Original>
+      <Expanded>
+        true
+      </Expanded>
+    </Expression>
+    <Expression success="false" filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
+      <Original>
+        {Unknown expression after the reported line}
+      </Original>
+      <Expanded>
+        {Unknown expression after the reported line}
+      </Expanded>
+      <Exception filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
+        Uncaught exception should fail!
+      </Exception>
+    </Expression>
+    <OverallResult success="true" skips="0"/>
+  </TestCase>
+  <TestCase name="Testing checked-if 5" tags="[!shouldfail][checked-if]" filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
+    <Expression success="false" type="CHECKED_ELSE" filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
+      <Original>
+        false
+      </Original>
+      <Expanded>
+        false
+      </Expanded>
+    </Expression>
+    <Expression success="false" filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
+      <Original>
+        {Unknown expression after the reported line}
+      </Original>
+      <Expanded>
+        {Unknown expression after the reported line}
+      </Expanded>
+      <Exception filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
+        Uncaught exception should fail!
+      </Exception>
+    </Expression>
+    <OverallResult success="true" skips="0"/>
+  </TestCase>
   <TestCase name="The NO_FAIL macro reports a failure but does not fail the test" tags="[messages]" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
     <Expression success="false" type="CHECK_NOFAIL" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       <Original>
@@ -20644,6 +21134,25 @@ b1!
     </Expression>
     <OverallResult success="true" skips="0"/>
   </TestCase>
+  <TestCase name="uniform_integer_distribution can return the bounds" tags="[distribution][rng]" filename="tests/<exe-name>/IntrospectiveTests/RandomNumberGeneration.tests.cpp" >
+    <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/RandomNumberGeneration.tests.cpp" >
+      <Original>
+        dist.a() == -10
+      </Original>
+      <Expanded>
+        -10 == -10
+      </Expanded>
+    </Expression>
+    <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/RandomNumberGeneration.tests.cpp" >
+      <Original>
+        dist.b() == 10
+      </Original>
+      <Expanded>
+        10 == 10
+      </Expanded>
+    </Expression>
+    <OverallResult success="true" skips="0"/>
+  </TestCase>
   <TestCase name="unique_ptr reimplementation: basic functionality" tags="[internals][unique-ptr]" filename="tests/<exe-name>/IntrospectiveTests/UniquePtr.tests.cpp" >
     <Section name="Default constructed unique_ptr is empty" filename="tests/<exe-name>/IntrospectiveTests/UniquePtr.tests.cpp" >
       <Expression success="true" type="REQUIRE_FALSE" filename="tests/<exe-name>/IntrospectiveTests/UniquePtr.tests.cpp" >
@@ -21198,6 +21707,6 @@ b1!
     </Section>
     <OverallResult success="true" skips="0"/>
   </TestCase>
-  <OverallResults successes="2048" failures="145" expectedFailures="32" skips="12"/>
-  <OverallResultsCases successes="308" failures="84" expectedFailures="11" skips="6"/>
+  <OverallResults successes="2079" failures="146" expectedFailures="35" skips="12"/>
+  <OverallResultsCases successes="312" failures="85" expectedFailures="14" skips="6"/>
 </Catch2TestRun>
diff --git a/packages/Catch2/tests/SelfTest/Baselines/xml.sw.multi.approved.txt b/packages/Catch2/tests/SelfTest/Baselines/xml.sw.multi.approved.txt
index 41dc8cb31522471f771534ff2fb0a878a870dcad..08ff6c43709ab7cb974583236d7d2155aafc02f5 100644
--- a/packages/Catch2/tests/SelfTest/Baselines/xml.sw.multi.approved.txt
+++ b/packages/Catch2/tests/SelfTest/Baselines/xml.sw.multi.approved.txt
@@ -667,7 +667,7 @@ Nor would this
     </Expression>
     <OverallResult success="true" skips="0"/>
   </TestCase>
-  <TestCase name="#2615 - Throwing in constructor generator fails test case but does not abort" tags="[!shouldfail]" filename="tests/<exe-name>/UsageTests/Generators.tests.cpp" >
+  <TestCase name="#2615 - Throwing in constructor generator fails test case but does not abort" tags="[!shouldfail][generators][regression]" filename="tests/<exe-name>/UsageTests/Generators.tests.cpp" >
     <Exception filename="tests/<exe-name>/UsageTests/Generators.tests.cpp" >
       failure to init
     </Exception>
@@ -2911,13 +2911,13 @@ Nor would this
   </TestCase>
   <TestCase name="CAPTURE can deal with complex expressions involving commas" tags="[capture][messages]" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
     <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
-      std::vector&lt;int>{1, 2, 3}[0, 1, 2] := 3
+      custom_index_op&lt;int>{1, 2, 3}[0, 1, 2] := 0
     </Info>
     <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
-      std::vector&lt;int>{1, 2, 3}[(0, 1)] := 2
+      custom_index_op&lt;int>{1, 2, 3}[(0, 1)] := 0
     </Info>
     <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
-      std::vector&lt;int>{1, 2, 3}[0] := 1
+      custom_index_op&lt;int>{1, 2, 3}[0] := 0
     </Info>
     <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       (helper_1436&lt;int, int>{12, -12}) := { 12, -12 }
@@ -5583,6 +5583,41 @@ C
     </Section>
     <OverallResult success="true" skips="0"/>
   </TestCase>
+  <TestCase name="GENERATE can combine literals and generators" tags="[generators]" filename="tests/<exe-name>/UsageTests/Generators.tests.cpp" >
+    <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Generators.tests.cpp" >
+      <Original>
+        i % 2 == 0
+      </Original>
+      <Expanded>
+        0 == 0
+      </Expanded>
+    </Expression>
+    <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Generators.tests.cpp" >
+      <Original>
+        i % 2 == 0
+      </Original>
+      <Expanded>
+        0 == 0
+      </Expanded>
+    </Expression>
+    <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Generators.tests.cpp" >
+      <Original>
+        i % 2 == 0
+      </Original>
+      <Expanded>
+        0 == 0
+      </Expanded>
+    </Expression>
+    <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Generators.tests.cpp" >
+      <Original>
+        i % 2 == 0
+      </Original>
+      <Expanded>
+        0 == 0
+      </Expanded>
+    </Expression>
+    <OverallResult success="true" skips="0"/>
+  </TestCase>
   <TestCase name="Generators -- adapters" tags="[generators][generic]" filename="tests/<exe-name>/UsageTests/Generators.tests.cpp" >
     <Section name="Filtering by predicate" filename="tests/<exe-name>/UsageTests/Generators.tests.cpp" >
       <Section name="Basic usage" filename="tests/<exe-name>/UsageTests/Generators.tests.cpp" >
@@ -8371,6 +8406,18 @@ C
     </Expression>
     <OverallResult success="true" skips="0"/>
   </TestCase>
+  <TestCase name="INFO and UNSCOPED_INFO can stream multiple arguments" tags="[.][failing][info][messages]" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
+      This info has multiple parts.
+    </Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
+      This unscoped info has multiple parts.
+    </Info>
+    <Failure filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
+      Show infos!
+    </Failure>
+    <OverallResult success="false" skips="0"/>
+  </TestCase>
   <TestCase name="INFO and WARN do not abort tests" tags="[.][messages]" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
     <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       this is a message
@@ -8619,6 +8666,20 @@ C
     </Expression>
     <OverallResult success="false" skips="0"/>
   </TestCase>
+  <TestCase name="Incomplete AssertionHandler" tags="[!shouldfail][assertion-handler]" filename="tests/<exe-name>/IntrospectiveTests/AssertionHandler.tests.cpp" >
+    <Expression success="false" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/AssertionHandler.tests.cpp" >
+      <Original>
+        Dummy
+      </Original>
+      <Expanded>
+        Dummy
+      </Expanded>
+      <Exception filename="tests/<exe-name>/IntrospectiveTests/AssertionHandler.tests.cpp" >
+        Exception translation was disabled by CATCH_CONFIG_FAST_COMPILE
+      </Exception>
+    </Expression>
+    <OverallResult success="true" skips="0"/>
+  </TestCase>
   <TestCase name="Inequality checks that should fail" tags="[!shouldfail][.][failing]" filename="tests/<exe-name>/UsageTests/Condition.tests.cpp" >
     <Expression success="false" type="CHECK" filename="tests/<exe-name>/UsageTests/Condition.tests.cpp" >
       <Original>
@@ -8753,6 +8814,277 @@ C
     </Expression>
     <OverallResult success="true" skips="0"/>
   </TestCase>
+  <TestCase name="JsonWriter" tags="[JSON][JsonWriter]" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+    <Section name="Newly constructed JsonWriter does nothing" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          stream.str() == ""
+        </Original>
+        <Expanded>
+          "" == ""
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="Calling writeObject will create an empty pair of braces" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          stream.str() == "{\n}"
+        </Original>
+        <Expanded>
+          "{
+}"
+==
+"{
+}"
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="Calling writeObject with key will create an object to write the value" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          stream.str(), ContainsSubstring( "\"int\": 1," ) &amp;&amp; ContainsSubstring( "\"double\": 1.5," ) &amp;&amp; ContainsSubstring( "\"true\": true," ) &amp;&amp; ContainsSubstring( "\"false\": false," ) &amp;&amp; ContainsSubstring( "\"string\": \"this is a string\"," ) &amp;&amp; ContainsSubstring( "\"array\": [\n    1,\n    2\n  ]\n}" )
+        </Original>
+        <Expanded>
+          "{
+  "int": 1,
+  "double": 1.5,
+  "true": true,
+  "false": false,
+  "string": "this is a string",
+  "array": [
+    1,
+    2
+  ]
+}" ( contains: ""int": 1," and contains: ""double": 1.5," and contains: ""true": true," and contains: ""false": false," and contains: ""string": "this is a string"," and contains: ""array": [
+    1,
+    2
+  ]
+}" )
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="nesting objects" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          stream.str(), ContainsSubstring( "\"empty_object\": {\n  }," ) &amp;&amp; ContainsSubstring( "\"fully_object\": {\n    \"key\": 1\n  }" )
+        </Original>
+        <Expanded>
+          "{
+  "empty_object": {
+  },
+  "fully_object": {
+    "key": 1
+  }
+}" ( contains: ""empty_object": {
+  }," and contains: ""fully_object": {
+    "key": 1
+  }" )
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="Calling writeArray will create an empty pair of braces" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          stream.str() == "[\n]"
+        </Original>
+        <Expanded>
+          "[
+]"
+==
+"[
+]"
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="Calling writeArray creates array to write the values to" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          stream.str() == "[\n  1,\n  1.5,\n  true,\n  false,\n  \"this is a string\",\n  {\n    \"object\": 42\n  },\n  [\n    \"array\",\n    42.5\n  ]\n]"
+        </Original>
+        <Expanded>
+          "[
+  1,
+  1.5,
+  true,
+  false,
+  "this is a string",
+  {
+    "object": 42
+  },
+  [
+    "array",
+    42.5
+  ]
+]"
+==
+"[
+  1,
+  1.5,
+  true,
+  false,
+  "this is a string",
+  {
+    "object": 42
+  },
+  [
+    "array",
+    42.5
+  ]
+]"
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="Moved from JsonObjectWriter shall not insert superfluous brace" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          stream.str() == "{\n}"
+        </Original>
+        <Expanded>
+          "{
+}"
+==
+"{
+}"
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="Moved from JsonArrayWriter shall not insert superfluous bracket" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          stream.str() == "[\n]"
+        </Original>
+        <Expanded>
+          "[
+]"
+==
+"[
+]"
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="Custom class shall be quoted" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          stream.str() == "\"custom\""
+        </Original>
+        <Expanded>
+          ""custom"" == ""custom""
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <OverallResult success="true" skips="0"/>
+  </TestCase>
+  <TestCase name="JsonWriter escapes charaters in strings properly" tags="[JsonWriter]" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+    <Section name="Quote in a string is escaped" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          sstream.str() == "\"\\\"\""
+        </Original>
+        <Expanded>
+          ""\""" == ""\"""
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="Backslash in a string is escaped" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          sstream.str() == "\"\\\\\""
+        </Original>
+        <Expanded>
+          ""\\"" == ""\\""
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="Forward slash in a string is **not** escaped" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          sstream.str() == "\"/\""
+        </Original>
+        <Expanded>
+          ""/"" == ""/""
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="Backspace in a string is escaped" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          sstream.str() == "\"\\b\""
+        </Original>
+        <Expanded>
+          ""\b"" == ""\b""
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="Formfeed in a string is escaped" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          sstream.str() == "\"\\f\""
+        </Original>
+        <Expanded>
+          ""\f"" == ""\f""
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="linefeed in a string is escaped" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          sstream.str() == "\"\\n\""
+        </Original>
+        <Expanded>
+          ""\n"" == ""\n""
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="carriage return in a string is escaped" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          sstream.str() == "\"\\r\""
+        </Original>
+        <Expanded>
+          ""\r"" == ""\r""
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="tab in a string is escaped" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          sstream.str() == "\"\\t\""
+        </Original>
+        <Expanded>
+          ""\t"" == ""\t""
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="combination of characters is escaped" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          sstream.str() == "\"\\\\/\\t\\r\\n\""
+        </Original>
+        <Expanded>
+          ""\\/\t\r\n"" == ""\\/\t\r\n""
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <OverallResult success="true" skips="0"/>
+  </TestCase>
   <TestCase name="Lambdas in assertions" filename="tests/<exe-name>/UsageTests/Compilation.tests.cpp" >
     <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Compilation.tests.cpp" >
       <Original>
@@ -11669,6 +12001,120 @@ C
         !false
       </Expanded>
     </Expression>
+    <Section name="JSON reporter lists tags" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
+        Tested reporter: JSON
+      </Info>
+      <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
+        <Original>
+          listingString, ContainsSubstring("fakeTag"s)
+        </Original>
+        <Expanded>
+          "{
+  "version": 1,
+  "metadata": {
+    "name": "",
+    "rng-seed": 1234,
+    "catch2-version": "<version>"
+  },
+  "listings": {
+    "tags": [
+      {
+        "aliases": [
+          "fakeTag"
+        ],
+        "count": 1
+      }
+    ]" contains: "fakeTag"
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Expression success="true" type="REQUIRE_FALSE" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
+      <Original>
+        !(factories.empty())
+      </Original>
+      <Expanded>
+        !false
+      </Expanded>
+    </Expression>
+    <Section name="JSON reporter lists reporters" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
+        Tested reporter: JSON
+      </Info>
+      <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
+        <Original>
+          listingString, ContainsSubstring("fake reporter"s)
+        </Original>
+        <Expanded>
+          "{
+  "version": 1,
+  "metadata": {
+    "name": "",
+    "rng-seed": 1234,
+    "catch2-version": "<version>"
+  },
+  "listings": {
+    "reporters": [
+      {
+        "name": "fake reporter",
+        "description": "fake description"
+      }
+    ]" contains: "fake reporter"
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Expression success="true" type="REQUIRE_FALSE" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
+      <Original>
+        !(factories.empty())
+      </Original>
+      <Expanded>
+        !false
+      </Expanded>
+    </Expression>
+    <Section name="JSON reporter lists tests" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
+        Tested reporter: JSON
+      </Info>
+      <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
+        <Original>
+          listingString, ContainsSubstring( "fake test name"s ) &amp;&amp; ContainsSubstring( "fakeTestTag"s )
+        </Original>
+        <Expanded>
+          "{
+  "version": 1,
+  "metadata": {
+    "name": "",
+    "rng-seed": 1234,
+    "catch2-version": "<version>"
+  },
+  "listings": {
+    "tests": [
+      {
+        "name": "fake test name",
+        "class-name": "",
+        "tags": [
+          "fakeTestTag"
+        ],
+        "source-location": {
+          "filename": "fake-file.cpp",
+          "line": 123456789
+        }
+      }
+    ]" ( contains: "fake test name" and contains: "fakeTestTag" )
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Expression success="true" type="REQUIRE_FALSE" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
+      <Original>
+        !(factories.empty())
+      </Original>
+      <Expanded>
+        !false
+      </Expanded>
+    </Expression>
     <Section name="JUnit reporter lists tags" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
       <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
         Tested reporter: JUnit
@@ -14547,6 +14993,50 @@ Message from section two
     <Failure filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" />
     <OverallResult success="true" skips="0"/>
   </TestCase>
+  <TestCase name="Testing checked-if 4" tags="[!shouldfail][checked-if]" filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
+    <Expression success="true" type="CHECKED_ELSE" filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
+      <Original>
+        true
+      </Original>
+      <Expanded>
+        true
+      </Expanded>
+    </Expression>
+    <Expression success="false" filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
+      <Original>
+        {Unknown expression after the reported line}
+      </Original>
+      <Expanded>
+        {Unknown expression after the reported line}
+      </Expanded>
+      <Exception filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
+        Uncaught exception should fail!
+      </Exception>
+    </Expression>
+    <OverallResult success="true" skips="0"/>
+  </TestCase>
+  <TestCase name="Testing checked-if 5" tags="[!shouldfail][checked-if]" filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
+    <Expression success="false" type="CHECKED_ELSE" filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
+      <Original>
+        false
+      </Original>
+      <Expanded>
+        false
+      </Expanded>
+    </Expression>
+    <Expression success="false" filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
+      <Original>
+        {Unknown expression after the reported line}
+      </Original>
+      <Expanded>
+        {Unknown expression after the reported line}
+      </Expanded>
+      <Exception filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
+        Uncaught exception should fail!
+      </Exception>
+    </Expression>
+    <OverallResult success="true" skips="0"/>
+  </TestCase>
   <TestCase name="The NO_FAIL macro reports a failure but does not fail the test" tags="[messages]" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
     <Expression success="false" type="CHECK_NOFAIL" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       <Original>
@@ -20643,6 +21133,25 @@ b1!
     </Expression>
     <OverallResult success="true" skips="0"/>
   </TestCase>
+  <TestCase name="uniform_integer_distribution can return the bounds" tags="[distribution][rng]" filename="tests/<exe-name>/IntrospectiveTests/RandomNumberGeneration.tests.cpp" >
+    <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/RandomNumberGeneration.tests.cpp" >
+      <Original>
+        dist.a() == -10
+      </Original>
+      <Expanded>
+        -10 == -10
+      </Expanded>
+    </Expression>
+    <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/RandomNumberGeneration.tests.cpp" >
+      <Original>
+        dist.b() == 10
+      </Original>
+      <Expanded>
+        10 == 10
+      </Expanded>
+    </Expression>
+    <OverallResult success="true" skips="0"/>
+  </TestCase>
   <TestCase name="unique_ptr reimplementation: basic functionality" tags="[internals][unique-ptr]" filename="tests/<exe-name>/IntrospectiveTests/UniquePtr.tests.cpp" >
     <Section name="Default constructed unique_ptr is empty" filename="tests/<exe-name>/IntrospectiveTests/UniquePtr.tests.cpp" >
       <Expression success="true" type="REQUIRE_FALSE" filename="tests/<exe-name>/IntrospectiveTests/UniquePtr.tests.cpp" >
@@ -21197,6 +21706,6 @@ b1!
     </Section>
     <OverallResult success="true" skips="0"/>
   </TestCase>
-  <OverallResults successes="2048" failures="145" expectedFailures="32" skips="12"/>
-  <OverallResultsCases successes="308" failures="84" expectedFailures="11" skips="6"/>
+  <OverallResults successes="2079" failures="146" expectedFailures="35" skips="12"/>
+  <OverallResultsCases successes="312" failures="85" expectedFailures="14" skips="6"/>
 </Catch2TestRun>
diff --git a/packages/Catch2/tests/SelfTest/IntrospectiveTests/AssertionHandler.tests.cpp b/packages/Catch2/tests/SelfTest/IntrospectiveTests/AssertionHandler.tests.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ab096074503b067181035fcaba8a0094c430702c
--- /dev/null
+++ b/packages/Catch2/tests/SelfTest/IntrospectiveTests/AssertionHandler.tests.cpp
@@ -0,0 +1,17 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include <catch2/catch_test_macros.hpp>
+
+TEST_CASE( "Incomplete AssertionHandler", "[assertion-handler][!shouldfail]" ) {
+    Catch::AssertionHandler catchAssertionHandler(
+        "REQUIRE"_catch_sr,
+        CATCH_INTERNAL_LINEINFO,
+        "Dummy",
+        Catch::ResultDisposition::Normal );
+}
diff --git a/packages/Catch2/tests/SelfTest/IntrospectiveTests/FloatingPoint.tests.cpp b/packages/Catch2/tests/SelfTest/IntrospectiveTests/FloatingPoint.tests.cpp
index 08a579c9dc5c0aa2dcfbbb602bc56c34dfab164d..d2181702d1a857871de162b436d09391d3545ac9 100644
--- a/packages/Catch2/tests/SelfTest/IntrospectiveTests/FloatingPoint.tests.cpp
+++ b/packages/Catch2/tests/SelfTest/IntrospectiveTests/FloatingPoint.tests.cpp
@@ -9,7 +9,9 @@
 #include <catch2/catch_test_macros.hpp>
 #include <catch2/catch_template_test_macros.hpp>
 #include <catch2/internal/catch_floating_point_helpers.hpp>
+#include <catch2/internal/catch_random_floating_point_helpers.hpp>
 
+#include <limits>
 
 TEST_CASE("convertToBits", "[floating-point][conversion]") {
     using Catch::Detail::convertToBits;
@@ -72,3 +74,66 @@ TEST_CASE("UlpDistance", "[floating-point][ulp][approvals]") {
     CHECK( ulpDistance( 1.f, 2.f ) == 0x80'00'00 );
     CHECK( ulpDistance( -2.f, 2.f ) == 0x80'00'00'00 );
 }
+
+
+
+TEMPLATE_TEST_CASE("gamma", "[approvals][floating-point][ulp][gamma]", float, double) {
+    using Catch::Detail::gamma;
+    using Catch::Detail::directCompare;
+
+    // We need to butcher the equal tests with the directCompare helper,
+    // because the Wfloat-equal triggers in decomposer rather than here,
+    // so we cannot locally disable it. Goddamn GCC.
+    CHECK( directCompare( gamma( TestType( -1. ), TestType( 1. ) ),
+                          gamma( TestType( 0.2332 ), TestType( 1.0 ) ) ) );
+    CHECK( directCompare( gamma( TestType( -2. ), TestType( 0 ) ),
+                          gamma( TestType( 1. ), TestType( 1.5 ) ) ) );
+    CHECK( gamma( TestType( 0. ), TestType( 1.0 ) ) <
+           gamma( TestType( 1.0 ), TestType( 1.5 ) ) );
+    CHECK( gamma( TestType( 0 ), TestType( 1. ) ) <
+           std::numeric_limits<TestType>::epsilon() );
+    CHECK( gamma( TestType( -1. ), TestType( -0. ) ) <
+           std::numeric_limits<TestType>::epsilon() );
+    CHECK( directCompare( gamma( TestType( 1. ), TestType( 2. ) ),
+                          std::numeric_limits<TestType>::epsilon() ) );
+    CHECK( directCompare( gamma( TestType( -2. ), TestType( -1. ) ),
+                          std::numeric_limits<TestType>::epsilon() ) );
+}
+
+TEMPLATE_TEST_CASE("count_equidistant_floats",
+                   "[approvals][floating-point][distance]",
+                   float,
+                   double) {
+    using Catch::Detail::count_equidistant_floats;
+    auto count_steps = []( TestType a, TestType b ) {
+        return count_equidistant_floats( a, b, Catch::Detail::gamma( a, b ) );
+    };
+
+    CHECK( count_steps( TestType( -1. ), TestType( 1. ) ) ==
+           2 * count_steps( TestType( 0. ), TestType( 1. ) ) );
+}
+
+TEST_CASE( "count_equidistant_floats",
+           "[approvals][floating-point][distance]" ) {
+    using Catch::Detail::count_equidistant_floats;
+    auto count_floats_with_scaled_ulp = []( auto a, auto b ) {
+        return count_equidistant_floats( a, b, Catch::Detail::gamma( a, b ) );
+    };
+
+    CHECK( count_floats_with_scaled_ulp( 1., 1.5 ) == 1ull << 51 );
+    CHECK( count_floats_with_scaled_ulp( 1.25, 1.5 ) == 1ull << 50 );
+    CHECK( count_floats_with_scaled_ulp( 1.f, 1.5f ) == 1 << 22 );
+    CHECK( count_floats_with_scaled_ulp( -std::numeric_limits<float>::max(),
+                                         std::numeric_limits<float>::max() ) ==
+           33554430 ); // (1 << 25) - 2 due to not including infinities
+    CHECK( count_floats_with_scaled_ulp( -std::numeric_limits<double>::max(),
+                                         std::numeric_limits<double>::max() ) ==
+           18014398509481982 ); // (1 << 54) - 2 due to not including infinities
+
+    STATIC_REQUIRE( std::is_same<std::uint64_t,
+                                 decltype( count_floats_with_scaled_ulp(
+                                     0., 1. ) )>::value );
+    STATIC_REQUIRE( std::is_same<std::uint32_t,
+                                 decltype( count_floats_with_scaled_ulp(
+                                     0.f, 1.f ) )>::value );
+}
diff --git a/packages/Catch2/tests/SelfTest/IntrospectiveTests/GeneratorsImpl.tests.cpp b/packages/Catch2/tests/SelfTest/IntrospectiveTests/GeneratorsImpl.tests.cpp
index f7b7c57cc354f5bd10ba48c8e08c3bf94cdaf6b7..acfeebed0722b323cdd2dc52a38ec64d82c6768c 100644
--- a/packages/Catch2/tests/SelfTest/IntrospectiveTests/GeneratorsImpl.tests.cpp
+++ b/packages/Catch2/tests/SelfTest/IntrospectiveTests/GeneratorsImpl.tests.cpp
@@ -10,6 +10,8 @@
 #    pragma GCC diagnostic ignored "-Wfloat-equal"
 #endif
 
+#include <helpers/range_test_helpers.hpp>
+
 #include <catch2/catch_approx.hpp>
 #include <catch2/catch_test_macros.hpp>
 #include <catch2/generators/catch_generator_exception.hpp>
@@ -545,3 +547,30 @@ TEST_CASE("Filter generator throws exception for empty generator",
         filter( []( int ) { return false; }, value( 3 ) ),
         Catch::GeneratorException );
 }
+
+TEST_CASE("from_range(container) supports ADL begin/end and arrays", "[generators][from-range][approvals]") {
+    using namespace Catch::Generators;
+
+    SECTION("C array") {
+        int arr[3]{ 5, 6, 7 };
+        auto gen = from_range( arr );
+        REQUIRE( gen.get() == 5 );
+        REQUIRE( gen.next() );
+        REQUIRE( gen.get() == 6 );
+        REQUIRE( gen.next() );
+        REQUIRE( gen.get() == 7 );
+        REQUIRE_FALSE( gen.next() );
+    }
+
+    SECTION( "ADL range" ) {
+        unrelated::needs_ADL_begin<int> range{ 1, 2, 3 };
+        auto gen = from_range( range );
+        REQUIRE( gen.get() == 1 );
+        REQUIRE( gen.next() );
+        REQUIRE( gen.get() == 2 );
+        REQUIRE( gen.next() );
+        REQUIRE( gen.get() == 3 );
+        REQUIRE_FALSE( gen.next() );
+    }
+
+}
diff --git a/packages/Catch2/tests/SelfTest/IntrospectiveTests/Integer.tests.cpp b/packages/Catch2/tests/SelfTest/IntrospectiveTests/Integer.tests.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fd620ebbf17310473ccfb4e7a708c6374291d6f2
--- /dev/null
+++ b/packages/Catch2/tests/SelfTest/IntrospectiveTests/Integer.tests.cpp
@@ -0,0 +1,150 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/internal/catch_random_integer_helpers.hpp>
+
+namespace {
+    template <typename Int>
+    static void
+    CommutativeMultCheck( Int a, Int b, Int upper_result, Int lower_result ) {
+        using Catch::Detail::extendedMult;
+        using Catch::Detail::ExtendedMultResult;
+        CHECK( extendedMult( a, b ) ==
+               ExtendedMultResult<Int>{ upper_result, lower_result } );
+        CHECK( extendedMult( b, a ) ==
+               ExtendedMultResult<Int>{ upper_result, lower_result } );
+    }
+} // namespace
+
+TEST_CASE( "extendedMult 64x64", "[Integer][approvals]" ) {
+    // a x 0 == 0
+    CommutativeMultCheck<uint64_t>( 0x1234'5678'9ABC'DEFF, 0, 0, 0 );
+
+    // bit carried from low half to upper half
+    CommutativeMultCheck<uint64_t>( uint64_t( 1 ) << 63, 2, 1, 0 );
+
+    // bits in upper half on one side, bits in lower half on other side
+    CommutativeMultCheck<uint64_t>( 0xcdcd'dcdc'0000'0000,
+                                    0x0000'0000'aeae'aeae,
+                                    0x0000'0000'8c6e'5a77,
+                                    0x7391'a588'0000'0000 );
+
+    // Some input numbers without interesting patterns
+    CommutativeMultCheck<uint64_t>( 0xaaaa'aaaa'aaaa'aaaa,
+                                    0xbbbb'bbbb'bbbb'bbbb,
+                                    0x7d27'd27d'27d2'7d26,
+                                    0xd82d'82d8'2d82'd82e );
+
+    CommutativeMultCheck<uint64_t>( 0x7d27'd27d'27d2'7d26,
+                                    0xd82d'82d8'2d82'd82e,
+                                    0x69af'd991'8256'b953,
+                                    0x8724'8909'fcb6'8cd4 );
+
+    CommutativeMultCheck<uint64_t>( 0xdead'beef'dead'beef,
+                                    0xfeed'feed'feed'feef,
+                                    0xddbf'680b'2b0c'b558,
+                                    0x7a36'b06f'2ce9'6321 );
+
+    CommutativeMultCheck<uint64_t>( 0xddbf'680b'2b0c'b558,
+                                    0x7a36'b06f'2ce9'6321,
+                                    0x69dc'96c9'294b'fc7f,
+                                    0xd038'39fa'a3dc'6858 );
+
+    CommutativeMultCheck<uint64_t>( 0x61c8'8646'80b5'83eb,
+                                    0x61c8'8646'80b5'83eb,
+                                    0x2559'92d3'8220'8bbe,
+                                    0xdf44'2d22'ce48'59b9 );
+}
+
+TEST_CASE( "SizedUnsignedType helpers", "[integer][approvals]" ) {
+    using Catch::Detail::SizedUnsignedType_t;
+    using Catch::Detail::DoubleWidthUnsignedType_t;
+
+    STATIC_REQUIRE( sizeof( SizedUnsignedType_t<1> ) == 1 );
+    STATIC_REQUIRE( sizeof( SizedUnsignedType_t<2> ) == 2 );
+    STATIC_REQUIRE( sizeof( SizedUnsignedType_t<4> ) == 4 );
+    STATIC_REQUIRE( sizeof( SizedUnsignedType_t<8> ) == 8 );
+
+    STATIC_REQUIRE( sizeof( DoubleWidthUnsignedType_t<std::uint8_t> ) == 2 );
+    STATIC_REQUIRE( std::is_unsigned<DoubleWidthUnsignedType_t<std::uint8_t>>::value );
+    STATIC_REQUIRE( sizeof( DoubleWidthUnsignedType_t<std::uint16_t> ) == 4 );
+    STATIC_REQUIRE( std::is_unsigned<DoubleWidthUnsignedType_t<std::uint16_t>>::value );
+    STATIC_REQUIRE( sizeof( DoubleWidthUnsignedType_t<std::uint32_t> ) == 8 );
+    STATIC_REQUIRE( std::is_unsigned<DoubleWidthUnsignedType_t<std::uint32_t>>::value );
+}
+
+TEST_CASE( "extendedMult 32x32", "[integer][approvals]" ) {
+    // a x 0 == 0
+    CommutativeMultCheck<uint32_t>( 0x1234'5678, 0, 0, 0 );
+
+    // bit carried from low half to upper half
+    CommutativeMultCheck<uint32_t>( uint32_t(1) << 31, 2, 1, 0 );
+
+    // bits in upper half on one side, bits in lower half on other side
+    CommutativeMultCheck<uint32_t>( 0xdcdc'0000, 0x0000'aabb, 0x0000'934b, 0x6cb4'0000 );
+
+    // Some input numbers without interesting patterns
+    CommutativeMultCheck<uint32_t>(
+        0xaaaa'aaaa, 0xbbbb'bbbb, 0x7d27'd27c, 0x2d82'd82e );
+
+    CommutativeMultCheck<uint32_t>(
+        0x7d27'd27c, 0x2d82'd82e, 0x163f'f7e8, 0xc5b8'7248 );
+
+    CommutativeMultCheck<uint32_t>(
+        0xdead'beef, 0xfeed'feed, 0xddbf'6809, 0x6f8d'e543 );
+
+    CommutativeMultCheck<uint32_t>(
+        0xddbf'6809, 0x6f8d'e543, 0x60a0'e71e, 0x751d'475b );
+}
+
+TEST_CASE( "extendedMult 8x8", "[integer][approvals]" ) {
+    // a x 0 == 0
+    CommutativeMultCheck<uint8_t>( 0xcd, 0, 0, 0 );
+
+    // bit carried from low half to upper half
+    CommutativeMultCheck<uint8_t>( uint8_t( 1 ) << 7, 2, 1, 0 );
+
+    // bits in upper half on one side, bits in lower half on other side
+    CommutativeMultCheck<uint8_t>( 0x80, 0x03, 0x01, 0x80 );
+
+    // Some input numbers without interesting patterns
+    CommutativeMultCheck<uint8_t>( 0xaa, 0xbb, 0x7c, 0x2e );
+    CommutativeMultCheck<uint8_t>( 0x7c, 0x2e, 0x16, 0x48 );
+    CommutativeMultCheck<uint8_t>( 0xdc, 0xcd, 0xb0, 0x2c );
+    CommutativeMultCheck<uint8_t>( 0xb0, 0x2c, 0x1e, 0x40 );
+}
+
+
+TEST_CASE( "negative and positive signed integers keep their order after transposeToNaturalOrder",
+                    "[integer][approvals]") {
+    using Catch::Detail::transposeToNaturalOrder;
+    int32_t negative( -1 );
+    int32_t positive( 1 );
+    uint32_t adjusted_negative =
+        transposeToNaturalOrder<int32_t>( static_cast<uint32_t>( negative ) );
+    uint32_t adjusted_positive =
+        transposeToNaturalOrder<int32_t>( static_cast<uint32_t>( positive ) );
+    REQUIRE( adjusted_negative < adjusted_positive );
+    REQUIRE( adjusted_positive - adjusted_negative == 2 );
+
+    // Conversion has to be reversible
+    REQUIRE( negative == static_cast<int32_t>( transposeToNaturalOrder<int32_t>(
+                             adjusted_negative ) ) );
+    REQUIRE( positive == static_cast<int32_t>( transposeToNaturalOrder<int32_t>(
+                             adjusted_positive ) ) );
+}
+
+TEST_CASE( "unsigned integers are unchanged by transposeToNaturalOrder",
+           "[integer][approvals]") {
+    using Catch::Detail::transposeToNaturalOrder;
+    uint32_t max = std::numeric_limits<uint32_t>::max();
+    uint32_t zero = 0;
+    REQUIRE( max == transposeToNaturalOrder<uint32_t>( max ) );
+    REQUIRE( zero == transposeToNaturalOrder<uint32_t>( zero ) );
+}
diff --git a/packages/Catch2/tests/SelfTest/IntrospectiveTests/InternalBenchmark.tests.cpp b/packages/Catch2/tests/SelfTest/IntrospectiveTests/InternalBenchmark.tests.cpp
index 24bfe68cd6dea8eb2bbdfe18fa677335c98d613e..bc8d715b47da2e52a0688d5271afe87ac42a5b30 100644
--- a/packages/Catch2/tests/SelfTest/IntrospectiveTests/InternalBenchmark.tests.cpp
+++ b/packages/Catch2/tests/SelfTest/IntrospectiveTests/InternalBenchmark.tests.cpp
@@ -156,8 +156,12 @@ TEST_CASE("uniform samples", "[benchmark]") {
     std::vector<double> samples(100);
     std::fill(samples.begin(), samples.end(), 23);
 
-    using it = std::vector<double>::iterator;
-    auto e = Catch::Benchmark::Detail::bootstrap(0.95, samples.begin(), samples.end(), samples, [](it a, it b) {
+    auto e = Catch::Benchmark::Detail::bootstrap(
+        0.95,
+        samples.data(),
+        samples.data() + samples.size(),
+        samples,
+        []( double const* a, double const* b ) {
         auto sum = std::accumulate(a, b, 0.);
         return sum / (b - a);
     });
@@ -198,7 +202,7 @@ TEST_CASE("normal_quantile", "[benchmark]") {
 TEST_CASE("mean", "[benchmark]") {
     std::vector<double> x{ 10., 20., 14., 16., 30., 24. };
 
-    auto m = Catch::Benchmark::Detail::mean(x.begin(), x.end());
+    auto m = Catch::Benchmark::Detail::mean(x.data(), x.data() + x.size());
 
     REQUIRE(m == 19.);
 }
@@ -206,9 +210,9 @@ TEST_CASE("mean", "[benchmark]") {
 TEST_CASE("weighted_average_quantile", "[benchmark]") {
     std::vector<double> x{ 10., 20., 14., 16., 30., 24. };
 
-    auto q1 = Catch::Benchmark::Detail::weighted_average_quantile(1, 4, x.begin(), x.end());
-    auto med = Catch::Benchmark::Detail::weighted_average_quantile(1, 2, x.begin(), x.end());
-    auto q3 = Catch::Benchmark::Detail::weighted_average_quantile(3, 4, x.begin(), x.end());
+    auto q1 = Catch::Benchmark::Detail::weighted_average_quantile(1, 4, x.data(), x.data() + x.size());
+    auto med = Catch::Benchmark::Detail::weighted_average_quantile(1, 2, x.data(), x.data() + x.size());
+    auto q3 = Catch::Benchmark::Detail::weighted_average_quantile(3, 4, x.data(), x.data() + x.size());
 
     REQUIRE(q1 == 14.5);
     REQUIRE(med == 18.);
@@ -227,7 +231,8 @@ TEST_CASE("classify_outliers", "[benchmark]") {
     SECTION("none") {
         std::vector<double> x{ 10., 20., 14., 16., 30., 24. };
 
-        auto o = Catch::Benchmark::Detail::classify_outliers(x.begin(), x.end());
+        auto o = Catch::Benchmark::Detail::classify_outliers(
+            x.data(), x.data() + x.size() );
 
         REQUIRE(o.samples_seen == static_cast<int>(x.size()));
         require_outliers(o, 0, 0, 0, 0);
@@ -235,7 +240,8 @@ TEST_CASE("classify_outliers", "[benchmark]") {
     SECTION("low severe") {
         std::vector<double> x{ -12., 20., 14., 16., 30., 24. };
 
-        auto o = Catch::Benchmark::Detail::classify_outliers(x.begin(), x.end());
+        auto o = Catch::Benchmark::Detail::classify_outliers(
+            x.data(), x.data() + x.size() );
 
         REQUIRE(o.samples_seen == static_cast<int>(x.size()));
         require_outliers(o, 1, 0, 0, 0);
@@ -243,7 +249,8 @@ TEST_CASE("classify_outliers", "[benchmark]") {
     SECTION("low mild") {
         std::vector<double> x{ 1., 20., 14., 16., 30., 24. };
 
-        auto o = Catch::Benchmark::Detail::classify_outliers(x.begin(), x.end());
+        auto o = Catch::Benchmark::Detail::classify_outliers(
+            x.data(), x.data() + x.size() );
 
         REQUIRE(o.samples_seen == static_cast<int>(x.size()));
         require_outliers(o, 0, 1, 0, 0);
@@ -251,7 +258,8 @@ TEST_CASE("classify_outliers", "[benchmark]") {
     SECTION("high mild") {
         std::vector<double> x{ 10., 20., 14., 16., 36., 24. };
 
-        auto o = Catch::Benchmark::Detail::classify_outliers(x.begin(), x.end());
+        auto o = Catch::Benchmark::Detail::classify_outliers(
+            x.data(), x.data() + x.size() );
 
         REQUIRE(o.samples_seen == static_cast<int>(x.size()));
         require_outliers(o, 0, 0, 1, 0);
@@ -259,7 +267,8 @@ TEST_CASE("classify_outliers", "[benchmark]") {
     SECTION("high severe") {
         std::vector<double> x{ 10., 20., 14., 16., 49., 24. };
 
-        auto o = Catch::Benchmark::Detail::classify_outliers(x.begin(), x.end());
+        auto o = Catch::Benchmark::Detail::classify_outliers(
+            x.data(), x.data() + x.size() );
 
         REQUIRE(o.samples_seen == static_cast<int>(x.size()));
         require_outliers(o, 0, 0, 0, 1);
@@ -267,7 +276,8 @@ TEST_CASE("classify_outliers", "[benchmark]") {
     SECTION("mixed") {
         std::vector<double> x{ -20., 20., 14., 16., 39., 24. };
 
-        auto o = Catch::Benchmark::Detail::classify_outliers(x.begin(), x.end());
+        auto o = Catch::Benchmark::Detail::classify_outliers(
+            x.data(), x.data() + x.size() );
 
         REQUIRE(o.samples_seen == static_cast<int>(x.size()));
         require_outliers(o, 1, 0, 1, 0);
@@ -282,15 +292,13 @@ TEST_CASE("analyse", "[approvals][benchmark]") {
     data.benchmarkSamples = 99;
     Catch::Config config{data};
 
-    using Duration = Catch::Benchmark::FloatDuration<Catch::Benchmark::default_clock>;
-
-    Catch::Benchmark::Environment<Duration> env;
-    std::vector<Duration> samples(99);
+    using FDuration = Catch::Benchmark::FDuration;
+    std::vector<FDuration> samples(99);
     for (size_t i = 0; i < samples.size(); ++i) {
-        samples[i] = Duration(23 + (i % 3 - 1));
+        samples[i] = FDuration(23 + (i % 3 - 1));
     }
 
-    auto analysis = Catch::Benchmark::Detail::analyse(config, env, samples.begin(), samples.end());
+    auto analysis = Catch::Benchmark::Detail::analyse(config, samples.data(), samples.data() + samples.size());
     CHECK( analysis.mean.point.count() == 23 );
     CHECK( analysis.mean.lower_bound.count() < 23 );
     CHECK(analysis.mean.lower_bound.count() > 22);
@@ -323,15 +331,13 @@ TEST_CASE("analyse no analysis", "[benchmark]") {
     data.benchmarkSamples = 99;
     Catch::Config config{ data };
 
-    using Duration = Catch::Benchmark::FloatDuration<Catch::Benchmark::default_clock>;
-
-    Catch::Benchmark::Environment<Duration> env;
-    std::vector<Duration> samples(99);
+    using FDuration = Catch::Benchmark::FDuration;
+    std::vector<FDuration> samples(99);
     for (size_t i = 0; i < samples.size(); ++i) {
-        samples[i] = Duration(23 + (i % 3 - 1));
+        samples[i] = FDuration(23 + (i % 3 - 1));
     }
 
-    auto analysis = Catch::Benchmark::Detail::analyse(config, env, samples.begin(), samples.end());
+    auto analysis = Catch::Benchmark::Detail::analyse(config, samples.data(), samples.data() + samples.size());
     CHECK(analysis.mean.point.count() == 23);
     CHECK(analysis.mean.lower_bound.count() == 23);
     CHECK(analysis.mean.upper_bound.count() == 23);
diff --git a/packages/Catch2/tests/SelfTest/IntrospectiveTests/Json.tests.cpp b/packages/Catch2/tests/SelfTest/IntrospectiveTests/Json.tests.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8204e3c4b11f668d3dcf105ef5000227b6453ffd
--- /dev/null
+++ b/packages/Catch2/tests/SelfTest/IntrospectiveTests/Json.tests.cpp
@@ -0,0 +1,152 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/internal/catch_jsonwriter.hpp>
+#include <catch2/matchers/catch_matchers_string.hpp>
+
+#include <sstream>
+
+namespace {
+    struct Custom {};
+    static std::ostream& operator<<( std::ostream& os, Custom const& ) {
+        return os << "custom";
+    }
+} // namespace
+
+TEST_CASE( "JsonWriter", "[JSON][JsonWriter]" ) {
+
+    std::stringstream stream;
+    SECTION( "Newly constructed JsonWriter does nothing" ) {
+        Catch::JsonValueWriter writer{ stream };
+        REQUIRE( stream.str() == "" );
+    }
+
+    SECTION( "Calling writeObject will create an empty pair of braces" ) {
+        { auto writer = Catch::JsonValueWriter{ stream }.writeObject(); }
+        REQUIRE( stream.str() == "{\n}" );
+    }
+
+    SECTION( "Calling writeObject with key will create an object to write the "
+             "value" ) {
+        using Catch::Matchers::ContainsSubstring;
+        {
+            auto writer = Catch::JsonValueWriter{ stream }.writeObject();
+            writer.write( "int" ).write( 1 );
+            writer.write( "double" ).write( 1.5 );
+            writer.write( "true" ).write( true );
+            writer.write( "false" ).write( false );
+            writer.write( "string" ).write( "this is a string" );
+            writer.write( "array" ).writeArray().write( 1 ).write( 2 );
+        }
+        REQUIRE_THAT(
+            stream.str(),
+            ContainsSubstring( "\"int\": 1," ) &&
+                ContainsSubstring( "\"double\": 1.5," ) &&
+                ContainsSubstring( "\"true\": true," ) &&
+                ContainsSubstring( "\"false\": false," ) &&
+                ContainsSubstring( "\"string\": \"this is a string\"," ) &&
+                ContainsSubstring( "\"array\": [\n    1,\n    2\n  ]\n}" ) );
+    }
+
+    SECTION( "nesting objects" ) {
+        using Catch::Matchers::ContainsSubstring;
+        {
+            auto writer = Catch::JsonValueWriter{ stream }.writeObject();
+            writer.write( "empty_object" ).writeObject();
+            writer.write( "fully_object" )
+                .writeObject()
+                .write( "key" )
+                .write( 1 );
+        }
+        REQUIRE_THAT( stream.str(),
+                      ContainsSubstring( "\"empty_object\": {\n  }," ) &&
+                          ContainsSubstring(
+                              "\"fully_object\": {\n    \"key\": 1\n  }" ) );
+    }
+
+    SECTION( "Calling writeArray will create an empty pair of braces" ) {
+        { auto writer = Catch::JsonValueWriter{ stream }.writeArray(); }
+        REQUIRE( stream.str() == "[\n]" );
+    }
+
+    SECTION( "Calling writeArray creates array to write the values to" ) {
+        {
+            auto writer = Catch::JsonValueWriter{ stream }.writeArray();
+            writer.write( 1 );
+            writer.write( 1.5 );
+            writer.write( true );
+            writer.write( false );
+            writer.write( "this is a string" );
+            writer.writeObject().write( "object" ).write( 42 );
+            writer.writeArray().write( "array" ).write( 42.5 );
+        }
+        REQUIRE( stream.str() == "[\n  1,\n  1.5,\n  true,\n  false,\n  \"this is a string\",\n  {\n    \"object\": 42\n  },\n  [\n    \"array\",\n    42.5\n  ]\n]" );
+    }
+
+    SECTION(
+        "Moved from JsonObjectWriter shall not insert superfluous brace" ) {
+        {
+            auto writer = Catch::JsonObjectWriter{ stream };
+            auto another_writer = std::move( writer );
+        }
+        REQUIRE( stream.str() == "{\n}" );
+    }
+    SECTION(
+        "Moved from JsonArrayWriter shall not insert superfluous bracket" ) {
+        {
+            auto writer = Catch::JsonArrayWriter{ stream };
+            auto another_writer = std::move( writer );
+        }
+        REQUIRE( stream.str() == "[\n]" );
+    }
+    SECTION( "Custom class shall be quoted" ) {
+        Catch::JsonValueWriter{ stream }.write( Custom{} );
+        REQUIRE( stream.str() == "\"custom\"" );
+    }
+}
+
+TEST_CASE( "JsonWriter escapes charaters in strings properly", "[JsonWriter]" ) {
+    std::stringstream sstream;
+    SECTION( "Quote in a string is escaped" ) {
+        Catch::JsonValueWriter{ sstream }.write( "\"" );
+        REQUIRE( sstream.str() == "\"\\\"\"" );
+    }
+    SECTION("Backslash in a string is escaped") {
+        Catch::JsonValueWriter{ sstream }.write( "\\" );
+        REQUIRE( sstream.str() == "\"\\\\\"" );
+    }
+    SECTION( "Forward slash in a string is **not** escaped" ) {
+        Catch::JsonValueWriter{ sstream }.write( "/" );
+        REQUIRE( sstream.str() == "\"/\"" );
+    }
+    SECTION( "Backspace in a string is escaped" ) {
+        Catch::JsonValueWriter{ sstream }.write( "\b" );
+        REQUIRE( sstream.str() == "\"\\b\"" );
+    }
+    SECTION( "Formfeed in a string is escaped" ) {
+        Catch::JsonValueWriter{ sstream }.write( "\f" );
+        REQUIRE( sstream.str() == "\"\\f\"" );
+    }
+    SECTION( "linefeed in a string is escaped" ) {
+        Catch::JsonValueWriter{ sstream }.write( "\n" );
+        REQUIRE( sstream.str() == "\"\\n\"" );
+    }
+    SECTION( "carriage return in a string is escaped" ) {
+        Catch::JsonValueWriter{ sstream }.write( "\r" );
+        REQUIRE( sstream.str() == "\"\\r\"" );
+    }
+    SECTION( "tab in a string is escaped" ) {
+        Catch::JsonValueWriter{ sstream }.write( "\t" );
+        REQUIRE( sstream.str() == "\"\\t\"" );
+    }
+    SECTION( "combination of characters is escaped" ) {
+        Catch::JsonValueWriter{ sstream }.write( "\\/\t\r\n" );
+        REQUIRE( sstream.str() == "\"\\\\/\\t\\r\\n\"" );
+    }
+}
diff --git a/packages/Catch2/tests/SelfTest/IntrospectiveTests/RandomNumberGeneration.tests.cpp b/packages/Catch2/tests/SelfTest/IntrospectiveTests/RandomNumberGeneration.tests.cpp
index 8018b7eb0d8660cf517ffb82f2dc81c3ae7ea275..03be6c9cad8d76ecbad9090de3021694b211fdba 100644
--- a/packages/Catch2/tests/SelfTest/IntrospectiveTests/RandomNumberGeneration.tests.cpp
+++ b/packages/Catch2/tests/SelfTest/IntrospectiveTests/RandomNumberGeneration.tests.cpp
@@ -7,9 +7,17 @@
 // SPDX-License-Identifier: BSL-1.0
 
 #include <catch2/catch_test_macros.hpp>
+#include <catch2/catch_template_test_macros.hpp>
+#include <catch2/internal/catch_floating_point_helpers.hpp>
+#include <catch2/internal/catch_random_integer_helpers.hpp>
 #include <catch2/internal/catch_random_number_generator.hpp>
 #include <catch2/internal/catch_random_seed_generation.hpp>
+#include <catch2/internal/catch_uniform_floating_point_distribution.hpp>
+#include <catch2/internal/catch_uniform_integer_distribution.hpp>
 #include <catch2/generators/catch_generators.hpp>
+#include <catch2/matchers/catch_matchers_range_equals.hpp>
+
+#include <random>
 
 TEST_CASE("Our PCG implementation provides expected results for known seeds", "[rng]") {
     Catch::SimplePcg32 rng;
@@ -60,3 +68,523 @@ TEST_CASE("Random seed generation accepts known methods", "[rng][seed]") {
 
     REQUIRE_NOTHROW(Catch::generateRandomSeed(method));
 }
+
+TEMPLATE_TEST_CASE("uniform_floating_point_distribution never returns infs from finite range",
+          "[rng][distribution][floating-point][approvals]", float, double) {
+    std::random_device rd{};
+    Catch::SimplePcg32 pcg( rd() );
+    Catch::uniform_floating_point_distribution<TestType> dist(
+        -std::numeric_limits<TestType>::max(),
+        std::numeric_limits<TestType>::max() );
+
+    for (size_t i = 0; i < 10'000; ++i) {
+        auto ret = dist( pcg );
+        REQUIRE_FALSE( std::isinf( ret ) );
+        REQUIRE_FALSE( std::isnan( ret ) );
+    }
+}
+
+TEST_CASE( "fillBitsFrom - shortening and stretching", "[rng][approvals]" ) {
+    using Catch::Detail::fillBitsFrom;
+
+    // The seed is not important, but the numbers below have to be repeatable.
+    // They should also exhibit the same general pattern of being prefixes
+    Catch::SimplePcg32 pcg( 0xaabb'ccdd );
+
+    SECTION( "Shorten to 8 bits" ) {
+        // We cast the result to avoid dealing with char-like type in uint8_t
+        auto shortened = static_cast<uint32_t>( fillBitsFrom<uint8_t>( pcg ) );
+        REQUIRE( shortened == 0xcc );
+    }
+    SECTION( "Shorten to 16 bits" ) {
+        auto shortened = fillBitsFrom<uint16_t>( pcg );
+        REQUIRE( shortened == 0xccbe );
+    }
+    SECTION( "Keep at 32 bits" ) {
+        auto n = fillBitsFrom<uint32_t>( pcg );
+        REQUIRE( n == 0xccbe'5f04 );
+    }
+    SECTION( "Stretch to 64 bits" ) {
+        auto stretched = fillBitsFrom<uint64_t>( pcg );
+        REQUIRE( stretched == 0xccbe'5f04'a424'a486 );
+    }
+}
+
+TEST_CASE("uniform_integer_distribution can return the bounds", "[rng][distribution]") {
+    Catch::uniform_integer_distribution<int32_t> dist( -10, 10 );
+    REQUIRE( dist.a() == -10 );
+    REQUIRE( dist.b() == 10 );
+}
+
+namespace {
+    template <typename T>
+    static void CheckReturnValue(Catch::uniform_integer_distribution<T>& dist,
+                                 Catch::SimplePcg32& rng,
+                                 T target) {
+        REQUIRE( dist.a() == dist.b() );
+        for (int i = 0; i < 1'000; ++i) {
+            REQUIRE( dist( rng ) == target );
+        }
+    }
+}
+
+TEMPLATE_TEST_CASE( "uniform_integer_distribution can handle unit ranges",
+                    "[rng][distribution][approvals]",
+                    unsigned char,
+                    signed char,
+                    char,
+                    uint8_t,
+                    int8_t,
+                    uint16_t,
+                    int16_t,
+                    uint32_t,
+                    int32_t,
+                    uint64_t,
+                    int64_t ) {
+    // We want random seed to sample different parts of the rng state,
+    // the output is predetermined anyway
+    std::random_device rd;
+    auto seed = rd();
+    CAPTURE( seed );
+    Catch::SimplePcg32 pcg( seed );
+
+    // We check unitary ranges of 3 different values, min for type, max for type,
+    // some value inbetween just to make sure
+    SECTION("lowest value") {
+        constexpr auto lowest = std::numeric_limits<TestType>::min();
+        Catch::uniform_integer_distribution<TestType> dist( lowest, lowest );
+        CheckReturnValue( dist, pcg, lowest );
+    }
+    SECTION( "highest value" ) {
+        constexpr auto highest = std::numeric_limits<TestType>::max();
+        Catch::uniform_integer_distribution<TestType> dist( highest, highest );
+        CheckReturnValue( dist, pcg, highest );
+    }
+    SECTION( "some value" ) {
+        constexpr auto some = TestType( 42 );
+        Catch::uniform_integer_distribution<TestType> dist( some, some );
+        CheckReturnValue( dist, pcg, some );
+    }
+}
+
+// Bool needs its own test because it doesn't have a valid "third" value
+TEST_CASE( "uniform_integer_distribution can handle boolean unit ranges",
+           "[rng][distribution][approvals]" ) {
+    // We want random seed to sample different parts of the rng state,
+    // the output is predetermined anyway
+    std::random_device rd;
+    auto seed = rd();
+    CAPTURE( seed );
+    Catch::SimplePcg32 pcg( seed );
+
+    // We check unitary ranges of 3 different values, min for type, max for
+    // type, some value inbetween just to make sure
+    SECTION( "lowest value" ) {
+        Catch::uniform_integer_distribution<bool> dist( false, false );
+        CheckReturnValue( dist, pcg, false );
+    }
+    SECTION( "highest value" ) {
+        Catch::uniform_integer_distribution<bool> dist( true, true );
+        CheckReturnValue( dist, pcg, true );
+    }
+}
+
+TEMPLATE_TEST_CASE( "uniform_integer_distribution can handle full width ranges",
+                    "[rng][distribution][approvals]",
+                    unsigned char,
+                    signed char,
+                    char,
+                    uint8_t,
+                    int8_t,
+                    uint16_t,
+                    int16_t,
+                    uint32_t,
+                    int32_t,
+                    uint64_t,
+                    int64_t ) {
+    // We want random seed to sample different parts of the rng state,
+    // the output is predetermined anyway
+    std::random_device rd;
+    auto seed = rd();
+    CAPTURE( seed );
+    Catch::SimplePcg32 pcg( seed );
+
+    constexpr auto lowest = std::numeric_limits<TestType>::min();
+    constexpr auto highest = std::numeric_limits<TestType>::max();
+    Catch::uniform_integer_distribution<TestType> dist( lowest, highest );
+    STATIC_REQUIRE( std::is_same<TestType, decltype( dist( pcg ) )>::value );
+
+    // We need to do bit operations on the results, so we will have to
+    // cast them to unsigned type.
+    using BitType = std::make_unsigned_t<TestType>;
+    BitType ORs = 0;
+    BitType ANDs = BitType(-1);
+    for (int i = 0; i < 100; ++i) {
+        auto bits = static_cast<BitType>( dist( pcg ) );
+        ORs |= bits;
+        ANDs &= bits;
+    }
+    // Assuming both our RNG and distribution are unbiased, asking for
+    // the full range should essentially give us random bit generator.
+    // Over long run, OR of all the generated values should have all
+    // bits set to 1, while AND should have all bits set to 0.
+    // The chance of this test failing for unbiased pipeline is
+    // 1 / 2**iters, which for 100 iterations is astronomical.
+    REQUIRE( ORs == BitType( -1 ) );
+    REQUIRE( ANDs == 0 );
+}
+
+namespace {
+    template <typename T>
+    struct uniform_integer_test_params;
+
+    template <>
+    struct uniform_integer_test_params<bool> {
+        static constexpr bool lowest = false;
+        static constexpr bool highest = true;
+        //  This seems weird, but it is an artifact of the specific seed
+        static constexpr bool expected[] = { true,
+                                             true,
+                                             true,
+                                             true,
+                                             true,
+                                             true,
+                                             false,
+                                             true,
+                                             true,
+                                             true,
+                                             true,
+                                             true,
+                                             false,
+                                             true,
+                                             true };
+    };
+
+    template <>
+    struct uniform_integer_test_params<char> {
+        static constexpr char lowest = 32;
+        static constexpr char highest = 126;
+        static constexpr char expected[] = { 'k',
+                                             '\\',
+                                             'Z',
+                                             'X',
+                                             '`',
+                                             'Q',
+                                             ';',
+                                             'o',
+                                             ']',
+                                             'T',
+                                             'v',
+                                             'p',
+                                             ':',
+                                             'S',
+                                             't' };
+    };
+
+    template <>
+    struct uniform_integer_test_params<uint8_t> {
+        static constexpr uint8_t lowest = 3;
+        static constexpr uint8_t highest = 123;
+        static constexpr uint8_t expected[] = { 'c',
+                                                'P',
+                                                'M',
+                                                'J',
+                                                'U',
+                                                'A',
+                                                '%',
+                                                'h',
+                                                'Q',
+                                                'F',
+                                                'q',
+                                                'i',
+                                                '$',
+                                                'E',
+                                                'o' };
+    };
+
+    template <>
+    struct uniform_integer_test_params<int8_t> {
+        static constexpr int8_t lowest = -27;
+        static constexpr int8_t highest = 73;
+        static constexpr int8_t expected[] = { '5',
+                                               '%',
+                                               '#',
+                                               ' ',
+                                               '*',
+                                               25,
+                                               2,
+                                               '9',
+                                               '&',
+                                               29,
+                                               'A',
+                                               ':',
+                                               1,
+                                               28,
+                                               '?' };
+    };
+
+    template <>
+    struct uniform_integer_test_params<uint16_t> {
+        static constexpr uint16_t lowest = 123;
+        static constexpr uint16_t highest = 33333;
+        static constexpr uint16_t expected[] = { 26684,
+                                                 21417,
+                                                 20658,
+                                                 19791,
+                                                 22896,
+                                                 17433,
+                                                 9806,
+                                                 27948,
+                                                 21767,
+                                                 18588,
+                                                 30556,
+                                                 28244,
+                                                 9439,
+                                                 18293,
+                                                 29949 };
+    };
+
+    template <>
+    struct uniform_integer_test_params<int16_t> {
+        static constexpr int16_t lowest = -17222;
+        static constexpr int16_t highest = 17222;
+        static constexpr int16_t expected[] = { 10326,
+                                                 4863,
+                                                 4076,
+                                                 3177,
+                                                 6397,
+                                                 731,
+                                                 -7179,
+                                                 11637,
+                                                 5226,
+                                                 1929,
+                                                 14342,
+                                                 11944,
+                                                 -7560,
+                                                 1623,
+                                                 13712 };
+    };
+
+    template <>
+    struct uniform_integer_test_params<uint32_t> {
+        static constexpr uint32_t lowest = 17222;
+        static constexpr uint32_t highest = 234234;
+        static constexpr uint32_t expected[] = { 190784,
+                                                 156367,
+                                                 151409,
+                                                 145743,
+                                                 166032,
+                                                 130337,
+                                                 80501,
+                                                 199046,
+                                                 158654,
+                                                 137883,
+                                                 216091,
+                                                 200981,
+                                                 78099,
+                                                 135954,
+                                                 212120 };
+    };
+
+    template <>
+    struct uniform_integer_test_params<int32_t> {
+        static constexpr int32_t lowest = -237272;
+        static constexpr int32_t highest = 234234;
+        static constexpr int32_t expected[] = { 139829,
+                                                65050,
+                                                54278,
+                                                41969,
+                                                86051,
+                                                8494,
+                                                -99785,
+                                                157781,
+                                                70021,
+                                                24890,
+                                                194815,
+                                                161985,
+                                                -105004,
+                                                20699,
+                                                186186 };
+    };
+
+    template <>
+    struct uniform_integer_test_params<uint64_t> {
+        static constexpr uint64_t lowest = 1234;
+        static constexpr uint64_t highest = 1234567890;
+        static constexpr uint64_t expected[] = { 987382749,
+                                                 763380386,
+                                                 846572137,
+                                                 359990258,
+                                                 804599765,
+                                                 1131353566,
+                                                 346324913,
+                                                 1108760730,
+                                                 1141693933,
+                                                 856999148,
+                                                 879390623,
+                                                 1149485521,
+                                                 900556586,
+                                                 952385958,
+                                                 807916408 };
+    };
+
+    template <>
+    struct uniform_integer_test_params<int64_t> {
+        static constexpr int64_t lowest = -1234567890;
+        static constexpr int64_t highest = 1234567890;
+        static constexpr int64_t expected[] = { 740197113,
+                                                292191940,
+                                                458575608,
+                                                -514589122,
+                                                374630781,
+                                                1028139036,
+                                                -541919840,
+                                                982953318,
+                                                1048819790,
+                                                479429651,
+                                                524212647,
+                                                1064402981,
+                                                566544615,
+                                                670203462,
+                                                381264073 };
+    };
+
+    // We need these definitions for C++14 and earlier, but
+    // GCC will complain about them in newer C++ standards
+#if __cplusplus <= 201402L
+    constexpr bool uniform_integer_test_params<bool>::expected[];
+    constexpr char uniform_integer_test_params<char>::expected[];
+    constexpr uint8_t uniform_integer_test_params<uint8_t>::expected[];
+    constexpr int8_t uniform_integer_test_params<int8_t>::expected[];
+    constexpr uint16_t uniform_integer_test_params<uint16_t>::expected[];
+    constexpr int16_t uniform_integer_test_params<int16_t>::expected[];
+    constexpr uint32_t uniform_integer_test_params<uint32_t>::expected[];
+    constexpr int32_t uniform_integer_test_params<int32_t>::expected[];
+    constexpr uint64_t uniform_integer_test_params<uint64_t>::expected[];
+    constexpr int64_t uniform_integer_test_params<int64_t>::expected[];
+#endif
+
+}
+
+TEMPLATE_TEST_CASE( "uniform_integer_distribution is reproducible",
+                    "[rng][distribution][approvals]",
+                   bool,
+                   char,
+                   uint8_t,
+                   int8_t,
+                   uint16_t,
+                   int16_t,
+                   uint32_t,
+                   int32_t,
+                   uint64_t,
+                   int64_t) {
+    Catch::SimplePcg32 pcg( 0xaabb'ccdd );
+
+    constexpr auto lowest = uniform_integer_test_params<TestType>::lowest;
+    constexpr auto highest = uniform_integer_test_params<TestType>::highest;
+    Catch::uniform_integer_distribution<TestType> dist(lowest, highest);
+
+    constexpr auto iters = 15;
+    std::array<TestType, iters> generated;
+    for (int i = 0; i < iters; ++i) {
+        generated[i] = dist( pcg );
+    }
+
+    REQUIRE_THAT(generated, Catch::Matchers::RangeEquals(uniform_integer_test_params<TestType>::expected));
+}
+
+
+namespace {
+    template <typename T>
+    struct uniform_fp_test_params;
+
+    template<>
+    struct uniform_fp_test_params<float> {
+        // These are exactly representable
+        static constexpr float lowest = -256.125f;
+        static constexpr float highest = 385.125f;
+        // These are just round-trip formatted
+        static constexpr float expected[] = { 92.56961f,
+                                              -23.170044f,
+                                              310.81833f,
+                                              -53.023132f,
+                                              105.03287f,
+                                              198.77591f,
+                                              -172.72931f,
+                                              51.805176f,
+                                              -241.10156f,
+                                              64.66101f,
+                                              212.12509f,
+                                              -49.24292f,
+                                              -177.1399f,
+                                              245.23679f,
+                                              173.22421f };
+    };
+    template <>
+    struct uniform_fp_test_params<double> {
+        // These are exactly representable
+        static constexpr double lowest = -234582.9921875;
+        static constexpr double highest = 261238.015625;
+        // These are just round-trip formatted
+        static constexpr double expected[] = { 35031.207052832615,
+                                               203783.3401838024,
+                                               44667.940405848756,
+                                               -170100.5877224467,
+                                               -222966.7418051684,
+                                               127472.72630072923,
+                                               -173510.88209096913,
+                                               97394.16172239158,
+                                               119123.6921592663,
+                                               22595.741022785165,
+                                               8988.68409120926,
+                                               136906.86520606978,
+                                               33369.19104222473,
+                                               60912.7615841752,
+                                               -149060.05936760217 };
+    };
+
+// We need these definitions for C++14 and earlier, but
+// GCC will complain about them in newer C++ standards
+#if __cplusplus <= 201402L
+    constexpr float uniform_fp_test_params<float>::expected[];
+    constexpr double uniform_fp_test_params<double>::expected[];
+#endif
+} // namespace
+
+TEMPLATE_TEST_CASE( "uniform_floating_point_distribution is reproducible",
+                    "[rng][distribution][floating-point][approvals]",
+                    float,
+                    double ) {
+    Catch::SimplePcg32 pcg( 0xaabb'aabb );
+
+    const auto lowest = uniform_fp_test_params<TestType>::lowest;
+    const auto highest = uniform_fp_test_params<TestType>::highest;
+    Catch::uniform_floating_point_distribution<TestType> dist( lowest, highest );
+
+    constexpr auto iters = 15;
+    std::array<TestType, iters> generated;
+    for ( int i = 0; i < iters; ++i ) {
+        generated[i] = dist( pcg );
+    }
+
+    REQUIRE_THAT( generated, Catch::Matchers::RangeEquals( uniform_fp_test_params<TestType>::expected ) );
+}
+
+TEMPLATE_TEST_CASE( "uniform_floating_point_distribution can handle unitary ranges",
+                    "[rng][distribution][floating-point][approvals]",
+                    float,
+                    double ) {
+    std::random_device rd;
+    auto seed = rd();
+    CAPTURE( seed );
+    Catch::SimplePcg32 pcg( seed );
+
+    const auto highest = uniform_fp_test_params<TestType>::highest;
+    Catch::uniform_floating_point_distribution<TestType> dist( highest,
+                                                               highest );
+
+    constexpr auto iters = 20;
+    for (int i = 0; i < iters; ++i) {
+        REQUIRE( Catch::Detail::directCompare( dist( pcg ), highest ) );
+    }
+}
diff --git a/packages/Catch2/tests/SelfTest/IntrospectiveTests/Reporters.tests.cpp b/packages/Catch2/tests/SelfTest/IntrospectiveTests/Reporters.tests.cpp
index 1568c951692fba61bf3f00171ab689418ed5b547..e5a65bda594425750e5c9b7239c2b7555b0299af 100644
--- a/packages/Catch2/tests/SelfTest/IntrospectiveTests/Reporters.tests.cpp
+++ b/packages/Catch2/tests/SelfTest/IntrospectiveTests/Reporters.tests.cpp
@@ -109,7 +109,9 @@ TEST_CASE( "Reporter's write listings to provided stream", "[reporters]" ) {
         auto sstream = Catch::Detail::make_unique<StringIStream>();
         auto& sstreamRef = *sstream.get();
 
-        Catch::Config config( Catch::ConfigData{} );
+        Catch::ConfigData cfg_data;
+        cfg_data.rngSeed = 1234;
+        Catch::Config config( cfg_data );
         auto reporter = factory.second->create( Catch::ReporterConfig{
             &config, CATCH_MOVE( sstream ), Catch::ColourMode::None, {} } );
 
diff --git a/packages/Catch2/tests/SelfTest/IntrospectiveTests/String.tests.cpp b/packages/Catch2/tests/SelfTest/IntrospectiveTests/String.tests.cpp
index 7a0b3b4ab445ec3b9fd96cc47652e671a5ed3f4e..43c58b49b43064a8196020560b3d4296808323dd 100644
--- a/packages/Catch2/tests/SelfTest/IntrospectiveTests/String.tests.cpp
+++ b/packages/Catch2/tests/SelfTest/IntrospectiveTests/String.tests.cpp
@@ -177,7 +177,7 @@ TEST_CASE("StringRef at compilation time", "[Strings][StringRef][constexpr]") {
         STATIC_REQUIRE_FALSE(sr1.empty());
         STATIC_REQUIRE(sr1.size() == 3);
 
-        using Catch::operator"" _sr;
+        using Catch::operator""_sr;
         constexpr auto sr2 = ""_sr;
         STATIC_REQUIRE(sr2.empty());
         STATIC_REQUIRE(sr2.size() == 0);
diff --git a/packages/Catch2/tests/SelfTest/UsageTests/Generators.tests.cpp b/packages/Catch2/tests/SelfTest/UsageTests/Generators.tests.cpp
index 8e2c387a304e6838e48ac648a93bbeb9a8af3e15..f04cf4f099f6626ab05b35023dcee52e5cc61a52 100644
--- a/packages/Catch2/tests/SelfTest/UsageTests/Generators.tests.cpp
+++ b/packages/Catch2/tests/SelfTest/UsageTests/Generators.tests.cpp
@@ -305,9 +305,19 @@ namespace {
 
 } // namespace
 
-TEST_CASE( "#2615 - Throwing in constructor generator fails test case but does not abort", "[!shouldfail]" ) {
+TEST_CASE( "#2615 - Throwing in constructor generator fails test case but does not abort",
+           "[!shouldfail][regression][generators]" ) {
     // this should fail the test case, but not abort the application
     auto sample = GENERATE( make_test_generator() );
     // this assertion shouldn't trigger
     REQUIRE( sample == 0 );
 }
+
+TEST_CASE( "GENERATE can combine literals and generators", "[generators]" ) {
+    auto i = GENERATE( 2,
+                       4,
+                       take( 2,
+                             filter( []( int val ) { return val % 2 == 0; },
+                                     random( -100, 100 ) ) ) );
+    REQUIRE( i % 2 == 0 );
+}
diff --git a/packages/Catch2/tests/SelfTest/UsageTests/Message.tests.cpp b/packages/Catch2/tests/SelfTest/UsageTests/Message.tests.cpp
index a5e695825797877b42da1badb0e259fd267715ce..6367bf5918612a903a525e07cf693e8192a8d48a 100644
--- a/packages/Catch2/tests/SelfTest/UsageTests/Message.tests.cpp
+++ b/packages/Catch2/tests/SelfTest/UsageTests/Message.tests.cpp
@@ -255,10 +255,24 @@ std::ostream& operator<<(std::ostream& out, helper_1436<T1, T2> const& helper) {
 #pragma clang diagnostic ignored "-Wunused-value"
 #endif
 
+namespace {
+    template <typename T>
+    struct custom_index_op {
+        constexpr custom_index_op( std::initializer_list<T> ) {}
+        constexpr T operator[]( size_t ) { return T{}; }
+#if defined( __cpp_multidimensional_subscript ) && \
+    __cpp_multidimensional_subscript >= 202110L
+        constexpr T operator[]( size_t, size_t, size_t ) const noexcept {
+            return T{};
+        }
+#endif
+    };
+}
+
 TEST_CASE("CAPTURE can deal with complex expressions involving commas", "[messages][capture]") {
-    CAPTURE(std::vector<int>{1, 2, 3}[0, 1, 2],
-            std::vector<int>{1, 2, 3}[(0, 1)],
-            std::vector<int>{1, 2, 3}[0]);
+    CAPTURE(custom_index_op<int>{1, 2, 3}[0, 1, 2],
+            custom_index_op<int>{1, 2, 3}[(0, 1)],
+            custom_index_op<int>{1, 2, 3}[0]);
     CAPTURE((helper_1436<int, int>{12, -12}),
             (helper_1436<int, int>(-12, 12)));
     CAPTURE( (1, 2), (2, 3) );
@@ -285,3 +299,14 @@ TEST_CASE("CAPTURE parses string and character constants", "[messages][capture]"
 #ifdef _MSC_VER
 #pragma warning(pop)
 #endif
+
+TEST_CASE( "INFO and UNSCOPED_INFO can stream multiple arguments",
+           "[messages][info][.failing]" ) {
+    INFO( "This info"
+          << " has multiple"
+          << " parts." );
+    UNSCOPED_INFO( "This unscoped info"
+                   << " has multiple"
+                   << " parts." );
+    FAIL( "Show infos!" );
+}
diff --git a/packages/Catch2/tests/SelfTest/UsageTests/Misc.tests.cpp b/packages/Catch2/tests/SelfTest/UsageTests/Misc.tests.cpp
index 6c1fd68f44ad728f1a861747ade1346274031fea..7f06704b417359160cadc9962ab87583a80202d2 100644
--- a/packages/Catch2/tests/SelfTest/UsageTests/Misc.tests.cpp
+++ b/packages/Catch2/tests/SelfTest/UsageTests/Misc.tests.cpp
@@ -217,6 +217,18 @@ TEST_CASE("Testing checked-if 3", "[checked-if][!shouldfail]") {
     SUCCEED();
 }
 
+[[noreturn]]
+TEST_CASE("Testing checked-if 4", "[checked-if][!shouldfail]") {
+    CHECKED_ELSE(true) {}
+    throw std::runtime_error("Uncaught exception should fail!");
+}
+
+[[noreturn]]
+TEST_CASE("Testing checked-if 5", "[checked-if][!shouldfail]") {
+    CHECKED_ELSE(false) {}
+    throw std::runtime_error("Uncaught exception should fail!");
+}
+
 TEST_CASE( "xmlentitycheck" ) {
     SECTION( "embedded xml: <test>it should be possible to embed xml characters, such as <, \" or &, or even whole <xml>documents</xml> within an attribute</test>" ) {
         SUCCEED(); // We need this here to stop it failing due to no tests
diff --git a/packages/Catch2/tests/SelfTest/UsageTests/ToStringOptional.tests.cpp b/packages/Catch2/tests/SelfTest/UsageTests/ToStringOptional.tests.cpp
index 9fd9d6b45300a41444962568a571df7f670aa1ae..3671771a7612de5786af3c6fabd94445eb21ae88 100644
--- a/packages/Catch2/tests/SelfTest/UsageTests/ToStringOptional.tests.cpp
+++ b/packages/Catch2/tests/SelfTest/UsageTests/ToStringOptional.tests.cpp
@@ -28,4 +28,8 @@ TEST_CASE( "std::vector<std::optional<int> > -> toString", "[toString][optional]
     REQUIRE( "{ 0, { }, 2 }" == ::Catch::Detail::stringify( type{ 0, {}, 2 } ) );
 }
 
+TEST_CASE( "std::nullopt -> toString", "[toString][optional][approvals]" ) {
+    REQUIRE( "{ }" == ::Catch::Detail::stringify( std::nullopt ) );
+}
+
 #endif // CATCH_INTERNAL_CONFIG_CPP17_OPTIONAL
diff --git a/packages/Catch2/tests/TestScripts/DiscoverTests/CMakeLists.txt b/packages/Catch2/tests/TestScripts/DiscoverTests/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d19f2f88971839f33a90288fad606e338b5ad229
--- /dev/null
+++ b/packages/Catch2/tests/TestScripts/DiscoverTests/CMakeLists.txt
@@ -0,0 +1,16 @@
+cmake_minimum_required(VERSION 3.10)
+
+project(discover-tests-test
+  LANGUAGES CXX
+)
+
+add_executable(tests
+  register-tests.cpp
+)
+
+add_subdirectory(${CATCH2_PATH} catch2-build)
+target_link_libraries(tests PRIVATE Catch2::Catch2WithMain)
+
+include(CTest)
+include(Catch)
+catch_discover_tests(tests)
diff --git a/packages/Catch2/tests/TestScripts/DiscoverTests/VerifyRegistration.py b/packages/Catch2/tests/TestScripts/DiscoverTests/VerifyRegistration.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ec42f24ca09efd50d4cb2caf39647c6aa0e712b
--- /dev/null
+++ b/packages/Catch2/tests/TestScripts/DiscoverTests/VerifyRegistration.py
@@ -0,0 +1,123 @@
+#!/usr/bin/env python3
+
+#              Copyright Catch2 Authors
+# Distributed under the Boost Software License, Version 1.0.
+#   (See accompanying file LICENSE.txt or copy at
+#        https://www.boost.org/LICENSE_1_0.txt)
+
+# SPDX-License-Identifier: BSL-1.0
+
+import os
+import subprocess
+import sys
+
+
+def build_project(sources_dir, output_base_path, catch2_path):
+    build_dir = os.path.join(output_base_path, 'ctest-registration-test')
+    config_cmd = ['cmake',
+                  '-B', build_dir,
+                  '-S', sources_dir,
+                  f'-DCATCH2_PATH={catch2_path}',
+                  '-DCMAKE_BUILD_TYPE=Debug']
+
+    build_cmd = ['cmake',
+                 '--build', build_dir,
+                 '--config', 'Debug']
+
+    try:
+        subprocess.run(config_cmd,
+                       capture_output = True,
+                       check = True,
+                       text = True)
+        subprocess.run(build_cmd,
+                       capture_output = True,
+                       check = True,
+                       text = True)
+    except subprocess.CalledProcessError as err:
+        print('Error when building the test project')
+        print(f'cmd: {err.cmd}')
+        print(f'stderr: {err.stderr}')
+        print(f'stdout: {err.stdout}')
+        exit(3)
+
+    return build_dir
+
+
+
+def get_test_names(build_path):
+    # For now we assume that Windows builds are done using MSBuild under
+    # Debug configuration. This means that we need to add "Debug" folder
+    # to the path when constructing it. On Linux, we don't add anything.
+    config_path = "Debug" if os.name == 'nt' else ""
+    full_path = os.path.join(build_path, config_path, 'tests')
+
+
+    cmd = [full_path, '--reporter', 'xml', '--list-tests']
+    result = subprocess.run(cmd,
+                            capture_output = True,
+                            check = True,
+                            text = True)
+
+    import xml.etree.ElementTree as ET
+    root = ET.fromstring(result.stdout)
+    return [tc.text for tc in root.findall('TestCase/Name')]
+
+
+def list_ctest_tests(build_path):
+    old_path = os.getcwd()
+    os.chdir(build_path)
+
+    cmd = ['ctest', '-C', 'debug', '--show-only=json-v1']
+    result = subprocess.run(cmd,
+                            capture_output = True,
+                            check = True,
+                            text = True)
+    os.chdir(old_path)
+
+    import json
+
+    ctest_response = json.loads(result.stdout)
+    tests = ctest_response['tests']
+    test_names = []
+    for test in tests:
+        test_command = test['command']
+        # First part of the command is the binary, second is the filter.
+        # If there are less, registration has failed. If there are more,
+        # registration has changed and the script needs updating.
+        assert len(test_command) == 2
+        test_names.append(test_command[1])
+        test_name = test_command[1]
+
+    return test_names
+
+def escape_catch2_test_name(name):
+    for char in ('\\', ',', '[', ']'):
+        name = name.replace(char, f"\\{char}")
+    return name
+
+if __name__ == '__main__':
+    if len(sys.argv) != 3:
+        print(f'Usage: {sys.argv[0]} path-to-catch2-cml output-path')
+        exit(2)
+    catch2_path = sys.argv[1]
+    output_base_path = sys.argv[2]
+    sources_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
+
+    build_path = build_project(sources_dir, output_base_path, catch2_path)
+
+    catch_test_names = [escape_catch2_test_name(name) for name in get_test_names(build_path)]
+    ctest_test_names = list_ctest_tests(build_path)
+
+    mismatched = 0
+    for catch_test in catch_test_names:
+        if catch_test not in ctest_test_names:
+            print(f"Catch2 test '{catch_test}' not found in CTest")
+            mismatched += 1
+    for ctest_test in ctest_test_names:
+        if ctest_test not in catch_test_names:
+            print(f"CTest test '{ctest_test}' not found in Catch2")
+            mismatched += 1
+
+    if mismatched:
+        print(f"Found {mismatched} mismatched tests catch test names and ctest test commands!")
+        exit(1)
diff --git a/packages/Catch2/tests/TestScripts/DiscoverTests/register-tests.cpp b/packages/Catch2/tests/TestScripts/DiscoverTests/register-tests.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..aa603df1acd0ea77db01f16ad0da4a97591e3a4d
--- /dev/null
+++ b/packages/Catch2/tests/TestScripts/DiscoverTests/register-tests.cpp
@@ -0,0 +1,16 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include <catch2/catch_test_macros.hpp>
+
+TEST_CASE("@Script[C:\\EPM1A]=x;\"SCALA_ZERO:\"", "[script regressions]"){}
+TEST_CASE("Some test") {}
+TEST_CASE( "Let's have a test case with a long name. Longer. No, even longer. "
+           "Really looooooooooooong. Even longer than that. Multiple lines "
+           "worth of test name. Yep, like this." ) {}
+TEST_CASE( "And now a test case with weird tags.", "[tl;dr][tl;dw][foo,bar]" ) {}
diff --git a/packages/Catch2/tests/meson.build b/packages/Catch2/tests/meson.build
index f525f0412a2f66a78fc4983ec7b5618add36d974..58302b7aa1d08997411cdc11879df541e66bf65a 100644
--- a/packages/Catch2/tests/meson.build
+++ b/packages/Catch2/tests/meson.build
@@ -17,6 +17,7 @@ self_test_sources = files(
   'SelfTest/IntrospectiveTests/Details.tests.cpp',
   'SelfTest/IntrospectiveTests/FloatingPoint.tests.cpp',
   'SelfTest/IntrospectiveTests/GeneratorsImpl.tests.cpp',
+  'SelfTest/IntrospectiveTests/Integer.tests.cpp',
   'SelfTest/IntrospectiveTests/InternalBenchmark.tests.cpp',
   'SelfTest/IntrospectiveTests/Parse.tests.cpp',
   'SelfTest/IntrospectiveTests/PartTracker.tests.cpp',
diff --git a/packages/Catch2/tools/scripts/checkLicense.py b/packages/Catch2/tools/scripts/checkLicense.py
index 9a9497692e79e328b42bdebbb1527be4c30495ac..7078d3ec2b878ee086e7275b3ab37d084a438cae 100755
--- a/packages/Catch2/tools/scripts/checkLicense.py
+++ b/packages/Catch2/tools/scripts/checkLicense.py
@@ -33,7 +33,8 @@ def check_licences_in_path(path: str) -> int:
 
 def check_licences():
     failed = 0
-    roots = ['src/catch2', 'tests']
+    # Add 'extras' after the amalgamted files are regenerated with the new script (past 3.4.0)
+    roots = ['src/catch2', 'tests', 'examples', 'fuzzing']
     for root in roots:
         failed += check_licences_in_path(root)
     
diff --git a/packages/Catch2/tools/scripts/generateAmalgamatedFiles.py b/packages/Catch2/tools/scripts/generateAmalgamatedFiles.py
index 99fc446bf96e8dccbfb927a183fdac1670477e67..e3e86aab98e5dcd6ae37151d4e2d3c65b5905ed8 100755
--- a/packages/Catch2/tools/scripts/generateAmalgamatedFiles.py
+++ b/packages/Catch2/tools/scripts/generateAmalgamatedFiles.py
@@ -1,4 +1,9 @@
 #!/usr/bin/env python3
+#              Copyright Catch2 Authors
+# Distributed under the Boost Software License, Version 1.0.
+#   (See accompanying file LICENSE.txt or copy at
+#        https://www.boost.org/LICENSE_1_0.txt)
+# SPDX-License-Identifier: BSL-1.0
 
 import os
 import re
@@ -12,6 +17,8 @@ starting_header = os.path.join(root_path, 'catch2', 'catch_all.hpp')
 output_header = os.path.join(catchPath, 'extras', 'catch_amalgamated.hpp')
 output_cpp = os.path.join(catchPath, 'extras', 'catch_amalgamated.cpp')
 
+# REUSE-IgnoreStart
+
 # These are the copyright comments in each file, we want to ignore them
 copyright_lines = [
 '//              Copyright Catch2 Authors\n',
@@ -24,6 +31,7 @@ copyright_lines = [
 # The header of the amalgamated file: copyright information + explanation
 # what this file is.
 file_header = '''\
+
 //              Copyright Catch2 Authors
 // Distributed under the Boost Software License, Version 1.0.
 //   (See accompanying file LICENSE.txt or copy at
@@ -39,6 +47,8 @@ file_header = '''\
 //  ----------------------------------------------------------
 '''
 
+# REUSE-IgnoreEnd
+
 # Returns file header with proper version string and generation time
 def formatted_file_header(version):
     return file_header.format(version_string=version.getVersionString(),
diff --git a/packages/Catch2/tools/scripts/releaseCommon.py b/packages/Catch2/tools/scripts/releaseCommon.py
index 0d995eaf7c9acf4846755339cbe094c93328a052..1ff4af291c955da54a4cc75e0929fb1662da3fb8 100644
--- a/packages/Catch2/tools/scripts/releaseCommon.py
+++ b/packages/Catch2/tools/scripts/releaseCommon.py
@@ -114,8 +114,8 @@ def updateVersionDefine(version):
 def updateVersionPlaceholder(filename, version):
     with open(filename, 'rb') as file:
         lines = file.readlines()
-    placeholderRegex = re.compile(b'in Catch[0-9]? X.Y.Z')
-    replacement = 'in Catch2 {}.{}.{}'.format(version.majorVersion, version.minorVersion, version.patchNumber).encode('ascii')
+    placeholderRegex = re.compile(b'Catch[0-9]? X.Y.Z')
+    replacement = 'Catch2 {}.{}.{}'.format(version.majorVersion, version.minorVersion, version.patchNumber).encode('ascii')
     with open(filename, 'wb') as file:
         for line in lines:
             file.write(placeholderRegex.sub(replacement, line))
diff --git a/packages/HighFive/.gitrepo b/packages/HighFive/.gitrepo
index 7d436d3458ed5f6d35f34ecd46f1c0975d9a2352..98988c87ecab65e9d9e9215ea6684fa53407fd3b 100644
--- a/packages/HighFive/.gitrepo
+++ b/packages/HighFive/.gitrepo
@@ -6,7 +6,7 @@
 [subrepo]
 	remote = git@github.com:BlueBrain/HighFive.git
 	branch = master
-	commit = 1299c556a992bea8aa03c3b75cca50560a5078b6
+	commit = d04789128123e1a643c6284e28087be1dafae2ee
 	parent = ce26d02b1f92356362dc4daede0babf8d4ac23b6
 	method = merge
 	cmdver = 0.4.6
diff --git a/packages/HighFive/CHANGELOG.md b/packages/HighFive/CHANGELOG.md
index 9a8cd86139f80bc1523e6f0e33fc049316a0f830..fcd0247e37b370f52587440147d5bc4aa2cf541d 100644
--- a/packages/HighFive/CHANGELOG.md
+++ b/packages/HighFive/CHANGELOG.md
@@ -1,4 +1,24 @@
 # Changes
+## Version 2.9.0 - 2024-01-25
+### New Features
+    - Add named ctors for scalar and null dataspaces. (#899)
+    - Named ctor for empty property lists. (#904)
+
+### Improvements
+    - Enable running tests in parallel. (#849)
+    - Wrap all used HDF5 function calls and always check status codes. (#863)
+    - Utilities for writing tests in a container independent manner. (#871)
+    - Improve test rigour.
+
+### Bug Fix
+    - Log messages were slightly misformatted. (#854)
+    - Fix bug in `enforce_ascii_hack`. (#856)
+    - Fix `create_datatype<bool>()`. (#869)
+    - Guard functionality introduced in 1.10.0. (#905)
+    - `inspector` guard for empty containers. (#913)
+    - Avoid non-collective behaviour. (#912)
+
+
 ## Version 2.8.0 - 2023-11-02
 ### Important Change
     - `Eigen::Matrix` is (by default) stored with column-major index ordering. Under
diff --git a/packages/HighFive/CMakeLists.txt b/packages/HighFive/CMakeLists.txt
index d592f2d66db1e448fbdf94cb24237070a120ee3a..694960090842b6a8c27676a336fe4a40e1266b39 100644
--- a/packages/HighFive/CMakeLists.txt
+++ b/packages/HighFive/CMakeLists.txt
@@ -5,7 +5,7 @@ else()
   cmake_policy(VERSION 3.13)
 endif()
 
-project(HighFive VERSION 2.8.0)
+project(HighFive VERSION 2.9.0)
 
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/include/highfive/H5Version.hpp.in
                ${CMAKE_CURRENT_SOURCE_DIR}/include/highfive/H5Version.hpp)
diff --git a/packages/HighFive/README.md b/packages/HighFive/README.md
index 3ea0680157cdc72040a39c68c2f4ebe2d1569ca1..bc0d2752ed71792b5bfa95116efc7822d3812d66 100644
--- a/packages/HighFive/README.md
+++ b/packages/HighFive/README.md
@@ -1,6 +1,6 @@
 # HighFive - HDF5 header-only C++ Library
 
-[![Doxygen -> gh-pages](https://github.com/BlueBrain/HighFive/workflows/gh-pages/badge.svg)](https://BlueBrain.github.io/HighFive)
+[![Doxygen -> gh-pages](https://github.com/BlueBrain/HighFive/workflows/gh-pages/badge.svg?branch=master)](https://BlueBrain.github.io/HighFive/actions/workflows/gh-pages.yml?query=branch%3Amaster)
 [![codecov](https://codecov.io/gh/BlueBrain/HighFive/branch/master/graph/badge.svg?token=UBKxHEn7RS)](https://codecov.io/gh/BlueBrain/HighFive)
 [![HighFive_Integration_tests](https://github.com/BlueBrain/HighFive-testing/actions/workflows/integration.yml/badge.svg)](https://github.com/BlueBrain/HighFive-testing/actions/workflows/integration.yml)
 
@@ -82,7 +82,8 @@ std::string filename = "/tmp/new_file.h5";
 }
 ```
 
-**Note:** `H5File.hpp` is the top-level header of HighFive core which should be always included.
+**Note:** As of 2.8.0, one can use `highfive/highfive.hpp` to include
+everything HighFive. Prior to 2.8.0 one would include `highfive/H5File.hpp`.
 
 **Note:** For advanced usecases the dataset can be created without immediately
 writing to it. This is common in MPI-IO related patterns, or when growing a
diff --git a/packages/HighFive/deps/catch2 b/packages/HighFive/deps/catch2
deleted file mode 160000
index 3f0283de7a9c43200033da996ff9093be3ac84dc..0000000000000000000000000000000000000000
--- a/packages/HighFive/deps/catch2
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 3f0283de7a9c43200033da996ff9093be3ac84dc
diff --git a/packages/HighFive/doc/Doxyfile b/packages/HighFive/doc/Doxyfile
index 6ebc393ec29919bda054ca1ebf962c23a1f1ceff..d0cf7efb1bc607ad1adbd6628ead1e5a9653a793 100644
--- a/packages/HighFive/doc/Doxyfile
+++ b/packages/HighFive/doc/Doxyfile
@@ -866,6 +866,7 @@ WARN_LOGFILE           =
 
 INPUT                  = @CMAKE_CURRENT_SOURCE_DIR@/../include \
                          @CMAKE_CURRENT_SOURCE_DIR@/installation.md \
+                         @CMAKE_CURRENT_SOURCE_DIR@/migration_guide.md \
                          @CMAKE_CURRENT_SOURCE_DIR@/developer_guide.md \
                          @CMAKE_CURRENT_SOURCE_DIR@/../CHANGELOG.md \
                          @CMAKE_CURRENT_SOURCE_DIR@/../README.md
diff --git a/packages/HighFive/doc/migration_guide.md b/packages/HighFive/doc/migration_guide.md
new file mode 100644
index 0000000000000000000000000000000000000000..e85002b150c5c002a9d896736946c0260429cc58
--- /dev/null
+++ b/packages/HighFive/doc/migration_guide.md
@@ -0,0 +1,16 @@
+# Migration Guide
+A collection of tips for migrating away from deprecated features.
+
+## Deprecation of `FixedLenStringArray`.
+The issue with `FixedLenStringArray` is that it is unable to avoid copies.
+Essentially, this class acts as a means to create a copy of the data in a
+format suitable for writing fixed-length strings. Additionally, the class acts
+as a tag for HighFive to overload on. The support of `std::string` in HighFive
+has improved considerable. Since 2.8.0 we can write/read `std::string` to fixed
+or variable length HDF5 strings.
+
+Therefore, this class serves no purpose anymore. Any occurrence of it can be
+replaced with an `std::vector<std::string>` (for example).
+
+If desired one can silence warnings by replacing `FixedLenStringArray` with
+`deprecated::FixedLenStringArray`.
diff --git a/packages/HighFive/include/highfive/H5DataType.hpp b/packages/HighFive/include/highfive/H5DataType.hpp
index 0d596965fea83fe2a97820be62249254c140af5d..b15f62165afcf5adc4254d79dc41acbb9889404c 100644
--- a/packages/HighFive/include/highfive/H5DataType.hpp
+++ b/packages/HighFive/include/highfive/H5DataType.hpp
@@ -342,6 +342,7 @@ template <typename T>
 DataType create_and_check_datatype();
 
 
+namespace deprecated {
 ///
 /// \brief A structure representing a set of fixed-length strings
 ///
@@ -460,6 +461,11 @@ class FixedLenStringArray {
   private:
     vector_t datavec;
 };
+}  // namespace deprecated
+
+template <size_t N>
+using FixedLenStringArray H5_DEPRECATED_USING("Use 'std::vector<std::string>'.") =
+    deprecated::FixedLenStringArray<N>;
 
 }  // namespace HighFive
 
diff --git a/packages/HighFive/include/highfive/H5Version.hpp b/packages/HighFive/include/highfive/H5Version.hpp
index dc238432cb01d651769c663276ea15a94008f3e9..bca2c3a83066e9e666840473c355c5ade91f1b87 100644
--- a/packages/HighFive/include/highfive/H5Version.hpp
+++ b/packages/HighFive/include/highfive/H5Version.hpp
@@ -9,7 +9,7 @@
 #pragma once
 
 #define HIGHFIVE_VERSION_MAJOR 2
-#define HIGHFIVE_VERSION_MINOR 8
+#define HIGHFIVE_VERSION_MINOR 9
 #define HIGHFIVE_VERSION_PATCH 0
 
 /** \brief Concatenated representation of the HighFive version.
@@ -24,10 +24,10 @@
  *  std::cout << STRINGIFY_VALUE(HIGHFIVE_VERSION) << "\n";
  *  \endcode
  */
-#define HIGHFIVE_VERSION 2.8.0
+#define HIGHFIVE_VERSION 2.9.0
 
 /** \brief String representation of the HighFive version.
  *
  *  \warning This macro only exists from 2.7.1 onwards.
  */
-#define HIGHFIVE_VERSION_STRING "2.8.0"
+#define HIGHFIVE_VERSION_STRING "2.9.0"
diff --git a/packages/HighFive/include/highfive/bits/H5Attribute_misc.hpp b/packages/HighFive/include/highfive/bits/H5Attribute_misc.hpp
index 33295d40e55a0e8c1ac111699114eafaf4fd0b9c..cc235b50074f90f948e8b4b22678e1ee44091274 100644
--- a/packages/HighFive/include/highfive/bits/H5Attribute_misc.hpp
+++ b/packages/HighFive/include/highfive/bits/H5Attribute_misc.hpp
@@ -64,7 +64,7 @@ inline void Attribute::read(T& array) const {
     const details::BufferInfo<T> buffer_info(
         file_datatype,
         [this]() -> std::string { return this->getName(); },
-        details::BufferInfo<T>::read);
+        details::BufferInfo<T>::Operation::read);
 
     if (!details::checkDimensions(mem_space, buffer_info.n_dimensions)) {
         std::ostringstream ss;
@@ -130,7 +130,7 @@ inline void Attribute::write(const T& buffer) {
     const details::BufferInfo<T> buffer_info(
         file_datatype,
         [this]() -> std::string { return this->getName(); },
-        details::BufferInfo<T>::write);
+        details::BufferInfo<T>::Operation::write);
 
     if (!details::checkDimensions(mem_space, buffer_info.n_dimensions)) {
         std::ostringstream ss;
diff --git a/packages/HighFive/include/highfive/bits/H5DataType_misc.hpp b/packages/HighFive/include/highfive/bits/H5DataType_misc.hpp
index e29c99b0ed30c8baf555e685ed5a6dc746c9fa6c..619e51e7189e47db7649100c63c1ef82837b8536 100644
--- a/packages/HighFive/include/highfive/bits/H5DataType_misc.hpp
+++ b/packages/HighFive/include/highfive/bits/H5DataType_misc.hpp
@@ -207,7 +207,7 @@ class AtomicType<char[StrLen]>: public DataType {
 };
 
 template <size_t StrLen>
-class AtomicType<FixedLenStringArray<StrLen>>: public DataType {
+class AtomicType<deprecated::FixedLenStringArray<StrLen>>: public DataType {
   public:
     inline AtomicType()
         : DataType(create_string(StrLen)) {}
@@ -239,8 +239,7 @@ AtomicType<T>::AtomicType() {
 }
 
 
-// class FixedLenStringArray<N>
-
+namespace deprecated {
 template <std::size_t N>
 inline FixedLenStringArray<N>::FixedLenStringArray(const char array[][N], std::size_t length) {
     datavec.resize(length);
@@ -283,6 +282,7 @@ template <std::size_t N>
 inline std::string FixedLenStringArray<N>::getString(std::size_t i) const {
     return std::string(datavec[i].data());
 }
+}  // namespace deprecated
 
 // Internal
 // Reference mapping
diff --git a/packages/HighFive/include/highfive/bits/H5Inspector_misc.hpp b/packages/HighFive/include/highfive/bits/H5Inspector_misc.hpp
index 1613f87c3392df4e4c37d706a0eb211c31e86ed6..7ae90d84f167e3757b5246787971929cbb6a73bd 100644
--- a/packages/HighFive/include/highfive/bits/H5Inspector_misc.hpp
+++ b/packages/HighFive/include/highfive/bits/H5Inspector_misc.hpp
@@ -289,10 +289,10 @@ struct inspector<Reference>: type_helper<Reference> {
 };
 
 template <size_t N>
-struct inspector<FixedLenStringArray<N>> {
-    using type = FixedLenStringArray<N>;
+struct inspector<deprecated::FixedLenStringArray<N>> {
+    using type = deprecated::FixedLenStringArray<N>;
     using value_type = char*;
-    using base_type = FixedLenStringArray<N>;
+    using base_type = deprecated::FixedLenStringArray<N>;
     using hdf5_type = char;
 
     static constexpr size_t ndim = 1;
@@ -591,6 +591,21 @@ struct inspector<T[N]> {
     static constexpr bool is_trivially_copyable = std::is_trivially_copyable<value_type>::value &&
                                                   inspector<value_type>::is_trivially_copyable;
 
+    static void prepare(type& val, const std::vector<size_t>& dims) {
+        if (dims.size() < 1) {
+            throw DataSpaceException("Invalid 'dims', must be at least 1 dimensional.");
+        }
+
+        if (dims[0] != N) {
+            throw DataSpaceException("Dimensions mismatch.");
+        }
+
+        std::vector<size_t> next_dims(dims.begin() + 1, dims.end());
+        for (size_t i = 0; i < dims[0]; ++i) {
+            inspector<value_type>::prepare(val[i], next_dims);
+        }
+    }
+
     static size_t getSizeVal(const type& val) {
         return compute_total_size(getDimensions(val));
     }
@@ -608,6 +623,10 @@ struct inspector<T[N]> {
         return inspector<value_type>::data(val[0]);
     }
 
+    static hdf5_type* data(type& val) {
+        return inspector<value_type>::data(val[0]);
+    }
+
     /* it works because there is only T[][][] currently
        we will fix it one day */
     static void serialize(const type& val, hdf5_type* m) {
diff --git a/packages/HighFive/include/highfive/bits/H5Node_traits.hpp b/packages/HighFive/include/highfive/bits/H5Node_traits.hpp
index 493749beecd0e94203028d071eaffabd44f6f8c0..6f4a93ce6a963d6d3d4fb9e1f1c65fa57380dc12 100644
--- a/packages/HighFive/include/highfive/bits/H5Node_traits.hpp
+++ b/packages/HighFive/include/highfive/bits/H5Node_traits.hpp
@@ -53,20 +53,7 @@ class NodeTraits {
     /// \param accessProps A property list with data set access properties
     /// \param parents Create intermediate groups if needed. Default: true.
     /// \return DataSet Object
-    template <typename T,
-              typename std::enable_if<
-                  std::is_same<typename details::inspector<T>::base_type, details::Boolean>::value,
-                  int>::type* = nullptr>
-    DataSet createDataSet(const std::string& dataset_name,
-                          const DataSpace& space,
-                          const DataSetCreateProps& createProps = DataSetCreateProps::Default(),
-                          const DataSetAccessProps& accessProps = DataSetAccessProps::Default(),
-                          bool parents = true);
-
-    template <typename T,
-              typename std::enable_if<
-                  !std::is_same<typename details::inspector<T>::base_type, details::Boolean>::value,
-                  int>::type* = nullptr>
+    template <typename T>
     DataSet createDataSet(const std::string& dataset_name,
                           const DataSpace& space,
                           const DataSetCreateProps& createProps = DataSetCreateProps::Default(),
@@ -92,8 +79,9 @@ class NodeTraits {
 
 
     template <std::size_t N>
+    H5_DEPRECATED("Use 'std::vector<std::string>'.")
     DataSet createDataSet(const std::string& dataset_name,
-                          const FixedLenStringArray<N>& data,
+                          const deprecated::FixedLenStringArray<N>& data,
                           const DataSetCreateProps& createProps = DataSetCreateProps::Default(),
                           const DataSetAccessProps& accessProps = DataSetAccessProps::Default(),
                           bool parents = true);
diff --git a/packages/HighFive/include/highfive/bits/H5Node_traits_misc.hpp b/packages/HighFive/include/highfive/bits/H5Node_traits_misc.hpp
index b09bc3190d1136d1385748fa2546c9b1e40a2064..a98600598ad7647598633378456230706ff99a5a 100644
--- a/packages/HighFive/include/highfive/bits/H5Node_traits_misc.hpp
+++ b/packages/HighFive/include/highfive/bits/H5Node_traits_misc.hpp
@@ -52,28 +52,7 @@ inline DataSet NodeTraits<Derivate>::createDataSet(const std::string& dataset_na
 }
 
 template <typename Derivate>
-template <typename T,
-          typename std::enable_if<
-              std::is_same<typename details::inspector<T>::base_type, details::Boolean>::value,
-              int>::type*>
-inline DataSet NodeTraits<Derivate>::createDataSet(const std::string& dataset_name,
-                                                   const DataSpace& space,
-                                                   const DataSetCreateProps& createProps,
-                                                   const DataSetAccessProps& accessProps,
-                                                   bool parents) {
-    return createDataSet(dataset_name,
-                         space,
-                         create_and_check_datatype<typename details::inspector<T>::base_type>(),
-                         createProps,
-                         accessProps,
-                         parents);
-}
-
-template <typename Derivate>
-template <typename T,
-          typename std::enable_if<
-              !std::is_same<typename details::inspector<T>::base_type, details::Boolean>::value,
-              int>::type*>
+template <typename T>
 inline DataSet NodeTraits<Derivate>::createDataSet(const std::string& dataset_name,
                                                    const DataSpace& space,
                                                    const DataSetCreateProps& createProps,
@@ -104,7 +83,7 @@ inline DataSet NodeTraits<Derivate>::createDataSet(const std::string& dataset_na
 template <typename Derivate>
 template <std::size_t N>
 inline DataSet NodeTraits<Derivate>::createDataSet(const std::string& dataset_name,
-                                                   const FixedLenStringArray<N>& data,
+                                                   const deprecated::FixedLenStringArray<N>& data,
                                                    const DataSetCreateProps& createProps,
                                                    const DataSetAccessProps& accessProps,
                                                    bool parents) {
diff --git a/packages/HighFive/include/highfive/bits/H5ReadWrite_misc.hpp b/packages/HighFive/include/highfive/bits/H5ReadWrite_misc.hpp
index 4f6f1578851cc2d72b2584e3737559b8be83aa4b..05bb49888cf8d75b14c8fbf41fe27dd067cd0dd8 100644
--- a/packages/HighFive/include/highfive/bits/H5ReadWrite_misc.hpp
+++ b/packages/HighFive/include/highfive/bits/H5ReadWrite_misc.hpp
@@ -51,7 +51,7 @@ struct BufferInfo {
     using char_array_t = typename details::type_char_array<type_no_const>::type;
     static constexpr bool is_char_array = details::type_char_array<type_no_const>::is_char_array;
 
-    enum Operation { read, write };
+    enum class Operation { read, write };
     const Operation op;
 
     template <class F>
@@ -131,29 +131,29 @@ struct string_type_checker<char*> {
 
 template <typename T>
 template <class F>
-BufferInfo<T>::BufferInfo(const DataType& dtype, F getName, Operation _op)
+BufferInfo<T>::BufferInfo(const DataType& file_data_type, F getName, Operation _op)
     : op(_op)
-    , is_fixed_len_string(dtype.isFixedLenStr())
+    , is_fixed_len_string(file_data_type.isFixedLenStr())
     // In case we are using Fixed-len strings we need to subtract one dimension
     , n_dimensions(details::inspector<type_no_const>::recursive_ndim -
                    ((is_fixed_len_string && is_char_array) ? 1 : 0))
-    , data_type(
-          string_type_checker<char_array_t>::getDataType(create_datatype<elem_type>(), dtype)) {
+    , data_type(string_type_checker<char_array_t>::getDataType(create_datatype<elem_type>(),
+                                                               file_data_type)) {
     // We warn. In case they are really not convertible an exception will rise on read/write
-    if (dtype.getClass() != data_type.getClass()) {
+    if (file_data_type.getClass() != data_type.getClass()) {
         HIGHFIVE_LOG_WARN(getName() + "\": data and hdf5 dataset have different types: " +
-                          data_type.string() + " -> " + dtype.string());
-    } else if ((dtype.getClass() & data_type.getClass()) == DataTypeClass::Float) {
+                          data_type.string() + " -> " + file_data_type.string());
+    } else if ((file_data_type.getClass() & data_type.getClass()) == DataTypeClass::Float) {
         HIGHFIVE_LOG_WARN_IF(
-            (op == read) && (dtype.getSize() > data_type.getSize()),
+            (op == Operation::read) && (file_data_type.getSize() > data_type.getSize()),
             getName() + "\": hdf5 dataset has higher floating point precision than data on read: " +
-                dtype.string() + " -> " + data_type.string());
+                file_data_type.string() + " -> " + data_type.string());
 
         HIGHFIVE_LOG_WARN_IF(
-            (op == write) && (dtype.getSize() < data_type.getSize()),
+            (op == Operation::write) && (file_data_type.getSize() < data_type.getSize()),
             getName() +
                 "\": data has higher floating point precision than hdf5 dataset on write: " +
-                data_type.string() + " -> " + dtype.string());
+                data_type.string() + " -> " + file_data_type.string());
     }
 }
 
diff --git a/packages/HighFive/include/highfive/bits/H5Utils.hpp b/packages/HighFive/include/highfive/bits/H5Utils.hpp
index 2d9d24f887174e5719949c7f4a2bb03b52829a94..b3f039e20d29377269a12cfe6126df7bb3027ed6 100644
--- a/packages/HighFive/include/highfive/bits/H5Utils.hpp
+++ b/packages/HighFive/include/highfive/bits/H5Utils.hpp
@@ -25,9 +25,11 @@
 
 namespace HighFive {
 
+namespace deprecated {
 // If ever used, recognize dimensions of FixedLenStringArray
 template <std::size_t N>
 class FixedLenStringArray;
+}  // namespace deprecated
 
 namespace details {
 // converter function for hsize_t -> size_t when hsize_t != size_t
diff --git a/packages/HighFive/include/highfive/bits/H5_definitions.hpp b/packages/HighFive/include/highfive/bits/H5_definitions.hpp
index 746723c8839344ef8cc2dcc55a9170029ce13f47..ad4b95af2a9af52f92e14196eff5247c848ab96e 100644
--- a/packages/HighFive/include/highfive/bits/H5_definitions.hpp
+++ b/packages/HighFive/include/highfive/bits/H5_definitions.hpp
@@ -5,10 +5,17 @@
 #elif defined(_MSC_VER)
 #define H5_DEPRECATED(msg) __declspec(deprecated(#msg))
 #else
-#pragma message("WARNING: Compiler doesnt support deprecation")
+#pragma message("WARNING: Compiler doesn't support deprecation")
 #define H5_DEPRECATED(msg)
 #endif
 
+#if defined(__GNUC__) || defined(__clang__)
+#define H5_DEPRECATED_USING(msg) H5_DEPRECATED((msg))
+#else
+#pragma message("WARNING: Compiler doesn't support deprecating using statements.")
+#define H5_DEPRECATED_USING(msg)
+#endif
+
 
 // Forward declarations
 
@@ -38,8 +45,10 @@ class AtomicType;
 template <typename Derivate>
 class AnnotateTraits;
 
+namespace deprecated {
 template <std::size_t N>
 class FixedLenStringArray;
+}
 
 template <typename Derivate>
 class NodeTraits;
diff --git a/packages/HighFive/src/examples/read_write_fixedlen_string.cpp b/packages/HighFive/src/examples/read_write_fixedlen_string.cpp
deleted file mode 100644
index 60589637ea97e6c526ab8b8c6754026c7191331d..0000000000000000000000000000000000000000
--- a/packages/HighFive/src/examples/read_write_fixedlen_string.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- *  Copyright (c), 2020, Blue Brain Project
- *
- *  Distributed under the Boost Software License, Version 1.0.
- *    (See accompanying file LICENSE_1_0.txt or copy at
- *          http://www.boost.org/LICENSE_1_0.txt)
- *
- */
-#include <iostream>
-#include <string>
-
-#include <highfive/highfive.hpp>
-
-using namespace HighFive;
-
-// This examples shows how compile time constant strings work.
-//
-// Note, that as of version 2.8.0., writing `std::string` as fixed-length
-// strings there's a simpler API.
-int main() {
-    // Create a new file using the default property lists.
-    File file("create_dataset_string_example.h5", File::Truncate);
-    const char strings_fixed[][16] = {"abcabcabcabcabc", "123123123123123"};
-
-    // create a dataset ready to contains strings of the size of the vector
-    file.createDataSet<char[10]>("ds1", DataSpace(2)).write(strings_fixed);
-
-    // Without specific type info this will create an int8 dataset
-    file.createDataSet("ds2", strings_fixed);
-
-    // Now test the new interface type
-    FixedLenStringArray<10> arr{"0000000", "1111111"};
-    auto ds = file.createDataSet("ds3", arr);
-
-    // Read back truncating to 4 chars
-    FixedLenStringArray<4> array_back;
-    ds.read(array_back);
-    std::cout << "First item is '" << array_back[0] << "'\n"
-              << "Second item is '" << array_back[1] << "'\n";
-
-    return 0;
-}
diff --git a/packages/HighFive/tests/unit/CMakeLists.txt b/packages/HighFive/tests/unit/CMakeLists.txt
index b8943067f7bc3055aa871b808ab8cae8d16782f5..2f01bdd81473cbc46e8ca990127e85b1c6d7eb34 100644
--- a/packages/HighFive/tests/unit/CMakeLists.txt
+++ b/packages/HighFive/tests/unit/CMakeLists.txt
@@ -52,3 +52,5 @@ if(HIGHFIVE_TEST_SINGLE_INCLUDES)
         target_link_libraries("tests_include_${CLASS_NAME}" HighFive HighFiveWarnings)
     endforeach()
 endif()
+
+add_subdirectory(deprecated)
diff --git a/packages/HighFive/tests/unit/deprecated/CMakeLists.txt b/packages/HighFive/tests/unit/deprecated/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5e515374bfe5c67779613a6d8e165153ccebb735
--- /dev/null
+++ b/packages/HighFive/tests/unit/deprecated/CMakeLists.txt
@@ -0,0 +1,10 @@
+foreach(test_name test_fixed_len_string_array)
+  add_executable(${test_name} "${test_name}.cpp")
+
+  target_link_libraries(${test_name} HighFive HighFiveWarnings Catch2::Catch2WithMain)
+  catch_discover_tests(${test_name})
+
+  if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID MATCHES "GNU")
+    target_compile_options(${test_name} PRIVATE -Wno-deprecated-declarations)
+  endif()
+endforeach()
diff --git a/packages/HighFive/tests/unit/deprecated/test_fixed_len_string_array.cpp b/packages/HighFive/tests/unit/deprecated/test_fixed_len_string_array.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1d0c33aaa243134c3e99c3c2d769cc3b3902bb4c
--- /dev/null
+++ b/packages/HighFive/tests/unit/deprecated/test_fixed_len_string_array.cpp
@@ -0,0 +1,172 @@
+#include <catch2/catch_test_macros.hpp>
+
+#include <highfive/highfive.hpp>
+#include "../tests_high_five.hpp"
+
+namespace HighFive {
+
+TEST_CASE("HighFiveFixedLenStringArray") {
+    const std::string file_name("fixed_len_string_array.h5");
+
+    // Create a new file using the default property lists.
+    File file(file_name, File::ReadWrite | File::Create | File::Truncate);
+
+    {  // Dedicated FixedLenStringArray (now deprecated).
+        FixedLenStringArray<10> arr{"0000000", "1111111"};
+
+        // More API: test inserting something
+        arr.push_back("2222");
+        auto ds = file.createDataSet("ds7", arr);  // Short syntax ok
+
+        // Recover truncating
+        FixedLenStringArray<4> array_back;
+        ds.read(array_back);
+        CHECK(array_back.size() == 3);
+        CHECK(array_back[0] == std::string("000"));
+        CHECK(array_back[1] == std::string("111"));
+        CHECK(array_back[2] == std::string("222"));
+        CHECK(array_back.getString(1) == "111");
+        CHECK(array_back.front() == std::string("000"));
+        CHECK(array_back.back() == std::string("222"));
+        CHECK(array_back.data() == std::string("000"));
+        array_back.data()[0] = 'x';
+        CHECK(array_back.data() == std::string("x00"));
+
+        for (auto& raw_elem: array_back) {
+            raw_elem[1] = 'y';
+        }
+        CHECK(array_back.getString(1) == "1y1");
+        for (auto iter = array_back.cbegin(); iter != array_back.cend(); ++iter) {
+            CHECK((*iter)[1] == 'y');
+        }
+    }
+}
+
+template <size_t N>
+static void check_fixed_len_string_array_contents(const FixedLenStringArray<N>& array,
+                                                  const std::vector<std::string>& expected) {
+    REQUIRE(array.size() == expected.size());
+
+    for (size_t i = 0; i < array.size(); ++i) {
+        CHECK(array[i] == expected[i]);
+    }
+}
+
+
+TEST_CASE("HighFiveFixedLenStringArrayStructure") {
+    using fixed_array_t = FixedLenStringArray<10>;
+    // increment the characters of a string written in a std::array
+    auto increment_string = [](const fixed_array_t::value_type arr) {
+        fixed_array_t::value_type output(arr);
+        for (auto& c: output) {
+            if (c == 0) {
+                break;
+            }
+            ++c;
+        }
+        return output;
+    };
+
+    SECTION("create from std::vector (onpoint)") {
+        auto expected = std::vector<std::string>{"000", "111"};
+        auto actual = FixedLenStringArray<4>(expected);
+        check_fixed_len_string_array_contents(actual, expected);
+    }
+
+    SECTION("create from std::vector (oversized)") {
+        auto expected = std::vector<std::string>{"000", "111"};
+        auto actual = FixedLenStringArray<8>(expected);
+        check_fixed_len_string_array_contents(actual, expected);
+    }
+
+    SECTION("create from pointers (onpoint)") {
+        auto expected = std::vector<std::string>{"000", "111"};
+        auto actual = FixedLenStringArray<4>(expected.data(), expected.data() + expected.size());
+        check_fixed_len_string_array_contents(actual, expected);
+    }
+
+    SECTION("create from pointers (oversized)") {
+        auto expected = std::vector<std::string>{"000", "111"};
+        auto actual = FixedLenStringArray<8>(expected.data(), expected.data() + expected.size());
+        check_fixed_len_string_array_contents(actual, expected);
+    }
+
+
+    SECTION("create from std::initializer_list (onpoint)") {
+        auto expected = std::vector<std::string>{"000", "111"};
+        auto actual = FixedLenStringArray<4>{"000", "111"};
+        check_fixed_len_string_array_contents(actual, expected);
+    }
+
+    SECTION("create from std::initializer_list (oversized)") {
+        auto expected = std::vector<std::string>{"000", "111"};
+        auto actual = FixedLenStringArray<8>{"000", "111"};
+        check_fixed_len_string_array_contents(actual, expected);
+    }
+
+    // manipulate FixedLenStringArray with std::copy
+    SECTION("compatible with std::copy") {
+        const fixed_array_t arr1{"0000000", "1111111"};
+        fixed_array_t arr2{"0000000", "1111111"};
+        std::copy(arr1.begin(), arr1.end(), std::back_inserter(arr2));
+        CHECK(arr2.size() == 4);
+    }
+
+    SECTION("compatible with std::transform") {
+        fixed_array_t arr;
+        {
+            const fixed_array_t arr1{"0000000", "1111111"};
+            std::transform(arr1.begin(), arr1.end(), std::back_inserter(arr), increment_string);
+        }
+        CHECK(arr.size() == 2);
+        CHECK(arr[0] == std::string("1111111"));
+        CHECK(arr[1] == std::string("2222222"));
+    }
+
+    SECTION("compatible with std::transform (reverse iterator)") {
+        fixed_array_t arr;
+        {
+            const fixed_array_t arr1{"0000000", "1111111"};
+            std::copy(arr1.rbegin(), arr1.rend(), std::back_inserter(arr));
+        }
+        CHECK(arr.size() == 2);
+        CHECK(arr[0] == std::string("1111111"));
+        CHECK(arr[1] == std::string("0000000"));
+    }
+
+    SECTION("compatible with std::remove_copy_if") {
+        fixed_array_t arr2;
+        {
+            const fixed_array_t arr1{"0000000", "1111111"};
+            std::remove_copy_if(arr1.begin(),
+                                arr1.end(),
+                                std::back_inserter(arr2),
+                                [](const fixed_array_t::value_type& s) {
+                                    return std::strncmp(s.data(), "1111111", 7) == 0;
+                                });
+        }
+        CHECK(arr2.size() == 1);
+        CHECK(arr2[0] == std::string("0000000"));
+    }
+}
+
+TEST_CASE("HighFiveFixedLenStringArrayAttribute") {
+    const std::string file_name("fixed_array_attr.h5");
+    // Create a new file using the default property lists.
+    {
+        File file(file_name, File::ReadWrite | File::Create | File::Truncate);
+        FixedLenStringArray<10> arr{"Hello", "world"};
+        file.createAttribute("str", arr);
+    }
+    // Re-read it
+    {
+        File file(file_name);
+        FixedLenStringArray<8> arr;  // notice the output strings can be smaller
+        file.getAttribute("str").read(arr);
+        CHECK(arr.size() == 2);
+        CHECK(arr[0] == std::string("Hello"));
+        CHECK(arr[1] == std::string("world"));
+    }
+}
+
+}  // namespace HighFive
diff --git a/packages/HighFive/tests/unit/tests_high_five_base.cpp b/packages/HighFive/tests/unit/tests_high_five_base.cpp
index 163535b55cdd0875ac7203dfabf39e1ce1f29c69..fefdcdd5586fa3748a99a729b2103e3232ba5739 100644
--- a/packages/HighFive/tests/unit/tests_high_five_base.cpp
+++ b/packages/HighFive/tests/unit/tests_high_five_base.cpp
@@ -2405,36 +2405,6 @@ TEST_CASE("HighFiveFixedString") {
         file.createDataSet<char[10]>("ds6", DataSpace(1)).write(buffer);
     }
 
-    {  // Dedicated FixedLenStringArray
-        FixedLenStringArray<10> arr{"0000000", "1111111"};
-
-        // More API: test inserting something
-        arr.push_back("2222");
-        auto ds = file.createDataSet("ds7", arr);  // Short syntax ok
-
-        // Recover truncating
-        FixedLenStringArray<4> array_back;
-        ds.read(array_back);
-        CHECK(array_back.size() == 3);
-        CHECK(array_back[0] == std::string("000"));
-        CHECK(array_back[1] == std::string("111"));
-        CHECK(array_back[2] == std::string("222"));
-        CHECK(array_back.getString(1) == "111");
-        CHECK(array_back.front() == std::string("000"));
-        CHECK(array_back.back() == std::string("222"));
-        CHECK(array_back.data() == std::string("000"));
-        array_back.data()[0] = 'x';
-        CHECK(array_back.data() == std::string("x00"));
-
-        for (auto& raw_elem: array_back) {
-            raw_elem[1] = 'y';
-        }
-        CHECK(array_back.getString(1) == "1y1");
-        for (auto iter = array_back.cbegin(); iter != array_back.cend(); ++iter) {
-            CHECK((*iter)[1] == 'y');
-        }
-    }
-
     {
         // Direct way of writing `std::string` as a fixed length
         // HDF5 string.
@@ -2492,132 +2462,6 @@ TEST_CASE("HighFiveFixedString") {
     }
 }
 
-template <size_t N>
-static void check_fixed_len_string_array_contents(const FixedLenStringArray<N>& array,
-                                                  const std::vector<std::string>& expected) {
-    REQUIRE(array.size() == expected.size());
-
-    for (size_t i = 0; i < array.size(); ++i) {
-        CHECK(array[i] == expected[i]);
-    }
-}
-
-TEST_CASE("HighFiveFixedLenStringArrayStructure") {
-    using fixed_array_t = FixedLenStringArray<10>;
-    // increment the characters of a string written in a std::array
-    auto increment_string = [](const fixed_array_t::value_type arr) {
-        fixed_array_t::value_type output(arr);
-        for (auto& c: output) {
-            if (c == 0) {
-                break;
-            }
-            ++c;
-        }
-        return output;
-    };
-
-    SECTION("create from std::vector (onpoint)") {
-        auto expected = std::vector<std::string>{"000", "111"};
-        auto actual = FixedLenStringArray<4>(expected);
-        check_fixed_len_string_array_contents(actual, expected);
-    }
-
-    SECTION("create from std::vector (oversized)") {
-        auto expected = std::vector<std::string>{"000", "111"};
-        auto actual = FixedLenStringArray<8>(expected);
-        check_fixed_len_string_array_contents(actual, expected);
-    }
-
-    SECTION("create from pointers (onpoint)") {
-        auto expected = std::vector<std::string>{"000", "111"};
-        auto actual = FixedLenStringArray<4>(expected.data(), expected.data() + expected.size());
-        check_fixed_len_string_array_contents(actual, expected);
-    }
-
-    SECTION("create from pointers (oversized)") {
-        auto expected = std::vector<std::string>{"000", "111"};
-        auto actual = FixedLenStringArray<8>(expected.data(), expected.data() + expected.size());
-        check_fixed_len_string_array_contents(actual, expected);
-    }
-
-
-    SECTION("create from std::initializer_list (onpoint)") {
-        auto expected = std::vector<std::string>{"000", "111"};
-        auto actual = FixedLenStringArray<4>{"000", "111"};
-        check_fixed_len_string_array_contents(actual, expected);
-    }
-
-    SECTION("create from std::initializer_list (oversized)") {
-        auto expected = std::vector<std::string>{"000", "111"};
-        auto actual = FixedLenStringArray<8>{"000", "111"};
-        check_fixed_len_string_array_contents(actual, expected);
-    }
-
-    // manipulate FixedLenStringArray with std::copy
-    SECTION("compatible with std::copy") {
-        const fixed_array_t arr1{"0000000", "1111111"};
-        fixed_array_t arr2{"0000000", "1111111"};
-        std::copy(arr1.begin(), arr1.end(), std::back_inserter(arr2));
-        CHECK(arr2.size() == 4);
-    }
-
-    SECTION("compatible with std::transform") {
-        fixed_array_t arr;
-        {
-            const fixed_array_t arr1{"0000000", "1111111"};
-            std::transform(arr1.begin(), arr1.end(), std::back_inserter(arr), increment_string);
-        }
-        CHECK(arr.size() == 2);
-        CHECK(arr[0] == std::string("1111111"));
-        CHECK(arr[1] == std::string("2222222"));
-    }
-
-    SECTION("compatible with std::transform (reverse iterator)") {
-        fixed_array_t arr;
-        {
-            const fixed_array_t arr1{"0000000", "1111111"};
-            std::copy(arr1.rbegin(), arr1.rend(), std::back_inserter(arr));
-        }
-        CHECK(arr.size() == 2);
-        CHECK(arr[0] == std::string("1111111"));
-        CHECK(arr[1] == std::string("0000000"));
-    }
-
-    SECTION("compatible with std::remove_copy_if") {
-        fixed_array_t arr2;
-        {
-            const fixed_array_t arr1{"0000000", "1111111"};
-            std::remove_copy_if(arr1.begin(),
-                                arr1.end(),
-                                std::back_inserter(arr2),
-                                [](const fixed_array_t::value_type& s) {
-                                    return std::strncmp(s.data(), "1111111", 7) == 0;
-                                });
-        }
-        CHECK(arr2.size() == 1);
-        CHECK(arr2[0] == std::string("0000000"));
-    }
-}
-
-TEST_CASE("HighFiveFixedLenStringArrayAttribute") {
-    const std::string file_name("fixed_array_attr.h5");
-    // Create a new file using the default property lists.
-    {
-        File file(file_name, File::ReadWrite | File::Create | File::Truncate);
-        FixedLenStringArray<10> arr{"Hello", "world"};
-        file.createAttribute("str", arr);
-    }
-    // Re-read it
-    {
-        File file(file_name);
-        FixedLenStringArray<8> arr;  // notice the output strings can be smaller
-        file.getAttribute("str").read(arr);
-        CHECK(arr.size() == 2);
-        CHECK(arr[0] == std::string("Hello"));
-        CHECK(arr[1] == std::string("world"));
-    }
-}
-
 TEST_CASE("HighFiveReference") {
     const std::string file_name("h5_ref_test.h5");
     const std::string dataset1_name("dset1");
diff --git a/packages/kokkos/.github/ISSUE_TEMPLATE/bug_report.md b/packages/kokkos/.github/ISSUE_TEMPLATE/bug_report.md
index 5a259e3a58c9660e01c2981228ac816525aeea63..f7a0a7185ac4e2e526f25fe8e8add12d320ac538 100644
--- a/packages/kokkos/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/packages/kokkos/.github/ISSUE_TEMPLATE/bug_report.md
@@ -1,21 +1,22 @@
 ---
 name: Bug report
-about: Create a report to correct failures and improve our code
+about: Create a report (for github issue tracker) to correct failures
 title: ''
 labels: ''
 assignees: ''
 ---
 **Describe the bug**
-Please provide a concise, clear description of the bug, as well as any available error logs.
 
-**Please also include the following items to support reproducing the bug**
-1. compilers (with versions)
+Please provide a concise, clear description of the bug, as well as any available error logs.  Feel free to contact the Kokkos Slack `# build` channel for further discussion of your issue.
+
+**Please include the following for a minimal reproducer**
+
+1. Compilers (with versions)
 2. Kokkos release or commit used (i.e., the sha1 number)
-3. platform and backend
-4. cmake configure command
-5. output from cmake command
-6. code needed to reproduce the bug
-7. command line needed to reproduce the bug
-7. please also attach the `KokkosCore_config.h` header file (generated during the build);
-**Any additional info**
-Please provide any additional context about the issue here.
+3. Platform, architecture and backend
+4. CMake configure command
+5. Output from CMake configure command
+6. Minimum, complete code needed to reproduce the bug
+7. Command line needed to reproduce the bug
+8. `KokkosCore_config.h` header file (generated during the build)
+9. Please provide any additional relevant error logs
diff --git a/packages/kokkos/.github/workflows/continuous-integration-workflow-32bit.yml b/packages/kokkos/.github/workflows/continuous-integration-workflow-32bit.yml
new file mode 100644
index 0000000000000000000000000000000000000000..68fbdbe8a4764d1e787361f99f3430bb26bbf8f7
--- /dev/null
+++ b/packages/kokkos/.github/workflows/continuous-integration-workflow-32bit.yml
@@ -0,0 +1,45 @@
+name: github-Linux-32bit
+
+on:
+  push:
+    branches:
+      - develop
+  pull_request:
+    paths-ignore:
+    - '**/*.md'
+    types: [ opened, reopened, synchronize ]
+
+concurrency:
+  group: ${ {github.event_name }}-${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{github.event_name == 'pull_request'}}
+
+jobs:
+  CI-32bit:
+    name: Linux-32bit
+    runs-on: ubuntu-latest
+    container:
+      image: ghcr.io/kokkos/ci-containers/ubuntu:latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+      - name: install_multilib
+        run: sudo apt-get update && sudo apt-get install -y gcc-multilib g++-multilib gfortran-multilib
+      - name: Configure Kokkos
+        run: |
+          cmake -B builddir \
+            -DKokkos_ENABLE_OPENMP=ON \
+            -DKokkos_ENABLE_TESTS=ON \
+            -DKokkos_ENABLE_BENCHMARKS=ON \
+            -DKokkos_ENABLE_EXAMPLES=ON \
+            -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \
+            -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \
+            -DKokkos_ENABLE_COMPILER_WARNINGS=ON \
+            -DCMAKE_CXX_FLAGS="-Werror -m32 -DKOKKOS_IMPL_32BIT" \
+            -DCMAKE_CXX_COMPILER=g++ \
+            -DCMAKE_BUILD_TYPE=RelWithDebInfo
+      - name: Build
+        run: |
+          cmake --build builddir --parallel 2
+      - name: Tests
+        working-directory: builddir
+        run: ctest --output-on-failure
diff --git a/packages/kokkos/.github/workflows/continuous-integration-workflow-hpx.yml b/packages/kokkos/.github/workflows/continuous-integration-workflow-hpx.yml
index 35bb5bb2cb2924a5000ab1087457ce6afea6029b..8b39350dc8765f871d389eae56acc4b90a7af149 100644
--- a/packages/kokkos/.github/workflows/continuous-integration-workflow-hpx.yml
+++ b/packages/kokkos/.github/workflows/continuous-integration-workflow-hpx.yml
@@ -1,6 +1,13 @@
 name: github-Linux-hpx
 
-on: [push, pull_request]
+on:
+  push:
+    branches:
+      - develop
+  pull_request:
+    paths-ignore:
+    - '**/*.md'
+    types: [ opened, reopened, synchronize ]
 
 concurrency:
   group: ${ {github.event_name }}-${{ github.workflow }}-${{ github.ref }}
@@ -13,7 +20,7 @@ jobs:
 
     steps:
       - name: checkout code
-        uses: actions/checkout@v2.2.0
+        uses: actions/checkout@v3
         with:
           path: kokkos
       - name: setup hpx dependencies
@@ -26,12 +33,12 @@ jobs:
             libboost-all-dev \
             ninja-build
       - name: checkout hpx
-        uses: actions/checkout@v2.2.0
+        uses: actions/checkout@v3
         with:
           repository: STELLAR-GROUP/hpx
-          ref: 1.7.1
+          ref: 1.8.0
           path: hpx
-      - uses: actions/cache@v2
+      - uses: actions/cache@v3
         id:   cache-hpx
         with:
           path:         ./hpx/install
@@ -69,13 +76,11 @@ jobs:
             -DCMAKE_CXX_COMPILER=clang++ \
             -DCMAKE_CXX_FLAGS="-Werror" \
             -DHPX_ROOT=$PWD/../../hpx/install \
-            -DKokkos_ARCH_NATIVE=ON \
             -DKokkos_ENABLE_COMPILER_WARNINGS=ON \
             -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \
             -DKokkos_ENABLE_EXAMPLES=ON \
             -DKokkos_ENABLE_HPX=ON \
-            -DKokkos_ENABLE_HPX_ASYNC_DISPATCH=ON \
-            -DKokkos_ENABLE_SERIAL=OFF \
+            -DKokkos_ENABLE_SERIAL=ON \
             -DKokkos_ENABLE_TESTS=ON \
             ..
 
diff --git a/packages/kokkos/.github/workflows/continuous-integration-workflow.yml b/packages/kokkos/.github/workflows/continuous-integration-workflow.yml
index 55b881794868bdccc8408870dc025b7c35d98d76..8c226c3766c7c2563d82e6adaace3bcbcd3d1b9b 100644
--- a/packages/kokkos/.github/workflows/continuous-integration-workflow.yml
+++ b/packages/kokkos/.github/workflows/continuous-integration-workflow.yml
@@ -1,5 +1,13 @@
 name: github-Linux
-on: [push, pull_request]
+
+on:
+  push:
+    branches:
+      - develop
+  pull_request:
+    paths-ignore:
+    - '**/*.md'
+    types: [ opened, reopened, synchronize ]
 
 concurrency:
   group: ${ {github.event_name }}-${{ github.workflow }}-${{ github.ref }}
@@ -12,30 +20,31 @@ jobs:
       matrix:
         distro: ['fedora:latest', 'fedora:rawhide', 'ubuntu:latest']
         cxx: ['g++', 'clang++']
+        cxx_extra_flags: ['']
         cmake_build_type: ['Release', 'Debug']
         backend: ['OPENMP']
         clang-tidy: ['']
         include:
           - distro: 'fedora:intel'
             cxx: 'icpc'
+            cxx_extra_flags: '-diag-disable=177,10441'
             cmake_build_type: 'Release'
             backend: 'OPENMP'
-            clang-tidy: ''
           - distro: 'fedora:intel'
             cxx: 'icpc'
+            cxx_extra_flags: '-diag-disable=177,10441'
             cmake_build_type: 'Debug'
             backend: 'OPENMP'
-            clang-tidy: ''
           - distro: 'fedora:intel'
             cxx: 'icpx'
+            cxx_extra_flags: '-fp-model=precise -Wno-pass-failed'
             cmake_build_type: 'Release'
             backend: 'OPENMP'
-            clang-tidy: ''
           - distro: 'fedora:intel'
             cxx: 'icpx'
+            cxx_extra_flags: '-fp-model=precise -Wno-pass-failed'
             cmake_build_type: 'Debug'
             backend: 'OPENMP'
-            clang-tidy: ''
           - distro: 'ubuntu:latest'
             cxx: 'clang++'
             cmake_build_type: 'RelWithDebInfo'
@@ -48,11 +57,9 @@ jobs:
     runs-on: ubuntu-latest
     container:
       image: ghcr.io/kokkos/ci-containers/${{ matrix.distro }}
-      # see https://github.com/actions/virtual-environments/issues/3812
-      options: --security-opt seccomp=unconfined
     steps:
       - name: Checkout desul
-        uses: actions/checkout@v2.2.0
+        uses: actions/checkout@v3
         with:
           repository: desul/desul
           ref: 477da9c8f40f8db369c28dd3f93a67e376d8511b
@@ -67,21 +74,14 @@ jobs:
           cmake -DDESUL_ENABLE_TESTS=OFF -DCMAKE_INSTALL_PREFIX=/usr/desul-install ..
           sudo cmake --build . --target install --parallel 2
       - name: Checkout code
-        uses: actions/checkout@v2.2.0
-      - uses: actions/cache@v2
+        uses: actions/checkout@v3
+      - uses: actions/cache@v3
         with:
-          path: ~/.ccache
-          key: kokkos-${{ matrix.distro }}-${{ matrix.cxx }}-${{ matrix.cmake_build_type }}-${{ matrix.openmp }}-${github.ref}-${{ github.sha }}
-          restore-keys: kokkos-${{ matrix.distro }}-${{ matrix.cxx }}-${{ matrix.cmake_build_type }}-${{ matrix.openmp }}-${{github.ref}}
-      - name: maybe_disable_death_tests
-        if: ${{ matrix.distro == 'fedora:rawhide' }}
-        run: echo "GTEST_FILTER=-*DeathTest*" >> $GITHUB_ENV
-# Re-enable when latest is F37+
-#      - name: maybe_use_flang
-#        if: ${{ matrix.cxx == 'clang++' && startsWith(matrix.distro,'fedora:') }}
-#        run: echo "FC=flang" >> $GITHUB_ENV
+          path: ~/.cache/ccache
+          key: kokkos-${{ matrix.distro }}-${{ matrix.cxx }}-${{ matrix.cmake_build_type }}-${{ matrix.openmp }}-${{ github.ref }}-${{ github.sha }}
+          restore-keys: kokkos-${{ matrix.distro }}-${{ matrix.cxx }}-${{ matrix.cmake_build_type }}-${{ matrix.openmp }}-${{ github.ref }}
       - name: maybe_use_flang_new
-        if: ${{ matrix.cxx == 'clang++' && startsWith(matrix.distro,'fedora:rawhide') }}
+        if: ${{ matrix.cxx == 'clang++' && startsWith(matrix.distro,'fedora:') }}
         run: echo "FC=flang-new" >> $GITHUB_ENV
       - name: maybe_use_external_gtest
         if: ${{ matrix.distro == 'ubuntu:latest' }}
@@ -95,7 +95,6 @@ jobs:
             -DCMAKE_INSTALL_PREFIX=/usr \
             ${{ matrix.clang-tidy }} \
             -Ddesul_ROOT=/usr/desul-install/ \
-            -DKokkos_ARCH_NATIVE=ON \
             -DKokkos_ENABLE_DESUL_ATOMICS_EXTERNAL=ON \
             -DKokkos_ENABLE_HWLOC=ON \
             -DKokkos_ENABLE_${{ matrix.backend }}=ON \
@@ -104,7 +103,11 @@ jobs:
             -DKokkos_ENABLE_EXAMPLES=ON \
             -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \
             -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \
+            -DKokkos_ENABLE_COMPILER_WARNINGS=ON \
+            -DKokkos_ENABLE_IMPL_MDSPAN=ON \
+            -DCMAKE_CXX_FLAGS="-Werror ${{ matrix.cxx_extra_flags }}" \
             -DCMAKE_CXX_COMPILER=${{ matrix.cxx }} \
+            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
             -DCMAKE_BUILD_TYPE=${{ matrix.cmake_build_type }}
       - name: Build
         run: |
@@ -114,6 +117,12 @@ jobs:
       - name: Tests
         working-directory: builddir
         run: ctest --output-on-failure
+      - name: Test linking against build dir
+        working-directory: example/build_cmake_installed
+        run: |
+          cmake -B builddir_buildtree -DCMAKE_CXX_COMPILER=${{ matrix.cxx }} -DKokkos_ROOT=../../builddir
+          cmake --build builddir_buildtree
+          cmake --build builddir_buildtree --target test
       - name: Test DESTDIR Install
         run: DESTDIR=${PWD}/install cmake --build builddir --target install && rm -rf ${PWD}/install/usr && rmdir ${PWD}/install
       - name: Install
diff --git a/packages/kokkos/.github/workflows/osx.yml b/packages/kokkos/.github/workflows/osx.yml
index dae8343f20d6fc2861c6c4221faada46ca84d99a..85b079e56c8c2dc96a7ed7bc502086ffd2370e7a 100644
--- a/packages/kokkos/.github/workflows/osx.yml
+++ b/packages/kokkos/.github/workflows/osx.yml
@@ -1,6 +1,13 @@
 name: github-OSX
 
-on: [push, pull_request]
+on:
+  push:
+    branches:
+      - develop
+  pull_request:
+    paths-ignore:
+    - '**/*.md'
+    types: [ opened, reopened, synchronize ]
 
 concurrency:
   group: ${ {github.event_name }}-${{ github.workflow }}-${{ github.ref }}
@@ -24,14 +31,13 @@ jobs:
             cmake_build_type: "Release"
 
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
       - name: configure
         run:
           cmake -B build .
             -DKokkos_ENABLE_${{ matrix.backend }}=On
             -DCMAKE_CXX_FLAGS="-Werror"
             -DCMAKE_CXX_STANDARD=17
-            -DKokkos_ARCH_NATIVE=ON
             -DKokkos_ENABLE_COMPILER_WARNINGS=ON
             -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF
             -DKokkos_ENABLE_TESTS=On
diff --git a/packages/kokkos/.github/workflows/performance-benchmark.yml b/packages/kokkos/.github/workflows/performance-benchmark.yml
new file mode 100644
index 0000000000000000000000000000000000000000..59eed4f6096fb81cc37c89a3fab8b7dee391c14e
--- /dev/null
+++ b/packages/kokkos/.github/workflows/performance-benchmark.yml
@@ -0,0 +1,64 @@
+name: github-benchmarks
+on:
+  push:
+    branches:
+      - develop
+  pull_request:
+    paths-ignore:
+      - '**/*.md'
+    types: [ opened, reopened, synchronize ]
+
+jobs:
+  CI:
+    continue-on-error: true
+    strategy:
+      matrix:
+        distro: ['ubuntu:latest']
+        cxx: ['g++', 'clang++']
+        backend: ['OPENMP']
+    runs-on: ubuntu-latest
+    container:
+      image: ghcr.io/kokkos/ci-containers/${{ matrix.distro }}
+    env:
+      BUILD_ID: ${{ matrix.distro }}-${{ matrix.cxx }}-${{ matrix.backend }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+      - uses: actions/cache@v3
+        with:
+          path: ~/.cache/ccache
+          key: kokkos-${{ matrix.distro }}-${{ matrix.cxx }}-${{ matrix.backend }}-${{ github.ref }}-${{ github.sha }}
+          restore-keys: kokkos-${{ matrix.distro }}-${{ matrix.cxx }}-${{ matrix.backend }}-${{ github.ref }}
+      - name: Configure Kokkos
+        run: |
+          cmake -B builddir \
+            -DKokkos_ENABLE_HWLOC=ON \
+            -DKokkos_ENABLE_${{ matrix.backend }}=ON \
+            -DKokkos_ENABLE_BENCHMARKS=ON \
+            -DCMAKE_CXX_COMPILER=${{ matrix.cxx }} \
+            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+            -DCMAKE_BUILD_TYPE=Release
+      - name: Build
+        run: |
+          ccache -z
+          NUM_CPU=$(grep -c processor /proc/cpuinfo)
+          cmake --build builddir --parallel ${NUM_CPU}
+          ccache -s
+      - name: Tests
+        working-directory: builddir
+        run: ctest --output-on-failure
+      - name: Gather benchmark results
+        run: |
+          mkdir ${{ env.BUILD_ID }}
+          find builddir/core/perf_test/ -name "*.json" -exec mv {} ${{ env.BUILD_ID }}/  \;
+      - name: Push benchmark results
+        if: ${{ github.ref == 'refs/heads/develop' }}
+        uses: dmnemec/copy_file_to_another_repo_action@main
+        env:
+          API_TOKEN_GITHUB: ${{ secrets.DALG24_PUSH_BENCHMARK_RESULTS }}
+        with:
+          source_file: ${{ env.BUILD_ID }}
+          destination_repo: 'kokkos/kokkos-benchmark-results'
+          destination_branch: 'main'
+          user_email: 'kokkos@users.noreply.github.com'
+          user_name: 'Kokkos Developers'
diff --git a/packages/kokkos/.gitrepo b/packages/kokkos/.gitrepo
index a6de31a618c04db9d43e4fd3cadc16cf74b7bd95..0b83a99d68b0beaff367f2948bc7f17627237051 100644
--- a/packages/kokkos/.gitrepo
+++ b/packages/kokkos/.gitrepo
@@ -6,7 +6,7 @@
 [subrepo]
 	remote = git@github.com:kokkos/kokkos.git
 	branch = master
-	commit = 62d2b6c879b74b6ae7bd06eb3e5e80139c4708e6
-	parent = c1b35ca4d5df2b2052224274c26bea7a8eee8ac5
+	commit = 71a9bcae52543bd065522bf3e41b5bfa467d8015
+	parent = 04fc22a5b0c95070fe40c61fe2db5c6bdac384d5
 	method = merge
 	cmdver = 0.4.6
diff --git a/packages/kokkos/.jenkins b/packages/kokkos/.jenkins
index 1775a57d3b2a1d12843ac38fe5e8d38a5f77e8f7..6f5cf80033fa63b96c432a7f7d7ef2b8c582963c 100644
--- a/packages/kokkos/.jenkins
+++ b/packages/kokkos/.jenkins
@@ -3,7 +3,7 @@ pipeline {
 
     environment {
         CCACHE_DIR = '/tmp/ccache'
-        CCACHE_MAXSIZE = '10G'
+        CCACHE_MAXSIZE = '5G'
         CCACHE_CPP2 = 'true'
     }
 
@@ -17,7 +17,7 @@ pipeline {
                 dockerfile {
                     filename 'Dockerfile.clang'
                     dir 'scripts/docker'
-                    label 'nvidia-docker || docker'
+                    label 'nvidia-docker || rocm-docker || docker'
                     args '-v /tmp/ccache.kokkos:/tmp/ccache'
                 }
             }
@@ -27,7 +27,7 @@ pipeline {
         }
         stage('Build') {
             parallel {
-                stage('OPENACC-NVHPC-CUDA-11.6') {
+                stage('OPENACC-NVHPC-CUDA-12.2') {
                     agent {
                         dockerfile {
                             filename 'Dockerfile.nvhpc'
@@ -37,7 +37,7 @@ pipeline {
                         }
                     }
                     environment {
-                        CUDA_HOME = '/opt/nvidia/hpc_sdk/Linux_x86_64/22.3/cuda/11.6'
+                        NVHPC_CUDA_HOME = '/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2'
                     }
                     steps {
                         sh '''rm -rf build && mkdir -p build && cd build && \
@@ -53,13 +53,12 @@ pipeline {
                               make -j8 && ctest --verbose'''
                     }
                 }
-                stage('CUDA-11.7-NVHPC') {
+                stage('CUDA-12.2-NVHPC') {
                     agent {
                         dockerfile {
                             filename 'Dockerfile.nvhpc'
                             dir 'scripts/docker'
-                            additionalBuildArgs '--build-arg BASE=nvcr.io/nvidia/nvhpc:22.9-devel-cuda11.7-ubuntu20.04'
-                            label 'nvidia-docker && large_images'
+                            label 'nvidia-docker && large_images && volta'
                             args '-v /tmp/ccache.kokkos:/tmp/ccache --env NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES'
                         }
                     }
@@ -70,7 +69,7 @@ pipeline {
                         OMP_MAX_ACTIVE_LEVELS = 1
                         OMP_PLACES = 'threads'
                         OMP_PROC_BIND = 'spread'
-                        NVHPC_CUDA_HOME = '/opt/nvidia/hpc_sdk/Linux_x86_64/22.9/cuda/11.7'
+                        NVHPC_CUDA_HOME = '/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2'
                     }
                     steps {
                         sh '''rm -rf build && mkdir -p build && cd build && \
@@ -86,6 +85,8 @@ pipeline {
                                 -DKokkos_ENABLE_CUDA=ON \
                                 -DKokkos_ENABLE_CUDA_LAMBDA=ON \
                                 -DKokkos_ENABLE_OPENMP=ON \
+                                -DKokkos_ENABLE_IMPL_MDSPAN=ON \
+                                -DKokkos_ENABLE_IMPL_NVHPC_AS_DEVICE_COMPILER=ON \
                               .. && \
                               make -j8 && ctest --verbose'''
                     }
@@ -95,20 +96,22 @@ pipeline {
                         dockerfile {
                             filename 'Dockerfile.sycl'
                             dir 'scripts/docker'
-                            label 'nvidia-docker && volta'
+                            label 'nvidia-docker && ampere'
                             args '-v /tmp/ccache.kokkos:/tmp/ccache'
                         }
                     }
                     steps {
                         sh 'ccache --zero-stats'
-                        sh '''rm -rf build && mkdir -p build && cd build && \
+                        sh '''. /opt/intel/oneapi/setvars.sh --include-intel-llvm && \
+                              rm -rf build && mkdir -p build && cd build && \
                               cmake \
                                 -DCMAKE_BUILD_TYPE=Release \
                                 -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-                                -DCMAKE_CXX_COMPILER=clang++ \
-                                -DCMAKE_CXX_FLAGS="-fsycl-device-code-split=per_kernel -Werror -Wno-gnu-zero-variadic-macro-arguments -Wno-linker-warnings" \
+                                -DCMAKE_CXX_COMPILER=/opt/intel/oneapi/compiler/2023.0.0/linux/bin-llvm/clang++ \
+                                -DCMAKE_CXX_FLAGS="-fsycl-device-code-split=per_kernel -Wno-deprecated-declarations -Werror -Wno-gnu-zero-variadic-macro-arguments -Wno-unknown-cuda-version -Wno-sycl-target" \
+                                -DKOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED=0 \
                                 -DKokkos_ARCH_NATIVE=ON \
-                                -DKokkos_ARCH_VOLTA70=ON \
+                                -DKokkos_ARCH_AMPERE80=ON \
                                 -DKokkos_ENABLE_COMPILER_WARNINGS=ON \
                                 -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \
                                 -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \
@@ -169,12 +172,12 @@ pipeline {
                         }
                     }
                 }
-                stage('HIP-ROCm-5.2-C++20') {
+                stage('HIP-ROCm-5.6-C++20') {
                     agent {
                         dockerfile {
                             filename 'Dockerfile.hipcc'
                             dir 'scripts/docker'
-                            additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:5.2'
+                            additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:5.6'
                             label 'rocm-docker && vega'
                             args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES'
                         }
@@ -237,7 +240,7 @@ pipeline {
                                 -DKokkos_ENABLE_BENCHMARKS=ON \
                                 -DKokkos_ENABLE_OPENMPTARGET=ON \
                                 -DKokkos_ENABLE_OPENMP=ON \
-                                -DKokkos_ARCH_VEGA906=ON \
+                                -DKokkos_ARCH_AMD_GFX906=ON \
                               && \
                               cmake --build build --parallel ${BUILD_JOBS} && \
                               cd build && ctest --output-on-failure
@@ -285,7 +288,7 @@ pipeline {
                         }
                     }
                 }
-                stage('CUDA-10.1-Clang-Tidy') {
+                stage('CUDA-11.0.3-Clang-Tidy') {
                     agent {
                         dockerfile {
                             filename 'Dockerfile.kokkosllvmproject'
@@ -302,7 +305,7 @@ pipeline {
                                 -DCMAKE_CXX_CLANG_TIDY="clang-tidy;-warnings-as-errors=*" \
                                 -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
                                 -DCMAKE_CXX_COMPILER=clang++ \
-                                -DCMAKE_CXX_FLAGS=-Werror \
+                                -DCMAKE_CXX_FLAGS="-Werror -Wno-unknown-cuda-version" \
                                 -DCMAKE_CXX_STANDARD=17 \
                                 -DKokkos_ARCH_NATIVE=ON \
                                 -DKokkos_ENABLE_COMPILER_WARNINGS=ON \
@@ -328,7 +331,7 @@ pipeline {
                         dockerfile {
                             filename 'Dockerfile.nvcc'
                             dir 'scripts/docker'
-                            additionalBuildArgs '--build-arg BASE=nvidia/cuda:11.7.0-devel-ubuntu20.04'
+                            additionalBuildArgs '--build-arg BASE=nvidia/cuda:11.7.1-devel-ubuntu20.04'
                             label 'nvidia-docker && volta'
                             args '-v /tmp/ccache.kokkos:/tmp/ccache --env NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES'
                         }
@@ -343,7 +346,7 @@ pipeline {
                                 --with-cuda \
                                 --with-cuda-options=enable_lambda \
                                 --arch=Volta70 \
-                              .. && \
+                              && \
                               make test -j8'''
                     }
                     post {
@@ -390,6 +393,7 @@ pipeline {
                                 -DKokkos_ENABLE_DEPRECATED_CODE_3=ON \
                                 -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \
                                 -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \
+                                -DKokkos_ENABLE_IMPL_MDSPAN=ON \
                                 -DCMAKE_INSTALL_PREFIX=${PWD}/../install \
                               .. && \
                               make -j8 install && \
@@ -429,7 +433,7 @@ pipeline {
                         dockerfile {
                             filename 'Dockerfile.nvcc'
                             dir 'scripts/docker'
-                            additionalBuildArgs '--build-arg BASE=nvidia/cuda:11.6.0-devel-ubuntu20.04'
+                            additionalBuildArgs '--build-arg BASE=nvidia/cuda:11.6.2-devel-ubuntu20.04'
                             label 'nvidia-docker'
                             args '-v /tmp/ccache.kokkos:/tmp/ccache --env NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES'
                         }
@@ -453,6 +457,8 @@ pipeline {
                                 -DKokkos_ENABLE_CUDA=ON \
                                 -DKokkos_ENABLE_CUDA_LAMBDA=ON \
                                 -DKokkos_ENABLE_LIBDL=OFF \
+                                -DKokkos_ENABLE_IMPL_MDSPAN=ON \
+                                -DKokkos_ENABLE_IMPL_CUDA_MALLOC_ASYNC=OFF \
                               .. && \
                               make -j8 && ctest --verbose && \
                               cd ../example/build_cmake_in_tree && \
@@ -487,6 +493,7 @@ pipeline {
                                 -DCMAKE_CXX_FLAGS=-Werror \
                                 -DKokkos_ARCH_NATIVE=ON \
                                 -DKokkos_ENABLE_COMPILER_WARNINGS=ON \
+                                -DKokkos_ENABLE_DEPRECATED_CODE_3=ON \
                                 -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \
                                 -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \
                                 -DKokkos_ENABLE_TESTS=ON \
diff --git a/packages/kokkos/.jenkins_nightly b/packages/kokkos/.jenkins_nightly
new file mode 100644
index 0000000000000000000000000000000000000000..5d5858178913b82827a5a3e90dd16e29e3cf4f2a
--- /dev/null
+++ b/packages/kokkos/.jenkins_nightly
@@ -0,0 +1,108 @@
+pipeline {
+    agent none
+
+    options {
+        timeout(time: 6, unit: 'HOURS')
+    }
+
+    stages {
+        stage('Build') {
+            parallel {
+                stage('spack-serial') {
+                    agent {
+                        docker {
+                          image 'ubuntu:22.04'
+                          label 'docker'
+                        }
+                    }
+                    steps {
+                        sh '''
+                          DEBIAN_FRONTEND=noninteractive && \
+                          apt-get update && apt-get upgrade -y && apt-get install -y \
+                          build-essential \
+                          wget \
+                          git \
+                          bc \
+                          python3-dev \
+                          && \
+                          apt-get clean && rm -rf /var/lib/apt/lists/*
+
+                          rm -rf spack && \
+                          git clone https://github.com/spack/spack.git && \
+                          . ./spack/share/spack/setup-env.sh && \
+                          spack install kokkos@develop+tests && \
+                          spack load cmake && \
+                          spack test run kokkos && \
+                          spack test results -l
+                          '''
+                    }      
+                }   
+                stage('spack-cuda') {
+                    agent {
+                        docker {
+                          image 'nvidia/cuda:12.1.0-devel-ubuntu22.04'
+                          label 'nvidia-docker && ampere'
+                        }
+                    }
+                    steps {
+                        sh '''
+                          DEBIAN_FRONTEND=noninteractive && \
+                          apt-get update && apt-get upgrade -y && apt-get install -y \
+                          build-essential \
+                          wget \
+                          git \
+                          bc \
+                          python3-dev \
+                          gfortran \
+                          && \
+                          apt-get clean && rm -rf /var/lib/apt/lists/*
+
+                          rm -rf spack && \
+                          git clone https://github.com/spack/spack.git && \
+                          . ./spack/share/spack/setup-env.sh && \
+                          spack install kokkos@develop+cuda+wrapper+tests cuda_arch=80 ^cuda@12.1.0 && \
+                          spack load cmake  && \
+                          spack load kokkos-nvcc-wrapper && \
+                          spack load cuda && \
+                          spack load kokkos && \
+                          spack test run kokkos && \
+                          spack test results -l
+                          '''
+                    }      
+                }   
+                stage('GCC-13') {
+                    agent {
+                        docker {
+                            image 'gcc:13.1'
+                            label 'docker'
+                        }
+                    }
+                    steps {
+                        sh '''
+                          DEBIAN_FRONTEND=noninteractive && \
+                          apt-get update && apt-get upgrade -y && apt-get install -y \
+                          cmake \
+                          && \
+                          apt-get clean && rm -rf /var/lib/apt/lists/*
+
+                          mkdir -p build && cd build && \
+                          cmake \
+                            -DCMAKE_BUILD_TYPE=Release \
+                            -DCMAKE_CXX_STANDARD=23 \
+                            -DCMAKE_CXX_FLAGS=-Werror \
+                            -DKokkos_ARCH_NATIVE=ON \
+                            -DKokkos_ENABLE_COMPILER_WARNINGS=ON \
+                            -DKokkos_ENABLE_BENCHMARKS=ON \
+                            -DKokkos_ENABLE_EXAMPLES=ON \
+                            -DKokkos_ENABLE_TESTS=ON \
+                            -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \
+                            -DKokkos_ENABLE_SERIAL=ON \
+                          .. && \
+                          make -j8 && ctest --verbose
+                          '''
+                    }
+                }
+            }
+        }
+    }
+}
diff --git a/packages/kokkos/BUILD.md b/packages/kokkos/BUILD.md
index b0d603e6db0e9c6f76eda03f1343fce4105d2835..f80320e78b1c6285fc30ed48b51034a7cc75f83a 100644
--- a/packages/kokkos/BUILD.md
+++ b/packages/kokkos/BUILD.md
@@ -111,247 +111,4 @@ For dev-build details, consult the kokkos-spack repository [README](https://gith
 
 # Kokkos Keyword Listing
 
-## Device Backends
-Device backends can be enabled by specifying `-DKokkos_ENABLE_X`.
-
-* Kokkos_ENABLE_CUDA
-    * Whether to build CUDA backend
-    * BOOL Default: OFF
-* Kokkos_ENABLE_HPX
-    * Whether to build HPX backend (experimental)
-    * BOOL Default: OFF
-* Kokkos_ENABLE_OPENMP
-    * Whether to build OpenMP backend
-    * BOOL Default: OFF
-* Kokkos_ENABLE_THREADS
-    * Whether to build C++ thread backend
-    * BOOL Default: OFF
-* Kokkos_ENABLE_SERIAL
-    * Whether to build serial backend
-    * BOOL Default: ON
-* Kokkos_ENABLE_HIP (Experimental)
-    * Whether to build HIP backend
-    * BOOL Default: OFF
-* Kokkos_ENABLE_OPENMPTARGET (Experimental)
-    * Whether to build the OpenMP target backend
-    * BOOL Default: OFF
-
-## Enable Options
-Options can be enabled by specifying `-DKokkos_ENABLE_X`.
-
-* Kokkos_ENABLE_AGGRESSIVE_VECTORIZATION
-    * Whether to aggressively vectorize loops
-    * BOOL Default: OFF
-* Kokkos_ENABLE_COMPILER_WARNINGS
-    * Whether to print all compiler warnings
-    * BOOL Default: OFF
-* Kokkos_ENABLE_CUDA_CONSTEXPR
-    * Whether to activate experimental relaxed constexpr functions
-    * BOOL Default: OFF
-* Kokkos_ENABLE_CUDA_LAMBDA
-    * Whether to activate experimental lambda features
-    * BOOL Default: OFF
-* Kokkos_ENABLE_CUDA_LDG_INTRINSIC
-    * Deprecated since 4.0, LDG intrinsics are always enabled.
-    * Whether to use CUDA LDG intrinsics
-    * BOOL Default: OFF
-* Kokkos_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
-    * Whether to enable relocatable device code (RDC) for CUDA
-    * BOOL Default: OFF
-* Kokkos_ENABLE_CUDA_UVM
-    * Deprecated since 4.0
-    * Whether to use unified memory (UM) by default for CUDA
-    * BOOL Default: OFF
-* Kokkos_ENABLE_DEBUG
-    * Whether to activate extra debug features - may increase compile times
-    * BOOL Default: OFF
-* Kokkos_ENABLE_DEBUG_BOUNDS_CHECK
-    * Whether to use bounds checking - will increase runtime
-    * BOOL Default: OFF
-* Kokkos_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK
-    * Debug check on dual views
-    * BOOL Default: OFF
-* Kokkos_ENABLE_EXAMPLES
-    * Whether to enable building examples
-    * BOOL Default: OFF
-* Kokkos_ENABLE_HPX_ASYNC_DISPATCH
-    * Whether HPX supports asynchronous dispatch
-    * BOOL Default: OFF
-* Kokkos_ENABLE_IMPL_CUDA_MALLOC_ASYNC
-    * Whether to enable CudaMallocAsync (requires CUDA Toolkit 11.2). This is an experimental performance feature and currently has issue when using with UCX. See https://github.com/kokkos/kokkos/issues/4228 for more details.
-    * BOOL Default: OFF
-* Kokkos_ENABLE_LARGE_MEM_TESTS
-    * Whether to perform extra large memory tests
-    * BOOL_Default: OFF
-* Kokkos_ENABLE_PROFILING_LOAD_PRINT
-    * Whether to print information about which profiling tools gotloaded
-    * BOOL Default: OFF
-* Kokkos_ENABLE_TESTS
-    * Whether to enable test suite
-    * BOOL Default: OFF
-
-
-## Third-party Libraries (TPLs)
-The following options control enabling TPLs:
-* Kokkos_ENABLE_HPX
-    * Whether to enable the HPX library
-    * BOOL Default: OFF
-* Kokkos_ENABLE_HWLOC
-    * Whether to enable the HWLOC library
-    * BOOL Default: Off
-* Kokkos_ENABLE_LIBNUMA
-    * Whether to enable the LIBNUMA library
-    * BOOL Default: Off
-* Kokkos_ENABLE_MEMKIND
-    * Whether to enable the MEMKIND library
-    * BOOL Default: Off
-* Kokkos_ENABLE_LIBDL
-    * Whether to enable the LIBDL library
-    * BOOL Default: On
-* Kokkos_ENABLE_LIBRT
-    * Whether to enable the LIBRT library
-    * BOOL Default: Off
-
-The following options control finding and configuring non-CMake TPLs:
-* Kokkos_CUDA_DIR or CUDA_ROOT
-    * Location of CUDA install prefix for libraries
-    * PATH Default:
-* Kokkos_HWLOC_DIR or HWLOC_ROOT
-    * Location of HWLOC install prefix
-    * PATH Default:
-* Kokkos_LIBNUMA_DIR or LIBNUMA_ROOT
-    * Location of LIBNUMA install prefix
-    * PATH Default:
-* Kokkos_MEMKIND_DIR or MEMKIND_ROOT
-    * Location of MEMKIND install prefix
-    * PATH Default:
-* Kokkos_LIBDL_DIR or LIBDL_ROOT
-    * Location of LIBDL install prefix
-    * PATH Default:
-* Kokkos_LIBRT_DIR or LIBRT_ROOT
-    * Location of LIBRT install prefix
-    * PATH Default:
-
-The following options control `find_package` paths for CMake-based TPLs:
-* HPX_DIR or HPX_ROOT
-    * Location of HPX prefix (ROOT) or CMake config file (DIR)
-    * PATH Default:
-
-## Architecture Keywords
-Architecture-specific optimizations can be enabled by specifying `-DKokkos_ARCH_X`.
-
-* Kokkos_ARCH_NATIVE
-    * Whether to optimize for the the local CPU architecture
-    * BOOL Default: OFF
-* Kokkos_ARCH_AMDAVX
-    * Whether to optimize for the AMDAVX architecture
-    * BOOL Default: OFF
-* Kokkos_ARCH_ARMV80
-    * Whether to optimize for the ARMV80 architecture
-    * BOOL Default: OFF
-* Kokkos_ARCH_ARMV81
-    * Whether to optimize for the ARMV81 architecture
-    * BOOL Default: OFF
-* Kokkos_ARCH_ARMV8_THUNDERX
-    * Whether to optimize for the ARMV8_THUNDERX architecture
-    * BOOL Default: OFF
-* Kokkos_ARCH_ARMV8_TX2
-    * Whether to optimize for the ARMV8_TX2 architecture
-    * BOOL Default: OFF
-* Kokkos_ARCH_BDW
-    * Whether to optimize for the BDW architecture
-    * BOOL Default: OFF
-* Kokkos_ARCH_BGQ
-    * Whether to optimize for the BGQ architecture
-    * BOOL Default: OFF
-* Kokkos_ARCH_ZEN
-    * Whether to optimize for the Zen architecture
-    * BOOL Default: OFF
-* Kokkos_ARCH_ZEN2
-    * Whether to optimize for the Zen2 architecture
-    * BOOL Default: OFF
-* Kokkos_ARCH_ZEN3
-    * Whether to optimize for the Zen3 architecture
-    * BOOL Default: OFF
-* Kokkos_ARCH_HSW
-    * Whether to optimize for the HSW architecture
-    * BOOL Default: OFF
-* Kokkos_ARCH_KEPLER30
-    * Whether to optimize for the KEPLER30 architecture
-    * BOOL Default: OFF
-* Kokkos_ARCH_KEPLER32
-    * Whether to optimize for the KEPLER32 architecture
-    * BOOL Default: OFF
-* Kokkos_ARCH_KEPLER35
-    * Whether to optimize for the KEPLER35 architecture
-    * BOOL Default: OFF
-* Kokkos_ARCH_KEPLER37
-    * Whether to optimize for the KEPLER37 architecture
-    * BOOL Default: OFF
-* Kokkos_ARCH_KNC
-    * Whether to optimize for the KNC architecture
-    * BOOL Default: OFF
-* Kokkos_ARCH_KNL
-    * Whether to optimize for the KNL architecture
-    * BOOL Default: OFF
-* Kokkos_ARCH_MAXWELL50
-    * Whether to optimize for the MAXWELL50 architecture
-    * BOOL Default: OFF
-* Kokkos_ARCH_MAXWELL52
-    * Whether to optimize for the MAXWELL52 architecture
-    * BOOL Default: OFF
-* Kokkos_ARCH_MAXWELL53
-    * Whether to optimize for the MAXWELL53 architecture
-    * BOOL Default: OFF
-* Kokkos_ARCH_PASCAL60
-    * Whether to optimize for the PASCAL60 architecture
-    * BOOL Default: OFF
-* Kokkos_ARCH_PASCAL61
-    * Whether to optimize for the PASCAL61 architecture
-    * BOOL Default: OFF
-* Kokkos_ARCH_POWER7
-    * Whether to optimize for the POWER7 architecture
-    * BOOL Default: OFF
-* Kokkos_ARCH_POWER8
-    * Whether to optimize for the POWER8 architecture
-    * BOOL Default: OFF
-* Kokkos_ARCH_POWER9
-    * Whether to optimize for the POWER9 architecture
-    * BOOL Default: OFF
-* Kokkos_ARCH_ICL
-    * Whether to optimize for the ICL architecture
-    * BOOL Default: OFF
-* Kokkos_ARCH_ICX
-    * Whether to optimize for the ICX architecture
-    * BOOL Default: OFF
-* Kokkos_ARCH_SKL
-    * Whether to optimize for the SKL architecture
-    * BOOL Default: OFF
-* Kokkos_ARCH_SKX
-    * Whether to optimize for the SKX architecture
-    * BOOL Default: OFF
-* Kokkos_ARCH_SNB
-    * Whether to optimize for the SNB architecture
-    * BOOL Default: OFF
-* Kokkos_ARCH_SPR
-    * Whether to optimize for the SPR architecture
-    * BOOL Default: OFF
-* Kokkos_ARCH_TURING75
-    * Whether to optimize for the TURING75 architecture
-    * BOOL Default: OFF
-* Kokkos_ARCH_VOLTA70
-    * Whether to optimize for the VOLTA70 architecture
-    * BOOL Default: OFF
-* Kokkos_ARCH_VOLTA72
-    * Whether to optimize for the VOLTA72 architecture
-    * BOOL Default: OFF
-* Kokkos_ARCH_WSM
-    * Whether to optimize for the WSM architecture
-    * BOOL Default: OFF
-
-##### [LICENSE](https://github.com/kokkos/kokkos/blob/devel/LICENSE)
-
-[![License](https://img.shields.io/badge/License-BSD%203--Clause-blue.svg)](https://opensource.org/licenses/BSD-3-Clause)
-
-Under the terms of Contract DE-NA0003525 with NTESS,
-the U.S. Government retains certain rights in this software.
+Please refer to our [wiki](https://kokkos.github.io/kokkos-core-wiki/keywords.html#cmake-keywords).
diff --git a/packages/kokkos/CHANGELOG.md b/packages/kokkos/CHANGELOG.md
index a381f16129fd4846a8f786da0172556ff2cf5034..c6115f4b3d27d187196cd341d74ed7a7d1ad4e5c 100644
--- a/packages/kokkos/CHANGELOG.md
+++ b/packages/kokkos/CHANGELOG.md
@@ -1,4 +1,179 @@
-# Change Log
+# CHANGELOG
+
+## [4.2.00](https://github.com/kokkos/kokkos/tree/4.2.00) (2023-11-06)
+[Full Changelog](https://github.com/kokkos/kokkos/compare/4.1.00...4.2.00)
+
+### Features:
+- SIMD: significant improvements to SIMD support and alignment with C++26 SIMD
+  - add `Kokkos::abs` overload for SIMD types [\#6069](https://github.com/kokkos/kokkos/pull/6069)
+  - add generator constructors [\#6347](https://github.com/kokkos/kokkos/pull/6347)
+  - convert binary operators to hidden friends [\#6320](https://github.com/kokkos/kokkos/pull/6320)
+  - add shift operators [\#6109](https://github.com/kokkos/kokkos/pull/6109)
+  - add `float` support [\#6177](https://github.com/kokkos/kokkos/pull/6177)
+  - add remaining `gather_from` and `scatter_to` overloads [\#6220](https://github.com/kokkos/kokkos/pull/6220)
+  - define simd math function overloads in the Kokkos namespace [\#6465](https://github.com/kokkos/kokkos/pull/6465), [\#6487](https://github.com/kokkos/kokkos/pull/6487)
+  - `Kokkos_ENABLE_NATIVE=ON` autodetects SIMD types supported [\#6188](https://github.com/kokkos/kokkos/pull/6188)
+  - fix AVX2 SIMD support for ZEN2 AMD CPU [\#6238](https://github.com/kokkos/kokkos/pull/6238)
+- `Kokkos::printf` [\#6083](https://github.com/kokkos/kokkos/pull/6083)
+- `Kokkos::sort`: support custom comparator [\#6253](https://github.com/kokkos/kokkos/pull/6253)
+- `half_t` and `bhalf_t` numeric traits [\#5778](https://github.com/kokkos/kokkos/pull/5778)
+- `half_t` and `bhalf_t` mixed comparisons [\#6407](https://github.com/kokkos/kokkos/pull/6407)
+- `half_t` and `bhalf_t` mathematical functions [\#6124](https://github.com/kokkos/kokkos/pull/6124)
+- `TeamThreadRange` `parallel_scan` with return value [\#6090](https://github.com/kokkos/kokkos/pull/6090), [\#6301](https://github.com/kokkos/kokkos/pull/6301), [\#6302](https://github.com/kokkos/kokkos/pull/6302), [\#6303](https://github.com/kokkos/kokkos/pull/6303), [\#6307](https://github.com/kokkos/kokkos/pull/6307)
+- `ThreadVectorRange` `parallel_scan` with return value [\#6235](https://github.com/kokkos/kokkos/pull/6235), [\#6242](https://github.com/kokkos/kokkos/pull/6242), [\#6308](https://github.com/kokkos/kokkos/pull/6308), [\#6305](https://github.com/kokkos/kokkos/pull/6305), [\#6292](https://github.com/kokkos/kokkos/pull/6292)
+- Add team-level std algorithms [\#6200](https://github.com/kokkos/kokkos/pull/6200), [\#6205](https://github.com/kokkos/kokkos/pull/6205), [\#6207](https://github.com/kokkos/kokkos/pull/6207), [\#6208](https://github.com/kokkos/kokkos/pull/6208), [\#6209](https://github.com/kokkos/kokkos/pull/6209), [\#6210](https://github.com/kokkos/kokkos/pull/6210), [\#6211](https://github.com/kokkos/kokkos/pull/6211), [\#6212](https://github.com/kokkos/kokkos/pull/6212), [\#6213](https://github.com/kokkos/kokkos/pull/6213), [\#6256](https://github.com/kokkos/kokkos/pull/6256), [\#6258](https://github.com/kokkos/kokkos/pull/6258), [\#6350](https://github.com/kokkos/kokkos/pull/6350), [\#6351](https://github.com/kokkos/kokkos/pull/6351)
+- Serial: Allow for distinct execution space instances [\#6441](https://github.com/kokkos/kokkos/pull/6441)
+
+### Backend and Architecture Enhancements:
+
+#### CUDA:
+- Fixed potential data race in Cuda `parallel_reduce` [\#6236](https://github.com/kokkos/kokkos/pull/6236)
+- Use `cudaMallocAsync` by default [\#6402](https://github.com/kokkos/kokkos/pull/6402)
+- Bugfix for using Kokkos from a thread of execution [\#6299](https://github.com/kokkos/kokkos/pull/6299)
+
+#### HIP:
+- New naming convention for AMD GPU: VEGA906, VEGA908, VEGA90A, NAVI1030 to AMD_GFX906, AMD_GFX908, AMD_GFX90A, AMD_GFX1030 [\#6266](https://github.com/kokkos/kokkos/pull/6266)
+- Add initial support for gfx942: [\#6358](https://github.com/kokkos/kokkos/pull/6358)
+- Improve reduction performance [\#6229](https://github.com/kokkos/kokkos/pull/6229)
+- Deprecate `HIP(hipStream_t,bool)` constructor [\#6401](https://github.com/kokkos/kokkos/pull/6401)
+- Add support for Graph [\#6370](https://github.com/kokkos/kokkos/pull/6370)
+- Improve reduction performance when using Teams [\#6284](https://github.com/kokkos/kokkos/pull/6284)
+- Fix concurrency calculation [\#6479](https://github.com/kokkos/kokkos/pull/6479)
+- Fix potential data race in HIP `parallel_reduce` [\#6429](https://github.com/kokkos/kokkos/pull/6429)
+
+#### SYCL:
+- Enforce external `sycl::queues` to be in-order [\#6246](https://github.com/kokkos/kokkos/pull/6246)
+- Improve reduction performance: [\#6272](https://github.com/kokkos/kokkos/pull/6272) [\#6271](https://github.com/kokkos/kokkos/pull/6271) [\#6270](https://github.com/kokkos/kokkos/pull/6270) [\#6264](https://github.com/kokkos/kokkos/pull/6264) 
+- Allow using the SYCL execution space on AMD GPUs [\#6321](https://github.com/kokkos/kokkos/pull/6321)
+- Allow sorting via native oneDPL to support Views with stride=1 [\#6322](https://github.com/kokkos/kokkos/pull/6322)
+- Make in-order queues the default via macro [\#6189](https://github.com/kokkos/kokkos/pull/6189)
+
+#### OpenACC:
+- Support Clacc compiler [\#6250](https://github.com/kokkos/kokkos/pull/6250)
+
+### General Enhancements
+- Add missing `is_*_view` traits and `is_*_view_v` helper variable templates for `DynRankView`, `DynamicView`, `OffsetView`, `ScatterView` containers [\#6195](https://github.com/kokkos/kokkos/pull/6195)
+- Make `nvcc_wrapper` and `compiler_launcher` scripts more portable by switching to a `#!/usr/bin/env` shebang [\#6357](https://github.com/kokkos/kokkos/pull/6357)
+- Add an improved `Kokkos::malloc` / `Kokkos::free` performance test [\#6377](https://github.com/kokkos/kokkos/pull/6377)
+- Ensure `Views` with `size==0` can be used with `deep_copy` [\#6273](https://github.com/kokkos/kokkos/pull/6273)
+- `Kokkos::abort` is moved to header `Kokkos_Abort.hpp` [\#6445](https://github.com/kokkos/kokkos/pull/6445)
+- `KOKKOS_ASSERT`, `KOKKOS_EXPECTS`, `KOKKOS_ENSURES` are moved to header `Kokkos_Assert.hpp` [\#6445](https://github.com/kokkos/kokkos/pull/6445)
+- Add a permuted-index mode to the gups benchmark [\#6378](https://github.com/kokkos/kokkos/pull/6378)
+- Check for overflow during backend initialization [\#6159](https://github.com/kokkos/kokkos/pull/6159)
+- Make constraints on `Kokkos::sort` more visible [\#6234](https://github.com/kokkos/kokkos/pull/6234) and cleanup API [\#6239](https://github.com/kokkos/kokkos/pull/6239)
+- Add converting assignment to `DualView`:  [\#6474](https://github.com/kokkos/kokkos/pull/6474)
+
+
+### Build System Changes 
+
+- Export `Kokkos_CXX_COMPILER_VERSION` [\#6282](https://github.com/kokkos/kokkos/pull/6282)
+- Disable default oneDPL support in Trilinos [\#6342](https://github.com/kokkos/kokkos/pull/6342)
+
+### Incompatibilities (i.e. breaking changes)
+ - Ensure that `Kokkos::complex` only gets instantiated for cv-unqualified floating-point types  [\#6251](https://github.com/kokkos/kokkos/pull/6251)
+ - Removed (deprecated-3) support for volatile join operators in reductions [\#6385](https://github.com/kokkos/kokkos/pull/6385)
+ - Enforce `ViewCtorArgs` restrictions for `create_mirror_view` [\#6304](https://github.com/kokkos/kokkos/pull/6304)
+ - SIMD types for ARM NEON are not autodetected anymore but need `Kokkos_ARCH_ARM_NEON` or `Kokkos_ARCH_NATIVE=ON` [\#6394](https://github.com/kokkos/kokkos/pull/6394)
+ - Remove `#include <iostream>` from headers where possible [\#6482](https://github.com/kokkos/kokkos/pull/6482)
+
+### Deprecations
+- Deprecated `Kokkos::vector` [\#6252](https://github.com/kokkos/kokkos/pull/6252)
+- All host allocation mechanisms except for `STD_MALLOC` have been deprecated [\#6341](https://github.com/kokkos/kokkos/pull/6341)
+
+### Bug Fixes
+ - Missing memory fence in `RandomPool::free_state` functions [\#6290](https://github.com/kokkos/kokkos/pull/6290)
+ - Fix for corner case in `Kokkos::Experimental::is_partitioned` algorithm [\#6257](https://github.com/kokkos/kokkos/pull/6257)
+ - Fix initialization of scratch lock variables in the `Cuda` backend [\#6433](https://github.com/kokkos/kokkos/pull/6433)
+ - Fixes for `Kokkos::Array` [\#6372](https://github.com/kokkos/kokkos/pull/6372)
+ - Fixed symlink configure issue for Windows [\#6241](https://github.com/kokkos/kokkos/pull/6241)
+ - OpenMPTarget init-join fix [\#6444](https://github.com/kokkos/kokkos/pull/6444)
+ - Fix atomic operations bug for Min and Max [\#6435](https://github.com/kokkos/kokkos/pull/6435)
+ - Fix implementation for `cyl_bessel_i0` [\#6484](https://github.com/kokkos/kokkos/pull/6484)
+ - Fix various NVCC warnings in `BinSort`, `Array`, and bit manipulation function templates [\#6483](https://github.com/kokkos/kokkos/pull/6483)
+
+## [4.1.00](https://github.com/kokkos/kokkos/tree/4.1.00) (2023-06-16)
+[Full Changelog](https://github.com/kokkos/kokkos/compare/4.0.01...4.1.00)
+
+### Features:
+* Add `<Kokkos_BitManipulation.hpp>` header [\#4577](https://github.com/kokkos/kokkos/pull/4577) [\#5907](https://github.com/kokkos/kokkos/pull/5907) [\#5967](https://github.com/kokkos/kokkos/pull/5967) [\#6101](https://github.com/kokkos/kokkos/pull/6101)
+* Add `UnorderedMapInsertOpTypes` [\#5877](https://github.com/kokkos/kokkos/pull/5877) and documentation [\#350](https://github.com/kokkos/kokkos-core-wiki/pull/350)
+* Add multiple reducers support for team-level parallel reduce [\#5727](https://github.com/kokkos/kokkos/pull/5727)
+
+### Backend and Architecture Enhancements:
+
+#### CUDA:
+
+* Allow NVCC 12 to compile using C++20 flag [\#5977](https://github.com/kokkos/kokkos/pull/5977)
+* Remove ability to disable CMake option `Kokkos_ENABLE_CUDA_LAMBDA` and unconditionally enable CUDA extended lambda support. [\#5964](https://github.com/kokkos/kokkos/pull/5964)
+* Drop unnecessary fences around the memory allocation when using `CudaUVMSpace` in views [\#6008](https://github.com/kokkos/kokkos/pull/6008)
+
+#### HIP:
+* Improve performance for `parallel_reduce`. Use different parameters for `LightWeight` kernels [\#6029](https://github.com/kokkos/kokkos/pull/6029) and [\#6160](https://github.com/kokkos/kokkos/pull/6160)
+
+#### SYCL:
+* Only pass one wrapper object in SYCL reductions [\#6047](https://github.com/kokkos/kokkos/pull/6047)
+* Improve and simplify parallel_scan implementation [\#6064](https://github.com/kokkos/kokkos/pull/6064)
+* Remove workaround for submit_barrier not being enqueued properly [\#5504](https://github.com/kokkos/kokkos/pull/5504)
+* Fix guards for using scratch space with SYCL [\#6003](https://github.com/kokkos/kokkos/pull/6003)
+* Fix compiling SYCL with KOKKOS_IMPL_DO_NOT_USE_PRINTF_USAGE [\#6219](https://github.com/kokkos/kokkos/pull/6219)
+
+#### OpenMPTarget:
+* Improve hierarchical parallelism for Intel architectures [\#6043](https://github.com/kokkos/kokkos/pull/6043)
+* Enable Cray compiler for the OpenMPTarget backend. [\#5889](https://github.com/kokkos/kokkos/pull/5889)
+
+#### HPX:
+* Update HPX backend to use HPX's sender/receiver functionality [\#5628](https://github.com/kokkos/kokkos/pull/5628)
+* Increase minimum required HPX version to 1.8.0 [\#6132](https://github.com/kokkos/kokkos/pull/6132)
+* Implement HPX::in_parallel [\#6143](https://github.com/kokkos/kokkos/pull/6143)
+
+### General Enhancements
+* Export CMake `Kokkos_{CUDA,HIP}_ARCHITECTURES` variables [\#5919](https://github.com/kokkos/kokkos/pull/5919) [\#5925](https://github.com/kokkos/kokkos/pull/5925)
+* Add `Kokkos::Profiling::ScopedRegion` [\#5959](https://github.com/kokkos/kokkos/pull/5959) [\#5972](https://github.com/kokkos/kokkos/pull/5972)
+* Add support for `View::rank[_dynamic]()`[\#5870](https://github.com/kokkos/kokkos/pull/5870)
+* Detect incompatible relocatable device code mode to prevent ODR violations [\#5991](https://github.com/kokkos/kokkos/pull/5991)
+* Add (experimental) support for 32-bit Darwin and PPC [\#5916](https://github.com/kokkos/kokkos/pull/5916)
+* Add missing half and bhalf specialization of the infinity numeric trait [\#6055](https://github.com/kokkos/kokkos/pull/6055)
+* Add `is_dual_view` trait and align further with regular view [\#6120](https://github.com/kokkos/kokkos/pull/6120)
+* Allow templated functors in parallel_for, parallel_reduce and parallel_scan [\#5976](https://github.com/kokkos/kokkos/pull/5976)
+* Define KOKKOS_COMPILER_INTEL_LLVM and only define at most one KOKKOS_COMPILER* macro [\#5906](https://github.com/kokkos/kokkos/pull/5906)
+* Allow linking against build tree [\#6078](https://github.com/kokkos/kokkos/pull/6078)
+* Allow passing a temporary std::vector to partition_space [\#6167](https://github.com/kokkos/kokkos/pull/6167)
+* `Kokkos` can be used as an external dependency in `Trilinos` [\#6142](https://github.com/kokkos/kokkos/pull/6142), [\#6157](https://github.com/kokkos/kokkos/pull/6157) [\#6163](https://github.com/kokkos/kokkos/pull/6163)
+* Left align demangled stacktrace output [\#6191](https://github.com/kokkos/kokkos/pull/6191)
+* Improve OpenMP affinity warning to include MPI concerns [\#6185](https://github.com/kokkos/kokkos/pull/6185)
+
+### Build System Changes
+* Drop `Kokkos_ENABLE_LAUNCH_COMPILER` option which had no effect [\#6148](https://github.com/kokkos/kokkos/pull/6148)
+* Export variables for relevant Kokkos options with cmake[\#6142](https://github.com/kokkos/kokkos/pull/6142)
+
+### Incompatibilities (i.e. breaking changes)
+* Desul atomics always enabled [\#5801](https://github.com/kokkos/kokkos/pull/5801)
+* Drop `KOKKOS_ENABLE_CUDA_ASM*` and `KOKKOS_ENABLE_*_ATOMICS` macros [\#5940](https://github.com/kokkos/kokkos/pull/5940)
+* Drop `KOKKOS_ENABLE_RFO_PREFETCH` macro [\#5944](https://github.com/kokkos/kokkos/pull/5944)
+* Deprecate `Kokkos_ENABLE_CUDA_LAMBDA` configuration option and force it to `ON` [\#5964](https://github.com/kokkos/kokkos/pull/5964)
+* Remove TriBITS Kokkos subpackages [\#6104](https://github.com/kokkos/kokkos/pull/6104)
+* Cuda: Remove unused attach_texture_object [\#6129](https://github.com/kokkos/kokkos/pull/6129)
+* Drop Kokkos_ENABLE_PROFILING_LOAD_PRINT configuration option [\#6150](https://github.com/kokkos/kokkos/pull/6150)
+* Drop pointless Kokkos{Algorithms,Containers}_config.h files [\#6108](https://github.com/kokkos/kokkos/pull/6108)
+
+### Deprecations
+* Deprecate `BinSort`, `BinOp1D`, and `BinOp3D` default constructors [\#6131](https://github.com/kokkos/kokkos/pull/6131)
+
+### Bug Fixes
+* Fix `SYCLTeamMember` to take arguments for scratch sizes as `std::size_t` [\#5981](https://github.com/kokkos/kokkos/pull/5981)
+* Fix Kokkos_SIMD with AVX2 on 64-bit architectures [\#6075](https://github.com/kokkos/kokkos/pull/6075)
+* Fix an incorrectly returning size for SIMD uint64_t in AVX2 [\#6004](https://github.com/kokkos/kokkos/pull/6004)
+* Fix missing avx512 header file with gcc versions before 10 [\#6183](https://github.com/kokkos/kokkos/pull/6183)
+* Fix incorrect results of `parallel_reduce` of types smaller than `int` on CUDA and HIP: [\#5745](https://github.com/kokkos/kokkos/pull/5745)
+* CMake: update package compatibility mode when building within Trilinos [\#6012](https://github.com/kokkos/kokkos/pull/6012)
+* Fix warnings generated from internal uses of `ALL_t` rather than `Kokkos::ALL_t` [\#6028](https://github.com/kokkos/kokkos/pull/6028)
+* Fix bug in `hpcbind` script: check for correct Slurm variable [\#6116](https://github.com/kokkos/kokkos/pull/6116)
+* KokkosTools: Don't call callbacks before backends are initialized [\#6114](https://github.com/kokkos/kokkos/pull/6114)
+* Fix global fence in Kokkos::resize(DynRankView) [\#6184](https://github.com/kokkos/kokkos/pull/6184)
+* Fix `BinSort` support for strided views [\#6081](https://github.com/kokkos/kokkos/pull/6184)
+* Fix missing `is_*_view` traits in containers [\#6195](https://github.com/kokkos/kokkos/pull/6195)
+* Fix broken OpenMP target on NVHPC [\#6171](https://github.com/kokkos/kokkos/pull/6171)
+* Sorting an empty view should exit early and not fail [\#6130](https://github.com/kokkos/kokkos/pull/6130)
 
 ## [4.0.01](https://github.com/kokkos/kokkos/tree/4.0.01) (2023-04-14)
 [Full Changelog](https://github.com/kokkos/kokkos/compare/4.0.00...4.0.01)
@@ -29,8 +204,9 @@
 - Fix an incorrectly returning size for SIMD uint64_t in AVX2 [\#6011](https://github.com/kokkos/kokkos/pull/6011)
 - Desul atomics: wrong value for `desul::Impl::numeric_limits_max<uint64_t>` [\#6018](https://github.com/kokkos/kokkos/pull/6018)
 - Fix warning in some user code when using std::memcpy [\#6000](https://github.com/kokkos/kokkos/pull/6000)
+- Fix excessive build times using Makefile.kokkos [\#6068](https://github.com/kokkos/kokkos/pull/6068)
 
-## [4.0.0](https://github.com/kokkos/kokkos/tree/4.0.0) (2023-02-21)
+## [4.0.0](https://github.com/kokkos/kokkos/tree/4.0.00) (2023-02-21)
 [Full Changelog](https://github.com/kokkos/kokkos/compare/3.7.01...4.0.00)
 
 ### Features:
@@ -38,6 +214,7 @@
 - `parallel_scan` with `View` as result type. [\#5146](https://github.com/kokkos/kokkos/pull/5146)
 - Introduced `SharedSpace`, an alias for a `MemorySpace` that is accessible by every `ExecutionSpace`. The memory is moved and then accessed locally. [\#5289](https://github.com/kokkos/kokkos/pull/5289)
 - Introduced `SharedHostPinnedSpace`, an alias for a `MemorySpace` that is accessible by every `ExecutionSpace`. The memory is pinned to the host and accessed via zero-copy access. [\#5405](https://github.com/kokkos/kokkos/pull/5405)
+- Add team- and thread-level `sort`, `sort_by_key` algorithms. [\#5317](https://github.com/kokkos/kokkos/pull/5317)
 - Groundwork for `MDSpan` integration. [\#4973](https://github.com/kokkos/kokkos/pull/4973) and [\#5304](https://github.com/kokkos/kokkos/pull/5304)
 - Introduced MD version of hierarchical parallelism: `TeamThreadMDRange`, `ThreadVectorMDRange` and `TeamVectorMDRange`. [\#5238](https://github.com/kokkos/kokkos/pull/5238)
 
@@ -121,7 +298,27 @@
 - Add missing `ReductionIdentity<char>` specialization [\#5798](https://github.com/kokkos/kokkos/pull/5798)
 - Don't install standard algorithms headers multiple times [\#5670](https://github.com/kokkos/kokkos/pull/5670)
 - Fix max scratch size calculation for level 0 scratch in CUDA and HIP [\#5718](https://github.com/kokkos/kokkos/pull/5718)
-- Fix excessive build times using Makefile.kokkos [\#6068](https://github.com/kokkos/kokkos/pull/6068)
+
+## [3.7.02](https://github.com/kokkos/kokkos/tree/3.7.02) (2023-05-17)
+[Full Changelog](https://github.com/kokkos/kokkos/compare/3.7.01...3.7.02)
+
+### Backends and Archs Enhancements:
+#### CUDA
+- Add Hopper support and update nvcc_wrapper to work with CUDA-12 [\#5693](https://github.com/kokkos/kokkos/pull/5693)
+### General Enhancements:
+- sprintf -> snprintf [\#5787](https://github.com/kokkos/kokkos/pull/5787)
+### Build System:
+- Add error message when not using `hipcc` and when `CMAKE_CXX_STANDARD` is not set [\#5945](https://github.com/kokkos/kokkos/pull/5945)
+### Bug Fixes:
+- Fix Scratch allocation alignment issues [\#5692](https://github.com/kokkos/kokkos/pull/5692)
+- Fix Intel Classic Compiler ICE [\#5710](https://github.com/kokkos/kokkos/pull/5710)
+- Don't install std algorithm headers multiple times [\#5711](https://github.com/kokkos/kokkos/pull/5711)
+- Fix static init order issue in InitalizationSettings [\#5721](https://github.com/kokkos/kokkos/pull/5721)
+- Fix src/dst Properties in deep_copy(DynamicView,View) [\#5732](https://github.com/kokkos/kokkos/pull/5732)
+- Fix build on Fedora Rawhide [\#5782](https://github.com/kokkos/kokkos/pull/5782)
+- Finalize HIP lock arrays [\#5694](https://github.com/kokkos/kokkos/pull/5694)
+- Fix CUDA lock arrays for current Desul [\#5812](https://github.com/kokkos/kokkos/pull/5812)
+- Set the correct device/context in InterOp tests [\#5701](https://github.com/kokkos/kokkos/pull/5701)
 
 ## [3.7.01](https://github.com/kokkos/kokkos/tree/3.7.01) (2022-12-01)
 [Full Changelog](https://github.com/kokkos/kokkos/compare/3.7.00...3.7.01)
diff --git a/packages/kokkos/CMakeLists.txt b/packages/kokkos/CMakeLists.txt
index aa712f56127aab50c02aa89a6c13a3472036e36f..f6bd81058e9016b9c2f67a50ece0bef9c85e83f3 100644
--- a/packages/kokkos/CMakeLists.txt
+++ b/packages/kokkos/CMakeLists.txt
@@ -5,8 +5,8 @@ if( "${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_BINARY_DIR}" )
   message( FATAL_ERROR "FATAL: In-source builds are not allowed. You should create a separate directory for build files and delete CMakeCache.txt." )
 endif()
 
-if (COMMAND TRIBITS_PACKAGE_DECL)
-  TRIBITS_PACKAGE_DECL(Kokkos)
+if (COMMAND TRIBITS_PACKAGE)
+  TRIBITS_PACKAGE(Kokkos)
 endif()
 
 # We want to determine if options are given with the wrong case
@@ -37,6 +37,8 @@ IF(COMMAND TRIBITS_PACKAGE_DECL)
   SET(KOKKOS_HAS_TRILINOS ON)
 ELSE()
   SET(KOKKOS_HAS_TRILINOS OFF)
+  SET(PACKAGE_NAME Kokkos)
+  SET(PACKAGE_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
 ENDIF()
 # Is this build a subdirectory of another project
 GET_DIRECTORY_PROPERTY(HAS_PARENT PARENT_DIRECTORY)
@@ -138,14 +140,20 @@ IF (NOT CMAKE_SIZEOF_VOID_P)
     MESSAGE(FATAL_ERROR "Kokkos did not configure correctly and failed to validate compiler. The most likely cause is linkage errors during CMake compiler validation. Please consult the CMake error log shown below for the exact error during compiler validation")
   ENDIF()
 ELSEIF (NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
-  MESSAGE(FATAL_ERROR "Kokkos assumes a 64-bit build; i.e., 8-byte pointers, but found ${CMAKE_SIZEOF_VOID_P}-byte pointers instead")
+  IF(CMAKE_SIZEOF_VOID_P EQUAL 4)
+    MESSAGE(WARNING "32-bit builds are experimental and not officially supported.")
+    SET(KOKKOS_IMPL_32BIT ON)
+  ELSE()
+    MESSAGE(FATAL_ERROR "Kokkos assumes a 64-bit build, i.e., 8-byte pointers, but found ${CMAKE_SIZEOF_VOID_P}-byte pointers instead;")
+  ENDIF()
 ENDIF()
 
 
 set(Kokkos_VERSION_MAJOR 4)
-set(Kokkos_VERSION_MINOR 0)
-set(Kokkos_VERSION_PATCH 1)
+set(Kokkos_VERSION_MINOR 2)
+set(Kokkos_VERSION_PATCH 0)
 set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}")
+message(STATUS "Kokkos version: ${Kokkos_VERSION}")
 math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}")
 # mathematical expressions below are not stricly necessary but they eliminate
 # the rather aggravating leading 0 in the releases patch version number, and,
@@ -293,10 +301,6 @@ IF (KOKKOS_HAS_TRILINOS)
     $<$<COMPILE_LANGUAGE:CXX>:${KOKKOS_ALL_COMPILE_OPTIONS}>)
 ENDIF()
 
-if (NOT COMMAND TRIBITS_PACKAGE_DECL)
-  KOKKOS_PACKAGE_DECL()
-endif()
-
 
 #------------------------------------------------------------------------------
 #
@@ -310,8 +314,6 @@ KOKKOS_PROCESS_SUBPACKAGES()
 # E) If Kokkos itself is enabled, process the Kokkos package
 #
 
-KOKKOS_PACKAGE_DEF()
-KOKKOS_EXCLUDE_AUTOTOOLS_FILES()
 KOKKOS_PACKAGE_POSTPROCESS()
 KOKKOS_CONFIGURE_CORE()
 
@@ -320,6 +322,8 @@ IF (NOT KOKKOS_HAS_TRILINOS AND NOT Kokkos_INSTALL_TESTING)
   #Make sure in-tree projects can reference this as Kokkos::
   #to match the installed target names
   ADD_LIBRARY(Kokkos::kokkos ALIAS kokkos)
+  # all_libs target is required for TriBITS-compliance
+  ADD_LIBRARY(Kokkos::all_libs ALIAS kokkos)
   TARGET_LINK_LIBRARIES(kokkos INTERFACE ${KOKKOS_COMPONENT_LIBRARIES})
   KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL(kokkos)
 ENDIF()
diff --git a/packages/kokkos/Makefile.kokkos b/packages/kokkos/Makefile.kokkos
index 60cef6c7f30eb8fc3525add860d9806fb332a16b..7137ec3936cc3d4d961e9c349a15a2aae7906167 100644
--- a/packages/kokkos/Makefile.kokkos
+++ b/packages/kokkos/Makefile.kokkos
@@ -1,8 +1,8 @@
 # Default settings common options.
 
 KOKKOS_VERSION_MAJOR = 4
-KOKKOS_VERSION_MINOR = 0
-KOKKOS_VERSION_PATCH = 1
+KOKKOS_VERSION_MINOR = 2
+KOKKOS_VERSION_PATCH = 0
 KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc)
 
 # Options: Cuda,HIP,SYCL,OpenMPTarget,OpenMP,Threads,Serial
@@ -13,7 +13,7 @@ KOKKOS_DEVICES ?= "Threads"
 # NVIDIA:   Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86,Ada89,Hopper90
 # ARM:      ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX
 # IBM:      BGQ,Power7,Power8,Power9
-# AMD-GPUS: Vega906,Vega908,Vega90A,Navi1030
+# AMD-GPUS: GFX906,GFX908,GFX90A,GFX942,GFX1030,GFX1100
 # AMD-CPUS: AMDAVX,Zen,Zen2,Zen3
 # Intel-GPUs: Gen9,Gen11,Gen12LP,DG1,XeHP,PVC
 KOKKOS_ARCH ?= ""
@@ -23,14 +23,14 @@ KOKKOS_DEBUG ?= "no"
 KOKKOS_USE_TPLS ?= ""
 # Options: c++17,c++1z,c++20,c++2a,c++23,c++2b
 KOKKOS_CXX_STANDARD ?= "c++17"
-# Options: aggressive_vectorization,disable_profiling,enable_large_mem_tests,disable_complex_align,disable_deprecated_code,enable_deprecation_warnings,disable_desul_atomics
+# Options: aggressive_vectorization,disable_profiling,enable_large_mem_tests,disable_complex_align,disable_deprecated_code,enable_deprecation_warnings
 KOKKOS_OPTIONS ?= ""
 KOKKOS_CMAKE ?= "no"
 KOKKOS_TRIBITS ?= "no"
 KOKKOS_STANDALONE_CMAKE ?= "no"
 
 # Default settings specific options.
-# Options: force_uvm,use_ldg,rdc,enable_lambda,enable_constexpr
+# Options: force_uvm,use_ldg,rdc,enable_lambda,enable_constexpr,disable_malloc_async
 KOKKOS_CUDA_OPTIONS ?= ""
 
 # Options: rdc
@@ -75,7 +75,6 @@ KOKKOS_INTERNAL_AGGRESSIVE_VECTORIZATION := $(call kokkos_has_string,$(KOKKOS_OP
 KOKKOS_INTERNAL_ENABLE_TUNING := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_tuning)
 KOKKOS_INTERNAL_DISABLE_COMPLEX_ALIGN := $(call kokkos_has_string,$(KOKKOS_OPTIONS),disable_complex_align)
 KOKKOS_INTERNAL_DISABLE_DUALVIEW_MODIFY_CHECK := $(call kokkos_has_string,$(KOKKOS_OPTIONS),disable_dualview_modify_check)
-KOKKOS_INTERNAL_ENABLE_PROFILING_LOAD_PRINT := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_profile_load_print)
 KOKKOS_INTERNAL_ENABLE_LARGE_MEM_TESTS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_large_mem_tests)
 # deprecated
 KOKKOS_INTERNAL_CUDA_USE_LDG := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),use_ldg)
@@ -83,9 +82,11 @@ KOKKOS_INTERNAL_CUDA_USE_UVM := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),
 KOKKOS_INTERNAL_CUDA_USE_RELOC := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),rdc)
 KOKKOS_INTERNAL_CUDA_USE_LAMBDA := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),enable_lambda)
 KOKKOS_INTERNAL_CUDA_USE_CONSTEXPR := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),enable_constexpr)
+KOKKOS_INTERNAL_CUDA_DISABLE_MALLOC_ASYNC := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),disable_malloc_async)
 KOKKOS_INTERNAL_HPX_ENABLE_ASYNC_DISPATCH := $(call kokkos_has_string,$(KOKKOS_HPX_OPTIONS),enable_async_dispatch)
 # deprecated
 KOKKOS_INTERNAL_ENABLE_DESUL_ATOMICS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_desul_atomics)
+# deprecated
 KOKKOS_INTERNAL_DISABLE_DESUL_ATOMICS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),disable_desul_atomics)
 KOKKOS_INTERNAL_DISABLE_BUNDLED_MDSPAN := $(call kokkos_has_string,$(KOKKOS_OPTIONS),impl_disable_bundled_mdspan)
 KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE := $(call kokkos_has_string,$(KOKKOS_OPTIONS),disable_deprecated_code)
@@ -265,15 +266,16 @@ else
     KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp
   endif
 endif
-ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
-  #KOKKOS_INTERNAL_OPENMPTARGET_FLAG := -DKOKKOS_BUG_WORKAROUND_IBM_CLANG_OMP45_VIEW_INIT -fopenmp-implicit-declare-target -fopenmp-targets=nvptx64-nvidia-cuda -fopenmp -fopenmp=libomp
-  KOKKOS_INTERNAL_OPENMPTARGET_FLAG := -DKOKKOS_WORKAROUND_OPENMPTARGET_CLANG -fopenmp -fopenmp=libomp -Wno-openmp-mapping
-  KOKKOS_INTERNAL_OPENMPTARGET_LIB := -lomptarget
-else ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL_CLANG), 1)
-  KOKKOS_INTERNAL_OPENMPTARGET_FLAG := -fiopenmp -Wno-openmp-mapping
-else
-  #Assume GCC
-  KOKKOS_INTERNAL_OPENMPTARGET_FLAG := -fopenmp -foffload=nvptx-none
+
+ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL_CLANG), 1)
+    KOKKOS_INTERNAL_OPENMPTARGET_FLAG := -fiopenmp -Wno-openmp-mapping
+  else ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 1)
+    KOKKOS_INTERNAL_OPENMPTARGET_FLAG := -mp=gpu 
+  else ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 0)
+    #Assume GCC
+    KOKKOS_INTERNAL_OPENMPTARGET_FLAG := -fopenmp -foffload=nvptx-none
+  endif
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1)
@@ -401,10 +403,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ZEN3), 0)
     KOKKOS_INTERNAL_USE_ARCH_ZEN := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen)
   endif
 endif
-KOKKOS_INTERNAL_USE_ARCH_VEGA906 := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega906)
-KOKKOS_INTERNAL_USE_ARCH_VEGA908 := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega908)
-KOKKOS_INTERNAL_USE_ARCH_VEGA90A := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega90A)
-KOKKOS_INTERNAL_USE_ARCH_NAVI1030 := $(call kokkos_has_string,$(KOKKOS_ARCH),Navi1030)
+KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA906),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX906))
+KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA908),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX908))
+KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA90A),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX90A))
+KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1030),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1030))
+KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1100),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1100))
 
 # Any AVX?
 KOKKOS_INTERNAL_USE_ARCH_SSE42      := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM))
@@ -576,10 +579,6 @@ ifeq ($(KOKKOS_INTERNAL_DISABLE_COMPLEX_ALIGN), 0)
   tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_COMPLEX_ALIGN")
 endif
 
-ifeq ($(KOKKOS_INTERNAL_ENABLE_PROFILING_LOAD_PRINT), 1)
-  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_PROFILING_LOAD_PRINT")
-endif
-
 ifeq ($(KOKKOS_INTERNAL_ENABLE_TUNING), 1)
   tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_TUNING")
 endif
@@ -668,15 +667,13 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
     endif
   endif
 
-  ifeq ($(KOKKOS_INTERNAL_CUDA_USE_LAMBDA), 1)
-    ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
-      tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CUDA_LAMBDA")
-      KOKKOS_CXXFLAGS += -expt-extended-lambda
-    endif
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CUDA_LAMBDA")
+    KOKKOS_CXXFLAGS += -extended-lambda
+  endif
 
-    ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
-      tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CUDA_LAMBDA")
-    endif
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CUDA_LAMBDA")
   endif
 
   ifeq ($(KOKKOS_INTERNAL_CUDA_USE_CONSTEXPR), 1)
@@ -693,11 +690,17 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
   ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
     tmp := $(call kokkos_append_header,"$H""define KOKKOS_IMPL_CUDA_CLANG_WORKAROUND")
   endif
+
+  ifeq ($(KOKKOS_INTERNAL_CUDA_DISABLE_MALLOC_ASYNC), 0)
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC")
+  else
+    tmp := $(call kokkos_append_header,"/* $H""undef KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC */")
+  endif
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
   ifeq ($(KOKKOS_INTERNAL_HPX_ENABLE_ASYNC_DISPATCH), 1)
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_HPX_ASYNC_DISPATCH")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_IMPL_HPX_ASYNC_DISPATCH")
   endif
 endif
 
@@ -705,6 +708,7 @@ endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV80), 1)
   tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARMV80")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARM_NEON")
 
   ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
     KOKKOS_CXXFLAGS +=
@@ -717,6 +721,7 @@ endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV81), 1)
   tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARMV81")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARM_NEON")
 
   ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
     KOKKOS_CXXFLAGS +=
@@ -729,6 +734,7 @@ endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_A64FX), 1)
   tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_A64FX")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARM_NEON")
 
   KOKKOS_CXXFLAGS += -march=armv8.2-a+sve
   KOKKOS_LDFLAGS += -march=armv8.2-a+sve
@@ -744,7 +750,7 @@ endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ZEN), 1)
   tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_ZEN")
-  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_AVX2")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AVX2")
 
   ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
     KOKKOS_CXXFLAGS += -mavx2
@@ -757,7 +763,7 @@ endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ZEN2), 1)
   tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_ZEN2")
-  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_AVX2")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AVX2")
 
   ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
     KOKKOS_CXXFLAGS += -mavx2
@@ -770,7 +776,7 @@ endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ZEN3), 1)
   tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_ZEN3")
-  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_AVX2")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AVX2")
 
   ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
     KOKKOS_CXXFLAGS += -mavx2
@@ -784,6 +790,7 @@ endif
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX), 1)
   tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARMV80")
   tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARMV8_THUNDERX")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARM_NEON")
 
   ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
     KOKKOS_CXXFLAGS +=
@@ -797,6 +804,7 @@ endif
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX2), 1)
   tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARMV81")
   tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARMV8_THUNDERX2")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARM_NEON")
 
   ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
     KOKKOS_CXXFLAGS +=
@@ -973,143 +981,149 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA_ARCH), 1)
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
-  ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
-    KOKKOS_INTERNAL_CUDA_ARCH_FLAG=-fopenmp-targets=nvptx64 -Xopenmp-target -march
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY_CLANG), 1)
+    KOKKOS_INTERNAL_CUDA_ARCH_FLAG=-fopenmp
+    else ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
+      KOKKOS_INTERNAL_CUDA_ARCH_FLAG=-fopenmp --offload-arch
   endif
-  KOKKOS_INTERNAL_USE_CUDA_ARCH = 1
 endif
 
-ifeq ($(KOKKOS_INTERNAL_USE_CUDA_ARCH), 1)
-  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER30), 1)
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER")
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER30")
-    KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_30
-  endif
-  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER32), 1)
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER")
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER32")
-    KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_32
-  endif
-  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER35), 1)
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER")
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER35")
-    KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_35
-  endif
-  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER37), 1)
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER")
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER37")
-    KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_37
-  endif
-  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50), 1)
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL")
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL50")
-    KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_50
-  endif
-  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52), 1)
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL")
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL52")
-    KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_52
-  endif
-  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53), 1)
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL")
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL53")
-    KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_53
-  endif
-  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL60), 1)
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL")
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL60")
-    KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_60
-  endif
-  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL61), 1)
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL")
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL61")
-    KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_61
-  endif
-  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VOLTA70), 1)
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA")
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA70")
-    KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_70
-  endif
-  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VOLTA72), 1)
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA")
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA72")
-    KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_72
-  endif
-  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_TURING75), 1)
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_TURING")
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_TURING75")
-    KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_75
-  endif
-  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMPERE80), 1)
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE")
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE80")
-    KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_80
-  endif
-  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMPERE86), 1)
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE")
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE86")
-    KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_86
-  endif
-  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ADA89), 1)
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ADA89")
-    KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_89
-  endif
-  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_HOPPER90), 1)
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER")
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER90")
-    KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_90
-  endif
-
-  ifneq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
-    KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)
-
-    ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
-      KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)
+# Do not add this flag if its the cray compiler or the nvhpc compiler.
+ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY_CLANG), 0)
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0)
+    # Lets start with adding architecture defines
+    ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER30), 1)
+      tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER")
+      tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER30")
+      KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_30
     endif
-    ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
-      ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
-        KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)
-      endif
+    ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER32), 1)
+      tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER")
+      tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER32")
+      KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_32
+    endif
+    ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER35), 1)
+      tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER")
+      tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER35")
+      KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_35
+    endif
+    ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER37), 1)
+      tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER")
+      tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER37")
+      KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_37
+    endif
+    ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50), 1)
+      tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL")
+      tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL50")
+      KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_50
+    endif
+    ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52), 1)
+      tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL")
+      tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL52")
+      KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_52
+    endif
+    ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53), 1)
+      tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL")
+      tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL53")
+      KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_53
+    endif
+    ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL60), 1)
+      tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL")
+      tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL60")
+      KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_60
+    endif
+    ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL61), 1)
+      tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL")
+      tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL61")
+      KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_61
+    endif
+    ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VOLTA70), 1)
+      tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA")
+      tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA70")
+      KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_70
+    endif
+    ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VOLTA72), 1)
+      tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA")
+      tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA72")
+      KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_72
+    endif
+    ifeq ($(KOKKOS_INTERNAL_USE_ARCH_TURING75), 1)
+      tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_TURING75")
+      KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_75
+    endif
+    ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMPERE80), 1)
+      tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE")
+      tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE80")
+      KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_80
+    endif
+    ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMPERE86), 1)
+      tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE")
+      tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE86")
+      KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_86
+    endif
+    ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ADA89), 1)
+      tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ADA89")
+      KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_89
+    endif
+    ifeq ($(KOKKOS_INTERNAL_USE_ARCH_HOPPER90), 1)
+      tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER")
+      tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER90")
+      KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_90
     endif
   endif
 endif
 
+ifneq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
+  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)
 
-# Figure out the architecture flag for ROCm.
-ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
-  # Lets start with adding architecture defines
-  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VEGA906), 1)
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA906")
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA")
-    KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx906
-  endif
-  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VEGA908), 1)
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA908")
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA")
-    KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx908
-  endif
-  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VEGA90A), 1)
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA90A")
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA")
-    KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx90a
-  endif
-  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NAVI1030), 1)
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_NAVI1030")
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_NAVI")
-    KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx1030
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
+    KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)
   endif
-  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NAVI1100), 1)
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_NAVI1100")
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_NAVI")
-    KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx1100
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
+    ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
+      KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)
+    endif
   endif
+endif
+
 
+# Figure out the architecture flag for ROCm.
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906), 1)
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX906")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU")
+  KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx906
+endif
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908), 1)
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX908")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU")
+  KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx908
+endif
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A), 1)
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX90A")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU")
+  KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx90a
+endif
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX942), 1)
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX942")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU")
+  KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx942
+endif
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030), 1)
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX1030")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU")
+  KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx1030
+endif
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100), 1)
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX1100")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU")
+  KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx1100
+endif
 
+
+ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
   KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/HIP/*.cpp)
+  KOKKOS_SRC += $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_HIP.cpp
   KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/HIP/*.hpp)
-  ifeq ($(KOKKOS_INTERNAL_DISABLE_DESUL_ATOMICS), 0)
-    KOKKOS_SRC += $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_HIP.cpp
-  endif
 
   KOKKOS_CXXFLAGS+=$(KOKKOS_INTERNAL_HIP_ARCH_FLAG)
   KOKKOS_LDFLAGS+=$(KOKKOS_INTERNAL_HIP_ARCH_FLAG)
@@ -1182,12 +1196,14 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
   KOKKOS_LDFLAGS+=$(KOKKOS_INTERNAL_INTEL_ARCH_FLAG)
 endif
 
-ifeq ($(KOKKOS_INTERNAL_DISABLE_DESUL_ATOMICS), 0)
-  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_IMPL_DESUL_ATOMICS")
-  KOKKOS_CPPFLAGS+=-I$(KOKKOS_PATH)/tpls/desul/include
-else ifeq ($(KOKKOS_INTERNAL_ENABLE_DESUL_ATOMICS), 1)
-  $(error Contradictory Desul atomics options: KOKKOS_OPTIONS=$(KOKKOS_OPTIONS) )
+ifeq ($(KOKKOS_INTERNAL_DISABLE_DESUL_ATOMICS), 1)
+  $(warning disable_desul_atomics option has been removed. Desul atomics cannot be disabled.)
+  KOKKOS_INTERNAL_DISABLE_DESUL_ATOMICS := 0
+endif
+ifeq ($(KOKKOS_INTERNAL_ENABLE_DESUL_ATOMICS), 1)
+  $(warning enable_desul_atomics option has been removed. Desul atomics are always enabled.)
 endif
+KOKKOS_CPPFLAGS+=-I$(KOKKOS_PATH)/tpls/desul/include
 
 ifeq ($(KOKKOS_INTERNAL_DISABLE_BUNDLED_MDSPAN), 0)
   KOKKOS_CPPFLAGS+=-I$(KOKKOS_PATH)/tpls/mdspan/include
@@ -1229,6 +1245,7 @@ ifneq ($(KOKKOS_INTERNAL_NEW_CONFIG), 0)
   ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1)
     tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_SYCL.hpp>","KokkosCore_Config_FwdBackend.hpp")
     tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_SYCL.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <setup/Kokkos_Setup_SYCL.hpp>","KokkosCore_Config_SetupBackend.hpp")
   endif
   ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
     tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_HIP.hpp>","KokkosCore_Config_FwdBackend.hpp")
@@ -1240,8 +1257,8 @@ ifneq ($(KOKKOS_INTERNAL_NEW_CONFIG), 0)
     tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_OPENMP.hpp>","KokkosCore_Config_DeclareBackend.hpp")
   endif
   ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1)
-    tmp := $(call kokkos_append_config_header,"\#include <fwd/Kokkos_Fwd_OPENACC.hpp>","KokkosCore_Config_FwdBackend.hpp")
-    tmp := $(call kokkos_append_config_header,"\#include <decl/Kokkos_Declare_OPENACC.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_OPENACC.hpp>","KokkosCore_Config_FwdBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_OPENACC.hpp>","KokkosCore_Config_DeclareBackend.hpp")
   endif
   ifeq ($(KOKKOS_INTERNAL_USE_THREADS), 1)
     tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_THREADS.hpp>","KokkosCore_Config_FwdBackend.hpp")
@@ -1272,9 +1289,7 @@ KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/containers/src/impl/*.cpp)
 
 ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
   KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.cpp)
-  ifeq ($(KOKKOS_INTERNAL_DISABLE_DESUL_ATOMICS), 0)
-    KOKKOS_SRC += $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_CUDA.cpp
-  endif
+  KOKKOS_SRC += $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_CUDA.cpp
   KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.hpp)
   ifneq ($(CUDA_PATH),)
     KOKKOS_CPPLAGS += -I$(CUDA_PATH)/include
@@ -1390,11 +1405,7 @@ KOKKOS_LIBS := -lkokkos ${KOKKOS_LIBS}
 
 # Generating the <desul/atomics/Config.hpp> header
 DESUL_INTERNAL_CONFIG_TMP=Desul_Config.tmp
-ifeq ($(KOKKOS_INTERNAL_DISABLE_DESUL_ATOMICS), 0)
-  DESUL_CONFIG_HEADER=desul/atomics/Config.hpp
-else
-  DESUL_CONFIG_HEADER=NothingToSeeHereMoveAlong
-endif
+DESUL_CONFIG_HEADER=desul/atomics/Config.hpp
 desul_append_header = $(shell echo $1 >> $(DESUL_INTERNAL_CONFIG_TMP))
 tmp := $(call desul_append_header, "// generated by on-demand build system by crtrott" > $(DESUL_INTERNAL_CONFIG_TMP))
 tmp := $(call desul_append_header, "$H""ifndef DESUL_ATOMICS_CONFIG_HPP_")
@@ -1405,12 +1416,22 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
 else
   tmp := $(call desul_append_header,"/* $H""undef DESUL_ATOMICS_ENABLE_CUDA */")
 endif
+ifeq ($(KOKKOS_INTERNAL_CUDA_USE_RELOC), 1)
+  tmp := $(call desul_append_header,"$H""define DESUL_ATOMICS_ENABLE_CUDA_SEPARABLE_COMPILATION")
+else
+  tmp := $(call desul_append_header,"/* $H""undef DESUL_ATOMICS_ENABLE_CUDA_SEPARABLE_COMPILATION */")
+endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
   tmp := $(call desul_append_header,"$H""define DESUL_ATOMICS_ENABLE_HIP")
 else
   tmp := $(call desul_append_header,"/* $H""undef DESUL_ATOMICS_ENABLE_HIP */")
 endif
+ifeq ($(KOKKOS_INTERNAL_HIP_USE_RELOC), 1)
+  tmp := $(call desul_append_header,"$H""define DESUL_ATOMICS_ENABLE_HIP_SEPARABLE_COMPILATION")
+else
+  tmp := $(call desul_append_header,"/* $H""undef DESUL_ATOMICS_ENABLE_HIP_SEPARABLE_COMPILATION */")
+endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1)
   tmp := $(call desul_append_header,"$H""define DESUL_ATOMICS_ENABLE_SYCL")
diff --git a/packages/kokkos/Makefile.targets b/packages/kokkos/Makefile.targets
index 32b1fab2615dda91ab573de6166a157e3466fd1b..ec8770dd7de048f66333f31b97454fa9f89c3db7 100644
--- a/packages/kokkos/Makefile.targets
+++ b/packages/kokkos/Makefile.targets
@@ -36,6 +36,8 @@ Kokkos_HostSpace_deepcopy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace_deepcopy.cpp
 Kokkos_NumericTraits.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_NumericTraits.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_NumericTraits.cpp
+Kokkos_Abort.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Abort.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Abort.cpp
 
 ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
 Kokkos_Serial.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Serial/Kokkos_Serial.cpp
@@ -51,8 +53,6 @@ Kokkos_CudaSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cu
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp
 Kokkos_Cuda_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp
-Kokkos_Cuda_Locks.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Locks.cpp
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Locks.cpp
 Lock_Array_CUDA.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_CUDA.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_CUDA.cpp
 endif
@@ -77,8 +77,6 @@ Kokkos_HIP_Space.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Space.cpp
 Kokkos_HIP_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Instance.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Instance.cpp
-Kokkos_HIP_Locks.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Locks.cpp
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Locks.cpp
 Lock_Array_HIP.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_HIP.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_HIP.cpp
 endif
@@ -89,6 +87,8 @@ Kokkos_ThreadsExec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokk
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
+Kokkos_OpenMP.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP.cpp
 Kokkos_OpenMP_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp
 Kokkos_OpenMP_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
diff --git a/packages/kokkos/algorithms/CMakeLists.txt b/packages/kokkos/algorithms/CMakeLists.txt
index f32363dc9acd5c1a972120f7309ea7745776e71d..368984647e9fbe3b3d9b5aa4dfe01457edbbd52c 100644
--- a/packages/kokkos/algorithms/CMakeLists.txt
+++ b/packages/kokkos/algorithms/CMakeLists.txt
@@ -1,15 +1,7 @@
-
-
-KOKKOS_SUBPACKAGE(Algorithms)
-
 IF (NOT Kokkos_INSTALL_TESTING)
   ADD_SUBDIRECTORY(src)
 ENDIF()
 # FIXME_OPENACC: temporarily disabled due to unimplemented features
-IF(NOT ((KOKKOS_ENABLE_OPENMPTARGET OR KOKKOS_ENABLE_OPENACC) AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC))
+IF(NOT ((KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) OR KOKKOS_ENABLE_OPENACC))
   KOKKOS_ADD_TEST_DIRECTORIES(unit_tests)
 ENDIF()
-
-KOKKOS_SUBPACKAGE_POSTPROCESS()
-
-
diff --git a/packages/kokkos/algorithms/cmake/Dependencies.cmake b/packages/kokkos/algorithms/cmake/Dependencies.cmake
deleted file mode 100644
index c36b62523fadb628e970b6eccf57a9caaa317f1e..0000000000000000000000000000000000000000
--- a/packages/kokkos/algorithms/cmake/Dependencies.cmake
+++ /dev/null
@@ -1,5 +0,0 @@
-TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
-  LIB_REQUIRED_PACKAGES KokkosCore KokkosContainers
-  LIB_OPTIONAL_TPLS Pthread CUDA HWLOC
-  TEST_OPTIONAL_TPLS CUSPARSE
-  )
diff --git a/packages/kokkos/algorithms/cmake/KokkosAlgorithms_config.h.in b/packages/kokkos/algorithms/cmake/KokkosAlgorithms_config.h.in
deleted file mode 100644
index 67334b70f36b6db55b225f25c91d8a8c4cb3aaab..0000000000000000000000000000000000000000
--- a/packages/kokkos/algorithms/cmake/KokkosAlgorithms_config.h.in
+++ /dev/null
@@ -1,4 +0,0 @@
-#ifndef KOKKOS_ALGORITHMS_CONFIG_H
-#define KOKKOS_ALGORITHMS_CONFIG_H
-
-#endif
diff --git a/packages/kokkos/algorithms/src/CMakeLists.txt b/packages/kokkos/algorithms/src/CMakeLists.txt
index 606d83d18b589183cb432ff65e576a674ec9dc6a..169577894728a3b75111c3d2398a0c015950fb1f 100644
--- a/packages/kokkos/algorithms/src/CMakeLists.txt
+++ b/packages/kokkos/algorithms/src/CMakeLists.txt
@@ -1,6 +1,3 @@
-
-KOKKOS_CONFIGURE_FILE(${PACKAGE_NAME}_config.h)
-
 #I have to leave these here for tribits
 KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
 KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
@@ -9,7 +6,6 @@ KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
 
 FILE(GLOB ALGO_HEADERS *.hpp)
 FILE(GLOB ALGO_SOURCES *.cpp)
-LIST(APPEND ALGO_HEADERS ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME}_config.h)
 APPEND_GLOB(ALGO_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/std_algorithms/*.hpp)
 APPEND_GLOB(ALGO_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/std_algorithms/impl/*.hpp)
 
diff --git a/packages/kokkos/algorithms/src/Kokkos_NestedSort.hpp b/packages/kokkos/algorithms/src/Kokkos_NestedSort.hpp
index 4c8be792d816a8032d3e2e3abbc0458b327ec26c..18e0674efea22549fa973e27b524577041fb89f2 100644
--- a/packages/kokkos/algorithms/src/Kokkos_NestedSort.hpp
+++ b/packages/kokkos/algorithms/src/Kokkos_NestedSort.hpp
@@ -14,175 +14,17 @@
 //
 //@HEADER
 
-#ifndef KOKKOS_NESTEDSORT_HPP_
-#define KOKKOS_NESTEDSORT_HPP_
-
-#include <Kokkos_Core.hpp>
-#include <std_algorithms/impl/Kokkos_HelperPredicates.hpp>
-#include <std_algorithms/Kokkos_Swap.hpp>
-
-namespace Kokkos {
-namespace Experimental {
-namespace Impl {
-
-// true for TeamVectorRange, false for ThreadVectorRange
-template <bool teamLevel>
-struct NestedRange {};
-
-// Specialization for team-level
-template <>
-struct NestedRange<true> {
-  template <typename TeamMember, typename SizeType>
-  KOKKOS_FUNCTION static auto create(const TeamMember& t, SizeType len) {
-    return Kokkos::TeamVectorRange(t, len);
-  }
-  template <typename TeamMember>
-  KOKKOS_FUNCTION static void barrier(const TeamMember& t) {
-    t.team_barrier();
-  }
-};
-
-// Specialization for thread-level
-template <>
-struct NestedRange<false> {
-  template <typename TeamMember, typename SizeType>
-  KOKKOS_FUNCTION static auto create(const TeamMember& t, SizeType len) {
-    return Kokkos::ThreadVectorRange(t, len);
-  }
-  // Barrier is no-op, as vector lanes of a thread are implicitly synchronized
-  // after parallel region
-  template <typename TeamMember>
-  KOKKOS_FUNCTION static void barrier(const TeamMember&) {}
-};
-
-// When just doing sort (not sort_by_key), use nullptr_t for ValueViewType.
-// This only takes the NestedRange instance for template arg deduction.
-template <class TeamMember, class KeyViewType, class ValueViewType,
-          class Comparator, bool useTeamLevel>
-KOKKOS_INLINE_FUNCTION void sort_nested_impl(
-    const TeamMember& t, const KeyViewType& keyView,
-    [[maybe_unused]] const ValueViewType& valueView, const Comparator& comp,
-    const NestedRange<useTeamLevel>) {
-  using SizeType  = typename KeyViewType::size_type;
-  using KeyType   = typename KeyViewType::non_const_value_type;
-  using Range     = NestedRange<useTeamLevel>;
-  SizeType n      = keyView.extent(0);
-  SizeType npot   = 1;
-  SizeType levels = 0;
-  // FIXME: ceiling power-of-two is a common thing to need - make it a utility
-  while (npot < n) {
-    levels++;
-    npot <<= 1;
-  }
-  for (SizeType i = 0; i < levels; i++) {
-    for (SizeType j = 0; j <= i; j++) {
-      // n/2 pairs of items are compared in parallel
-      Kokkos::parallel_for(Range::create(t, npot / 2), [=](const SizeType k) {
-        // How big are the brown/pink boxes?
-        // (Terminology comes from Wikipedia diagram)
-        // https://commons.wikimedia.org/wiki/File:BitonicSort.svg#/media/File:BitonicSort.svg
-        SizeType boxSize = SizeType(2) << (i - j);
-        // Which box contains this thread?
-        SizeType boxID     = k >> (i - j);          // k * 2 / boxSize;
-        SizeType boxStart  = boxID << (1 + i - j);  // boxID * boxSize
-        SizeType boxOffset = k - (boxStart >> 1);   // k - boxID * boxSize / 2;
-        SizeType elem1     = boxStart + boxOffset;
-        // In first phase (j == 0, brown box): within a box, compare with the
-        // opposite value in the box.
-        // In later phases (j > 0, pink box): within a box, compare with fixed
-        // distance (boxSize / 2) apart.
-        SizeType elem2 = (j == 0) ? (boxStart + boxSize - 1 - boxOffset)
-                                  : (elem1 + boxSize / 2);
-        if (elem2 < n) {
-          KeyType key1 = keyView(elem1);
-          KeyType key2 = keyView(elem2);
-          if (comp(key2, key1)) {
-            keyView(elem1) = key2;
-            keyView(elem2) = key1;
-            if constexpr (!std::is_same_v<ValueViewType, std::nullptr_t>) {
-              Kokkos::Experimental::swap(valueView(elem1), valueView(elem2));
-            }
-          }
-        }
-      });
-      Range::barrier(t);
-    }
-  }
-}
-
-}  // namespace Impl
-
-template <class TeamMember, class ViewType>
-KOKKOS_INLINE_FUNCTION void sort_team(const TeamMember& t,
-                                      const ViewType& view) {
-  Impl::sort_nested_impl(t, view, nullptr,
-                         Experimental::Impl::StdAlgoLessThanBinaryPredicate<
-                             typename ViewType::non_const_value_type>(),
-                         Impl::NestedRange<true>());
-}
-
-template <class TeamMember, class ViewType, class Comparator>
-KOKKOS_INLINE_FUNCTION void sort_team(const TeamMember& t, const ViewType& view,
-                                      const Comparator& comp) {
-  Impl::sort_nested_impl(t, view, nullptr, comp, Impl::NestedRange<true>());
-}
-
-template <class TeamMember, class KeyViewType, class ValueViewType>
-KOKKOS_INLINE_FUNCTION void sort_by_key_team(const TeamMember& t,
-                                             const KeyViewType& keyView,
-                                             const ValueViewType& valueView) {
-  Impl::sort_nested_impl(t, keyView, valueView,
-                         Experimental::Impl::StdAlgoLessThanBinaryPredicate<
-                             typename KeyViewType::non_const_value_type>(),
-                         Impl::NestedRange<true>());
-}
-
-template <class TeamMember, class KeyViewType, class ValueViewType,
-          class Comparator>
-KOKKOS_INLINE_FUNCTION void sort_by_key_team(const TeamMember& t,
-                                             const KeyViewType& keyView,
-                                             const ValueViewType& valueView,
-                                             const Comparator& comp) {
-  Impl::sort_nested_impl(t, keyView, valueView, comp,
-                         Impl::NestedRange<true>());
-}
-
-template <class TeamMember, class ViewType>
-KOKKOS_INLINE_FUNCTION void sort_thread(const TeamMember& t,
-                                        const ViewType& view) {
-  Impl::sort_nested_impl(t, view, nullptr,
-                         Experimental::Impl::StdAlgoLessThanBinaryPredicate<
-                             typename ViewType::non_const_value_type>(),
-                         Impl::NestedRange<false>());
-}
-
-template <class TeamMember, class ViewType, class Comparator>
-KOKKOS_INLINE_FUNCTION void sort_thread(const TeamMember& t,
-                                        const ViewType& view,
-                                        const Comparator& comp) {
-  Impl::sort_nested_impl(t, view, nullptr, comp, Impl::NestedRange<false>());
-}
-
-template <class TeamMember, class KeyViewType, class ValueViewType>
-KOKKOS_INLINE_FUNCTION void sort_by_key_thread(const TeamMember& t,
-                                               const KeyViewType& keyView,
-                                               const ValueViewType& valueView) {
-  Impl::sort_nested_impl(t, keyView, valueView,
-                         Experimental::Impl::StdAlgoLessThanBinaryPredicate<
-                             typename KeyViewType::non_const_value_type>(),
-                         Impl::NestedRange<false>());
-}
+#ifndef KOKKOS_NESTED_SORT_HPP_
+#define KOKKOS_NESTED_SORT_HPP_
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_NESTED_SORT
+#endif
 
-template <class TeamMember, class KeyViewType, class ValueViewType,
-          class Comparator>
-KOKKOS_INLINE_FUNCTION void sort_by_key_thread(const TeamMember& t,
-                                               const KeyViewType& keyView,
-                                               const ValueViewType& valueView,
-                                               const Comparator& comp) {
-  Impl::sort_nested_impl(t, keyView, valueView, comp,
-                         Impl::NestedRange<false>());
-}
+#include "sorting/Kokkos_NestedSortPublicAPI.hpp"
 
-}  // namespace Experimental
-}  // namespace Kokkos
+#ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_NESTED_SORT
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_NESTED_SORT
+#endif
 #endif
diff --git a/packages/kokkos/algorithms/src/Kokkos_Random.hpp b/packages/kokkos/algorithms/src/Kokkos_Random.hpp
index 91e9ce6fc84bcdeccfb9c40e4fca104d885f4b46..2d7d236d2fc2df0967a49768ba14a97d1a63b917 100644
--- a/packages/kokkos/algorithms/src/Kokkos_Random.hpp
+++ b/packages/kokkos/algorithms/src/Kokkos_Random.hpp
@@ -956,6 +956,8 @@ class Random_XorShift64_Pool {
   KOKKOS_INLINE_FUNCTION
   void free_state(const Random_XorShift64<DeviceType>& state) const {
     state_(state.state_idx_, 0) = state.state_;
+    // Release the lock only after the state has been updated in memory
+    Kokkos::memory_fence();
     locks_(state.state_idx_, 0) = 0;
   }
 };
@@ -1208,7 +1210,9 @@ class Random_XorShift1024_Pool {
   KOKKOS_INLINE_FUNCTION
   void free_state(const Random_XorShift1024<DeviceType>& state) const {
     for (int i = 0; i < 16; i++) state_(state.state_idx_, i) = state.state_[i];
-    p_(state.state_idx_, 0)     = state.p_;
+    p_(state.state_idx_, 0) = state.p_;
+    // Release the lock only after the state has been updated in memory
+    Kokkos::memory_fence();
     locks_(state.state_idx_, 0) = 0;
   }
 };
@@ -1514,7 +1518,7 @@ void fill_random(const ExecutionSpace& exec, ViewType a, RandomPool g,
         "Kokkos::fill_random",
         Kokkos::RangePolicy<ExecutionSpace>(exec, 0, (LDA + 127) / 128),
         Impl::fill_random_functor_begin_end<ViewType, RandomPool, 128,
-                                            ViewType::Rank, IndexType>(
+                                            ViewType::rank, IndexType>(
             a, g, begin, end));
 }
 
diff --git a/packages/kokkos/algorithms/src/Kokkos_Sort.hpp b/packages/kokkos/algorithms/src/Kokkos_Sort.hpp
index 033de221649659353e61c61313233a1805e8fd30..f77484cc5559dedb93b6b3ab65139c80e12180dc 100644
--- a/packages/kokkos/algorithms/src/Kokkos_Sort.hpp
+++ b/packages/kokkos/algorithms/src/Kokkos_Sort.hpp
@@ -21,681 +21,9 @@
 #define KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_SORT
 #endif
 
-#include <Kokkos_Core.hpp>
-#include <Kokkos_NestedSort.hpp>
-#include <std_algorithms/Kokkos_BeginEnd.hpp>
-#include <algorithm>
-
-#if defined(KOKKOS_ENABLE_CUDA)
-
-// Workaround for `Instruction 'shfl' without '.sync' is not supported on
-// .target sm_70 and higher from PTX ISA version 6.4`.
-// Also see https://github.com/NVIDIA/cub/pull/170.
-#if !defined(CUB_USE_COOPERATIVE_GROUPS)
-#define CUB_USE_COOPERATIVE_GROUPS
-#endif
-
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wshadow"
-
-#if defined(KOKKOS_COMPILER_CLANG)
-// Some versions of Clang fail to compile Thrust, failing with errors like
-// this:
-//    <snip>/thrust/system/cuda/detail/core/agent_launcher.h:557:11:
-//    error: use of undeclared identifier 'va_printf'
-// The exact combination of versions for Clang and Thrust (or CUDA) for this
-// failure was not investigated, however even very recent version combination
-// (Clang 10.0.0 and Cuda 10.0) demonstrated failure.
-//
-// Defining _CubLog here locally allows us to avoid that code path, however
-// disabling some debugging diagnostics
-#pragma push_macro("_CubLog")
-#ifdef _CubLog
-#undef _CubLog
-#endif
-#define _CubLog
-#include <thrust/device_ptr.h>
-#include <thrust/sort.h>
-#pragma pop_macro("_CubLog")
-#else
-#include <thrust/device_ptr.h>
-#include <thrust/sort.h>
-#endif
-
-#pragma GCC diagnostic pop
-
-#endif
-
-namespace Kokkos {
-
-namespace Impl {
-
-template <class DstViewType, class SrcViewType, int Rank = DstViewType::Rank>
-struct CopyOp;
-
-template <class DstViewType, class SrcViewType>
-struct CopyOp<DstViewType, SrcViewType, 1> {
-  KOKKOS_INLINE_FUNCTION
-  static void copy(DstViewType const& dst, size_t i_dst, SrcViewType const& src,
-                   size_t i_src) {
-    dst(i_dst) = src(i_src);
-  }
-};
-
-template <class DstViewType, class SrcViewType>
-struct CopyOp<DstViewType, SrcViewType, 2> {
-  KOKKOS_INLINE_FUNCTION
-  static void copy(DstViewType const& dst, size_t i_dst, SrcViewType const& src,
-                   size_t i_src) {
-    for (int j = 0; j < (int)dst.extent(1); j++) dst(i_dst, j) = src(i_src, j);
-  }
-};
-
-template <class DstViewType, class SrcViewType>
-struct CopyOp<DstViewType, SrcViewType, 3> {
-  KOKKOS_INLINE_FUNCTION
-  static void copy(DstViewType const& dst, size_t i_dst, SrcViewType const& src,
-                   size_t i_src) {
-    for (int j = 0; j < dst.extent(1); j++)
-      for (int k = 0; k < dst.extent(2); k++)
-        dst(i_dst, j, k) = src(i_src, j, k);
-  }
-};
-}  // namespace Impl
-
-//----------------------------------------------------------------------------
-
-template <class KeyViewType, class BinSortOp,
-          class Space    = typename KeyViewType::device_type,
-          class SizeType = typename KeyViewType::memory_space::size_type>
-class BinSort {
- public:
-  template <class DstViewType, class SrcViewType>
-  struct copy_functor {
-    using src_view_type = typename SrcViewType::const_type;
-
-    using copy_op = Impl::CopyOp<DstViewType, src_view_type>;
-
-    DstViewType dst_values;
-    src_view_type src_values;
-    int dst_offset;
-
-    copy_functor(DstViewType const& dst_values_, int const& dst_offset_,
-                 SrcViewType const& src_values_)
-        : dst_values(dst_values_),
-          src_values(src_values_),
-          dst_offset(dst_offset_) {}
-
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const int& i) const {
-      copy_op::copy(dst_values, i + dst_offset, src_values, i);
-    }
-  };
-
-  template <class DstViewType, class PermuteViewType, class SrcViewType>
-  struct copy_permute_functor {
-    // If a Kokkos::View then can generate constant random access
-    // otherwise can only use the constant type.
-
-    using src_view_type = std::conditional_t<
-        Kokkos::is_view<SrcViewType>::value,
-        Kokkos::View<typename SrcViewType::const_data_type,
-                     typename SrcViewType::array_layout,
-                     typename SrcViewType::device_type,
-                     Kokkos::MemoryTraits<Kokkos::RandomAccess> >,
-        typename SrcViewType::const_type>;
-
-    using perm_view_type = typename PermuteViewType::const_type;
-
-    using copy_op = Impl::CopyOp<DstViewType, src_view_type>;
-
-    DstViewType dst_values;
-    perm_view_type sort_order;
-    src_view_type src_values;
-    int src_offset;
-
-    copy_permute_functor(DstViewType const& dst_values_,
-                         PermuteViewType const& sort_order_,
-                         SrcViewType const& src_values_, int const& src_offset_)
-        : dst_values(dst_values_),
-          sort_order(sort_order_),
-          src_values(src_values_),
-          src_offset(src_offset_) {}
-
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const int& i) const {
-      copy_op::copy(dst_values, i, src_values, src_offset + sort_order(i));
-    }
-  };
-
-  // Naming this alias "execution_space" would be problematic since it would be
-  // considered as execution space for the various functors which might use
-  // another execution space through sort() or create_permute_vector().
-  using exec_space  = typename Space::execution_space;
-  using bin_op_type = BinSortOp;
-
-  struct bin_count_tag {};
-  struct bin_offset_tag {};
-  struct bin_binning_tag {};
-  struct bin_sort_bins_tag {};
-
- public:
-  using size_type  = SizeType;
-  using value_type = size_type;
-
-  using offset_type    = Kokkos::View<size_type*, Space>;
-  using bin_count_type = Kokkos::View<const int*, Space>;
-
-  using const_key_view_type = typename KeyViewType::const_type;
-
-  // If a Kokkos::View then can generate constant random access
-  // otherwise can only use the constant type.
-
-  using const_rnd_key_view_type = std::conditional_t<
-      Kokkos::is_view<KeyViewType>::value,
-      Kokkos::View<typename KeyViewType::const_data_type,
-                   typename KeyViewType::array_layout,
-                   typename KeyViewType::device_type,
-                   Kokkos::MemoryTraits<Kokkos::RandomAccess> >,
-      const_key_view_type>;
-
-  using non_const_key_scalar = typename KeyViewType::non_const_value_type;
-  using const_key_scalar     = typename KeyViewType::const_value_type;
-
-  using bin_count_atomic_type =
-      Kokkos::View<int*, Space, Kokkos::MemoryTraits<Kokkos::Atomic> >;
-
- private:
-  const_key_view_type keys;
-  const_rnd_key_view_type keys_rnd;
-
- public:
-  BinSortOp bin_op;
-  offset_type bin_offsets;
-  bin_count_atomic_type bin_count_atomic;
-  bin_count_type bin_count_const;
-  offset_type sort_order;
-
-  int range_begin;
-  int range_end;
-  bool sort_within_bins;
-
- public:
-  BinSort() = default;
-
-  //----------------------------------------
-  // Constructor: takes the keys, the binning_operator and optionally whether to
-  // sort within bins (default false)
-  template <typename ExecutionSpace>
-  BinSort(const ExecutionSpace& exec, const_key_view_type keys_,
-          int range_begin_, int range_end_, BinSortOp bin_op_,
-          bool sort_within_bins_ = false)
-      : keys(keys_),
-        keys_rnd(keys_),
-        bin_op(bin_op_),
-        bin_offsets(),
-        bin_count_atomic(),
-        bin_count_const(),
-        sort_order(),
-        range_begin(range_begin_),
-        range_end(range_end_),
-        sort_within_bins(sort_within_bins_) {
-    static_assert(
-        Kokkos::SpaceAccessibility<ExecutionSpace,
-                                   typename Space::memory_space>::accessible,
-        "The provided execution space must be able to access the memory space "
-        "BinSort was initialized with!");
-    if (bin_op.max_bins() <= 0)
-      Kokkos::abort(
-          "The number of bins in the BinSortOp object must be greater than 0!");
-    bin_count_atomic = Kokkos::View<int*, Space>(
-        "Kokkos::SortImpl::BinSortFunctor::bin_count", bin_op.max_bins());
-    bin_count_const = bin_count_atomic;
-    bin_offsets =
-        offset_type(view_alloc(exec, WithoutInitializing,
-                               "Kokkos::SortImpl::BinSortFunctor::bin_offsets"),
-                    bin_op.max_bins());
-    sort_order =
-        offset_type(view_alloc(exec, WithoutInitializing,
-                               "Kokkos::SortImpl::BinSortFunctor::sort_order"),
-                    range_end - range_begin);
-  }
-
-  BinSort(const_key_view_type keys_, int range_begin_, int range_end_,
-          BinSortOp bin_op_, bool sort_within_bins_ = false)
-      : BinSort(exec_space{}, keys_, range_begin_, range_end_, bin_op_,
-                sort_within_bins_) {}
-
-  template <typename ExecutionSpace>
-  BinSort(const ExecutionSpace& exec, const_key_view_type keys_,
-          BinSortOp bin_op_, bool sort_within_bins_ = false)
-      : BinSort(exec, keys_, 0, keys_.extent(0), bin_op_, sort_within_bins_) {}
-
-  BinSort(const_key_view_type keys_, BinSortOp bin_op_,
-          bool sort_within_bins_ = false)
-      : BinSort(exec_space{}, keys_, bin_op_, sort_within_bins_) {}
-
-  //----------------------------------------
-  // Create the permutation vector, the bin_offset array and the bin_count
-  // array. Can be called again if keys changed
-  template <class ExecutionSpace>
-  void create_permute_vector(const ExecutionSpace& exec) {
-    static_assert(
-        Kokkos::SpaceAccessibility<ExecutionSpace,
-                                   typename Space::memory_space>::accessible,
-        "The provided execution space must be able to access the memory space "
-        "BinSort was initialized with!");
-
-    const size_t len = range_end - range_begin;
-    Kokkos::parallel_for(
-        "Kokkos::Sort::BinCount",
-        Kokkos::RangePolicy<ExecutionSpace, bin_count_tag>(exec, 0, len),
-        *this);
-    Kokkos::parallel_scan("Kokkos::Sort::BinOffset",
-                          Kokkos::RangePolicy<ExecutionSpace, bin_offset_tag>(
-                              exec, 0, bin_op.max_bins()),
-                          *this);
-
-    Kokkos::deep_copy(exec, bin_count_atomic, 0);
-    Kokkos::parallel_for(
-        "Kokkos::Sort::BinBinning",
-        Kokkos::RangePolicy<ExecutionSpace, bin_binning_tag>(exec, 0, len),
-        *this);
-
-    if (sort_within_bins)
-      Kokkos::parallel_for(
-          "Kokkos::Sort::BinSort",
-          Kokkos::RangePolicy<ExecutionSpace, bin_sort_bins_tag>(
-              exec, 0, bin_op.max_bins()),
-          *this);
-  }
-
-  // Create the permutation vector, the bin_offset array and the bin_count
-  // array. Can be called again if keys changed
-  void create_permute_vector() {
-    Kokkos::fence("Kokkos::Binsort::create_permute_vector: before");
-    exec_space e{};
-    create_permute_vector(e);
-    e.fence("Kokkos::Binsort::create_permute_vector: after");
-  }
-
-  // Sort a subset of a view with respect to the first dimension using the
-  // permutation array
-  template <class ExecutionSpace, class ValuesViewType>
-  void sort(const ExecutionSpace& exec, ValuesViewType const& values,
-            int values_range_begin, int values_range_end) const {
-    static_assert(
-        Kokkos::SpaceAccessibility<ExecutionSpace,
-                                   typename Space::memory_space>::accessible,
-        "The provided execution space must be able to access the memory space "
-        "BinSort was initialized with!");
-    static_assert(
-        Kokkos::SpaceAccessibility<
-            ExecutionSpace, typename ValuesViewType::memory_space>::accessible,
-        "The provided execution space must be able to access the memory space "
-        "of the View argument!");
-
-    using scratch_view_type =
-        Kokkos::View<typename ValuesViewType::data_type,
-                     typename ValuesViewType::array_layout,
-                     typename ValuesViewType::device_type>;
-
-    const size_t len        = range_end - range_begin;
-    const size_t values_len = values_range_end - values_range_begin;
-    if (len != values_len) {
-      Kokkos::abort(
-          "BinSort::sort: values range length != permutation vector length");
-    }
-
-    scratch_view_type sorted_values(
-        view_alloc(exec, WithoutInitializing,
-                   "Kokkos::SortImpl::BinSortFunctor::sorted_values"),
-        values.rank_dynamic > 0 ? len : KOKKOS_IMPL_CTOR_DEFAULT_ARG,
-        values.rank_dynamic > 1 ? values.extent(1)
-                                : KOKKOS_IMPL_CTOR_DEFAULT_ARG,
-        values.rank_dynamic > 2 ? values.extent(2)
-                                : KOKKOS_IMPL_CTOR_DEFAULT_ARG,
-        values.rank_dynamic > 3 ? values.extent(3)
-                                : KOKKOS_IMPL_CTOR_DEFAULT_ARG,
-        values.rank_dynamic > 4 ? values.extent(4)
-                                : KOKKOS_IMPL_CTOR_DEFAULT_ARG,
-        values.rank_dynamic > 5 ? values.extent(5)
-                                : KOKKOS_IMPL_CTOR_DEFAULT_ARG,
-        values.rank_dynamic > 6 ? values.extent(6)
-                                : KOKKOS_IMPL_CTOR_DEFAULT_ARG,
-        values.rank_dynamic > 7 ? values.extent(7)
-                                : KOKKOS_IMPL_CTOR_DEFAULT_ARG);
-
-    {
-      copy_permute_functor<scratch_view_type /* DstViewType */
-                           ,
-                           offset_type /* PermuteViewType */
-                           ,
-                           ValuesViewType /* SrcViewType */
-                           >
-          functor(sorted_values, sort_order, values,
-                  values_range_begin - range_begin);
-
-      parallel_for("Kokkos::Sort::CopyPermute",
-                   Kokkos::RangePolicy<ExecutionSpace>(exec, 0, len), functor);
-    }
-
-    {
-      copy_functor<ValuesViewType, scratch_view_type> functor(
-          values, range_begin, sorted_values);
-
-      parallel_for("Kokkos::Sort::Copy",
-                   Kokkos::RangePolicy<ExecutionSpace>(exec, 0, len), functor);
-    }
-  }
-
-  // Sort a subset of a view with respect to the first dimension using the
-  // permutation array
-  template <class ValuesViewType>
-  void sort(ValuesViewType const& values, int values_range_begin,
-            int values_range_end) const {
-    Kokkos::fence("Kokkos::Binsort::sort: before");
-    exec_space exec;
-    sort(exec, values, values_range_begin, values_range_end);
-    exec.fence("Kokkos::BinSort:sort: after");
-  }
-
-  template <class ExecutionSpace, class ValuesViewType>
-  void sort(ExecutionSpace const& exec, ValuesViewType const& values) const {
-    this->sort(exec, values, 0, /*values.extent(0)*/ range_end - range_begin);
-  }
-
-  template <class ValuesViewType>
-  void sort(ValuesViewType const& values) const {
-    this->sort(values, 0, /*values.extent(0)*/ range_end - range_begin);
-  }
-
-  // Get the permutation vector
-  KOKKOS_INLINE_FUNCTION
-  offset_type get_permute_vector() const { return sort_order; }
-
-  // Get the start offsets for each bin
-  KOKKOS_INLINE_FUNCTION
-  offset_type get_bin_offsets() const { return bin_offsets; }
-
-  // Get the count for each bin
-  KOKKOS_INLINE_FUNCTION
-  bin_count_type get_bin_count() const { return bin_count_const; }
-
- public:
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const bin_count_tag& /*tag*/, const int i) const {
-    const int j = range_begin + i;
-    bin_count_atomic(bin_op.bin(keys, j))++;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const bin_offset_tag& /*tag*/, const int i,
-                  value_type& offset, const bool& final) const {
-    if (final) {
-      bin_offsets(i) = offset;
-    }
-    offset += bin_count_const(i);
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const bin_binning_tag& /*tag*/, const int i) const {
-    const int j     = range_begin + i;
-    const int bin   = bin_op.bin(keys, j);
-    const int count = bin_count_atomic(bin)++;
-
-    sort_order(bin_offsets(bin) + count) = j;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const bin_sort_bins_tag& /*tag*/, const int i) const {
-    auto bin_size = bin_count_const(i);
-    if (bin_size <= 1) return;
-    int upper_bound = bin_offsets(i) + bin_size;
-    bool sorted     = false;
-    while (!sorted) {
-      sorted      = true;
-      int old_idx = sort_order(bin_offsets(i));
-      int new_idx = 0;
-      for (int k = bin_offsets(i) + 1; k < upper_bound; k++) {
-        new_idx = sort_order(k);
-
-        if (!bin_op(keys_rnd, old_idx, new_idx)) {
-          sort_order(k - 1) = new_idx;
-          sort_order(k)     = old_idx;
-          sorted            = false;
-        } else {
-          old_idx = new_idx;
-        }
-      }
-      upper_bound--;
-    }
-  }
-};
-
-//----------------------------------------------------------------------------
-
-template <class KeyViewType>
-struct BinOp1D {
-  int max_bins_ = {};
-  double mul_   = {};
-  double min_   = {};
-
-  BinOp1D() = default;
-
-  // Construct BinOp with number of bins, minimum value and maximum value
-  BinOp1D(int max_bins__, typename KeyViewType::const_value_type min,
-          typename KeyViewType::const_value_type max)
-      : max_bins_(max_bins__ + 1),
-        // Cast to double to avoid possible overflow when using integer
-        mul_(static_cast<double>(max_bins__) /
-             (static_cast<double>(max) - static_cast<double>(min))),
-        min_(static_cast<double>(min)) {
-    // For integral types the number of bins may be larger than the range
-    // in which case we can exactly have one unique value per bin
-    // and then don't need to sort bins.
-    if (std::is_integral<typename KeyViewType::const_value_type>::value &&
-        (static_cast<double>(max) - static_cast<double>(min)) <=
-            static_cast<double>(max_bins__)) {
-      mul_ = 1.;
-    }
-  }
-
-  // Determine bin index from key value
-  template <class ViewType>
-  KOKKOS_INLINE_FUNCTION int bin(ViewType& keys, const int& i) const {
-    return static_cast<int>(mul_ * (static_cast<double>(keys(i)) - min_));
-  }
-
-  // Return maximum bin index + 1
-  KOKKOS_INLINE_FUNCTION
-  int max_bins() const { return max_bins_; }
-
-  // Compare to keys within a bin if true new_val will be put before old_val
-  template <class ViewType, typename iType1, typename iType2>
-  KOKKOS_INLINE_FUNCTION bool operator()(ViewType& keys, iType1& i1,
-                                         iType2& i2) const {
-    return keys(i1) < keys(i2);
-  }
-};
-
-template <class KeyViewType>
-struct BinOp3D {
-  int max_bins_[3] = {};
-  double mul_[3]   = {};
-  double min_[3]   = {};
-
-  BinOp3D() = default;
-
-  BinOp3D(int max_bins__[], typename KeyViewType::const_value_type min[],
-          typename KeyViewType::const_value_type max[]) {
-    max_bins_[0] = max_bins__[0];
-    max_bins_[1] = max_bins__[1];
-    max_bins_[2] = max_bins__[2];
-    mul_[0]      = static_cast<double>(max_bins__[0]) /
-              (static_cast<double>(max[0]) - static_cast<double>(min[0]));
-    mul_[1] = static_cast<double>(max_bins__[1]) /
-              (static_cast<double>(max[1]) - static_cast<double>(min[1]));
-    mul_[2] = static_cast<double>(max_bins__[2]) /
-              (static_cast<double>(max[2]) - static_cast<double>(min[2]));
-    min_[0] = static_cast<double>(min[0]);
-    min_[1] = static_cast<double>(min[1]);
-    min_[2] = static_cast<double>(min[2]);
-  }
-
-  template <class ViewType>
-  KOKKOS_INLINE_FUNCTION int bin(ViewType& keys, const int& i) const {
-    return int((((int(mul_[0] * (keys(i, 0) - min_[0])) * max_bins_[1]) +
-                 int(mul_[1] * (keys(i, 1) - min_[1]))) *
-                max_bins_[2]) +
-               int(mul_[2] * (keys(i, 2) - min_[2])));
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  int max_bins() const { return max_bins_[0] * max_bins_[1] * max_bins_[2]; }
-
-  template <class ViewType, typename iType1, typename iType2>
-  KOKKOS_INLINE_FUNCTION bool operator()(ViewType& keys, iType1& i1,
-                                         iType2& i2) const {
-    if (keys(i1, 0) > keys(i2, 0))
-      return true;
-    else if (keys(i1, 0) == keys(i2, 0)) {
-      if (keys(i1, 1) > keys(i2, 1))
-        return true;
-      else if (keys(i1, 1) == keys(i2, 1)) {
-        if (keys(i1, 2) > keys(i2, 2)) return true;
-      }
-    }
-    return false;
-  }
-};
-
-namespace Impl {
-
-template <class ViewType>
-struct min_max_functor {
-  using minmax_scalar =
-      Kokkos::MinMaxScalar<typename ViewType::non_const_value_type>;
-
-  ViewType view;
-  min_max_functor(const ViewType& view_) : view(view_) {}
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const size_t& i, minmax_scalar& minmax) const {
-    if (view(i) < minmax.min_val) minmax.min_val = view(i);
-    if (view(i) > minmax.max_val) minmax.max_val = view(i);
-  }
-};
-
-}  // namespace Impl
-
-template <class ExecutionSpace, class DataType, class... Properties>
-std::enable_if_t<(Kokkos::is_execution_space<ExecutionSpace>::value) &&
-                 (!SpaceAccessibility<
-                     HostSpace, typename Kokkos::View<DataType, Properties...>::
-                                    memory_space>::accessible)>
-sort(const ExecutionSpace& exec,
-     const Kokkos::View<DataType, Properties...>& view) {
-  using ViewType = Kokkos::View<DataType, Properties...>;
-  using CompType = BinOp1D<ViewType>;
-
-  Kokkos::MinMaxScalar<typename ViewType::non_const_value_type> result;
-  Kokkos::MinMax<typename ViewType::non_const_value_type> reducer(result);
-  parallel_reduce("Kokkos::Sort::FindExtent",
-                  Kokkos::RangePolicy<typename ViewType::execution_space>(
-                      exec, 0, view.extent(0)),
-                  Impl::min_max_functor<ViewType>(view), reducer);
-  if (result.min_val == result.max_val) return;
-  // For integral types the number of bins may be larger than the range
-  // in which case we can exactly have one unique value per bin
-  // and then don't need to sort bins.
-  bool sort_in_bins = true;
-  // TODO: figure out better max_bins then this ...
-  int64_t max_bins = view.extent(0) / 2;
-  if (std::is_integral<typename ViewType::non_const_value_type>::value) {
-    // Cast to double to avoid possible overflow when using integer
-    auto const max_val = static_cast<double>(result.max_val);
-    auto const min_val = static_cast<double>(result.min_val);
-    // using 10M as the cutoff for special behavior (roughly 40MB for the count
-    // array)
-    if ((max_val - min_val) < 10000000) {
-      max_bins     = max_val - min_val + 1;
-      sort_in_bins = false;
-    }
-  }
-  if (std::is_floating_point<typename ViewType::non_const_value_type>::value) {
-    KOKKOS_ASSERT(std::isfinite(static_cast<double>(result.max_val) -
-                                static_cast<double>(result.min_val)));
-  }
-
-  BinSort<ViewType, CompType> bin_sort(
-      view, CompType(max_bins, result.min_val, result.max_val), sort_in_bins);
-  bin_sort.create_permute_vector(exec);
-  bin_sort.sort(exec, view);
-}
-
-template <class ExecutionSpace, class DataType, class... Properties>
-std::enable_if_t<(Kokkos::is_execution_space<ExecutionSpace>::value) &&
-                 (SpaceAccessibility<
-                     HostSpace, typename Kokkos::View<DataType, Properties...>::
-                                    memory_space>::accessible)>
-sort(const ExecutionSpace&, const Kokkos::View<DataType, Properties...>& view) {
-  auto first = Experimental::begin(view);
-  auto last  = Experimental::end(view);
-  std::sort(first, last);
-}
-
-#if defined(KOKKOS_ENABLE_CUDA)
-template <class DataType, class... Properties>
-void sort(const Cuda& space,
-          const Kokkos::View<DataType, Properties...>& view) {
-  const auto exec = thrust::cuda::par.on(space.cuda_stream());
-  auto first      = Experimental::begin(view);
-  auto last       = Experimental::end(view);
-  thrust::sort(exec, first, last);
-}
-#endif
-
-template <class ViewType>
-void sort(ViewType const& view) {
-  Kokkos::fence("Kokkos::sort: before");
-  typename ViewType::execution_space exec;
-  sort(exec, view);
-  exec.fence("Kokkos::sort: fence after sorting");
-}
-
-template <class ExecutionSpace, class ViewType>
-std::enable_if_t<Kokkos::is_execution_space<ExecutionSpace>::value> sort(
-    const ExecutionSpace& exec, ViewType view, size_t const begin,
-    size_t const end) {
-  using range_policy = Kokkos::RangePolicy<typename ViewType::execution_space>;
-  using CompType     = BinOp1D<ViewType>;
-
-  Kokkos::MinMaxScalar<typename ViewType::non_const_value_type> result;
-  Kokkos::MinMax<typename ViewType::non_const_value_type> reducer(result);
-
-  parallel_reduce("Kokkos::Sort::FindExtent", range_policy(exec, begin, end),
-                  Impl::min_max_functor<ViewType>(view), reducer);
-
-  if (result.min_val == result.max_val) return;
-
-  BinSort<ViewType, CompType> bin_sort(
-      exec, view, begin, end,
-      CompType((end - begin) / 2, result.min_val, result.max_val), true);
-
-  bin_sort.create_permute_vector(exec);
-  bin_sort.sort(exec, view, begin, end);
-}
-
-template <class ViewType>
-void sort(ViewType view, size_t const begin, size_t const end) {
-  Kokkos::fence("Kokkos::sort: before");
-  typename ViewType::execution_space exec;
-  sort(exec, view, begin, end);
-  exec.fence("Kokkos::Sort: fence after sorting");
-}
-
-}  // namespace Kokkos
+#include "sorting/Kokkos_BinSortPublicAPI.hpp"
+#include "sorting/Kokkos_SortPublicAPI.hpp"
+#include "sorting/Kokkos_NestedSortPublicAPI.hpp"
 
 #ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_SORT
 #undef KOKKOS_IMPL_PUBLIC_INCLUDE
diff --git a/packages/kokkos/algorithms/src/sorting/Kokkos_BinOpsPublicAPI.hpp b/packages/kokkos/algorithms/src/sorting/Kokkos_BinOpsPublicAPI.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..73e751f572c5e866f03d93ce54f1b109e50a1ea2
--- /dev/null
+++ b/packages/kokkos/algorithms/src/sorting/Kokkos_BinOpsPublicAPI.hpp
@@ -0,0 +1,129 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_BIN_OPS_PUBLIC_API_HPP_
+#define KOKKOS_BIN_OPS_PUBLIC_API_HPP_
+
+#include <Kokkos_Macros.hpp>
+#include <type_traits>
+
+namespace Kokkos {
+
+template <class KeyViewType>
+struct BinOp1D {
+  int max_bins_ = {};
+  double mul_   = {};
+  double min_   = {};
+
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
+  KOKKOS_DEPRECATED BinOp1D() = default;
+#else
+  BinOp1D() = delete;
+#endif
+
+  // Construct BinOp with number of bins, minimum value and maximum value
+  BinOp1D(int max_bins__, typename KeyViewType::const_value_type min,
+          typename KeyViewType::const_value_type max)
+      : max_bins_(max_bins__ + 1),
+        // Cast to double to avoid possible overflow when using integer
+        mul_(static_cast<double>(max_bins__) /
+             (static_cast<double>(max) - static_cast<double>(min))),
+        min_(static_cast<double>(min)) {
+    // For integral types the number of bins may be larger than the range
+    // in which case we can exactly have one unique value per bin
+    // and then don't need to sort bins.
+    if (std::is_integral<typename KeyViewType::const_value_type>::value &&
+        (static_cast<double>(max) - static_cast<double>(min)) <=
+            static_cast<double>(max_bins__)) {
+      mul_ = 1.;
+    }
+  }
+
+  // Determine bin index from key value
+  template <class ViewType>
+  KOKKOS_INLINE_FUNCTION int bin(ViewType& keys, const int& i) const {
+    return static_cast<int>(mul_ * (static_cast<double>(keys(i)) - min_));
+  }
+
+  // Return maximum bin index + 1
+  KOKKOS_INLINE_FUNCTION
+  int max_bins() const { return max_bins_; }
+
+  // Compare to keys within a bin if true new_val will be put before old_val
+  template <class ViewType, typename iType1, typename iType2>
+  KOKKOS_INLINE_FUNCTION bool operator()(ViewType& keys, iType1& i1,
+                                         iType2& i2) const {
+    return keys(i1) < keys(i2);
+  }
+};
+
+template <class KeyViewType>
+struct BinOp3D {
+  int max_bins_[3] = {};
+  double mul_[3]   = {};
+  double min_[3]   = {};
+
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
+  KOKKOS_DEPRECATED BinOp3D() = default;
+#else
+  BinOp3D() = delete;
+#endif
+
+  BinOp3D(int max_bins__[], typename KeyViewType::const_value_type min[],
+          typename KeyViewType::const_value_type max[]) {
+    max_bins_[0] = max_bins__[0];
+    max_bins_[1] = max_bins__[1];
+    max_bins_[2] = max_bins__[2];
+    mul_[0]      = static_cast<double>(max_bins__[0]) /
+              (static_cast<double>(max[0]) - static_cast<double>(min[0]));
+    mul_[1] = static_cast<double>(max_bins__[1]) /
+              (static_cast<double>(max[1]) - static_cast<double>(min[1]));
+    mul_[2] = static_cast<double>(max_bins__[2]) /
+              (static_cast<double>(max[2]) - static_cast<double>(min[2]));
+    min_[0] = static_cast<double>(min[0]);
+    min_[1] = static_cast<double>(min[1]);
+    min_[2] = static_cast<double>(min[2]);
+  }
+
+  template <class ViewType>
+  KOKKOS_INLINE_FUNCTION int bin(ViewType& keys, const int& i) const {
+    return int((((int(mul_[0] * (keys(i, 0) - min_[0])) * max_bins_[1]) +
+                 int(mul_[1] * (keys(i, 1) - min_[1]))) *
+                max_bins_[2]) +
+               int(mul_[2] * (keys(i, 2) - min_[2])));
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  int max_bins() const { return max_bins_[0] * max_bins_[1] * max_bins_[2]; }
+
+  template <class ViewType, typename iType1, typename iType2>
+  KOKKOS_INLINE_FUNCTION bool operator()(ViewType& keys, iType1& i1,
+                                         iType2& i2) const {
+    if (keys(i1, 0) > keys(i2, 0))
+      return true;
+    else if (keys(i1, 0) == keys(i2, 0)) {
+      if (keys(i1, 1) > keys(i2, 1))
+        return true;
+      else if (keys(i1, 1) == keys(i2, 1)) {
+        if (keys(i1, 2) > keys(i2, 2)) return true;
+      }
+    }
+    return false;
+  }
+};
+
+}  // namespace Kokkos
+#endif
diff --git a/packages/kokkos/algorithms/src/sorting/Kokkos_BinSortPublicAPI.hpp b/packages/kokkos/algorithms/src/sorting/Kokkos_BinSortPublicAPI.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c399279fe48ff2b322b19cf2c9cc68d1477f8a7e
--- /dev/null
+++ b/packages/kokkos/algorithms/src/sorting/Kokkos_BinSortPublicAPI.hpp
@@ -0,0 +1,410 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_BIN_SORT_PUBLIC_API_HPP_
+#define KOKKOS_BIN_SORT_PUBLIC_API_HPP_
+
+#include "Kokkos_BinOpsPublicAPI.hpp"
+#include "impl/Kokkos_CopyOpsForBinSortImpl.hpp"
+#include <Kokkos_Core.hpp>
+#include <algorithm>
+
+namespace Kokkos {
+
+template <class KeyViewType, class BinSortOp,
+          class Space    = typename KeyViewType::device_type,
+          class SizeType = typename KeyViewType::memory_space::size_type>
+class BinSort {
+ public:
+  template <class DstViewType, class SrcViewType>
+  struct copy_functor {
+    using src_view_type = typename SrcViewType::const_type;
+
+    using copy_op = Impl::CopyOp<DstViewType, src_view_type>;
+
+    DstViewType dst_values;
+    src_view_type src_values;
+    int dst_offset;
+
+    copy_functor(DstViewType const& dst_values_, int const& dst_offset_,
+                 SrcViewType const& src_values_)
+        : dst_values(dst_values_),
+          src_values(src_values_),
+          dst_offset(dst_offset_) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const int& i) const {
+      copy_op::copy(dst_values, i + dst_offset, src_values, i);
+    }
+  };
+
+  template <class DstViewType, class PermuteViewType, class SrcViewType>
+  struct copy_permute_functor {
+    // If a Kokkos::View then can generate constant random access
+    // otherwise can only use the constant type.
+
+    using src_view_type = std::conditional_t<
+        Kokkos::is_view<SrcViewType>::value,
+        Kokkos::View<typename SrcViewType::const_data_type,
+                     typename SrcViewType::array_layout,
+                     typename SrcViewType::device_type
+#if !defined(KOKKOS_COMPILER_NVHPC) || (KOKKOS_COMPILER_NVHPC >= 230700)
+                     ,
+                     Kokkos::MemoryTraits<Kokkos::RandomAccess>
+#endif
+                     >,
+        typename SrcViewType::const_type>;
+
+    using perm_view_type = typename PermuteViewType::const_type;
+
+    using copy_op = Impl::CopyOp<DstViewType, src_view_type>;
+
+    DstViewType dst_values;
+    perm_view_type sort_order;
+    src_view_type src_values;
+    int src_offset;
+
+    copy_permute_functor(DstViewType const& dst_values_,
+                         PermuteViewType const& sort_order_,
+                         SrcViewType const& src_values_, int const& src_offset_)
+        : dst_values(dst_values_),
+          sort_order(sort_order_),
+          src_values(src_values_),
+          src_offset(src_offset_) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const int& i) const {
+      copy_op::copy(dst_values, i, src_values, src_offset + sort_order(i));
+    }
+  };
+
+  // Naming this alias "execution_space" would be problematic since it would be
+  // considered as execution space for the various functors which might use
+  // another execution space through sort() or create_permute_vector().
+  using exec_space  = typename Space::execution_space;
+  using bin_op_type = BinSortOp;
+
+  struct bin_count_tag {};
+  struct bin_offset_tag {};
+  struct bin_binning_tag {};
+  struct bin_sort_bins_tag {};
+
+ public:
+  using size_type  = SizeType;
+  using value_type = size_type;
+
+  using offset_type    = Kokkos::View<size_type*, Space>;
+  using bin_count_type = Kokkos::View<const int*, Space>;
+
+  using const_key_view_type = typename KeyViewType::const_type;
+
+  // If a Kokkos::View then can generate constant random access
+  // otherwise can only use the constant type.
+
+  using const_rnd_key_view_type = std::conditional_t<
+      Kokkos::is_view<KeyViewType>::value,
+      Kokkos::View<typename KeyViewType::const_data_type,
+                   typename KeyViewType::array_layout,
+                   typename KeyViewType::device_type,
+                   Kokkos::MemoryTraits<Kokkos::RandomAccess> >,
+      const_key_view_type>;
+
+  using non_const_key_scalar = typename KeyViewType::non_const_value_type;
+  using const_key_scalar     = typename KeyViewType::const_value_type;
+
+  using bin_count_atomic_type =
+      Kokkos::View<int*, Space, Kokkos::MemoryTraits<Kokkos::Atomic> >;
+
+ private:
+  const_key_view_type keys;
+  const_rnd_key_view_type keys_rnd;
+
+ public:
+  BinSortOp bin_op;
+  offset_type bin_offsets;
+  bin_count_atomic_type bin_count_atomic;
+  bin_count_type bin_count_const;
+  offset_type sort_order;
+
+  int range_begin;
+  int range_end;
+  bool sort_within_bins;
+
+ public:
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
+  KOKKOS_DEPRECATED BinSort() = default;
+#else
+  BinSort() = delete;
+#endif
+
+  //----------------------------------------
+  // Constructor: takes the keys, the binning_operator and optionally whether to
+  // sort within bins (default false)
+  template <typename ExecutionSpace>
+  BinSort(const ExecutionSpace& exec, const_key_view_type keys_,
+          int range_begin_, int range_end_, BinSortOp bin_op_,
+          bool sort_within_bins_ = false)
+      : keys(keys_),
+        keys_rnd(keys_),
+        bin_op(bin_op_),
+        bin_offsets(),
+        bin_count_atomic(),
+        bin_count_const(),
+        sort_order(),
+        range_begin(range_begin_),
+        range_end(range_end_),
+        sort_within_bins(sort_within_bins_) {
+    static_assert(
+        Kokkos::SpaceAccessibility<ExecutionSpace,
+                                   typename Space::memory_space>::accessible,
+        "The provided execution space must be able to access the memory space "
+        "BinSort was initialized with!");
+    if (bin_op.max_bins() <= 0)
+      Kokkos::abort(
+          "The number of bins in the BinSortOp object must be greater than 0!");
+    bin_count_atomic = Kokkos::View<int*, Space>(
+        "Kokkos::SortImpl::BinSortFunctor::bin_count", bin_op.max_bins());
+    bin_count_const = bin_count_atomic;
+    bin_offsets =
+        offset_type(view_alloc(exec, WithoutInitializing,
+                               "Kokkos::SortImpl::BinSortFunctor::bin_offsets"),
+                    bin_op.max_bins());
+    sort_order =
+        offset_type(view_alloc(exec, WithoutInitializing,
+                               "Kokkos::SortImpl::BinSortFunctor::sort_order"),
+                    range_end - range_begin);
+  }
+
+  BinSort(const_key_view_type keys_, int range_begin_, int range_end_,
+          BinSortOp bin_op_, bool sort_within_bins_ = false)
+      : BinSort(exec_space{}, keys_, range_begin_, range_end_, bin_op_,
+                sort_within_bins_) {}
+
+  template <typename ExecutionSpace>
+  BinSort(const ExecutionSpace& exec, const_key_view_type keys_,
+          BinSortOp bin_op_, bool sort_within_bins_ = false)
+      : BinSort(exec, keys_, 0, keys_.extent(0), bin_op_, sort_within_bins_) {}
+
+  BinSort(const_key_view_type keys_, BinSortOp bin_op_,
+          bool sort_within_bins_ = false)
+      : BinSort(exec_space{}, keys_, bin_op_, sort_within_bins_) {}
+
+  //----------------------------------------
+  // Create the permutation vector, the bin_offset array and the bin_count
+  // array. Can be called again if keys changed
+  template <class ExecutionSpace>
+  void create_permute_vector(const ExecutionSpace& exec) {
+    static_assert(
+        Kokkos::SpaceAccessibility<ExecutionSpace,
+                                   typename Space::memory_space>::accessible,
+        "The provided execution space must be able to access the memory space "
+        "BinSort was initialized with!");
+
+    const size_t len = range_end - range_begin;
+    Kokkos::parallel_for(
+        "Kokkos::Sort::BinCount",
+        Kokkos::RangePolicy<ExecutionSpace, bin_count_tag>(exec, 0, len),
+        *this);
+    Kokkos::parallel_scan("Kokkos::Sort::BinOffset",
+                          Kokkos::RangePolicy<ExecutionSpace, bin_offset_tag>(
+                              exec, 0, bin_op.max_bins()),
+                          *this);
+
+    Kokkos::deep_copy(exec, bin_count_atomic, 0);
+    Kokkos::parallel_for(
+        "Kokkos::Sort::BinBinning",
+        Kokkos::RangePolicy<ExecutionSpace, bin_binning_tag>(exec, 0, len),
+        *this);
+
+    if (sort_within_bins)
+      Kokkos::parallel_for(
+          "Kokkos::Sort::BinSort",
+          Kokkos::RangePolicy<ExecutionSpace, bin_sort_bins_tag>(
+              exec, 0, bin_op.max_bins()),
+          *this);
+  }
+
+  // Create the permutation vector, the bin_offset array and the bin_count
+  // array. Can be called again if keys changed
+  void create_permute_vector() {
+    Kokkos::fence("Kokkos::Binsort::create_permute_vector: before");
+    exec_space e{};
+    create_permute_vector(e);
+    e.fence("Kokkos::Binsort::create_permute_vector: after");
+  }
+
+  // Sort a subset of a view with respect to the first dimension using the
+  // permutation array
+  template <class ExecutionSpace, class ValuesViewType>
+  void sort(const ExecutionSpace& exec, ValuesViewType const& values,
+            int values_range_begin, int values_range_end) const {
+    if (values.extent(0) == 0) {
+      return;
+    }
+
+    static_assert(
+        Kokkos::SpaceAccessibility<ExecutionSpace,
+                                   typename Space::memory_space>::accessible,
+        "The provided execution space must be able to access the memory space "
+        "BinSort was initialized with!");
+    static_assert(
+        Kokkos::SpaceAccessibility<
+            ExecutionSpace, typename ValuesViewType::memory_space>::accessible,
+        "The provided execution space must be able to access the memory space "
+        "of the View argument!");
+
+    const size_t len        = range_end - range_begin;
+    const size_t values_len = values_range_end - values_range_begin;
+    if (len != values_len) {
+      Kokkos::abort(
+          "BinSort::sort: values range length != permutation vector length");
+    }
+
+    using scratch_view_type =
+        Kokkos::View<typename ValuesViewType::data_type,
+                     typename ValuesViewType::device_type>;
+    scratch_view_type sorted_values(
+        view_alloc(exec, WithoutInitializing,
+                   "Kokkos::SortImpl::BinSortFunctor::sorted_values"),
+        values.rank_dynamic > 0 ? len : KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+        values.rank_dynamic > 1 ? values.extent(1)
+                                : KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+        values.rank_dynamic > 2 ? values.extent(2)
+                                : KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+        values.rank_dynamic > 3 ? values.extent(3)
+                                : KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+        values.rank_dynamic > 4 ? values.extent(4)
+                                : KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+        values.rank_dynamic > 5 ? values.extent(5)
+                                : KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+        values.rank_dynamic > 6 ? values.extent(6)
+                                : KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+        values.rank_dynamic > 7 ? values.extent(7)
+                                : KOKKOS_IMPL_CTOR_DEFAULT_ARG);
+
+    {
+      copy_permute_functor<scratch_view_type /* DstViewType */
+                           ,
+                           offset_type /* PermuteViewType */
+                           ,
+                           ValuesViewType /* SrcViewType */
+                           >
+          functor(sorted_values, sort_order, values,
+                  values_range_begin - range_begin);
+
+      parallel_for("Kokkos::Sort::CopyPermute",
+                   Kokkos::RangePolicy<ExecutionSpace>(exec, 0, len), functor);
+    }
+
+    {
+      copy_functor<ValuesViewType, scratch_view_type> functor(
+          values, range_begin, sorted_values);
+
+      parallel_for("Kokkos::Sort::Copy",
+                   Kokkos::RangePolicy<ExecutionSpace>(exec, 0, len), functor);
+    }
+  }
+
+  // Sort a subset of a view with respect to the first dimension using the
+  // permutation array
+  template <class ValuesViewType>
+  void sort(ValuesViewType const& values, int values_range_begin,
+            int values_range_end) const {
+    Kokkos::fence("Kokkos::Binsort::sort: before");
+    exec_space exec;
+    sort(exec, values, values_range_begin, values_range_end);
+    exec.fence("Kokkos::BinSort:sort: after");
+  }
+
+  template <class ExecutionSpace, class ValuesViewType>
+  void sort(ExecutionSpace const& exec, ValuesViewType const& values) const {
+    this->sort(exec, values, 0, /*values.extent(0)*/ range_end - range_begin);
+  }
+
+  template <class ValuesViewType>
+  void sort(ValuesViewType const& values) const {
+    this->sort(values, 0, /*values.extent(0)*/ range_end - range_begin);
+  }
+
+  // Get the permutation vector
+  KOKKOS_INLINE_FUNCTION
+  offset_type get_permute_vector() const { return sort_order; }
+
+  // Get the start offsets for each bin
+  KOKKOS_INLINE_FUNCTION
+  offset_type get_bin_offsets() const { return bin_offsets; }
+
+  // Get the count for each bin
+  KOKKOS_INLINE_FUNCTION
+  bin_count_type get_bin_count() const { return bin_count_const; }
+
+ public:
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const bin_count_tag& /*tag*/, const int i) const {
+    const int j = range_begin + i;
+    bin_count_atomic(bin_op.bin(keys, j))++;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const bin_offset_tag& /*tag*/, const int i,
+                  value_type& offset, const bool& final) const {
+    if (final) {
+      bin_offsets(i) = offset;
+    }
+    offset += bin_count_const(i);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const bin_binning_tag& /*tag*/, const int i) const {
+    const int j     = range_begin + i;
+    const int bin   = bin_op.bin(keys, j);
+    const int count = bin_count_atomic(bin)++;
+
+    sort_order(bin_offsets(bin) + count) = j;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const bin_sort_bins_tag& /*tag*/, const int i) const {
+    auto bin_size = bin_count_const(i);
+    if (bin_size <= 1) return;
+    constexpr bool use_std_sort =
+        std::is_same_v<typename exec_space::memory_space, HostSpace>;
+    int lower_bound = bin_offsets(i);
+    int upper_bound = lower_bound + bin_size;
+    // Switching to std::sort for more than 10 elements has been found
+    // reasonable experimentally.
+    if (use_std_sort && bin_size > 10) {
+      KOKKOS_IF_ON_HOST(
+          (std::sort(&sort_order(lower_bound), &sort_order(upper_bound),
+                     [this](int p, int q) { return bin_op(keys_rnd, p, q); });))
+    } else {
+      for (int k = lower_bound + 1; k < upper_bound; ++k) {
+        int old_idx = sort_order(k);
+        int j       = k - 1;
+        while (j >= lower_bound) {
+          int new_idx = sort_order(j);
+          if (!bin_op(keys_rnd, old_idx, new_idx)) break;
+          sort_order(j + 1) = new_idx;
+          --j;
+        }
+        sort_order(j + 1) = old_idx;
+      }
+    }
+  }
+};
+
+}  // namespace Kokkos
+#endif
diff --git a/packages/kokkos/algorithms/src/sorting/Kokkos_NestedSortPublicAPI.hpp b/packages/kokkos/algorithms/src/sorting/Kokkos_NestedSortPublicAPI.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..dd468e07342c3ae6f3278321c0486c143ea83517
--- /dev/null
+++ b/packages/kokkos/algorithms/src/sorting/Kokkos_NestedSortPublicAPI.hpp
@@ -0,0 +1,100 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_NESTED_SORT_PUBLIC_API_HPP_
+#define KOKKOS_NESTED_SORT_PUBLIC_API_HPP_
+
+#include "impl/Kokkos_NestedSortImpl.hpp"
+#include <Kokkos_Core.hpp>
+#include <std_algorithms/impl/Kokkos_HelperPredicates.hpp>
+
+namespace Kokkos {
+namespace Experimental {
+
+template <class TeamMember, class ViewType>
+KOKKOS_INLINE_FUNCTION void sort_team(const TeamMember& t,
+                                      const ViewType& view) {
+  Impl::sort_nested_impl(t, view, nullptr,
+                         Experimental::Impl::StdAlgoLessThanBinaryPredicate<
+                             typename ViewType::non_const_value_type>(),
+                         Impl::NestedRange<true>());
+}
+
+template <class TeamMember, class ViewType, class Comparator>
+KOKKOS_INLINE_FUNCTION void sort_team(const TeamMember& t, const ViewType& view,
+                                      const Comparator& comp) {
+  Impl::sort_nested_impl(t, view, nullptr, comp, Impl::NestedRange<true>());
+}
+
+template <class TeamMember, class KeyViewType, class ValueViewType>
+KOKKOS_INLINE_FUNCTION void sort_by_key_team(const TeamMember& t,
+                                             const KeyViewType& keyView,
+                                             const ValueViewType& valueView) {
+  Impl::sort_nested_impl(t, keyView, valueView,
+                         Experimental::Impl::StdAlgoLessThanBinaryPredicate<
+                             typename KeyViewType::non_const_value_type>(),
+                         Impl::NestedRange<true>());
+}
+
+template <class TeamMember, class KeyViewType, class ValueViewType,
+          class Comparator>
+KOKKOS_INLINE_FUNCTION void sort_by_key_team(const TeamMember& t,
+                                             const KeyViewType& keyView,
+                                             const ValueViewType& valueView,
+                                             const Comparator& comp) {
+  Impl::sort_nested_impl(t, keyView, valueView, comp,
+                         Impl::NestedRange<true>());
+}
+
+template <class TeamMember, class ViewType>
+KOKKOS_INLINE_FUNCTION void sort_thread(const TeamMember& t,
+                                        const ViewType& view) {
+  Impl::sort_nested_impl(t, view, nullptr,
+                         Experimental::Impl::StdAlgoLessThanBinaryPredicate<
+                             typename ViewType::non_const_value_type>(),
+                         Impl::NestedRange<false>());
+}
+
+template <class TeamMember, class ViewType, class Comparator>
+KOKKOS_INLINE_FUNCTION void sort_thread(const TeamMember& t,
+                                        const ViewType& view,
+                                        const Comparator& comp) {
+  Impl::sort_nested_impl(t, view, nullptr, comp, Impl::NestedRange<false>());
+}
+
+template <class TeamMember, class KeyViewType, class ValueViewType>
+KOKKOS_INLINE_FUNCTION void sort_by_key_thread(const TeamMember& t,
+                                               const KeyViewType& keyView,
+                                               const ValueViewType& valueView) {
+  Impl::sort_nested_impl(t, keyView, valueView,
+                         Experimental::Impl::StdAlgoLessThanBinaryPredicate<
+                             typename KeyViewType::non_const_value_type>(),
+                         Impl::NestedRange<false>());
+}
+
+template <class TeamMember, class KeyViewType, class ValueViewType,
+          class Comparator>
+KOKKOS_INLINE_FUNCTION void sort_by_key_thread(const TeamMember& t,
+                                               const KeyViewType& keyView,
+                                               const ValueViewType& valueView,
+                                               const Comparator& comp) {
+  Impl::sort_nested_impl(t, keyView, valueView, comp,
+                         Impl::NestedRange<false>());
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+#endif
diff --git a/packages/kokkos/algorithms/src/sorting/Kokkos_SortPublicAPI.hpp b/packages/kokkos/algorithms/src/sorting/Kokkos_SortPublicAPI.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a763c41e580701aff17417be77c153577e984c80
--- /dev/null
+++ b/packages/kokkos/algorithms/src/sorting/Kokkos_SortPublicAPI.hpp
@@ -0,0 +1,194 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_SORT_PUBLIC_API_HPP_
+#define KOKKOS_SORT_PUBLIC_API_HPP_
+
+#include "./impl/Kokkos_SortImpl.hpp"
+#include <std_algorithms/Kokkos_BeginEnd.hpp>
+#include <Kokkos_Core.hpp>
+#include <algorithm>
+
+namespace Kokkos {
+
+// ---------------------------------------------------------------
+// basic overloads
+// ---------------------------------------------------------------
+
+template <class ExecutionSpace, class DataType, class... Properties>
+void sort([[maybe_unused]] const ExecutionSpace& exec,
+          const Kokkos::View<DataType, Properties...>& view) {
+  // constraints
+  using ViewType = Kokkos::View<DataType, Properties...>;
+  using MemSpace = typename ViewType::memory_space;
+  static_assert(
+      ViewType::rank == 1 &&
+          (std::is_same_v<typename ViewType::array_layout, LayoutRight> ||
+           std::is_same_v<typename ViewType::array_layout, LayoutLeft> ||
+           std::is_same_v<typename ViewType::array_layout, LayoutStride>),
+      "Kokkos::sort without comparator: supports 1D Views with LayoutRight, "
+      "LayoutLeft or LayoutStride.");
+
+  static_assert(SpaceAccessibility<ExecutionSpace, MemSpace>::accessible,
+                "Kokkos::sort: execution space instance is not able to access "
+                "the memory space of the "
+                "View argument!");
+
+  if (view.extent(0) <= 1) {
+    return;
+  }
+
+  if constexpr (Impl::better_off_calling_std_sort_v<ExecutionSpace>) {
+    auto first = ::Kokkos::Experimental::begin(view);
+    auto last  = ::Kokkos::Experimental::end(view);
+    std::sort(first, last);
+  } else {
+    Impl::sort_device_view_without_comparator(exec, view);
+  }
+}
+
+template <class DataType, class... Properties>
+void sort(const Kokkos::View<DataType, Properties...>& view) {
+  using ViewType = Kokkos::View<DataType, Properties...>;
+  static_assert(ViewType::rank == 1,
+                "Kokkos::sort: currently only supports rank-1 Views.");
+
+  Kokkos::fence("Kokkos::sort: before");
+
+  if (view.extent(0) <= 1) {
+    return;
+  }
+
+  typename ViewType::execution_space exec;
+  sort(exec, view);
+  exec.fence("Kokkos::sort: fence after sorting");
+}
+
+// ---------------------------------------------------------------
+// overloads supporting a custom comparator
+// ---------------------------------------------------------------
+template <class ExecutionSpace, class ComparatorType, class DataType,
+          class... Properties>
+void sort([[maybe_unused]] const ExecutionSpace& exec,
+          const Kokkos::View<DataType, Properties...>& view,
+          const ComparatorType& comparator) {
+  // constraints
+  using ViewType = Kokkos::View<DataType, Properties...>;
+  using MemSpace = typename ViewType::memory_space;
+  static_assert(
+      ViewType::rank == 1 &&
+          (std::is_same_v<typename ViewType::array_layout, LayoutRight> ||
+           std::is_same_v<typename ViewType::array_layout, LayoutLeft> ||
+           std::is_same_v<typename ViewType::array_layout, LayoutStride>),
+      "Kokkos::sort with comparator: supports 1D Views with LayoutRight, "
+      "LayoutLeft or LayoutStride.");
+
+  static_assert(SpaceAccessibility<ExecutionSpace, MemSpace>::accessible,
+                "Kokkos::sort: execution space instance is not able to access "
+                "the memory space of the View argument!");
+
+  if (view.extent(0) <= 1) {
+    return;
+  }
+
+  if constexpr (Impl::better_off_calling_std_sort_v<ExecutionSpace>) {
+    auto first = ::Kokkos::Experimental::begin(view);
+    auto last  = ::Kokkos::Experimental::end(view);
+    std::sort(first, last, comparator);
+  } else {
+    Impl::sort_device_view_with_comparator(exec, view, comparator);
+  }
+}
+
+template <class ComparatorType, class DataType, class... Properties>
+void sort(const Kokkos::View<DataType, Properties...>& view,
+          const ComparatorType& comparator) {
+  using ViewType = Kokkos::View<DataType, Properties...>;
+  static_assert(
+      ViewType::rank == 1 &&
+          (std::is_same_v<typename ViewType::array_layout, LayoutRight> ||
+           std::is_same_v<typename ViewType::array_layout, LayoutLeft> ||
+           std::is_same_v<typename ViewType::array_layout, LayoutStride>),
+      "Kokkos::sort with comparator: supports 1D Views with LayoutRight, "
+      "LayoutLeft or LayoutStride.");
+
+  Kokkos::fence("Kokkos::sort with comparator: before");
+
+  if (view.extent(0) <= 1) {
+    return;
+  }
+
+  typename ViewType::execution_space exec;
+  sort(exec, view, comparator);
+  exec.fence("Kokkos::sort with comparator: fence after sorting");
+}
+
+// ---------------------------------------------------------------
+// overloads for sorting a view with a subrange
+// specified via integers begin, end
+// ---------------------------------------------------------------
+
+template <class ExecutionSpace, class ViewType>
+std::enable_if_t<Kokkos::is_execution_space<ExecutionSpace>::value> sort(
+    const ExecutionSpace& exec, ViewType view, size_t const begin,
+    size_t const end) {
+  // view must be rank-1 because the Impl::min_max_functor
+  // used below only works for rank-1 views for now
+  static_assert(ViewType::rank == 1,
+                "Kokkos::sort: currently only supports rank-1 Views.");
+
+  if (view.extent(0) <= 1) {
+    return;
+  }
+
+  using range_policy = Kokkos::RangePolicy<typename ViewType::execution_space>;
+  using CompType     = BinOp1D<ViewType>;
+
+  Kokkos::MinMaxScalar<typename ViewType::non_const_value_type> result;
+  Kokkos::MinMax<typename ViewType::non_const_value_type> reducer(result);
+
+  parallel_reduce("Kokkos::Sort::FindExtent", range_policy(exec, begin, end),
+                  Impl::min_max_functor<ViewType>(view), reducer);
+
+  if (result.min_val == result.max_val) return;
+
+  BinSort<ViewType, CompType> bin_sort(
+      exec, view, begin, end,
+      CompType((end - begin) / 2, result.min_val, result.max_val), true);
+
+  bin_sort.create_permute_vector(exec);
+  bin_sort.sort(exec, view, begin, end);
+}
+
+template <class ViewType>
+void sort(ViewType view, size_t const begin, size_t const end) {
+  // same constraints as the overload above which this gets dispatched to
+  static_assert(ViewType::rank == 1,
+                "Kokkos::sort: currently only supports rank-1 Views.");
+
+  Kokkos::fence("Kokkos::sort: before");
+
+  if (view.extent(0) <= 1) {
+    return;
+  }
+
+  typename ViewType::execution_space exec;
+  sort(exec, view, begin, end);
+  exec.fence("Kokkos::Sort: fence after sorting");
+}
+
+}  // namespace Kokkos
+#endif
diff --git a/packages/kokkos/algorithms/src/sorting/impl/Kokkos_CopyOpsForBinSortImpl.hpp b/packages/kokkos/algorithms/src/sorting/impl/Kokkos_CopyOpsForBinSortImpl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..07f5926d82ad907f4e91066f112f54842ed6c283
--- /dev/null
+++ b/packages/kokkos/algorithms/src/sorting/impl/Kokkos_CopyOpsForBinSortImpl.hpp
@@ -0,0 +1,61 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_COPY_OPS_FOR_BINSORT_IMPL_HPP_
+#define KOKKOS_COPY_OPS_FOR_BINSORT_IMPL_HPP_
+
+#include <Kokkos_Macros.hpp>
+#include <cstddef>
+
+namespace Kokkos {
+namespace Impl {
+
+template <class DstViewType, class SrcViewType, int Rank = DstViewType::rank>
+struct CopyOp;
+
+template <class DstViewType, class SrcViewType>
+struct CopyOp<DstViewType, SrcViewType, 1> {
+  KOKKOS_INLINE_FUNCTION
+  static void copy(DstViewType const& dst, size_t i_dst, SrcViewType const& src,
+                   size_t i_src) {
+    dst(i_dst) = src(i_src);
+  }
+};
+
+template <class DstViewType, class SrcViewType>
+struct CopyOp<DstViewType, SrcViewType, 2> {
+  KOKKOS_INLINE_FUNCTION
+  static void copy(DstViewType const& dst, size_t i_dst, SrcViewType const& src,
+                   size_t i_src) {
+    for (int j = 0; j < (int)dst.extent(1); j++) dst(i_dst, j) = src(i_src, j);
+  }
+};
+
+template <class DstViewType, class SrcViewType>
+struct CopyOp<DstViewType, SrcViewType, 3> {
+  KOKKOS_INLINE_FUNCTION
+  static void copy(DstViewType const& dst, size_t i_dst, SrcViewType const& src,
+                   size_t i_src) {
+    for (int j = 0; j < dst.extent(1); j++)
+      for (int k = 0; k < dst.extent(2); k++)
+        dst(i_dst, j, k) = src(i_src, j, k);
+  }
+};
+
+}  // namespace Impl
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/sorting/impl/Kokkos_NestedSortImpl.hpp b/packages/kokkos/algorithms/src/sorting/impl/Kokkos_NestedSortImpl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..50ac82331957f186de8aaa1135ee34bb37ece83e
--- /dev/null
+++ b/packages/kokkos/algorithms/src/sorting/impl/Kokkos_NestedSortImpl.hpp
@@ -0,0 +1,115 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_NESTED_SORT_IMPL_HPP_
+#define KOKKOS_NESTED_SORT_IMPL_HPP_
+
+#include <Kokkos_Core.hpp>
+#include <std_algorithms/Kokkos_Swap.hpp>
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+// true for TeamVectorRange, false for ThreadVectorRange
+template <bool teamLevel>
+struct NestedRange {};
+
+// Specialization for team-level
+template <>
+struct NestedRange<true> {
+  template <typename TeamMember, typename SizeType>
+  KOKKOS_FUNCTION static auto create(const TeamMember& t, SizeType len) {
+    return Kokkos::TeamVectorRange(t, len);
+  }
+  template <typename TeamMember>
+  KOKKOS_FUNCTION static void barrier(const TeamMember& t) {
+    t.team_barrier();
+  }
+};
+
+// Specialization for thread-level
+template <>
+struct NestedRange<false> {
+  template <typename TeamMember, typename SizeType>
+  KOKKOS_FUNCTION static auto create(const TeamMember& t, SizeType len) {
+    return Kokkos::ThreadVectorRange(t, len);
+  }
+  // Barrier is no-op, as vector lanes of a thread are implicitly synchronized
+  // after parallel region
+  template <typename TeamMember>
+  KOKKOS_FUNCTION static void barrier(const TeamMember&) {}
+};
+
+// When just doing sort (not sort_by_key), use nullptr_t for ValueViewType.
+// This only takes the NestedRange instance for template arg deduction.
+template <class TeamMember, class KeyViewType, class ValueViewType,
+          class Comparator, bool useTeamLevel>
+KOKKOS_INLINE_FUNCTION void sort_nested_impl(
+    const TeamMember& t, const KeyViewType& keyView,
+    [[maybe_unused]] const ValueViewType& valueView, const Comparator& comp,
+    const NestedRange<useTeamLevel>) {
+  using SizeType  = typename KeyViewType::size_type;
+  using KeyType   = typename KeyViewType::non_const_value_type;
+  using Range     = NestedRange<useTeamLevel>;
+  SizeType n      = keyView.extent(0);
+  SizeType npot   = 1;
+  SizeType levels = 0;
+  // FIXME: ceiling power-of-two is a common thing to need - make it a utility
+  while (npot < n) {
+    levels++;
+    npot <<= 1;
+  }
+  for (SizeType i = 0; i < levels; i++) {
+    for (SizeType j = 0; j <= i; j++) {
+      // n/2 pairs of items are compared in parallel
+      Kokkos::parallel_for(Range::create(t, npot / 2), [=](const SizeType k) {
+        // How big are the brown/pink boxes?
+        // (Terminology comes from Wikipedia diagram)
+        // https://commons.wikimedia.org/wiki/File:BitonicSort.svg#/media/File:BitonicSort.svg
+        SizeType boxSize = SizeType(2) << (i - j);
+        // Which box contains this thread?
+        SizeType boxID     = k >> (i - j);          // k * 2 / boxSize;
+        SizeType boxStart  = boxID << (1 + i - j);  // boxID * boxSize
+        SizeType boxOffset = k - (boxStart >> 1);   // k - boxID * boxSize / 2;
+        SizeType elem1     = boxStart + boxOffset;
+        // In first phase (j == 0, brown box): within a box, compare with the
+        // opposite value in the box.
+        // In later phases (j > 0, pink box): within a box, compare with fixed
+        // distance (boxSize / 2) apart.
+        SizeType elem2 = (j == 0) ? (boxStart + boxSize - 1 - boxOffset)
+                                  : (elem1 + boxSize / 2);
+        if (elem2 < n) {
+          KeyType key1 = keyView(elem1);
+          KeyType key2 = keyView(elem2);
+          if (comp(key2, key1)) {
+            keyView(elem1) = key2;
+            keyView(elem2) = key1;
+            if constexpr (!std::is_same_v<ValueViewType, std::nullptr_t>) {
+              Kokkos::Experimental::swap(valueView(elem1), valueView(elem2));
+            }
+          }
+        }
+      });
+      Range::barrier(t);
+    }
+  }
+}
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace Kokkos
+#endif
diff --git a/packages/kokkos/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp b/packages/kokkos/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d87ab09e7724b87ceaa1cace387e57952bf5ddd6
--- /dev/null
+++ b/packages/kokkos/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp
@@ -0,0 +1,369 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_SORT_FREE_FUNCS_IMPL_HPP_
+#define KOKKOS_SORT_FREE_FUNCS_IMPL_HPP_
+
+#include "../Kokkos_BinOpsPublicAPI.hpp"
+#include "../Kokkos_BinSortPublicAPI.hpp"
+#include <std_algorithms/Kokkos_BeginEnd.hpp>
+#include <std_algorithms/Kokkos_Copy.hpp>
+#include <Kokkos_Core.hpp>
+
+#if defined(KOKKOS_ENABLE_CUDA)
+
+// Workaround for `Instruction 'shfl' without '.sync' is not supported on
+// .target sm_70 and higher from PTX ISA version 6.4`.
+// Also see https://github.com/NVIDIA/cub/pull/170.
+#if !defined(CUB_USE_COOPERATIVE_GROUPS)
+#define CUB_USE_COOPERATIVE_GROUPS
+#endif
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wshadow"
+
+#if defined(KOKKOS_COMPILER_CLANG)
+// Some versions of Clang fail to compile Thrust, failing with errors like
+// this:
+//    <snip>/thrust/system/cuda/detail/core/agent_launcher.h:557:11:
+//    error: use of undeclared identifier 'va_printf'
+// The exact combination of versions for Clang and Thrust (or CUDA) for this
+// failure was not investigated, however even very recent version combination
+// (Clang 10.0.0 and Cuda 10.0) demonstrated failure.
+//
+// Defining _CubLog here locally allows us to avoid that code path, however
+// disabling some debugging diagnostics
+#pragma push_macro("_CubLog")
+#ifdef _CubLog
+#undef _CubLog
+#endif
+#define _CubLog
+#include <thrust/device_ptr.h>
+#include <thrust/sort.h>
+#pragma pop_macro("_CubLog")
+#else
+#include <thrust/device_ptr.h>
+#include <thrust/sort.h>
+#endif
+
+#pragma GCC diagnostic pop
+
+#endif
+
+#if defined(KOKKOS_ENABLE_ONEDPL)
+#include <oneapi/dpl/execution>
+#include <oneapi/dpl/algorithm>
+#endif
+
+namespace Kokkos {
+namespace Impl {
+
+template <class ExecutionSpace>
+struct better_off_calling_std_sort : std::false_type {};
+
+#if defined KOKKOS_ENABLE_SERIAL
+template <>
+struct better_off_calling_std_sort<Kokkos::Serial> : std::true_type {};
+#endif
+
+#if defined KOKKOS_ENABLE_OPENMP
+template <>
+struct better_off_calling_std_sort<Kokkos::OpenMP> : std::true_type {};
+#endif
+
+#if defined KOKKOS_ENABLE_THREADS
+template <>
+struct better_off_calling_std_sort<Kokkos::Threads> : std::true_type {};
+#endif
+
+#if defined KOKKOS_ENABLE_HPX
+template <>
+struct better_off_calling_std_sort<Kokkos::Experimental::HPX> : std::true_type {
+};
+#endif
+
+template <class T>
+inline constexpr bool better_off_calling_std_sort_v =
+    better_off_calling_std_sort<T>::value;
+
+template <class ViewType>
+struct min_max_functor {
+  using minmax_scalar =
+      Kokkos::MinMaxScalar<typename ViewType::non_const_value_type>;
+
+  ViewType view;
+  min_max_functor(const ViewType& view_) : view(view_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const size_t& i, minmax_scalar& minmax) const {
+    if (view(i) < minmax.min_val) minmax.min_val = view(i);
+    if (view(i) > minmax.max_val) minmax.max_val = view(i);
+  }
+};
+
+template <class ExecutionSpace, class DataType, class... Properties>
+void sort_via_binsort(const ExecutionSpace& exec,
+                      const Kokkos::View<DataType, Properties...>& view) {
+  // Although we are using BinSort below, which could work on rank-2 views,
+  // for now view must be rank-1 because the min_max_functor
+  // used below only works for rank-1 views
+  using ViewType = Kokkos::View<DataType, Properties...>;
+  static_assert(ViewType::rank == 1,
+                "Kokkos::sort: currently only supports rank-1 Views.");
+
+  if (view.extent(0) <= 1) {
+    return;
+  }
+
+  Kokkos::MinMaxScalar<typename ViewType::non_const_value_type> result;
+  Kokkos::MinMax<typename ViewType::non_const_value_type> reducer(result);
+  parallel_reduce("Kokkos::Sort::FindExtent",
+                  Kokkos::RangePolicy<typename ViewType::execution_space>(
+                      exec, 0, view.extent(0)),
+                  min_max_functor<ViewType>(view), reducer);
+  if (result.min_val == result.max_val) return;
+  // For integral types the number of bins may be larger than the range
+  // in which case we can exactly have one unique value per bin
+  // and then don't need to sort bins.
+  bool sort_in_bins = true;
+  // TODO: figure out better max_bins then this ...
+  int64_t max_bins = view.extent(0) / 2;
+  if (std::is_integral<typename ViewType::non_const_value_type>::value) {
+    // Cast to double to avoid possible overflow when using integer
+    auto const max_val = static_cast<double>(result.max_val);
+    auto const min_val = static_cast<double>(result.min_val);
+    // using 10M as the cutoff for special behavior (roughly 40MB for the count
+    // array)
+    if ((max_val - min_val) < 10000000) {
+      max_bins     = max_val - min_val + 1;
+      sort_in_bins = false;
+    }
+  }
+  if (std::is_floating_point<typename ViewType::non_const_value_type>::value) {
+    KOKKOS_ASSERT(std::isfinite(static_cast<double>(result.max_val) -
+                                static_cast<double>(result.min_val)));
+  }
+
+  using CompType = BinOp1D<ViewType>;
+  BinSort<ViewType, CompType> bin_sort(
+      view, CompType(max_bins, result.min_val, result.max_val), sort_in_bins);
+  bin_sort.create_permute_vector(exec);
+  bin_sort.sort(exec, view);
+}
+
+#if defined(KOKKOS_ENABLE_CUDA)
+template <class DataType, class... Properties, class... MaybeComparator>
+void sort_cudathrust(const Cuda& space,
+                     const Kokkos::View<DataType, Properties...>& view,
+                     MaybeComparator&&... maybeComparator) {
+  using ViewType = Kokkos::View<DataType, Properties...>;
+  static_assert(ViewType::rank == 1,
+                "Kokkos::sort: currently only supports rank-1 Views.");
+
+  if (view.extent(0) <= 1) {
+    return;
+  }
+  const auto exec = thrust::cuda::par.on(space.cuda_stream());
+  auto first      = ::Kokkos::Experimental::begin(view);
+  auto last       = ::Kokkos::Experimental::end(view);
+  thrust::sort(exec, first, last,
+               std::forward<MaybeComparator>(maybeComparator)...);
+}
+#endif
+
+#if defined(KOKKOS_ENABLE_ONEDPL)
+template <class DataType, class... Properties, class... MaybeComparator>
+void sort_onedpl(const Kokkos::Experimental::SYCL& space,
+                 const Kokkos::View<DataType, Properties...>& view,
+                 MaybeComparator&&... maybeComparator) {
+  using ViewType = Kokkos::View<DataType, Properties...>;
+  static_assert(SpaceAccessibility<Kokkos::Experimental::SYCL,
+                                   typename ViewType::memory_space>::accessible,
+                "SYCL execution space is not able to access the memory space "
+                "of the View argument!");
+
+  static_assert(
+      (ViewType::rank == 1) &&
+          (std::is_same_v<typename ViewType::array_layout, LayoutRight> ||
+           std::is_same_v<typename ViewType::array_layout, LayoutLeft> ||
+           std::is_same_v<typename ViewType::array_layout, LayoutStride>),
+      "SYCL sort only supports contiguous rank-1 Views with LayoutLeft, "
+      "LayoutRight or LayoutStride"
+      "For the latter, this means the View must have stride(0) = 1, enforced "
+      "at runtime.");
+
+  if (view.stride(0) != 1) {
+    Kokkos::abort("SYCL sort only supports rank-1 Views with stride(0) = 1.");
+  }
+
+  if (view.extent(0) <= 1) {
+    return;
+  }
+
+  // Can't use Experimental::begin/end here since the oneDPL then assumes that
+  // the data is on the host.
+  auto queue  = space.sycl_queue();
+  auto policy = oneapi::dpl::execution::make_device_policy(queue);
+  const int n = view.extent(0);
+  oneapi::dpl::sort(policy, view.data(), view.data() + n,
+                    std::forward<MaybeComparator>(maybeComparator)...);
+}
+#endif
+
+template <class ExecutionSpace, class DataType, class... Properties,
+          class... MaybeComparator>
+void copy_to_host_run_stdsort_copy_back(
+    const ExecutionSpace& exec,
+    const Kokkos::View<DataType, Properties...>& view,
+    MaybeComparator&&... maybeComparator) {
+  namespace KE = ::Kokkos::Experimental;
+
+  using ViewType = Kokkos::View<DataType, Properties...>;
+  using layout   = typename ViewType::array_layout;
+  if constexpr (std::is_same_v<LayoutStride, layout>) {
+    // for strided views we cannot just deep_copy from device to host,
+    // so we need to do a few more jumps
+    using view_value_type      = typename ViewType::non_const_value_type;
+    using view_exespace        = typename ViewType::execution_space;
+    using view_deep_copyable_t = Kokkos::View<view_value_type*, view_exespace>;
+    view_deep_copyable_t view_dc("view_dc", view.extent(0));
+    KE::copy(exec, view, view_dc);
+
+    // run sort on the mirror of view_dc
+    auto mv_h  = create_mirror_view_and_copy(Kokkos::HostSpace(), view_dc);
+    auto first = KE::begin(mv_h);
+    auto last  = KE::end(mv_h);
+    std::sort(first, last, std::forward<MaybeComparator>(maybeComparator)...);
+    Kokkos::deep_copy(exec, view_dc, mv_h);
+
+    // copy back to argument view
+    KE::copy(exec, KE::cbegin(view_dc), KE::cend(view_dc), KE::begin(view));
+  } else {
+    auto view_h = create_mirror_view_and_copy(Kokkos::HostSpace(), view);
+    auto first  = KE::begin(view_h);
+    auto last   = KE::end(view_h);
+    std::sort(first, last, std::forward<MaybeComparator>(maybeComparator)...);
+    Kokkos::deep_copy(exec, view, view_h);
+  }
+}
+
+// --------------------------------------------------
+//
+// specialize cases for sorting without comparator
+//
+// --------------------------------------------------
+
+#if defined(KOKKOS_ENABLE_CUDA)
+template <class DataType, class... Properties>
+void sort_device_view_without_comparator(
+    const Cuda& exec, const Kokkos::View<DataType, Properties...>& view) {
+  sort_cudathrust(exec, view);
+}
+#endif
+
+#if defined(KOKKOS_ENABLE_ONEDPL)
+template <class DataType, class... Properties>
+void sort_device_view_without_comparator(
+    const Kokkos::Experimental::SYCL& exec,
+    const Kokkos::View<DataType, Properties...>& view) {
+  using ViewType = Kokkos::View<DataType, Properties...>;
+  static_assert(
+      (ViewType::rank == 1) &&
+          (std::is_same_v<typename ViewType::array_layout, LayoutRight> ||
+           std::is_same_v<typename ViewType::array_layout, LayoutLeft> ||
+           std::is_same_v<typename ViewType::array_layout, LayoutStride>),
+      "sort_device_view_without_comparator: supports rank-1 Views "
+      "with LayoutLeft, LayoutRight or LayoutStride");
+
+  if (view.stride(0) == 1) {
+    sort_onedpl(exec, view);
+  } else {
+    copy_to_host_run_stdsort_copy_back(exec, view);
+  }
+}
+#endif
+
+// fallback case
+template <class ExecutionSpace, class DataType, class... Properties>
+std::enable_if_t<Kokkos::is_execution_space<ExecutionSpace>::value>
+sort_device_view_without_comparator(
+    const ExecutionSpace& exec,
+    const Kokkos::View<DataType, Properties...>& view) {
+  sort_via_binsort(exec, view);
+}
+
+// --------------------------------------------------
+//
+// specialize cases for sorting with comparator
+//
+// --------------------------------------------------
+
+#if defined(KOKKOS_ENABLE_CUDA)
+template <class ComparatorType, class DataType, class... Properties>
+void sort_device_view_with_comparator(
+    const Cuda& exec, const Kokkos::View<DataType, Properties...>& view,
+    const ComparatorType& comparator) {
+  sort_cudathrust(exec, view, comparator);
+}
+#endif
+
+#if defined(KOKKOS_ENABLE_ONEDPL)
+template <class ComparatorType, class DataType, class... Properties>
+void sort_device_view_with_comparator(
+    const Kokkos::Experimental::SYCL& exec,
+    const Kokkos::View<DataType, Properties...>& view,
+    const ComparatorType& comparator) {
+  using ViewType = Kokkos::View<DataType, Properties...>;
+  static_assert(
+      (ViewType::rank == 1) &&
+          (std::is_same_v<typename ViewType::array_layout, LayoutRight> ||
+           std::is_same_v<typename ViewType::array_layout, LayoutLeft> ||
+           std::is_same_v<typename ViewType::array_layout, LayoutStride>),
+      "sort_device_view_with_comparator: supports rank-1 Views "
+      "with LayoutLeft, LayoutRight or LayoutStride");
+
+  if (view.stride(0) == 1) {
+    sort_onedpl(exec, view, comparator);
+  } else {
+    copy_to_host_run_stdsort_copy_back(exec, view, comparator);
+  }
+}
+#endif
+
+template <class ExecutionSpace, class ComparatorType, class DataType,
+          class... Properties>
+std::enable_if_t<Kokkos::is_execution_space<ExecutionSpace>::value>
+sort_device_view_with_comparator(
+    const ExecutionSpace& exec,
+    const Kokkos::View<DataType, Properties...>& view,
+    const ComparatorType& comparator) {
+  // This is a fallback case if a more specialized overload does not exist:
+  // for now, this fallback copies data to host, runs std::sort
+  // and then copies data back. Potentially, this can later be changed
+  // with a better solution like our own quicksort on device or similar.
+
+  using ViewType = Kokkos::View<DataType, Properties...>;
+  using MemSpace = typename ViewType::memory_space;
+  static_assert(!SpaceAccessibility<HostSpace, MemSpace>::accessible,
+                "Impl::sort_device_view_with_comparator: should not be called "
+                "on a view that is already accessible on the host");
+
+  copy_to_host_run_stdsort_copy_back(exec, view, comparator);
+}
+
+}  // namespace Impl
+}  // namespace Kokkos
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_AdjacentDifference.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_AdjacentDifference.hpp
index 38dcd1a6743610fcd6859164133a04ebc56add4a..f254686dbaf0aa5434d9133614ee9b25dbed1597 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_AdjacentDifference.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_AdjacentDifference.hpp
@@ -23,64 +23,85 @@
 namespace Kokkos {
 namespace Experimental {
 
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorType>
-std::enable_if_t<!::Kokkos::is_view<InputIteratorType>::value,
-                 OutputIteratorType>
-adjacent_difference(const ExecutionSpace& ex, InputIteratorType first_from,
-                    InputIteratorType last_from,
-                    OutputIteratorType first_dest) {
+//
+// overload set accepting execution space
+//
+template <
+    typename ExecutionSpace, typename InputIteratorType,
+    typename OutputIteratorType,
+    std::enable_if_t<!::Kokkos::is_view<InputIteratorType>::value &&
+                         ::Kokkos::is_execution_space<ExecutionSpace>::value,
+                     int> = 0>
+OutputIteratorType adjacent_difference(const ExecutionSpace& ex,
+                                       InputIteratorType first_from,
+                                       InputIteratorType last_from,
+                                       OutputIteratorType first_dest) {
   using value_type1 = typename InputIteratorType::value_type;
   using value_type2 = typename OutputIteratorType::value_type;
   using binary_op =
       Impl::StdAdjacentDifferenceDefaultBinaryOpFunctor<value_type1,
                                                         value_type2>;
-  return Impl::adjacent_difference_impl(
+  return Impl::adjacent_difference_exespace_impl(
       "Kokkos::adjacent_difference_iterator_api", ex, first_from, last_from,
       first_dest, binary_op());
 }
 
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorType, class BinaryOp>
-std::enable_if_t<!::Kokkos::is_view<InputIteratorType>::value,
-                 OutputIteratorType>
-adjacent_difference(const ExecutionSpace& ex, InputIteratorType first_from,
-                    InputIteratorType last_from, OutputIteratorType first_dest,
-                    BinaryOp bin_op) {
-  return Impl::adjacent_difference_impl(
+template <
+    typename ExecutionSpace, typename InputIteratorType,
+    typename OutputIteratorType, typename BinaryOp,
+    std::enable_if_t<!::Kokkos::is_view<InputIteratorType>::value &&
+                         ::Kokkos::is_execution_space<ExecutionSpace>::value,
+                     int> = 0>
+OutputIteratorType adjacent_difference(const ExecutionSpace& ex,
+                                       InputIteratorType first_from,
+                                       InputIteratorType last_from,
+                                       OutputIteratorType first_dest,
+                                       BinaryOp bin_op) {
+  return Impl::adjacent_difference_exespace_impl(
       "Kokkos::adjacent_difference_iterator_api", ex, first_from, last_from,
       first_dest, bin_op);
 }
 
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorType>
-std::enable_if_t<!::Kokkos::is_view<InputIteratorType>::value,
-                 OutputIteratorType>
-adjacent_difference(const std::string& label, const ExecutionSpace& ex,
-                    InputIteratorType first_from, InputIteratorType last_from,
-                    OutputIteratorType first_dest) {
+template <
+    typename ExecutionSpace, typename InputIteratorType,
+    typename OutputIteratorType,
+    std::enable_if_t<!::Kokkos::is_view<InputIteratorType>::value &&
+                         ::Kokkos::is_execution_space<ExecutionSpace>::value,
+                     int> = 0>
+OutputIteratorType adjacent_difference(const std::string& label,
+                                       const ExecutionSpace& ex,
+                                       InputIteratorType first_from,
+                                       InputIteratorType last_from,
+                                       OutputIteratorType first_dest) {
   using value_type1 = typename InputIteratorType::value_type;
   using value_type2 = typename OutputIteratorType::value_type;
   using binary_op =
       Impl::StdAdjacentDifferenceDefaultBinaryOpFunctor<value_type1,
                                                         value_type2>;
-  return Impl::adjacent_difference_impl(label, ex, first_from, last_from,
-                                        first_dest, binary_op());
+  return Impl::adjacent_difference_exespace_impl(
+      label, ex, first_from, last_from, first_dest, binary_op());
 }
 
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorType, class BinaryOp>
-std::enable_if_t<!::Kokkos::is_view<InputIteratorType>::value,
-                 OutputIteratorType>
-adjacent_difference(const std::string& label, const ExecutionSpace& ex,
-                    InputIteratorType first_from, InputIteratorType last_from,
-                    OutputIteratorType first_dest, BinaryOp bin_op) {
-  return Impl::adjacent_difference_impl(label, ex, first_from, last_from,
-                                        first_dest, bin_op);
+template <
+    typename ExecutionSpace, typename InputIteratorType,
+    typename OutputIteratorType, typename BinaryOp,
+    std::enable_if_t<!::Kokkos::is_view<InputIteratorType>::value &&
+                         ::Kokkos::is_execution_space<ExecutionSpace>::value,
+                     int> = 0>
+OutputIteratorType adjacent_difference(const std::string& label,
+                                       const ExecutionSpace& ex,
+                                       InputIteratorType first_from,
+                                       InputIteratorType last_from,
+                                       OutputIteratorType first_dest,
+                                       BinaryOp bin_op) {
+  return Impl::adjacent_difference_exespace_impl(label, ex, first_from,
+                                                 last_from, first_dest, bin_op);
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
+template <typename ExecutionSpace, typename DataType1, typename... Properties1,
+          typename DataType2, typename... Properties2,
+          std::enable_if_t<::Kokkos::is_execution_space<ExecutionSpace>::value,
+                           int> = 0>
 auto adjacent_difference(
     const ExecutionSpace& ex,
     const ::Kokkos::View<DataType1, Properties1...>& view_from,
@@ -96,13 +117,15 @@ auto adjacent_difference(
   using binary_op =
       Impl::StdAdjacentDifferenceDefaultBinaryOpFunctor<value_type1,
                                                         value_type2>;
-  return Impl::adjacent_difference_impl(
+  return Impl::adjacent_difference_exespace_impl(
       "Kokkos::adjacent_difference_view_api", ex, KE::cbegin(view_from),
       KE::cend(view_from), KE::begin(view_dest), binary_op());
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class BinaryOp>
+template <typename ExecutionSpace, typename DataType1, typename... Properties1,
+          typename DataType2, typename... Properties2, typename BinaryOp,
+          std::enable_if_t<::Kokkos::is_execution_space<ExecutionSpace>::value,
+                           int> = 0>
 auto adjacent_difference(
     const ExecutionSpace& ex,
     const ::Kokkos::View<DataType1, Properties1...>& view_from,
@@ -111,13 +134,15 @@ auto adjacent_difference(
   namespace KE = ::Kokkos::Experimental;
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
-  return Impl::adjacent_difference_impl(
+  return Impl::adjacent_difference_exespace_impl(
       "Kokkos::adjacent_difference_view_api", ex, KE::cbegin(view_from),
       KE::cend(view_from), KE::begin(view_dest), bin_op);
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
+template <typename ExecutionSpace, typename DataType1, typename... Properties1,
+          typename DataType2, typename... Properties2,
+          std::enable_if_t<::Kokkos::is_execution_space<ExecutionSpace>::value,
+                           int> = 0>
 auto adjacent_difference(
     const std::string& label, const ExecutionSpace& ex,
     const ::Kokkos::View<DataType1, Properties1...>& view_from,
@@ -134,13 +159,15 @@ auto adjacent_difference(
       Impl::StdAdjacentDifferenceDefaultBinaryOpFunctor<value_type1,
                                                         value_type2>;
 
-  return Impl::adjacent_difference_impl(label, ex, KE::cbegin(view_from),
-                                        KE::cend(view_from),
-                                        KE::begin(view_dest), binary_op());
+  return Impl::adjacent_difference_exespace_impl(
+      label, ex, KE::cbegin(view_from), KE::cend(view_from),
+      KE::begin(view_dest), binary_op());
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class BinaryOp>
+template <typename ExecutionSpace, typename DataType1, typename... Properties1,
+          typename DataType2, typename... Properties2, typename BinaryOp,
+          std::enable_if_t<::Kokkos::is_execution_space<ExecutionSpace>::value,
+                           int> = 0>
 auto adjacent_difference(
     const std::string& label, const ExecutionSpace& ex,
     const ::Kokkos::View<DataType1, Properties1...>& view_from,
@@ -149,9 +176,85 @@ auto adjacent_difference(
   namespace KE = ::Kokkos::Experimental;
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
-  return Impl::adjacent_difference_impl(label, ex, KE::cbegin(view_from),
-                                        KE::cend(view_from),
-                                        KE::begin(view_dest), bin_op);
+  return Impl::adjacent_difference_exespace_impl(
+      label, ex, KE::cbegin(view_from), KE::cend(view_from),
+      KE::begin(view_dest), bin_op);
+}
+
+//
+// overload set accepting a team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+template <typename TeamHandleType, typename InputIteratorType,
+          typename OutputIteratorType,
+          std::enable_if_t<!::Kokkos::is_view<InputIteratorType>::value &&
+                               ::Kokkos::is_team_handle<TeamHandleType>::value,
+                           int> = 0>
+KOKKOS_FUNCTION OutputIteratorType adjacent_difference(
+    const TeamHandleType& teamHandle, InputIteratorType first_from,
+    InputIteratorType last_from, OutputIteratorType first_dest) {
+  using value_type1 = typename InputIteratorType::value_type;
+  using value_type2 = typename OutputIteratorType::value_type;
+  using binary_op =
+      Impl::StdAdjacentDifferenceDefaultBinaryOpFunctor<value_type1,
+                                                        value_type2>;
+  return Impl::adjacent_difference_team_impl(teamHandle, first_from, last_from,
+                                             first_dest, binary_op());
+}
+
+template <typename TeamHandleType, typename InputIteratorType,
+          typename OutputIteratorType, typename BinaryOp,
+          std::enable_if_t<!::Kokkos::is_view<InputIteratorType>::value &&
+                               ::Kokkos::is_team_handle<TeamHandleType>::value,
+                           int> = 0>
+KOKKOS_FUNCTION OutputIteratorType
+adjacent_difference(const TeamHandleType& teamHandle,
+                    InputIteratorType first_from, InputIteratorType last_from,
+                    OutputIteratorType first_dest, BinaryOp bin_op) {
+  return Impl::adjacent_difference_team_impl(teamHandle, first_from, last_from,
+                                             first_dest, bin_op);
+}
+
+template <
+    typename TeamHandleType, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2,
+    std::enable_if_t<::Kokkos::is_team_handle<TeamHandleType>::value, int> = 0>
+KOKKOS_FUNCTION auto adjacent_difference(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType1, Properties1...>& view_from,
+    const ::Kokkos::View<DataType2, Properties2...>& view_dest) {
+  namespace KE = ::Kokkos::Experimental;
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
+
+  using view_type1  = ::Kokkos::View<DataType1, Properties1...>;
+  using view_type2  = ::Kokkos::View<DataType2, Properties2...>;
+  using value_type1 = typename view_type1::value_type;
+  using value_type2 = typename view_type2::value_type;
+  using binary_op =
+      Impl::StdAdjacentDifferenceDefaultBinaryOpFunctor<value_type1,
+                                                        value_type2>;
+  return Impl::adjacent_difference_team_impl(teamHandle, KE::cbegin(view_from),
+                                             KE::cend(view_from),
+                                             KE::begin(view_dest), binary_op());
+}
+
+template <
+    typename TeamHandleType, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2, typename BinaryOp,
+    std::enable_if_t<::Kokkos::is_team_handle<TeamHandleType>::value, int> = 0>
+KOKKOS_FUNCTION auto adjacent_difference(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType1, Properties1...>& view_from,
+    const ::Kokkos::View<DataType2, Properties2...>& view_dest,
+    BinaryOp bin_op) {
+  namespace KE = ::Kokkos::Experimental;
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
+  return Impl::adjacent_difference_team_impl(teamHandle, KE::cbegin(view_from),
+                                             KE::cend(view_from),
+                                             KE::begin(view_dest), bin_op);
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_AdjacentFind.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_AdjacentFind.hpp
index 43c2b660107b6d27223f7004fb5d7c52babc3592..ac476ca5bfac4a3256fabef4e80c84fd002b48d1 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_AdjacentFind.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_AdjacentFind.hpp
@@ -23,71 +23,144 @@
 namespace Kokkos {
 namespace Experimental {
 
+//
+// overload set accepting execution space
+//
+
 // overload set1
-template <class ExecutionSpace, class IteratorType>
+template <
+    typename ExecutionSpace, typename IteratorType,
+    std::enable_if_t<Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 IteratorType adjacent_find(const ExecutionSpace& ex, IteratorType first,
                            IteratorType last) {
-  return Impl::adjacent_find_impl("Kokkos::adjacent_find_iterator_api_default",
-                                  ex, first, last);
+  return Impl::adjacent_find_exespace_impl(
+      "Kokkos::adjacent_find_iterator_api_default", ex, first, last);
 }
 
-template <class ExecutionSpace, class IteratorType>
+template <
+    typename ExecutionSpace, typename IteratorType,
+    std::enable_if_t<Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 IteratorType adjacent_find(const std::string& label, const ExecutionSpace& ex,
                            IteratorType first, IteratorType last) {
-  return Impl::adjacent_find_impl(label, ex, first, last);
+  return Impl::adjacent_find_exespace_impl(label, ex, first, last);
 }
 
-template <class ExecutionSpace, class DataType, class... Properties>
+template <
+    typename ExecutionSpace, typename DataType, typename... Properties,
+    std::enable_if_t<Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto adjacent_find(const ExecutionSpace& ex,
                    const ::Kokkos::View<DataType, Properties...>& v) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
   namespace KE = ::Kokkos::Experimental;
-  return Impl::adjacent_find_impl("Kokkos::adjacent_find_view_api_default", ex,
-                                  KE::begin(v), KE::end(v));
+  return Impl::adjacent_find_exespace_impl(
+      "Kokkos::adjacent_find_view_api_default", ex, KE::begin(v), KE::end(v));
 }
 
-template <class ExecutionSpace, class DataType, class... Properties>
+template <
+    typename ExecutionSpace, typename DataType, typename... Properties,
+    std::enable_if_t<Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto adjacent_find(const std::string& label, const ExecutionSpace& ex,
                    const ::Kokkos::View<DataType, Properties...>& v) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
   namespace KE = ::Kokkos::Experimental;
-  return Impl::adjacent_find_impl(label, ex, KE::begin(v), KE::end(v));
+  return Impl::adjacent_find_exespace_impl(label, ex, KE::begin(v), KE::end(v));
 }
 
 // overload set2
-template <class ExecutionSpace, class IteratorType, class BinaryPredicateType>
+template <
+    typename ExecutionSpace, typename IteratorType,
+    typename BinaryPredicateType,
+    std::enable_if_t<Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 IteratorType adjacent_find(const ExecutionSpace& ex, IteratorType first,
                            IteratorType last, BinaryPredicateType pred) {
-  return Impl::adjacent_find_impl("Kokkos::adjacent_find_iterator_api_default",
-                                  ex, first, last, pred);
+  return Impl::adjacent_find_exespace_impl(
+      "Kokkos::adjacent_find_iterator_api_default", ex, first, last, pred);
 }
 
-template <class ExecutionSpace, class IteratorType, class BinaryPredicateType>
+template <
+    typename ExecutionSpace, typename IteratorType,
+    typename BinaryPredicateType,
+    std::enable_if_t<Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 IteratorType adjacent_find(const std::string& label, const ExecutionSpace& ex,
                            IteratorType first, IteratorType last,
                            BinaryPredicateType pred) {
-  return Impl::adjacent_find_impl(label, ex, first, last, pred);
+  return Impl::adjacent_find_exespace_impl(label, ex, first, last, pred);
 }
 
-template <class ExecutionSpace, class DataType, class... Properties,
-          class BinaryPredicateType>
+template <
+    typename ExecutionSpace, typename DataType, typename... Properties,
+    typename BinaryPredicateType,
+    std::enable_if_t<Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto adjacent_find(const ExecutionSpace& ex,
                    const ::Kokkos::View<DataType, Properties...>& v,
                    BinaryPredicateType pred) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
   namespace KE = ::Kokkos::Experimental;
-  return Impl::adjacent_find_impl("Kokkos::adjacent_find_view_api_default", ex,
-                                  KE::begin(v), KE::end(v), pred);
+  return Impl::adjacent_find_exespace_impl(
+      "Kokkos::adjacent_find_view_api_default", ex, KE::begin(v), KE::end(v),
+      pred);
 }
 
-template <class ExecutionSpace, class DataType, class... Properties,
-          class BinaryPredicateType>
+template <
+    typename ExecutionSpace, typename DataType, typename... Properties,
+    typename BinaryPredicateType,
+    std::enable_if_t<Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto adjacent_find(const std::string& label, const ExecutionSpace& ex,
                    const ::Kokkos::View<DataType, Properties...>& v,
                    BinaryPredicateType pred) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
   namespace KE = ::Kokkos::Experimental;
-  return Impl::adjacent_find_impl(label, ex, KE::begin(v), KE::end(v), pred);
+  return Impl::adjacent_find_exespace_impl(label, ex, KE::begin(v), KE::end(v),
+                                           pred);
+}
+
+//
+// overload set accepting a team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+
+// overload set1
+template <typename TeamHandleType, typename IteratorType,
+          std::enable_if_t<Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION IteratorType adjacent_find(const TeamHandleType& teamHandle,
+                                           IteratorType first,
+                                           IteratorType last) {
+  return Impl::adjacent_find_team_impl(teamHandle, first, last);
+}
+
+template <typename TeamHandleType, typename DataType, typename... Properties,
+          std::enable_if_t<Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto adjacent_find(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType, Properties...>& v) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::adjacent_find_team_impl(teamHandle, KE::begin(v), KE::end(v));
+}
+
+// overload set2
+template <typename TeamHandleType, typename IteratorType,
+          typename BinaryPredicateType,
+          std::enable_if_t<Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION IteratorType adjacent_find(const TeamHandleType& teamHandle,
+                                           IteratorType first,
+                                           IteratorType last,
+                                           BinaryPredicateType pred) {
+  return Impl::adjacent_find_team_impl(teamHandle, first, last, pred);
+}
+
+template <typename TeamHandleType, typename DataType, typename... Properties,
+          typename BinaryPredicateType,
+          std::enable_if_t<Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto adjacent_find(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType, Properties...>& v,
+    BinaryPredicateType pred) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::adjacent_find_team_impl(teamHandle, KE::begin(v), KE::end(v),
+                                       pred);
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_AllOf.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_AllOf.hpp
index 2ffec7e144a0c504e666757778b1e2960dd988ad..d6ed4c4a7e0c2d8400671d1dd300fbc5339b819c 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_AllOf.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_AllOf.hpp
@@ -23,41 +23,79 @@
 namespace Kokkos {
 namespace Experimental {
 
-template <class ExecutionSpace, class InputIterator, class Predicate>
+//
+// overload set accepting execution space
+//
+template <
+    typename ExecutionSpace, typename InputIterator, typename Predicate,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 bool all_of(const ExecutionSpace& ex, InputIterator first, InputIterator last,
             Predicate predicate) {
-  return Impl::all_of_impl("Kokkos::all_of_iterator_api_default", ex, first,
-                           last, predicate);
+  return Impl::all_of_exespace_impl("Kokkos::all_of_iterator_api_default", ex,
+                                    first, last, predicate);
 }
 
-template <class ExecutionSpace, class InputIterator, class Predicate>
+template <
+    typename ExecutionSpace, typename InputIterator, typename Predicate,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 bool all_of(const std::string& label, const ExecutionSpace& ex,
             InputIterator first, InputIterator last, Predicate predicate) {
-  return Impl::all_of_impl(label, ex, first, last, predicate);
+  return Impl::all_of_exespace_impl(label, ex, first, last, predicate);
 }
 
-template <class ExecutionSpace, class DataType, class... Properties,
-          class Predicate>
+template <
+    typename ExecutionSpace, typename DataType, typename... Properties,
+    typename Predicate,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 bool all_of(const ExecutionSpace& ex,
             const ::Kokkos::View<DataType, Properties...>& v,
             Predicate predicate) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::all_of_impl("Kokkos::all_of_view_api_default", ex, KE::cbegin(v),
-                           KE::cend(v), std::move(predicate));
+  return Impl::all_of_exespace_impl("Kokkos::all_of_view_api_default", ex,
+                                    KE::cbegin(v), KE::cend(v),
+                                    std::move(predicate));
 }
 
-template <class ExecutionSpace, class DataType, class... Properties,
-          class Predicate>
+template <
+    typename ExecutionSpace, typename DataType, typename... Properties,
+    typename Predicate,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 bool all_of(const std::string& label, const ExecutionSpace& ex,
             const ::Kokkos::View<DataType, Properties...>& v,
             Predicate predicate) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::all_of_impl(label, ex, KE::cbegin(v), KE::cend(v),
-                           std::move(predicate));
+  return Impl::all_of_exespace_impl(label, ex, KE::cbegin(v), KE::cend(v),
+                                    std::move(predicate));
+}
+
+//
+// overload set accepting a team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+template <typename TeamHandleType, typename InputIterator, typename Predicate,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION bool all_of(const TeamHandleType& teamHandle,
+                            InputIterator first, InputIterator last,
+                            Predicate predicate) {
+  return Impl::all_of_team_impl(teamHandle, first, last, predicate);
+}
+
+template <typename TeamHandleType, typename DataType, typename... Properties,
+          typename Predicate,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION bool all_of(const TeamHandleType& teamHandle,
+                            const ::Kokkos::View<DataType, Properties...>& v,
+                            Predicate predicate) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::all_of_team_impl(teamHandle, KE::cbegin(v), KE::cend(v),
+                                std::move(predicate));
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_AnyOf.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_AnyOf.hpp
index 019c466c6d2f3701b827e1a566e1ef1daaf71a2f..82356e65982ef21c3102f580033c1d17e91509d1 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_AnyOf.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_AnyOf.hpp
@@ -23,41 +23,79 @@
 namespace Kokkos {
 namespace Experimental {
 
-template <class ExecutionSpace, class InputIterator, class Predicate>
+//
+// overload set accepting execution space
+//
+template <
+    typename ExecutionSpace, typename InputIterator, typename Predicate,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 bool any_of(const ExecutionSpace& ex, InputIterator first, InputIterator last,
             Predicate predicate) {
-  return Impl::any_of_impl("Kokkos::any_of_view_api_default", ex, first, last,
-                           predicate);
+  return Impl::any_of_exespace_impl("Kokkos::any_of_view_api_default", ex,
+                                    first, last, predicate);
 }
 
-template <class ExecutionSpace, class InputIterator, class Predicate>
+template <
+    typename ExecutionSpace, typename InputIterator, typename Predicate,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 bool any_of(const std::string& label, const ExecutionSpace& ex,
             InputIterator first, InputIterator last, Predicate predicate) {
-  return Impl::any_of_impl(label, ex, first, last, predicate);
+  return Impl::any_of_exespace_impl(label, ex, first, last, predicate);
 }
 
-template <class ExecutionSpace, class DataType, class... Properties,
-          class Predicate>
+template <
+    typename ExecutionSpace, typename DataType, typename... Properties,
+    typename Predicate,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 bool any_of(const ExecutionSpace& ex,
             const ::Kokkos::View<DataType, Properties...>& v,
             Predicate predicate) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::any_of_impl("Kokkos::any_of_view_api_default", ex, KE::cbegin(v),
-                           KE::cend(v), std::move(predicate));
+  return Impl::any_of_exespace_impl("Kokkos::any_of_view_api_default", ex,
+                                    KE::cbegin(v), KE::cend(v),
+                                    std::move(predicate));
 }
 
-template <class ExecutionSpace, class DataType, class... Properties,
-          class Predicate>
+template <
+    typename ExecutionSpace, typename DataType, typename... Properties,
+    typename Predicate,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 bool any_of(const std::string& label, const ExecutionSpace& ex,
             const ::Kokkos::View<DataType, Properties...>& v,
             Predicate predicate) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::any_of_impl(label, ex, KE::cbegin(v), KE::cend(v),
-                           std::move(predicate));
+  return Impl::any_of_exespace_impl(label, ex, KE::cbegin(v), KE::cend(v),
+                                    std::move(predicate));
+}
+
+//
+// overload set accepting a team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+template <typename TeamHandleType, typename InputIterator, typename Predicate,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION bool any_of(const TeamHandleType& teamHandle,
+                            InputIterator first, InputIterator last,
+                            Predicate predicate) {
+  return Impl::any_of_team_impl(teamHandle, first, last, predicate);
+}
+
+template <typename TeamHandleType, typename DataType, typename... Properties,
+          typename Predicate,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION bool any_of(const TeamHandleType& teamHandle,
+                            const ::Kokkos::View<DataType, Properties...>& v,
+                            Predicate predicate) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::any_of_team_impl(teamHandle, KE::cbegin(v), KE::cend(v),
+                                std::move(predicate));
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Copy.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Copy.hpp
index 028f3b66b2dce3fd61a823361eb60c899718cc57..b7ce1ba5edb335d2b66ec4d2f5abb0f2fb4ef552 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Copy.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Copy.hpp
@@ -23,22 +23,31 @@
 namespace Kokkos {
 namespace Experimental {
 
-template <class ExecutionSpace, class InputIterator, class OutputIterator>
+//
+// overload set accepting execution space
+//
+template <
+    typename ExecutionSpace, typename InputIterator, typename OutputIterator,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 OutputIterator copy(const ExecutionSpace& ex, InputIterator first,
                     InputIterator last, OutputIterator d_first) {
-  return Impl::copy_impl("Kokkos::copy_iterator_api_default", ex, first, last,
-                         d_first);
+  return Impl::copy_exespace_impl("Kokkos::copy_iterator_api_default", ex,
+                                  first, last, d_first);
 }
 
-template <class ExecutionSpace, class InputIterator, class OutputIterator>
+template <
+    typename ExecutionSpace, typename InputIterator, typename OutputIterator,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 OutputIterator copy(const std::string& label, const ExecutionSpace& ex,
                     InputIterator first, InputIterator last,
                     OutputIterator d_first) {
-  return Impl::copy_impl(label, ex, first, last, d_first);
+  return Impl::copy_exespace_impl(label, ex, first, last, d_first);
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto copy(const ExecutionSpace& ex,
           const ::Kokkos::View<DataType1, Properties1...>& source,
           ::Kokkos::View<DataType2, Properties2...>& dest) {
@@ -46,12 +55,15 @@ auto copy(const ExecutionSpace& ex,
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::copy_impl("Kokkos::copy_view_api_default", ex,
-                         KE::cbegin(source), KE::cend(source), KE::begin(dest));
+  return Impl::copy_exespace_impl("Kokkos::copy_view_api_default", ex,
+                                  KE::cbegin(source), KE::cend(source),
+                                  KE::begin(dest));
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto copy(const std::string& label, const ExecutionSpace& ex,
           const ::Kokkos::View<DataType1, Properties1...>& source,
           ::Kokkos::View<DataType2, Properties2...>& dest) {
@@ -59,8 +71,35 @@ auto copy(const std::string& label, const ExecutionSpace& ex,
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::copy_impl(label, ex, KE::cbegin(source), KE::cend(source),
-                         KE::begin(dest));
+  return Impl::copy_exespace_impl(label, ex, KE::cbegin(source),
+                                  KE::cend(source), KE::begin(dest));
+}
+
+//
+// overload set accepting team handle
+//
+template <typename TeamHandleType, typename InputIterator,
+          typename OutputIterator,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION OutputIterator copy(const TeamHandleType& teamHandle,
+                                    InputIterator first, InputIterator last,
+                                    OutputIterator d_first) {
+  return Impl::copy_team_impl(teamHandle, first, last, d_first);
+}
+
+template <typename TeamHandleType, typename DataType1, typename... Properties1,
+          typename DataType2, typename... Properties2,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto copy(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType1, Properties1...>& source,
+    ::Kokkos::View<DataType2, Properties2...>& dest) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::copy_team_impl(teamHandle, KE::cbegin(source), KE::cend(source),
+                              KE::begin(dest));
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_CopyBackward.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_CopyBackward.hpp
index deff6baf9a523dcdd880796483d94a5a5e506416..8f9e0f19b80837ac12efa84f6ebbad75fd08871c 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_CopyBackward.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_CopyBackward.hpp
@@ -23,42 +23,81 @@
 namespace Kokkos {
 namespace Experimental {
 
-template <class ExecutionSpace, class IteratorType1, class IteratorType2>
+//
+// overload set accepting execution space
+//
+template <
+    typename ExecutionSpace, typename IteratorType1, typename IteratorType2,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 IteratorType2 copy_backward(const ExecutionSpace& ex, IteratorType1 first,
                             IteratorType1 last, IteratorType2 d_last) {
-  return Impl::copy_backward_impl("Kokkos::copy_backward_iterator_api_default",
-                                  ex, first, last, d_last);
+  return Impl::copy_backward_exespace_impl(
+      "Kokkos::copy_backward_iterator_api_default", ex, first, last, d_last);
 }
 
-template <class ExecutionSpace, class IteratorType1, class IteratorType2>
+template <
+    typename ExecutionSpace, typename IteratorType1, typename IteratorType2,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 IteratorType2 copy_backward(const std::string& label, const ExecutionSpace& ex,
                             IteratorType1 first, IteratorType1 last,
                             IteratorType2 d_last) {
-  return Impl::copy_backward_impl(label, ex, first, last, d_last);
+  return Impl::copy_backward_exespace_impl(label, ex, first, last, d_last);
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto copy_backward(const ExecutionSpace& ex,
                    const ::Kokkos::View<DataType1, Properties1...>& source,
                    ::Kokkos::View<DataType2, Properties2...>& dest) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
 
-  return Impl::copy_backward_impl("Kokkos::copy_backward_view_api_default", ex,
-                                  cbegin(source), cend(source), end(dest));
+  return Impl::copy_backward_exespace_impl(
+      "Kokkos::copy_backward_view_api_default", ex, cbegin(source),
+      cend(source), end(dest));
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto copy_backward(const std::string& label, const ExecutionSpace& ex,
                    const ::Kokkos::View<DataType1, Properties1...>& source,
                    ::Kokkos::View<DataType2, Properties2...>& dest) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
 
-  return Impl::copy_backward_impl(label, ex, cbegin(source), cend(source),
-                                  end(dest));
+  return Impl::copy_backward_exespace_impl(label, ex, cbegin(source),
+                                           cend(source), end(dest));
+}
+
+//
+// overload set accepting team handle
+//
+template <typename TeamHandleType, typename IteratorType1,
+          typename IteratorType2,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION IteratorType2 copy_backward(const TeamHandleType& teamHandle,
+                                            IteratorType1 first,
+                                            IteratorType1 last,
+                                            IteratorType2 d_last) {
+  return Impl::copy_backward_team_impl(teamHandle, first, last, d_last);
+}
+
+template <typename TeamHandleType, typename DataType1, typename... Properties1,
+          typename DataType2, typename... Properties2,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto copy_backward(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType1, Properties1...>& source,
+    ::Kokkos::View<DataType2, Properties2...>& dest) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
+
+  return Impl::copy_backward_team_impl(teamHandle, cbegin(source), cend(source),
+                                       end(dest));
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_CopyIf.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_CopyIf.hpp
index 3db2fc074f73bc5173cae7583717be9e96eb0152..ba18bc76b93682131e082e8dd90fad97607b14c6 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_CopyIf.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_CopyIf.hpp
@@ -23,46 +23,85 @@
 namespace Kokkos {
 namespace Experimental {
 
-template <class ExecutionSpace, class InputIterator, class OutputIterator,
-          class Predicate>
+//
+// overload set accepting execution space
+//
+template <
+    typename ExecutionSpace, typename InputIterator, typename OutputIterator,
+    typename Predicate,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 OutputIterator copy_if(const ExecutionSpace& ex, InputIterator first,
                        InputIterator last, OutputIterator d_first,
                        Predicate pred) {
-  return Impl::copy_if_impl("Kokkos::copy_if_iterator_api_default", ex, first,
-                            last, d_first, std::move(pred));
+  return Impl::copy_if_exespace_impl("Kokkos::copy_if_iterator_api_default", ex,
+                                     first, last, d_first, std::move(pred));
 }
 
-template <class ExecutionSpace, class InputIterator, class OutputIterator,
-          class Predicate>
+template <
+    typename ExecutionSpace, typename InputIterator, typename OutputIterator,
+    typename Predicate,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 OutputIterator copy_if(const std::string& label, const ExecutionSpace& ex,
                        InputIterator first, InputIterator last,
                        OutputIterator d_first, Predicate pred) {
-  return Impl::copy_if_impl(label, ex, first, last, d_first, std::move(pred));
+  return Impl::copy_if_exespace_impl(label, ex, first, last, d_first,
+                                     std::move(pred));
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class Predicate>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2, typename Predicate,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto copy_if(const ExecutionSpace& ex,
              const ::Kokkos::View<DataType1, Properties1...>& source,
              ::Kokkos::View<DataType2, Properties2...>& dest, Predicate pred) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
 
-  return Impl::copy_if_impl("Kokkos::copy_if_view_api_default", ex,
-                            cbegin(source), cend(source), begin(dest),
-                            std::move(pred));
+  return Impl::copy_if_exespace_impl("Kokkos::copy_if_view_api_default", ex,
+                                     cbegin(source), cend(source), begin(dest),
+                                     std::move(pred));
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class Predicate>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2, typename Predicate,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto copy_if(const std::string& label, const ExecutionSpace& ex,
              const ::Kokkos::View<DataType1, Properties1...>& source,
              ::Kokkos::View<DataType2, Properties2...>& dest, Predicate pred) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
 
-  return Impl::copy_if_impl(label, ex, cbegin(source), cend(source),
-                            begin(dest), std::move(pred));
+  return Impl::copy_if_exespace_impl(label, ex, cbegin(source), cend(source),
+                                     begin(dest), std::move(pred));
+}
+
+//
+// overload set accepting team handle
+//
+template <typename TeamHandleType, typename InputIterator,
+          typename OutputIterator, typename Predicate,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION OutputIterator copy_if(const TeamHandleType& teamHandle,
+                                       InputIterator first, InputIterator last,
+                                       OutputIterator d_first, Predicate pred) {
+  return Impl::copy_if_team_impl(teamHandle, first, last, d_first,
+                                 std::move(pred));
+}
+
+template <typename TeamHandleType, typename DataType1, typename... Properties1,
+          typename DataType2, typename... Properties2, typename Predicate,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto copy_if(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType1, Properties1...>& source,
+    ::Kokkos::View<DataType2, Properties2...>& dest, Predicate pred) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
+
+  return Impl::copy_if_team_impl(teamHandle, cbegin(source), cend(source),
+                                 begin(dest), std::move(pred));
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_CopyN.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_CopyN.hpp
index a64f99b5c01d2001f8c0151e842831ac7749e739..43c91204837e6e695229ea90173e868a5d125d69 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_CopyN.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_CopyN.hpp
@@ -23,23 +23,32 @@
 namespace Kokkos {
 namespace Experimental {
 
-template <class ExecutionSpace, class InputIterator, class Size,
-          class OutputIterator>
+//
+// overload set accepting execution space
+//
+template <
+    typename ExecutionSpace, typename InputIterator, typename Size,
+    typename OutputIterator,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 OutputIterator copy_n(const ExecutionSpace& ex, InputIterator first, Size count,
                       OutputIterator result) {
-  return Impl::copy_n_impl("Kokkos::copy_n_iterator_api_default", ex, first,
-                           count, result);
+  return Impl::copy_n_exespace_impl("Kokkos::copy_n_iterator_api_default", ex,
+                                    first, count, result);
 }
 
-template <class ExecutionSpace, class InputIterator, class Size,
-          class OutputIterator>
+template <
+    typename ExecutionSpace, typename InputIterator, typename Size,
+    typename OutputIterator,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 OutputIterator copy_n(const std::string& label, const ExecutionSpace& ex,
                       InputIterator first, Size count, OutputIterator result) {
-  return Impl::copy_n_impl(label, ex, first, count, result);
+  return Impl::copy_n_exespace_impl(label, ex, first, count, result);
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class Size, class DataType2, class... Properties2>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename Size, typename DataType2, typename... Properties2,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto copy_n(const ExecutionSpace& ex,
             const ::Kokkos::View<DataType1, Properties1...>& source, Size count,
             ::Kokkos::View<DataType2, Properties2...>& dest) {
@@ -47,12 +56,14 @@ auto copy_n(const ExecutionSpace& ex,
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::copy_n_impl("Kokkos::copy_n_view_api_default", ex,
-                           KE::cbegin(source), count, KE::begin(dest));
+  return Impl::copy_n_exespace_impl("Kokkos::copy_n_view_api_default", ex,
+                                    KE::cbegin(source), count, KE::begin(dest));
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class Size, class DataType2, class... Properties2>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename Size, typename DataType2, typename... Properties2,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto copy_n(const std::string& label, const ExecutionSpace& ex,
             const ::Kokkos::View<DataType1, Properties1...>& source, Size count,
             ::Kokkos::View<DataType2, Properties2...>& dest) {
@@ -60,8 +71,35 @@ auto copy_n(const std::string& label, const ExecutionSpace& ex,
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::copy_n_impl(label, ex, KE::cbegin(source), count,
-                           KE::begin(dest));
+  return Impl::copy_n_exespace_impl(label, ex, KE::cbegin(source), count,
+                                    KE::begin(dest));
+}
+
+//
+// overload set accepting team handle
+//
+template <typename TeamHandleType, typename InputIterator, typename Size,
+          typename OutputIterator,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION OutputIterator copy_n(const TeamHandleType& teamHandle,
+                                      InputIterator first, Size count,
+                                      OutputIterator result) {
+  return Impl::copy_n_team_impl(teamHandle, first, count, result);
+}
+
+template <typename TeamHandleType, typename DataType1, typename... Properties1,
+          typename Size, typename DataType2, typename... Properties2,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto copy_n(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType1, Properties1...>& source, Size count,
+    ::Kokkos::View<DataType2, Properties2...>& dest) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::copy_n_team_impl(teamHandle, KE::cbegin(source), count,
+                                KE::begin(dest));
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Count.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Count.hpp
index 3ac63467ec9a0d1e50382952f093b4cf18289480..f179e88babad2f55d8777518a3543ecd9525aab6 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Count.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Count.hpp
@@ -23,41 +23,81 @@
 namespace Kokkos {
 namespace Experimental {
 
-template <class ExecutionSpace, class IteratorType, class T>
+//
+// overload set accepting execution space
+//
+template <
+    typename ExecutionSpace, typename IteratorType, typename T,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 typename IteratorType::difference_type count(const ExecutionSpace& ex,
                                              IteratorType first,
                                              IteratorType last,
                                              const T& value) {
-  return Impl::count_impl("Kokkos::count_iterator_api_default", ex, first, last,
-                          value);
+  return Impl::count_exespace_impl("Kokkos::count_iterator_api_default", ex,
+                                   first, last, value);
 }
 
-template <class ExecutionSpace, class IteratorType, class T>
+template <
+    typename ExecutionSpace, typename IteratorType, typename T,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 typename IteratorType::difference_type count(const std::string& label,
                                              const ExecutionSpace& ex,
                                              IteratorType first,
                                              IteratorType last,
                                              const T& value) {
-  return Impl::count_impl(label, ex, first, last, value);
+  return Impl::count_exespace_impl(label, ex, first, last, value);
 }
 
-template <class ExecutionSpace, class DataType, class... Properties, class T>
+template <
+    typename ExecutionSpace, typename DataType, typename... Properties,
+    typename T,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto count(const ExecutionSpace& ex,
            const ::Kokkos::View<DataType, Properties...>& v, const T& value) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::count_impl("Kokkos::count_view_api_default", ex, KE::cbegin(v),
-                          KE::cend(v), value);
+  return Impl::count_exespace_impl("Kokkos::count_view_api_default", ex,
+                                   KE::cbegin(v), KE::cend(v), value);
 }
 
-template <class ExecutionSpace, class DataType, class... Properties, class T>
+template <
+    typename ExecutionSpace, typename DataType, typename... Properties,
+    typename T,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto count(const std::string& label, const ExecutionSpace& ex,
            const ::Kokkos::View<DataType, Properties...>& v, const T& value) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::count_impl(label, ex, KE::cbegin(v), KE::cend(v), value);
+  return Impl::count_exespace_impl(label, ex, KE::cbegin(v), KE::cend(v),
+                                   value);
+}
+
+//
+// overload set accepting a team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+
+template <typename TeamHandleType, typename IteratorType, typename T,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION typename IteratorType::difference_type count(
+    const TeamHandleType& teamHandle, IteratorType first, IteratorType last,
+    const T& value) {
+  return Impl::count_team_impl(teamHandle, first, last, value);
+}
+
+template <typename TeamHandleType, typename DataType, typename... Properties,
+          typename T,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto count(const TeamHandleType& teamHandle,
+                           const ::Kokkos::View<DataType, Properties...>& v,
+                           const T& value) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::count_team_impl(teamHandle, KE::cbegin(v), KE::cend(v), value);
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_CountIf.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_CountIf.hpp
index b9731d378a594a6177f83dbae51585dd59c053db..967cf75e7a4793f8e0175d325f9691ca64eed5e2 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_CountIf.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_CountIf.hpp
@@ -23,46 +23,84 @@
 namespace Kokkos {
 namespace Experimental {
 
-template <class ExecutionSpace, class IteratorType, class Predicate>
+//
+// overload set accepting execution space
+//
+template <
+    typename ExecutionSpace, typename IteratorType, typename Predicate,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 typename IteratorType::difference_type count_if(const ExecutionSpace& ex,
                                                 IteratorType first,
                                                 IteratorType last,
                                                 Predicate predicate) {
-  return Impl::count_if_impl("Kokkos::count_if_iterator_api_default", ex, first,
-                             last, std::move(predicate));
+  return Impl::count_if_exespace_impl("Kokkos::count_if_iterator_api_default",
+                                      ex, first, last, std::move(predicate));
 }
 
-template <class ExecutionSpace, class IteratorType, class Predicate>
+template <
+    typename ExecutionSpace, typename IteratorType, typename Predicate,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 typename IteratorType::difference_type count_if(const std::string& label,
                                                 const ExecutionSpace& ex,
                                                 IteratorType first,
                                                 IteratorType last,
                                                 Predicate predicate) {
-  return Impl::count_if_impl(label, ex, first, last, std::move(predicate));
+  return Impl::count_if_exespace_impl(label, ex, first, last,
+                                      std::move(predicate));
 }
 
-template <class ExecutionSpace, class DataType, class... Properties,
-          class Predicate>
+template <
+    typename ExecutionSpace, typename DataType, typename... Properties,
+    typename Predicate,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto count_if(const ExecutionSpace& ex,
               const ::Kokkos::View<DataType, Properties...>& v,
               Predicate predicate) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::count_if_impl("Kokkos::count_if_view_api_default", ex,
-                             KE::cbegin(v), KE::cend(v), std::move(predicate));
+  return Impl::count_if_exespace_impl("Kokkos::count_if_view_api_default", ex,
+                                      KE::cbegin(v), KE::cend(v),
+                                      std::move(predicate));
 }
 
-template <class ExecutionSpace, class DataType, class... Properties,
-          class Predicate>
+template <
+    typename ExecutionSpace, typename DataType, typename... Properties,
+    typename Predicate,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto count_if(const std::string& label, const ExecutionSpace& ex,
               const ::Kokkos::View<DataType, Properties...>& v,
               Predicate predicate) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::count_if_impl(label, ex, KE::cbegin(v), KE::cend(v),
-                             std::move(predicate));
+  return Impl::count_if_exespace_impl(label, ex, KE::cbegin(v), KE::cend(v),
+                                      std::move(predicate));
+}
+
+//
+// overload set accepting team handle
+//
+template <typename TeamHandleType, typename IteratorType, typename Predicate,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION typename IteratorType::difference_type count_if(
+    const TeamHandleType& teamHandle, IteratorType first, IteratorType last,
+    Predicate predicate) {
+  return Impl::count_if_team_impl(teamHandle, first, last,
+                                  std::move(predicate));
+}
+
+template <typename TeamHandleType, typename DataType, typename... Properties,
+          typename Predicate,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto count_if(const TeamHandleType& teamHandle,
+                              const ::Kokkos::View<DataType, Properties...>& v,
+                              Predicate predicate) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::count_if_team_impl(teamHandle, KE::cbegin(v), KE::cend(v),
+                                  std::move(predicate));
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Equal.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Equal.hpp
index 37c0d75ef5bfb751883b8e3d0c086b4e732ea8b8..a72a49cc22b827c24f9c2f3e92c6507b2f1c0508 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Equal.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Equal.hpp
@@ -23,50 +23,61 @@
 namespace Kokkos {
 namespace Experimental {
 
-template <class ExecutionSpace, class IteratorType1, class IteratorType2>
-std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators<
-                      IteratorType1, IteratorType2>::value,
-                  bool>
-equal(const ExecutionSpace& ex, IteratorType1 first1, IteratorType1 last1,
-      IteratorType2 first2) {
-  return Impl::equal_impl("Kokkos::equal_iterator_api_default", ex, first1,
-                          last1, first2);
-}
-
-template <class ExecutionSpace, class IteratorType1, class IteratorType2>
-std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators<
-                      IteratorType1, IteratorType2>::value,
-                  bool>
-equal(const std::string& label, const ExecutionSpace& ex, IteratorType1 first1,
-      IteratorType1 last1, IteratorType2 first2) {
-  return Impl::equal_impl(label, ex, first1, last1, first2);
-}
-
-template <class ExecutionSpace, class IteratorType1, class IteratorType2,
-          class BinaryPredicateType>
-std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators<
-                      IteratorType1, IteratorType2>::value,
-                  bool>
-equal(const ExecutionSpace& ex, IteratorType1 first1, IteratorType1 last1,
-      IteratorType2 first2, BinaryPredicateType predicate) {
-  return Impl::equal_impl("Kokkos::equal_iterator_api_default", ex, first1,
-                          last1, first2, std::move(predicate));
-}
-
-template <class ExecutionSpace, class IteratorType1, class IteratorType2,
-          class BinaryPredicateType>
-std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators<
-                      IteratorType1, IteratorType2>::value,
-                  bool>
-equal(const std::string& label, const ExecutionSpace& ex, IteratorType1 first1,
-      IteratorType1 last1, IteratorType2 first2,
-      BinaryPredicateType predicate) {
-  return Impl::equal_impl(label, ex, first1, last1, first2,
-                          std::move(predicate));
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
+//
+// overload set accepting execution space
+//
+template <typename ExecutionSpace, typename IteratorType1,
+          typename IteratorType2,
+          std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators_v<
+                               IteratorType1, IteratorType2> &&
+                               Kokkos::is_execution_space_v<ExecutionSpace>,
+                           int> = 0>
+bool equal(const ExecutionSpace& ex, IteratorType1 first1, IteratorType1 last1,
+           IteratorType2 first2) {
+  return Impl::equal_exespace_impl("Kokkos::equal_iterator_api_default", ex,
+                                   first1, last1, first2);
+}
+
+template <typename ExecutionSpace, typename IteratorType1,
+          typename IteratorType2,
+          std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators_v<
+                               IteratorType1, IteratorType2>&& ::Kokkos::
+                               is_execution_space_v<ExecutionSpace>,
+                           int> = 0>
+bool equal(const std::string& label, const ExecutionSpace& ex,
+           IteratorType1 first1, IteratorType1 last1, IteratorType2 first2) {
+  return Impl::equal_exespace_impl(label, ex, first1, last1, first2);
+}
+
+template <typename ExecutionSpace, typename IteratorType1,
+          typename IteratorType2, typename BinaryPredicateType,
+          std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators_v<
+                               IteratorType1, IteratorType2>&& ::Kokkos::
+                               is_execution_space_v<ExecutionSpace>,
+                           int> = 0>
+bool equal(const ExecutionSpace& ex, IteratorType1 first1, IteratorType1 last1,
+           IteratorType2 first2, BinaryPredicateType predicate) {
+  return Impl::equal_exespace_impl("Kokkos::equal_iterator_api_default", ex,
+                                   first1, last1, first2, std::move(predicate));
+}
+
+template <typename ExecutionSpace, typename IteratorType1,
+          typename IteratorType2, typename BinaryPredicateType,
+          std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators_v<
+                               IteratorType1, IteratorType2>&& ::Kokkos::
+                               is_execution_space_v<ExecutionSpace>,
+                           int> = 0>
+bool equal(const std::string& label, const ExecutionSpace& ex,
+           IteratorType1 first1, IteratorType1 last1, IteratorType2 first2,
+           BinaryPredicateType predicate) {
+  return Impl::equal_exespace_impl(label, ex, first1, last1, first2,
+                                   std::move(predicate));
+}
+
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 bool equal(const ExecutionSpace& ex,
            const ::Kokkos::View<DataType1, Properties1...>& view1,
            ::Kokkos::View<DataType2, Properties2...>& view2) {
@@ -74,13 +85,15 @@ bool equal(const ExecutionSpace& ex,
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::equal_impl("Kokkos::equal_view_api_default", ex,
-                          KE::cbegin(view1), KE::cend(view1),
-                          KE::cbegin(view2));
+  return Impl::equal_exespace_impl("Kokkos::equal_view_api_default", ex,
+                                   KE::cbegin(view1), KE::cend(view1),
+                                   KE::cbegin(view2));
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 bool equal(const std::string& label, const ExecutionSpace& ex,
            const ::Kokkos::View<DataType1, Properties1...>& view1,
            ::Kokkos::View<DataType2, Properties2...>& view2) {
@@ -88,12 +101,14 @@ bool equal(const std::string& label, const ExecutionSpace& ex,
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::equal_impl(label, ex, KE::cbegin(view1), KE::cend(view1),
-                          KE::cbegin(view2));
+  return Impl::equal_exespace_impl(label, ex, KE::cbegin(view1),
+                                   KE::cend(view1), KE::cbegin(view2));
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class BinaryPredicateType>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2, typename BinaryPredicateType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 bool equal(const ExecutionSpace& ex,
            const ::Kokkos::View<DataType1, Properties1...>& view1,
            ::Kokkos::View<DataType2, Properties2...>& view2,
@@ -102,13 +117,15 @@ bool equal(const ExecutionSpace& ex,
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::equal_impl("Kokkos::equal_view_api_default", ex,
-                          KE::cbegin(view1), KE::cend(view1), KE::cbegin(view2),
-                          std::move(predicate));
+  return Impl::equal_exespace_impl("Kokkos::equal_view_api_default", ex,
+                                   KE::cbegin(view1), KE::cend(view1),
+                                   KE::cbegin(view2), std::move(predicate));
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class BinaryPredicateType>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2, typename BinaryPredicateType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 bool equal(const std::string& label, const ExecutionSpace& ex,
            const ::Kokkos::View<DataType1, Properties1...>& view1,
            ::Kokkos::View<DataType2, Properties2...>& view2,
@@ -117,51 +134,149 @@ bool equal(const std::string& label, const ExecutionSpace& ex,
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::equal_impl(label, ex, KE::cbegin(view1), KE::cend(view1),
-                          KE::cbegin(view2), std::move(predicate));
-}
-
-template <class ExecutionSpace, class IteratorType1, class IteratorType2>
-std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators<
-                      IteratorType1, IteratorType2>::value,
-                  bool>
-equal(const ExecutionSpace& ex, IteratorType1 first1, IteratorType1 last1,
-      IteratorType2 first2, IteratorType2 last2) {
-  return Impl::equal_impl("Kokkos::equal_iterator_api_default", ex, first1,
-                          last1, first2, last2);
-}
-
-template <class ExecutionSpace, class IteratorType1, class IteratorType2>
-std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators<
-                      IteratorType1, IteratorType2>::value,
-                  bool>
-equal(const std::string& label, const ExecutionSpace& ex, IteratorType1 first1,
-      IteratorType1 last1, IteratorType2 first2, IteratorType2 last2) {
-  return Impl::equal_impl(label, ex, first1, last1, first2, last2);
-}
-
-template <class ExecutionSpace, class IteratorType1, class IteratorType2,
-          class BinaryPredicateType>
-std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators<
-                      IteratorType1, IteratorType2>::value,
-                  bool>
-equal(const ExecutionSpace& ex, IteratorType1 first1, IteratorType1 last1,
-      IteratorType2 first2, IteratorType2 last2,
-      BinaryPredicateType predicate) {
-  return Impl::equal_impl("Kokkos::equal_iterator_api_default", ex, first1,
-                          last1, first2, last2, std::move(predicate));
-}
-
-template <class ExecutionSpace, class IteratorType1, class IteratorType2,
-          class BinaryPredicateType>
-std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators<
-                      IteratorType1, IteratorType2>::value,
-                  bool>
-equal(const std::string& label, const ExecutionSpace& ex, IteratorType1 first1,
-      IteratorType1 last1, IteratorType2 first2, IteratorType2 last2,
-      BinaryPredicateType predicate) {
-  return Impl::equal_impl(label, ex, first1, last1, first2, last2,
-                          std::move(predicate));
+  return Impl::equal_exespace_impl(label, ex, KE::cbegin(view1),
+                                   KE::cend(view1), KE::cbegin(view2),
+                                   std::move(predicate));
+}
+
+template <typename ExecutionSpace, typename IteratorType1,
+          typename IteratorType2,
+          std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators_v<
+                               IteratorType1, IteratorType2>&& ::Kokkos::
+                               is_execution_space_v<ExecutionSpace>,
+                           int> = 0>
+bool equal(const ExecutionSpace& ex, IteratorType1 first1, IteratorType1 last1,
+           IteratorType2 first2, IteratorType2 last2) {
+  return Impl::equal_exespace_impl("Kokkos::equal_iterator_api_default", ex,
+                                   first1, last1, first2, last2);
+}
+
+template <typename ExecutionSpace, typename IteratorType1,
+          typename IteratorType2,
+          std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators_v<
+                               IteratorType1, IteratorType2>&& ::Kokkos::
+                               is_execution_space_v<ExecutionSpace>,
+                           int> = 0>
+bool equal(const std::string& label, const ExecutionSpace& ex,
+           IteratorType1 first1, IteratorType1 last1, IteratorType2 first2,
+           IteratorType2 last2) {
+  return Impl::equal_exespace_impl(label, ex, first1, last1, first2, last2);
+}
+
+template <typename ExecutionSpace, typename IteratorType1,
+          typename IteratorType2, typename BinaryPredicateType,
+          std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators_v<
+                               IteratorType1, IteratorType2>&& ::Kokkos::
+                               is_execution_space_v<ExecutionSpace>,
+                           int> = 0>
+bool equal(const ExecutionSpace& ex, IteratorType1 first1, IteratorType1 last1,
+           IteratorType2 first2, IteratorType2 last2,
+           BinaryPredicateType predicate) {
+  return Impl::equal_exespace_impl("Kokkos::equal_iterator_api_default", ex,
+                                   first1, last1, first2, last2,
+                                   std::move(predicate));
+}
+
+template <typename ExecutionSpace, typename IteratorType1,
+          typename IteratorType2, typename BinaryPredicateType,
+          std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators_v<
+                               IteratorType1, IteratorType2>&& ::Kokkos::
+                               is_execution_space_v<ExecutionSpace>,
+                           int> = 0>
+bool equal(const std::string& label, const ExecutionSpace& ex,
+           IteratorType1 first1, IteratorType1 last1, IteratorType2 first2,
+           IteratorType2 last2, BinaryPredicateType predicate) {
+  return Impl::equal_exespace_impl(label, ex, first1, last1, first2, last2,
+                                   std::move(predicate));
+}
+
+//
+// overload set accepting a team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+template <typename TeamHandleType, typename IteratorType1,
+          typename IteratorType2,
+          std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators_v<
+                               IteratorType1, IteratorType2>&& ::Kokkos::
+                               is_team_handle_v<TeamHandleType>,
+                           int> = 0>
+KOKKOS_FUNCTION bool equal(const TeamHandleType& teamHandle,
+                           IteratorType1 first1, IteratorType1 last1,
+                           IteratorType2 first2) {
+  return Impl::equal_team_impl(teamHandle, first1, last1, first2);
+}
+
+template <typename TeamHandleType, typename IteratorType1,
+          typename IteratorType2, typename BinaryPredicateType,
+          std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators_v<
+                               IteratorType1, IteratorType2>&& ::Kokkos::
+                               is_team_handle_v<TeamHandleType>,
+                           int> = 0>
+KOKKOS_FUNCTION bool equal(const TeamHandleType& teamHandle,
+                           IteratorType1 first1, IteratorType1 last1,
+                           IteratorType2 first2,
+                           BinaryPredicateType predicate) {
+  return Impl::equal_team_impl(teamHandle, first1, last1, first2,
+                               std::move(predicate));
+}
+
+template <typename TeamHandleType, typename DataType1, typename... Properties1,
+          typename DataType2, typename... Properties2,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION bool equal(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType1, Properties1...>& view1,
+    ::Kokkos::View<DataType2, Properties2...>& view2) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::equal_team_impl(teamHandle, KE::cbegin(view1), KE::cend(view1),
+                               KE::cbegin(view2));
+}
+
+template <typename TeamHandleType, typename DataType1, typename... Properties1,
+          typename DataType2, typename... Properties2,
+          typename BinaryPredicateType,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION bool equal(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType1, Properties1...>& view1,
+    ::Kokkos::View<DataType2, Properties2...>& view2,
+    BinaryPredicateType predicate) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::equal_team_impl(teamHandle, KE::cbegin(view1), KE::cend(view1),
+                               KE::cbegin(view2), std::move(predicate));
+}
+
+template <typename TeamHandleType, typename IteratorType1,
+          typename IteratorType2,
+          std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators_v<
+                               IteratorType1, IteratorType2>&& ::Kokkos::
+                               is_team_handle_v<TeamHandleType>,
+                           int> = 0>
+KOKKOS_FUNCTION bool equal(const TeamHandleType& teamHandle,
+                           IteratorType1 first1, IteratorType1 last1,
+                           IteratorType2 first2, IteratorType2 last2) {
+  return Impl::equal_team_impl(teamHandle, first1, last1, first2, last2);
+}
+
+template <typename TeamHandleType, typename IteratorType1,
+          typename IteratorType2, typename BinaryPredicateType,
+          std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators_v<
+                               IteratorType1, IteratorType2>&& ::Kokkos::
+                               is_team_handle_v<TeamHandleType>,
+                           int> = 0>
+KOKKOS_FUNCTION bool equal(const TeamHandleType& teamHandle,
+                           IteratorType1 first1, IteratorType1 last1,
+                           IteratorType2 first2, IteratorType2 last2,
+                           BinaryPredicateType predicate) {
+  return Impl::equal_team_impl(teamHandle, first1, last1, first2, last2,
+                               std::move(predicate));
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ExclusiveScan.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ExclusiveScan.hpp
index 4e05676c2c1971832de2ad55de6f66c6fe582642..ee3a10512645ab53fcccb8f1338bd9f5f67667f0 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ExclusiveScan.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ExclusiveScan.hpp
@@ -23,105 +23,130 @@
 namespace Kokkos {
 namespace Experimental {
 
+//
+// overload set accepting execution space
+//
+
 // overload set 1
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorType, class ValueType>
-std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators<
-                     InputIteratorType, OutputIteratorType>::value,
-                 OutputIteratorType>
-exclusive_scan(const ExecutionSpace& ex, InputIteratorType first,
-               InputIteratorType last, OutputIteratorType first_dest,
-               ValueType init_value) {
-  static_assert(std::is_move_constructible<ValueType>::value,
+template <typename ExecutionSpace, typename InputIteratorType,
+          typename OutputIteratorType, typename ValueType,
+          std::enable_if_t<
+              Impl::are_iterators_v<InputIteratorType, OutputIteratorType>&& ::
+                  Kokkos::is_execution_space_v<ExecutionSpace>,
+              int> = 0>
+OutputIteratorType exclusive_scan(const ExecutionSpace& ex,
+                                  InputIteratorType first,
+                                  InputIteratorType last,
+                                  OutputIteratorType first_dest,
+                                  ValueType init_value) {
+  static_assert(std::is_move_constructible_v<ValueType>,
                 "ValueType must be move constructible.");
-  return Impl::exclusive_scan_default_op_impl(
+  return Impl::exclusive_scan_default_op_exespace_impl(
       "Kokkos::exclusive_scan_default_functors_iterator_api", ex, first, last,
-      first_dest, init_value);
+      first_dest, std::move(init_value));
 }
 
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorType, class ValueType>
-std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators<
-                     InputIteratorType, OutputIteratorType>::value,
-                 OutputIteratorType>
-exclusive_scan(const std::string& label, const ExecutionSpace& ex,
-               InputIteratorType first, InputIteratorType last,
-               OutputIteratorType first_dest, ValueType init_value) {
-  static_assert(std::is_move_constructible<ValueType>::value,
+template <typename ExecutionSpace, typename InputIteratorType,
+          typename OutputIteratorType, typename ValueType,
+          std::enable_if_t<
+              Impl::are_iterators_v<InputIteratorType, OutputIteratorType>&& ::
+                  Kokkos::is_execution_space_v<ExecutionSpace>,
+              int> = 0>
+OutputIteratorType exclusive_scan(const std::string& label,
+                                  const ExecutionSpace& ex,
+                                  InputIteratorType first,
+                                  InputIteratorType last,
+                                  OutputIteratorType first_dest,
+                                  ValueType init_value) {
+  static_assert(std::is_move_constructible_v<ValueType>,
                 "ValueType must be move constructible.");
-  return Impl::exclusive_scan_default_op_impl(label, ex, first, last,
-                                              first_dest, init_value);
+  return Impl::exclusive_scan_default_op_exespace_impl(
+      label, ex, first, last, first_dest, std::move(init_value));
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class ValueType>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2, typename ValueType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto exclusive_scan(const ExecutionSpace& ex,
                     const ::Kokkos::View<DataType1, Properties1...>& view_from,
                     const ::Kokkos::View<DataType2, Properties2...>& view_dest,
                     ValueType init_value) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
-  static_assert(std::is_move_constructible<ValueType>::value,
+  static_assert(std::is_move_constructible_v<ValueType>,
                 "ValueType must be move constructible.");
   namespace KE = ::Kokkos::Experimental;
-  return Impl::exclusive_scan_default_op_impl(
+  return Impl::exclusive_scan_default_op_exespace_impl(
       "Kokkos::exclusive_scan_default_functors_view_api", ex,
       KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest),
-      init_value);
+      std::move(init_value));
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class ValueType>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2, typename ValueType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto exclusive_scan(const std::string& label, const ExecutionSpace& ex,
                     const ::Kokkos::View<DataType1, Properties1...>& view_from,
                     const ::Kokkos::View<DataType2, Properties2...>& view_dest,
                     ValueType init_value) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
-  static_assert(std::is_move_constructible<ValueType>::value,
+  static_assert(std::is_move_constructible_v<ValueType>,
                 "ValueType must be move constructible.");
   namespace KE = ::Kokkos::Experimental;
-  return Impl::exclusive_scan_default_op_impl(label, ex, KE::cbegin(view_from),
-                                              KE::cend(view_from),
-                                              KE::begin(view_dest), init_value);
+  return Impl::exclusive_scan_default_op_exespace_impl(
+      label, ex, KE::cbegin(view_from), KE::cend(view_from),
+      KE::begin(view_dest), std::move(init_value));
 }
 
 // overload set 2
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorType, class ValueType, class BinaryOpType>
-std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators<
-                     InputIteratorType, OutputIteratorType>::value,
-                 OutputIteratorType>
-exclusive_scan(const ExecutionSpace& ex, InputIteratorType first,
-               InputIteratorType last, OutputIteratorType first_dest,
-               ValueType init_value, BinaryOpType bop) {
+template <typename ExecutionSpace, typename InputIteratorType,
+          typename OutputIteratorType, typename ValueType,
+          typename BinaryOpType,
+          std::enable_if_t<
+              Impl::are_iterators_v<InputIteratorType, OutputIteratorType>&& ::
+                  Kokkos::is_execution_space_v<ExecutionSpace>,
+              int> = 0>
+OutputIteratorType exclusive_scan(const ExecutionSpace& ex,
+                                  InputIteratorType first,
+                                  InputIteratorType last,
+                                  OutputIteratorType first_dest,
+                                  ValueType init_value, BinaryOpType bop) {
   Impl::static_assert_is_not_openmptarget(ex);
-  static_assert(std::is_move_constructible<ValueType>::value,
+  static_assert(std::is_move_constructible_v<ValueType>,
                 "ValueType must be move constructible.");
-  return Impl::exclusive_scan_custom_op_impl(
+  return Impl::exclusive_scan_custom_op_exespace_impl(
       "Kokkos::exclusive_scan_custom_functors_iterator_api", ex, first, last,
-      first_dest, init_value, bop);
+      first_dest, std::move(init_value), bop);
 }
 
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorType, class ValueType, class BinaryOpType>
-std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators<
-                     InputIteratorType, OutputIteratorType>::value,
-                 OutputIteratorType>
-exclusive_scan(const std::string& label, const ExecutionSpace& ex,
-               InputIteratorType first, InputIteratorType last,
-               OutputIteratorType first_dest, ValueType init_value,
-               BinaryOpType bop) {
+template <typename ExecutionSpace, typename InputIteratorType,
+          typename OutputIteratorType, typename ValueType,
+          typename BinaryOpType,
+          std::enable_if_t<
+              Impl::are_iterators_v<InputIteratorType, OutputIteratorType>&& ::
+                  Kokkos::is_execution_space_v<ExecutionSpace>,
+              int> = 0>
+OutputIteratorType exclusive_scan(const std::string& label,
+                                  const ExecutionSpace& ex,
+                                  InputIteratorType first,
+                                  InputIteratorType last,
+                                  OutputIteratorType first_dest,
+                                  ValueType init_value, BinaryOpType bop) {
   Impl::static_assert_is_not_openmptarget(ex);
-  static_assert(std::is_move_constructible<ValueType>::value,
+  static_assert(std::is_move_constructible_v<ValueType>,
                 "ValueType must be move constructible.");
-  return Impl::exclusive_scan_custom_op_impl(label, ex, first, last, first_dest,
-                                             init_value, bop);
+  return Impl::exclusive_scan_custom_op_exespace_impl(
+      label, ex, first, last, first_dest, std::move(init_value), bop);
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class ValueType,
-          class BinaryOpType>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2, typename ValueType,
+    typename BinaryOpType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto exclusive_scan(const ExecutionSpace& ex,
                     const ::Kokkos::View<DataType1, Properties1...>& view_from,
                     const ::Kokkos::View<DataType2, Properties2...>& view_dest,
@@ -129,18 +154,20 @@ auto exclusive_scan(const ExecutionSpace& ex,
   Impl::static_assert_is_not_openmptarget(ex);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
-  static_assert(std::is_move_constructible<ValueType>::value,
+  static_assert(std::is_move_constructible_v<ValueType>,
                 "ValueType must be move constructible.");
   namespace KE = ::Kokkos::Experimental;
-  return Impl::exclusive_scan_custom_op_impl(
+  return Impl::exclusive_scan_custom_op_exespace_impl(
       "Kokkos::exclusive_scan_custom_functors_view_api", ex,
       KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest),
-      init_value, bop);
+      std::move(init_value), bop);
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class ValueType,
-          class BinaryOpType>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2, typename ValueType,
+    typename BinaryOpType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto exclusive_scan(const std::string& label, const ExecutionSpace& ex,
                     const ::Kokkos::View<DataType1, Properties1...>& view_from,
                     const ::Kokkos::View<DataType2, Properties2...>& view_dest,
@@ -148,12 +175,92 @@ auto exclusive_scan(const std::string& label, const ExecutionSpace& ex,
   Impl::static_assert_is_not_openmptarget(ex);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
-  static_assert(std::is_move_constructible<ValueType>::value,
+  static_assert(std::is_move_constructible_v<ValueType>,
                 "ValueType must be move constructible.");
   namespace KE = ::Kokkos::Experimental;
-  return Impl::exclusive_scan_custom_op_impl(
+  return Impl::exclusive_scan_custom_op_exespace_impl(
       label, ex, KE::cbegin(view_from), KE::cend(view_from),
-      KE::begin(view_dest), init_value, bop);
+      KE::begin(view_dest), std::move(init_value), bop);
+}
+
+//
+// overload set accepting a team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+
+// overload set 1
+template <typename TeamHandleType, typename InputIteratorType,
+          typename OutputIteratorType, typename ValueType,
+          std::enable_if_t<
+              Impl::are_iterators_v<InputIteratorType, OutputIteratorType> &&
+                  Kokkos::is_team_handle_v<TeamHandleType>,
+              int> = 0>
+KOKKOS_FUNCTION OutputIteratorType
+exclusive_scan(const TeamHandleType& teamHandle, InputIteratorType first,
+               InputIteratorType last, OutputIteratorType first_dest,
+               ValueType init_value) {
+  static_assert(std::is_move_constructible_v<ValueType>,
+                "ValueType must be move constructible.");
+  return Impl::exclusive_scan_default_op_team_impl(
+      teamHandle, first, last, first_dest, std::move(init_value));
+}
+
+template <typename TeamHandleType, typename DataType1, typename... Properties1,
+          typename DataType2, typename... Properties2, typename ValueType,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto exclusive_scan(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType1, Properties1...>& view_from,
+    const ::Kokkos::View<DataType2, Properties2...>& view_dest,
+    ValueType init_value) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
+  static_assert(std::is_move_constructible_v<ValueType>,
+                "ValueType must be move constructible.");
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::exclusive_scan_default_op_team_impl(
+      teamHandle, KE::cbegin(view_from), KE::cend(view_from),
+      KE::begin(view_dest), std::move(init_value));
+}
+
+// overload set 2
+template <typename TeamHandleType, typename InputIteratorType,
+          typename OutputIteratorType, typename ValueType,
+          typename BinaryOpType,
+          std::enable_if_t<
+              Impl::are_iterators_v<InputIteratorType, OutputIteratorType> &&
+                  Kokkos::is_team_handle_v<TeamHandleType>,
+              int> = 0>
+KOKKOS_FUNCTION OutputIteratorType
+exclusive_scan(const TeamHandleType& teamHandle, InputIteratorType first,
+               InputIteratorType last, OutputIteratorType first_dest,
+               ValueType init_value, BinaryOpType bop) {
+  Impl::static_assert_is_not_openmptarget(teamHandle);
+  static_assert(std::is_move_constructible_v<ValueType>,
+                "ValueType must be move constructible.");
+  return Impl::exclusive_scan_custom_op_team_impl(
+      teamHandle, first, last, first_dest, std::move(init_value), bop);
+}
+
+template <typename TeamHandleType, typename DataType1, typename... Properties1,
+          typename DataType2, typename... Properties2, typename ValueType,
+          typename BinaryOpType,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto exclusive_scan(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType1, Properties1...>& view_from,
+    const ::Kokkos::View<DataType2, Properties2...>& view_dest,
+    ValueType init_value, BinaryOpType bop) {
+  Impl::static_assert_is_not_openmptarget(teamHandle);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
+  static_assert(std::is_move_constructible_v<ValueType>,
+                "ValueType must be move constructible.");
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::exclusive_scan_custom_op_team_impl(
+      teamHandle, KE::cbegin(view_from), KE::cend(view_from),
+      KE::begin(view_dest), std::move(init_value), bop);
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Fill.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Fill.hpp
index 1e300a4c2081694786ae021e04958a9d2e757c88..6d805ba1bed1b1a39318c0f03525a207ac2be8dd 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Fill.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Fill.hpp
@@ -23,33 +23,67 @@
 namespace Kokkos {
 namespace Experimental {
 
-template <class ExecutionSpace, class IteratorType, class T>
+//
+// overload set accepting execution space
+//
+template <
+    typename ExecutionSpace, typename IteratorType, typename T,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 void fill(const ExecutionSpace& ex, IteratorType first, IteratorType last,
           const T& value) {
-  Impl::fill_impl("Kokkos::fill_iterator_api_default", ex, first, last, value);
+  Impl::fill_exespace_impl("Kokkos::fill_iterator_api_default", ex, first, last,
+                           value);
 }
 
-template <class ExecutionSpace, class IteratorType, class T>
+template <
+    typename ExecutionSpace, typename IteratorType, typename T,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 void fill(const std::string& label, const ExecutionSpace& ex,
           IteratorType first, IteratorType last, const T& value) {
-  Impl::fill_impl(label, ex, first, last, value);
+  Impl::fill_exespace_impl(label, ex, first, last, value);
 }
 
-template <class ExecutionSpace, class DataType, class... Properties, class T>
+template <
+    typename ExecutionSpace, typename DataType, typename... Properties,
+    typename T,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 void fill(const ExecutionSpace& ex,
           const ::Kokkos::View<DataType, Properties...>& view, const T& value) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-
-  Impl::fill_impl("Kokkos::fill_view_api_default", ex, begin(view), end(view),
-                  value);
+  Impl::fill_exespace_impl("Kokkos::fill_view_api_default", ex, begin(view),
+                           end(view), value);
 }
 
-template <class ExecutionSpace, class DataType, class... Properties, class T>
+template <
+    typename ExecutionSpace, typename DataType, typename... Properties,
+    typename T,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 void fill(const std::string& label, const ExecutionSpace& ex,
           const ::Kokkos::View<DataType, Properties...>& view, const T& value) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  Impl::fill_exespace_impl(label, ex, begin(view), end(view), value);
+}
 
-  Impl::fill_impl(label, ex, begin(view), end(view), value);
+//
+// overload set accepting a team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+template <typename TeamHandleType, typename IteratorType, typename T,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION void fill(const TeamHandleType& th, IteratorType first,
+                          IteratorType last, const T& value) {
+  Impl::fill_team_impl(th, first, last, value);
+}
+
+template <typename TeamHandleType, typename DataType, typename... Properties,
+          typename T,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION void fill(const TeamHandleType& th,
+                          const ::Kokkos::View<DataType, Properties...>& view,
+                          const T& value) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  Impl::fill_team_impl(th, begin(view), end(view), value);
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_FillN.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_FillN.hpp
index 02503dfd14324ea0d7b1043e453bbdc0f35f5672..66b8cd66cc5c797f8d06a4d7561ac47eb5ceb96f 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_FillN.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_FillN.hpp
@@ -23,38 +23,72 @@
 namespace Kokkos {
 namespace Experimental {
 
-template <class ExecutionSpace, class IteratorType, class SizeType, class T>
+template <
+    typename ExecutionSpace, typename IteratorType, typename SizeType,
+    typename T,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 IteratorType fill_n(const ExecutionSpace& ex, IteratorType first, SizeType n,
                     const T& value) {
-  return Impl::fill_n_impl("Kokkos::fill_n_iterator_api_default", ex, first, n,
-                           value);
+  return Impl::fill_n_exespace_impl("Kokkos::fill_n_iterator_api_default", ex,
+                                    first, n, value);
 }
 
-template <class ExecutionSpace, class IteratorType, class SizeType, class T>
+template <
+    typename ExecutionSpace, typename IteratorType, typename SizeType,
+    typename T,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 IteratorType fill_n(const std::string& label, const ExecutionSpace& ex,
                     IteratorType first, SizeType n, const T& value) {
-  return Impl::fill_n_impl(label, ex, first, n, value);
+  return Impl::fill_n_exespace_impl(label, ex, first, n, value);
 }
 
-template <class ExecutionSpace, class DataType, class... Properties,
-          class SizeType, class T>
+template <
+    typename ExecutionSpace, typename DataType, typename... Properties,
+    typename SizeType, typename T,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto fill_n(const ExecutionSpace& ex,
             const ::Kokkos::View<DataType, Properties...>& view, SizeType n,
             const T& value) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
 
-  return Impl::fill_n_impl("Kokkos::fill_n_view_api_default", ex, begin(view),
-                           n, value);
+  return Impl::fill_n_exespace_impl("Kokkos::fill_n_view_api_default", ex,
+                                    begin(view), n, value);
 }
 
-template <class ExecutionSpace, class DataType, class... Properties,
-          class SizeType, class T>
+template <
+    typename ExecutionSpace, typename DataType, typename... Properties,
+    typename SizeType, typename T,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto fill_n(const std::string& label, const ExecutionSpace& ex,
             const ::Kokkos::View<DataType, Properties...>& view, SizeType n,
             const T& value) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
 
-  return Impl::fill_n_impl(label, ex, begin(view), n, value);
+  return Impl::fill_n_exespace_impl(label, ex, begin(view), n, value);
+}
+
+//
+// overload set accepting a team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+template <typename TeamHandleType, typename IteratorType, typename SizeType,
+          typename T,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION IteratorType fill_n(const TeamHandleType& th,
+                                    IteratorType first, SizeType n,
+                                    const T& value) {
+  return Impl::fill_n_team_impl(th, first, n, value);
+}
+
+template <typename TeamHandleType, typename DataType, typename... Properties,
+          typename SizeType, typename T,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto fill_n(const TeamHandleType& th,
+                            const ::Kokkos::View<DataType, Properties...>& view,
+                            SizeType n, const T& value) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  return Impl::fill_n_team_impl(th, begin(view), n, value);
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Find.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Find.hpp
index 65b68cf931d28934d00560964586967d8264e93e..e5e2b0e2b05e268c141e73468e0bd390149b6b26 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Find.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Find.hpp
@@ -23,36 +23,76 @@
 namespace Kokkos {
 namespace Experimental {
 
-template <class ExecutionSpace, class InputIterator, class T>
+//
+// overload set accepting execution space
+//
+template <
+    typename ExecutionSpace, typename InputIterator, typename T,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 InputIterator find(const ExecutionSpace& ex, InputIterator first,
                    InputIterator last, const T& value) {
-  return Impl::find_impl("Kokkos::find_iterator_api_default", ex, first, last,
-                         value);
+  return Impl::find_exespace_impl("Kokkos::find_iterator_api_default", ex,
+                                  first, last, value);
 }
 
-template <class ExecutionSpace, class InputIterator, class T>
+template <
+    typename ExecutionSpace, typename InputIterator, typename T,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 InputIterator find(const std::string& label, const ExecutionSpace& ex,
                    InputIterator first, InputIterator last, const T& value) {
-  return Impl::find_impl(label, ex, first, last, value);
+  return Impl::find_exespace_impl(label, ex, first, last, value);
 }
 
-template <class ExecutionSpace, class DataType, class... Properties, class T>
+template <
+    typename ExecutionSpace, typename DataType, typename... Properties,
+    typename T,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto find(const ExecutionSpace& ex,
           const ::Kokkos::View<DataType, Properties...>& view, const T& value) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::find_impl("Kokkos::find_view_api_default", ex, KE::begin(view),
-                         KE::end(view), value);
+  return Impl::find_exespace_impl("Kokkos::find_view_api_default", ex,
+                                  KE::begin(view), KE::end(view), value);
 }
 
-template <class ExecutionSpace, class DataType, class... Properties, class T>
+template <
+    typename ExecutionSpace, typename DataType, typename... Properties,
+    typename T,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto find(const std::string& label, const ExecutionSpace& ex,
           const ::Kokkos::View<DataType, Properties...>& view, const T& value) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::find_impl(label, ex, KE::begin(view), KE::end(view), value);
+  return Impl::find_exespace_impl(label, ex, KE::begin(view), KE::end(view),
+                                  value);
+}
+
+//
+// overload set accepting a team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+template <typename TeamHandleType, typename InputIterator, typename T,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION InputIterator find(const TeamHandleType& teamHandle,
+                                   InputIterator first, InputIterator last,
+                                   const T& value) {
+  return Impl::find_team_impl(teamHandle, first, last, value);
+}
+
+template <typename TeamHandleType, typename DataType, typename... Properties,
+          typename T,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto find(const TeamHandleType& teamHandle,
+                          const ::Kokkos::View<DataType, Properties...>& view,
+                          const T& value) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::find_team_impl(teamHandle, KE::begin(view), KE::end(view),
+                              value);
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_FindEnd.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_FindEnd.hpp
index f6a38855ebbcfb5c8db06ce961b5e5c1001f4060..a4ec735fd5953e1b31f072d59f7e778fef1ec479 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_FindEnd.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_FindEnd.hpp
@@ -24,24 +24,34 @@
 namespace Kokkos {
 namespace Experimental {
 
+//
+// overload set accepting execution space
+//
+
 // overload set 1: no binary predicate passed
-template <class ExecutionSpace, class IteratorType1, class IteratorType2>
+template <
+    typename ExecutionSpace, typename IteratorType1, typename IteratorType2,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 IteratorType1 find_end(const ExecutionSpace& ex, IteratorType1 first,
                        IteratorType1 last, IteratorType2 s_first,
                        IteratorType2 s_last) {
-  return Impl::find_end_impl("Kokkos::find_end_iterator_api_default", ex, first,
-                             last, s_first, s_last);
+  return Impl::find_end_exespace_impl("Kokkos::find_end_iterator_api_default",
+                                      ex, first, last, s_first, s_last);
 }
 
-template <class ExecutionSpace, class IteratorType1, class IteratorType2>
+template <
+    typename ExecutionSpace, typename IteratorType1, typename IteratorType2,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 IteratorType1 find_end(const std::string& label, const ExecutionSpace& ex,
                        IteratorType1 first, IteratorType1 last,
                        IteratorType2 s_first, IteratorType2 s_last) {
-  return Impl::find_end_impl(label, ex, first, last, s_first, s_last);
+  return Impl::find_end_exespace_impl(label, ex, first, last, s_first, s_last);
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto find_end(const ExecutionSpace& ex,
               const ::Kokkos::View<DataType1, Properties1...>& view,
               const ::Kokkos::View<DataType2, Properties2...>& s_view) {
@@ -49,13 +59,15 @@ auto find_end(const ExecutionSpace& ex,
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::find_end_impl("Kokkos::find_end_view_api_default", ex,
-                             KE::begin(view), KE::end(view), KE::begin(s_view),
-                             KE::end(s_view));
+  return Impl::find_end_exespace_impl("Kokkos::find_end_view_api_default", ex,
+                                      KE::begin(view), KE::end(view),
+                                      KE::begin(s_view), KE::end(s_view));
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto find_end(const std::string& label, const ExecutionSpace& ex,
               const ::Kokkos::View<DataType1, Properties1...>& view,
               const ::Kokkos::View<DataType2, Properties2...>& s_view) {
@@ -63,31 +75,38 @@ auto find_end(const std::string& label, const ExecutionSpace& ex,
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::find_end_impl(label, ex, KE::begin(view), KE::end(view),
-                             KE::begin(s_view), KE::end(s_view));
+  return Impl::find_end_exespace_impl(label, ex, KE::begin(view), KE::end(view),
+                                      KE::begin(s_view), KE::end(s_view));
 }
 
 // overload set 2: binary predicate passed
-template <class ExecutionSpace, class IteratorType1, class IteratorType2,
-          class BinaryPredicateType>
+template <
+    typename ExecutionSpace, typename IteratorType1, typename IteratorType2,
+    typename BinaryPredicateType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 IteratorType1 find_end(const ExecutionSpace& ex, IteratorType1 first,
                        IteratorType1 last, IteratorType2 s_first,
                        IteratorType2 s_last, const BinaryPredicateType& pred) {
-  return Impl::find_end_impl("Kokkos::find_end_iterator_api_default", ex, first,
-                             last, s_first, s_last, pred);
+  return Impl::find_end_exespace_impl("Kokkos::find_end_iterator_api_default",
+                                      ex, first, last, s_first, s_last, pred);
 }
 
-template <class ExecutionSpace, class IteratorType1, class IteratorType2,
-          class BinaryPredicateType>
+template <
+    typename ExecutionSpace, typename IteratorType1, typename IteratorType2,
+    typename BinaryPredicateType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 IteratorType1 find_end(const std::string& label, const ExecutionSpace& ex,
                        IteratorType1 first, IteratorType1 last,
                        IteratorType2 s_first, IteratorType2 s_last,
                        const BinaryPredicateType& pred) {
-  return Impl::find_end_impl(label, ex, first, last, s_first, s_last, pred);
+  return Impl::find_end_exespace_impl(label, ex, first, last, s_first, s_last,
+                                      pred);
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class BinaryPredicateType>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2, typename BinaryPredicateType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto find_end(const ExecutionSpace& ex,
               const ::Kokkos::View<DataType1, Properties1...>& view,
               const ::Kokkos::View<DataType2, Properties2...>& s_view,
@@ -96,13 +115,15 @@ auto find_end(const ExecutionSpace& ex,
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::find_end_impl("Kokkos::find_end_view_api_default", ex,
-                             KE::begin(view), KE::end(view), KE::begin(s_view),
-                             KE::end(s_view), pred);
+  return Impl::find_end_exespace_impl("Kokkos::find_end_view_api_default", ex,
+                                      KE::begin(view), KE::end(view),
+                                      KE::begin(s_view), KE::end(s_view), pred);
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class BinaryPredicateType>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2, typename BinaryPredicateType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto find_end(const std::string& label, const ExecutionSpace& ex,
               const ::Kokkos::View<DataType1, Properties1...>& view,
               const ::Kokkos::View<DataType2, Properties2...>& s_view,
@@ -111,8 +132,71 @@ auto find_end(const std::string& label, const ExecutionSpace& ex,
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::find_end_impl(label, ex, KE::begin(view), KE::end(view),
-                             KE::begin(s_view), KE::end(s_view), pred);
+  return Impl::find_end_exespace_impl(label, ex, KE::begin(view), KE::end(view),
+                                      KE::begin(s_view), KE::end(s_view), pred);
+}
+
+//
+// overload set accepting a team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+
+// overload set 1: no binary predicate passed
+template <typename TeamHandleType, typename IteratorType1,
+          typename IteratorType2,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION IteratorType1 find_end(const TeamHandleType& teamHandle,
+                                       IteratorType1 first, IteratorType1 last,
+                                       IteratorType2 s_first,
+                                       IteratorType2 s_last) {
+  return Impl::find_end_team_impl(teamHandle, first, last, s_first, s_last);
+}
+
+template <typename TeamHandleType, typename DataType1, typename... Properties1,
+          typename DataType2, typename... Properties2,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto find_end(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType1, Properties1...>& view,
+    const ::Kokkos::View<DataType2, Properties2...>& s_view) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::find_end_team_impl(teamHandle, KE::begin(view), KE::end(view),
+                                  KE::begin(s_view), KE::end(s_view));
+}
+
+// overload set 2: binary predicate passed
+template <typename TeamHandleType, typename IteratorType1,
+          typename IteratorType2, typename BinaryPredicateType,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+
+KOKKOS_FUNCTION IteratorType1 find_end(const TeamHandleType& teamHandle,
+                                       IteratorType1 first, IteratorType1 last,
+                                       IteratorType2 s_first,
+                                       IteratorType2 s_last,
+                                       const BinaryPredicateType& pred) {
+  return Impl::find_end_team_impl(teamHandle, first, last, s_first, s_last,
+                                  pred);
+}
+
+template <typename TeamHandleType, typename DataType1, typename... Properties1,
+          typename DataType2, typename... Properties2,
+          typename BinaryPredicateType,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto find_end(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType1, Properties1...>& view,
+    const ::Kokkos::View<DataType2, Properties2...>& s_view,
+    const BinaryPredicateType& pred) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::find_end_team_impl(teamHandle, KE::begin(view), KE::end(view),
+                                  KE::begin(s_view), KE::end(s_view), pred);
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_FindFirstOf.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_FindFirstOf.hpp
index 6b0e4993ee2461179f6ca467fbda060934dd32ae..341a70e2f256032f548a0f126bf1858e0b147b98 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_FindFirstOf.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_FindFirstOf.hpp
@@ -23,24 +23,36 @@
 namespace Kokkos {
 namespace Experimental {
 
+//
+// overload set accepting execution space
+//
+
 // overload set 1: no binary predicate passed
-template <class ExecutionSpace, class IteratorType1, class IteratorType2>
+template <
+    typename ExecutionSpace, typename IteratorType1, typename IteratorType2,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 IteratorType1 find_first_of(const ExecutionSpace& ex, IteratorType1 first,
                             IteratorType1 last, IteratorType2 s_first,
                             IteratorType2 s_last) {
-  return Impl::find_first_of_impl("Kokkos::find_first_of_iterator_api_default",
-                                  ex, first, last, s_first, s_last);
+  return Impl::find_first_of_exespace_impl(
+      "Kokkos::find_first_of_iterator_api_default", ex, first, last, s_first,
+      s_last);
 }
 
-template <class ExecutionSpace, class IteratorType1, class IteratorType2>
+template <
+    typename ExecutionSpace, typename IteratorType1, typename IteratorType2,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 IteratorType1 find_first_of(const std::string& label, const ExecutionSpace& ex,
                             IteratorType1 first, IteratorType1 last,
                             IteratorType2 s_first, IteratorType2 s_last) {
-  return Impl::find_first_of_impl(label, ex, first, last, s_first, s_last);
+  return Impl::find_first_of_exespace_impl(label, ex, first, last, s_first,
+                                           s_last);
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto find_first_of(const ExecutionSpace& ex,
                    const ::Kokkos::View<DataType1, Properties1...>& view,
                    const ::Kokkos::View<DataType2, Properties2...>& s_view) {
@@ -48,13 +60,15 @@ auto find_first_of(const ExecutionSpace& ex,
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::find_first_of_impl("Kokkos::find_first_of_view_api_default", ex,
-                                  KE::begin(view), KE::end(view),
-                                  KE::begin(s_view), KE::end(s_view));
+  return Impl::find_first_of_exespace_impl(
+      "Kokkos::find_first_of_view_api_default", ex, KE::begin(view),
+      KE::end(view), KE::begin(s_view), KE::end(s_view));
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto find_first_of(const std::string& label, const ExecutionSpace& ex,
                    const ::Kokkos::View<DataType1, Properties1...>& view,
                    const ::Kokkos::View<DataType2, Properties2...>& s_view) {
@@ -62,33 +76,41 @@ auto find_first_of(const std::string& label, const ExecutionSpace& ex,
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::find_first_of_impl(label, ex, KE::begin(view), KE::end(view),
-                                  KE::begin(s_view), KE::end(s_view));
+  return Impl::find_first_of_exespace_impl(label, ex, KE::begin(view),
+                                           KE::end(view), KE::begin(s_view),
+                                           KE::end(s_view));
 }
 
 // overload set 2: binary predicate passed
-template <class ExecutionSpace, class IteratorType1, class IteratorType2,
-          class BinaryPredicateType>
+template <
+    typename ExecutionSpace, typename IteratorType1, typename IteratorType2,
+    typename BinaryPredicateType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 IteratorType1 find_first_of(const ExecutionSpace& ex, IteratorType1 first,
                             IteratorType1 last, IteratorType2 s_first,
                             IteratorType2 s_last,
                             const BinaryPredicateType& pred) {
-  return Impl::find_first_of_impl("Kokkos::find_first_of_iterator_api_default",
-                                  ex, first, last, s_first, s_last, pred);
+  return Impl::find_first_of_exespace_impl(
+      "Kokkos::find_first_of_iterator_api_default", ex, first, last, s_first,
+      s_last, pred);
 }
 
-template <class ExecutionSpace, class IteratorType1, class IteratorType2,
-          class BinaryPredicateType>
+template <
+    typename ExecutionSpace, typename IteratorType1, typename IteratorType2,
+    typename BinaryPredicateType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 IteratorType1 find_first_of(const std::string& label, const ExecutionSpace& ex,
                             IteratorType1 first, IteratorType1 last,
                             IteratorType2 s_first, IteratorType2 s_last,
                             const BinaryPredicateType& pred) {
-  return Impl::find_first_of_impl(label, ex, first, last, s_first, s_last,
-                                  pred);
+  return Impl::find_first_of_exespace_impl(label, ex, first, last, s_first,
+                                           s_last, pred);
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class BinaryPredicateType>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2, typename BinaryPredicateType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto find_first_of(const ExecutionSpace& ex,
                    const ::Kokkos::View<DataType1, Properties1...>& view,
                    const ::Kokkos::View<DataType2, Properties2...>& s_view,
@@ -97,13 +119,15 @@ auto find_first_of(const ExecutionSpace& ex,
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::find_first_of_impl("Kokkos::find_first_of_view_api_default", ex,
-                                  KE::begin(view), KE::end(view),
-                                  KE::begin(s_view), KE::end(s_view), pred);
+  return Impl::find_first_of_exespace_impl(
+      "Kokkos::find_first_of_view_api_default", ex, KE::begin(view),
+      KE::end(view), KE::begin(s_view), KE::end(s_view), pred);
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class BinaryPredicateType>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2, typename BinaryPredicateType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto find_first_of(const std::string& label, const ExecutionSpace& ex,
                    const ::Kokkos::View<DataType1, Properties1...>& view,
                    const ::Kokkos::View<DataType2, Properties2...>& s_view,
@@ -112,8 +136,77 @@ auto find_first_of(const std::string& label, const ExecutionSpace& ex,
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::find_first_of_impl(label, ex, KE::begin(view), KE::end(view),
-                                  KE::begin(s_view), KE::end(s_view), pred);
+  return Impl::find_first_of_exespace_impl(label, ex, KE::begin(view),
+                                           KE::end(view), KE::begin(s_view),
+                                           KE::end(s_view), pred);
+}
+
+//
+// overload set accepting a team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+
+// overload set 1: no binary predicate passed
+template <typename TeamHandleType, typename IteratorType1,
+          typename IteratorType2,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION IteratorType1 find_first_of(const TeamHandleType& teamHandle,
+                                            IteratorType1 first,
+                                            IteratorType1 last,
+                                            IteratorType2 s_first,
+                                            IteratorType2 s_last) {
+  return Impl::find_first_of_team_impl(teamHandle, first, last, s_first,
+                                       s_last);
+}
+
+template <typename TeamHandleType, typename DataType1, typename... Properties1,
+          typename DataType2, typename... Properties2,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto find_first_of(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType1, Properties1...>& view,
+    const ::Kokkos::View<DataType2, Properties2...>& s_view) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::find_first_of_team_impl(teamHandle, KE::begin(view),
+                                       KE::end(view), KE::begin(s_view),
+                                       KE::end(s_view));
+}
+
+// overload set 2: binary predicate passed
+template <typename TeamHandleType, typename IteratorType1,
+          typename IteratorType2, typename BinaryPredicateType,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+
+KOKKOS_FUNCTION IteratorType1 find_first_of(const TeamHandleType& teamHandle,
+                                            IteratorType1 first,
+                                            IteratorType1 last,
+                                            IteratorType2 s_first,
+                                            IteratorType2 s_last,
+                                            const BinaryPredicateType& pred) {
+  return Impl::find_first_of_team_impl(teamHandle, first, last, s_first, s_last,
+                                       pred);
+}
+
+template <typename TeamHandleType, typename DataType1, typename... Properties1,
+          typename DataType2, typename... Properties2,
+          typename BinaryPredicateType,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto find_first_of(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType1, Properties1...>& view,
+    const ::Kokkos::View<DataType2, Properties2...>& s_view,
+    const BinaryPredicateType& pred) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::find_first_of_team_impl(teamHandle, KE::begin(view),
+                                       KE::end(view), KE::begin(s_view),
+                                       KE::end(s_view), pred);
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_FindIf.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_FindIf.hpp
index 911316a668dd33f52da9ccc105b7a77923e709e8..283fab7617f1a3d8201a6a28f45d7e4752eebf1b 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_FindIf.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_FindIf.hpp
@@ -23,42 +23,82 @@
 namespace Kokkos {
 namespace Experimental {
 
-template <class ExecutionSpace, class IteratorType, class PredicateType>
+//
+// overload set accepting execution space
+//
+template <
+    typename ExecutionSpace, typename IteratorType, typename PredicateType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 IteratorType find_if(const ExecutionSpace& ex, IteratorType first,
                      IteratorType last, PredicateType predicate) {
-  return Impl::find_if_or_not_impl<true>("Kokkos::find_if_iterator_api_default",
-                                         ex, first, last, std::move(predicate));
+  return Impl::find_if_or_not_exespace_impl<true>(
+      "Kokkos::find_if_iterator_api_default", ex, first, last,
+      std::move(predicate));
 }
 
-template <class ExecutionSpace, class IteratorType, class PredicateType>
+template <
+    typename ExecutionSpace, typename IteratorType, typename PredicateType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 IteratorType find_if(const std::string& label, const ExecutionSpace& ex,
                      IteratorType first, IteratorType last,
                      PredicateType predicate) {
-  return Impl::find_if_or_not_impl<true>(label, ex, first, last,
-                                         std::move(predicate));
+  return Impl::find_if_or_not_exespace_impl<true>(label, ex, first, last,
+                                                  std::move(predicate));
 }
 
-template <class ExecutionSpace, class DataType, class... Properties,
-          class Predicate>
+template <typename ExecutionSpace, typename DataType, typename... Properties,
+          typename Predicate,
+          std::enable_if_t<::Kokkos::is_execution_space<ExecutionSpace>::value,
+                           int> = 0>
 auto find_if(const ExecutionSpace& ex,
              const ::Kokkos::View<DataType, Properties...>& v,
              Predicate predicate) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
   namespace KE = ::Kokkos::Experimental;
-  return Impl::find_if_or_not_impl<true>("Kokkos::find_if_view_api_default", ex,
-                                         KE::begin(v), KE::end(v),
-                                         std::move(predicate));
+  return Impl::find_if_or_not_exespace_impl<true>(
+      "Kokkos::find_if_view_api_default", ex, KE::begin(v), KE::end(v),
+      std::move(predicate));
 }
 
-template <class ExecutionSpace, class DataType, class... Properties,
-          class Predicate>
+template <typename ExecutionSpace, typename DataType, typename... Properties,
+          typename Predicate,
+          std::enable_if_t<::Kokkos::is_execution_space<ExecutionSpace>::value,
+                           int> = 0>
 auto find_if(const std::string& label, const ExecutionSpace& ex,
              const ::Kokkos::View<DataType, Properties...>& v,
              Predicate predicate) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
   namespace KE = ::Kokkos::Experimental;
-  return Impl::find_if_or_not_impl<true>(label, ex, KE::begin(v), KE::end(v),
-                                         std::move(predicate));
+  return Impl::find_if_or_not_exespace_impl<true>(
+      label, ex, KE::begin(v), KE::end(v), std::move(predicate));
+}
+
+//
+// overload set accepting a team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+template <typename TeamHandleType, typename IteratorType,
+          typename PredicateType,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION IteratorType find_if(const TeamHandleType& teamHandle,
+                                     IteratorType first, IteratorType last,
+                                     PredicateType predicate) {
+  return Impl::find_if_or_not_team_impl<true>(teamHandle, first, last,
+                                              std::move(predicate));
+}
+
+template <
+    typename TeamHandleType, typename DataType, typename... Properties,
+    typename Predicate,
+    std::enable_if_t<::Kokkos::is_team_handle<TeamHandleType>::value, int> = 0>
+KOKKOS_FUNCTION auto find_if(const TeamHandleType& teamHandle,
+                             const ::Kokkos::View<DataType, Properties...>& v,
+                             Predicate predicate) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::find_if_or_not_team_impl<true>(teamHandle, KE::begin(v),
+                                              KE::end(v), std::move(predicate));
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_FindIfNot.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_FindIfNot.hpp
index 18294d7b7df963b96eb122fb09544905d6203741..5e17a6f539b44b6b8723bc579f4578a9118e79c1 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_FindIfNot.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_FindIfNot.hpp
@@ -23,45 +23,84 @@
 namespace Kokkos {
 namespace Experimental {
 
-template <class ExecutionSpace, class IteratorType, class Predicate>
+//
+// overload set accepting execution space
+//
+template <
+    typename ExecutionSpace, typename IteratorType, typename Predicate,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 IteratorType find_if_not(const ExecutionSpace& ex, IteratorType first,
                          IteratorType last, Predicate predicate) {
-  return Impl::find_if_or_not_impl<false>(
+  return Impl::find_if_or_not_exespace_impl<false>(
       "Kokkos::find_if_not_iterator_api_default", ex, first, last,
       std::move(predicate));
 }
 
-template <class ExecutionSpace, class IteratorType, class Predicate>
+template <
+    typename ExecutionSpace, typename IteratorType, typename Predicate,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 IteratorType find_if_not(const std::string& label, const ExecutionSpace& ex,
                          IteratorType first, IteratorType last,
                          Predicate predicate) {
-  return Impl::find_if_or_not_impl<false>(label, ex, first, last,
-                                          std::move(predicate));
+  return Impl::find_if_or_not_exespace_impl<false>(label, ex, first, last,
+                                                   std::move(predicate));
 }
 
-template <class ExecutionSpace, class DataType, class... Properties,
-          class Predicate>
+template <typename ExecutionSpace, typename DataType, typename... Properties,
+          typename Predicate,
+          std::enable_if_t<::Kokkos::is_execution_space<ExecutionSpace>::value,
+                           int> = 0>
 auto find_if_not(const ExecutionSpace& ex,
                  const ::Kokkos::View<DataType, Properties...>& v,
                  Predicate predicate) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::find_if_or_not_impl<false>(
+  return Impl::find_if_or_not_exespace_impl<false>(
       "Kokkos::find_if_not_view_api_default", ex, KE::begin(v), KE::end(v),
       std::move(predicate));
 }
 
-template <class ExecutionSpace, class DataType, class... Properties,
-          class Predicate>
+template <typename ExecutionSpace, typename DataType, typename... Properties,
+          typename Predicate,
+          std::enable_if_t<::Kokkos::is_execution_space<ExecutionSpace>::value,
+                           int> = 0>
 auto find_if_not(const std::string& label, const ExecutionSpace& ex,
                  const ::Kokkos::View<DataType, Properties...>& v,
                  Predicate predicate) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::find_if_or_not_impl<false>(label, ex, KE::begin(v), KE::end(v),
-                                          std::move(predicate));
+  return Impl::find_if_or_not_exespace_impl<false>(
+      label, ex, KE::begin(v), KE::end(v), std::move(predicate));
+}
+
+//
+// overload set accepting a team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+template <typename TeamHandleType, typename IteratorType, typename Predicate,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION IteratorType find_if_not(const TeamHandleType& teamHandle,
+                                         IteratorType first, IteratorType last,
+                                         Predicate predicate) {
+  return Impl::find_if_or_not_team_impl<false>(teamHandle, first, last,
+                                               std::move(predicate));
+}
+
+template <
+    typename TeamHandleType, typename DataType, typename... Properties,
+    typename Predicate,
+    std::enable_if_t<::Kokkos::is_team_handle<TeamHandleType>::value, int> = 0>
+KOKKOS_FUNCTION auto find_if_not(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType, Properties...>& v, Predicate predicate) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::find_if_or_not_team_impl<false>(
+      teamHandle, KE::begin(v), KE::end(v), std::move(predicate));
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ForEach.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ForEach.hpp
index d7b08e4842658cab72a73ed835ff3bba50780768..6215b325afc79c92dcd0355cd05ca0856943846f 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ForEach.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ForEach.hpp
@@ -23,42 +23,83 @@
 namespace Kokkos {
 namespace Experimental {
 
-template <class ExecutionSpace, class IteratorType, class UnaryFunctorType>
+//
+// overload set accepting execution space
+//
+template <
+    class ExecutionSpace, class IteratorType, class UnaryFunctorType,
+    std::enable_if_t<Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 UnaryFunctorType for_each(const std::string& label, const ExecutionSpace& ex,
                           IteratorType first, IteratorType last,
                           UnaryFunctorType functor) {
-  return Impl::for_each_impl(label, ex, first, last, std::move(functor));
+  return Impl::for_each_exespace_impl(label, ex, first, last,
+                                      std::move(functor));
 }
 
-template <class ExecutionSpace, class IteratorType, class UnaryFunctorType>
+template <
+    class ExecutionSpace, class IteratorType, class UnaryFunctorType,
+    std::enable_if_t<Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 UnaryFunctorType for_each(const ExecutionSpace& ex, IteratorType first,
                           IteratorType last, UnaryFunctorType functor) {
-  return Impl::for_each_impl("Kokkos::for_each_iterator_api_default", ex, first,
-                             last, std::move(functor));
+  return Impl::for_each_exespace_impl("Kokkos::for_each_iterator_api_default",
+                                      ex, first, last, std::move(functor));
 }
 
-template <class ExecutionSpace, class DataType, class... Properties,
-          class UnaryFunctorType>
+template <
+    class ExecutionSpace, class DataType, class... Properties,
+    class UnaryFunctorType,
+    std::enable_if_t<Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 UnaryFunctorType for_each(const std::string& label, const ExecutionSpace& ex,
                           const ::Kokkos::View<DataType, Properties...>& v,
                           UnaryFunctorType functor) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::for_each_impl(label, ex, KE::begin(v), KE::end(v),
-                             std::move(functor));
+  return Impl::for_each_exespace_impl(label, ex, KE::begin(v), KE::end(v),
+                                      std::move(functor));
 }
 
-template <class ExecutionSpace, class DataType, class... Properties,
-          class UnaryFunctorType>
+template <
+    class ExecutionSpace, class DataType, class... Properties,
+    class UnaryFunctorType,
+    std::enable_if_t<Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 UnaryFunctorType for_each(const ExecutionSpace& ex,
                           const ::Kokkos::View<DataType, Properties...>& v,
                           UnaryFunctorType functor) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::for_each_impl("Kokkos::for_each_view_api_default", ex,
-                             KE::begin(v), KE::end(v), std::move(functor));
+  return Impl::for_each_exespace_impl("Kokkos::for_each_view_api_default", ex,
+                                      KE::begin(v), KE::end(v),
+                                      std::move(functor));
+}
+
+//
+// overload set accepting a team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+
+template <class TeamHandleType, class IteratorType, class UnaryFunctorType,
+          std::enable_if_t<Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION UnaryFunctorType for_each(const TeamHandleType& teamHandle,
+                                          IteratorType first, IteratorType last,
+                                          UnaryFunctorType functor) {
+  return Impl::for_each_team_impl(teamHandle, first, last, std::move(functor));
+}
+
+template <class TeamHandleType, class DataType, class... Properties,
+          class UnaryFunctorType,
+          std::enable_if_t<Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION UnaryFunctorType
+for_each(const TeamHandleType& teamHandle,
+         const ::Kokkos::View<DataType, Properties...>& v,
+         UnaryFunctorType functor) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::for_each_team_impl(teamHandle, KE::begin(v), KE::end(v),
+                                  std::move(functor));
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ForEachN.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ForEachN.hpp
index f1769da05bd6aa90641433fd708c4910d59c325f..e6fbcad891e173dae3b9d165f11ca96fe87b5f90 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ForEachN.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ForEachN.hpp
@@ -23,43 +23,87 @@
 namespace Kokkos {
 namespace Experimental {
 
-template <class ExecutionSpace, class IteratorType, class SizeType,
-          class UnaryFunctorType>
+//
+// overload set accepting execution space
+//
+template <
+    class ExecutionSpace, class IteratorType, class SizeType,
+    class UnaryFunctorType,
+    std::enable_if_t<Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 IteratorType for_each_n(const std::string& label, const ExecutionSpace& ex,
                         IteratorType first, SizeType n,
                         UnaryFunctorType functor) {
-  return Impl::for_each_n_impl(label, ex, first, n, std::move(functor));
+  return Impl::for_each_n_exespace_impl(label, ex, first, n,
+                                        std::move(functor));
 }
 
-template <class ExecutionSpace, class IteratorType, class SizeType,
-          class UnaryFunctorType>
+template <
+    class ExecutionSpace, class IteratorType, class SizeType,
+    class UnaryFunctorType,
+    std::enable_if_t<Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 IteratorType for_each_n(const ExecutionSpace& ex, IteratorType first,
                         SizeType n, UnaryFunctorType functor) {
-  return Impl::for_each_n_impl("Kokkos::for_each_n_iterator_api_default", ex,
-                               first, n, std::move(functor));
+  return Impl::for_each_n_exespace_impl(
+      "Kokkos::for_each_n_iterator_api_default", ex, first, n,
+      std::move(functor));
 }
 
-template <class ExecutionSpace, class DataType, class... Properties,
-          class SizeType, class UnaryFunctorType>
+template <
+    class ExecutionSpace, class DataType, class... Properties, class SizeType,
+    class UnaryFunctorType,
+    std::enable_if_t<Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto for_each_n(const std::string& label, const ExecutionSpace& ex,
                 const ::Kokkos::View<DataType, Properties...>& v, SizeType n,
                 UnaryFunctorType functor) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::for_each_n_impl(label, ex, KE::begin(v), n, std::move(functor));
+  return Impl::for_each_n_exespace_impl(label, ex, KE::begin(v), n,
+                                        std::move(functor));
 }
 
-template <class ExecutionSpace, class DataType, class... Properties,
-          class SizeType, class UnaryFunctorType>
+template <
+    class ExecutionSpace, class DataType, class... Properties, class SizeType,
+    class UnaryFunctorType,
+    std::enable_if_t<Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto for_each_n(const ExecutionSpace& ex,
                 const ::Kokkos::View<DataType, Properties...>& v, SizeType n,
                 UnaryFunctorType functor) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::for_each_n_impl("Kokkos::for_each_n_view_api_default", ex,
-                               KE::begin(v), n, std::move(functor));
+  return Impl::for_each_n_exespace_impl("Kokkos::for_each_n_view_api_default",
+                                        ex, KE::begin(v), n,
+                                        std::move(functor));
+}
+
+//
+// overload set accepting a team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+
+template <class TeamHandleType, class IteratorType, class SizeType,
+          class UnaryFunctorType,
+          std::enable_if_t<Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION IteratorType for_each_n(const TeamHandleType& teamHandle,
+                                        IteratorType first, SizeType n,
+                                        UnaryFunctorType functor) {
+  return Impl::for_each_n_team_impl(teamHandle, first, n, std::move(functor));
+}
+
+template <class TeamHandleType, class DataType, class... Properties,
+          class SizeType, class UnaryFunctorType,
+          std::enable_if_t<Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto for_each_n(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType, Properties...>& v, SizeType n,
+    UnaryFunctorType functor) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::for_each_n_team_impl(teamHandle, KE::begin(v), n,
+                                    std::move(functor));
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Generate.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Generate.hpp
index 13e12783e0999ca5ff2fea384697989a6a21a56f..a3295084eeb955a1174907fc99202b8f10f1b52f 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Generate.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Generate.hpp
@@ -23,38 +23,68 @@
 namespace Kokkos {
 namespace Experimental {
 
-template <class ExecutionSpace, class IteratorType, class Generator>
+//
+// overload set accepting execution space
+//
+template <typename ExecutionSpace, typename IteratorType, typename Generator,
+          std::enable_if_t<is_execution_space_v<ExecutionSpace>, int> = 0>
 void generate(const ExecutionSpace& ex, IteratorType first, IteratorType last,
               Generator g) {
-  Impl::generate_impl("Kokkos::generate_iterator_api_default", ex, first, last,
-                      std::move(g));
+  Impl::generate_exespace_impl("Kokkos::generate_iterator_api_default", ex,
+                               first, last, std::move(g));
 }
 
-template <class ExecutionSpace, class IteratorType, class Generator>
+template <typename ExecutionSpace, typename IteratorType, typename Generator,
+          std::enable_if_t<is_execution_space_v<ExecutionSpace>, int> = 0>
 void generate(const std::string& label, const ExecutionSpace& ex,
               IteratorType first, IteratorType last, Generator g) {
-  Impl::generate_impl(label, ex, first, last, std::move(g));
+  Impl::generate_exespace_impl(label, ex, first, last, std::move(g));
 }
 
-template <class ExecutionSpace, class DataType, class... Properties,
-          class Generator>
+template <typename ExecutionSpace, typename DataType, typename... Properties,
+          typename Generator,
+          std::enable_if_t<is_execution_space_v<ExecutionSpace>, int> = 0>
 void generate(const ExecutionSpace& ex,
               const ::Kokkos::View<DataType, Properties...>& view,
               Generator g) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
 
-  Impl::generate_impl("Kokkos::generate_view_api_default", ex, begin(view),
-                      end(view), std::move(g));
+  Impl::generate_exespace_impl("Kokkos::generate_view_api_default", ex,
+                               begin(view), end(view), std::move(g));
 }
 
-template <class ExecutionSpace, class DataType, class... Properties,
-          class Generator>
+template <typename ExecutionSpace, typename DataType, typename... Properties,
+          typename Generator,
+          std::enable_if_t<is_execution_space_v<ExecutionSpace>, int> = 0>
 void generate(const std::string& label, const ExecutionSpace& ex,
               const ::Kokkos::View<DataType, Properties...>& view,
               Generator g) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
 
-  Impl::generate_impl(label, ex, begin(view), end(view), std::move(g));
+  Impl::generate_exespace_impl(label, ex, begin(view), end(view), std::move(g));
+}
+
+//
+// overload set accepting a team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+template <typename TeamHandleType, typename IteratorType, typename Generator,
+          std::enable_if_t<is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION void generate(const TeamHandleType& teamHandle,
+                              IteratorType first, IteratorType last,
+                              Generator g) {
+  Impl::generate_team_impl(teamHandle, first, last, std::move(g));
+}
+
+template <typename TeamHandleType, typename DataType, typename... Properties,
+          typename Generator,
+          std::enable_if_t<is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION void generate(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType, Properties...>& view, Generator g) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  Impl::generate_team_impl(teamHandle, begin(view), end(view), std::move(g));
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_GenerateN.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_GenerateN.hpp
index 4d17512228590195c74f9993bd52952943f4bb35..e480062c2368eaa68f83aff07beed3db13ea65dd 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_GenerateN.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_GenerateN.hpp
@@ -23,40 +23,75 @@
 namespace Kokkos {
 namespace Experimental {
 
-template <class ExecutionSpace, class IteratorType, class Size, class Generator>
+//
+// overload set accepting execution space
+//
+template <typename ExecutionSpace, typename IteratorType, typename Size,
+          typename Generator,
+          std::enable_if_t<is_execution_space_v<ExecutionSpace>, int> = 0>
 IteratorType generate_n(const ExecutionSpace& ex, IteratorType first,
                         Size count, Generator g) {
-  Impl::generate_n_impl("Kokkos::generate_n_iterator_api_default", ex, first,
-                        count, std::move(g));
-  return first + count;
+  return Impl::generate_n_exespace_impl(
+      "Kokkos::generate_n_iterator_api_default", ex, first, count,
+      std::move(g));
 }
 
-template <class ExecutionSpace, class IteratorType, class Size, class Generator>
+template <typename ExecutionSpace, typename IteratorType, typename Size,
+          typename Generator,
+          std::enable_if_t<is_execution_space_v<ExecutionSpace>, int> = 0>
 IteratorType generate_n(const std::string& label, const ExecutionSpace& ex,
                         IteratorType first, Size count, Generator g) {
-  Impl::generate_n_impl(label, ex, first, count, std::move(g));
-  return first + count;
+  return Impl::generate_n_exespace_impl(label, ex, first, count, std::move(g));
 }
 
-template <class ExecutionSpace, class DataType, class... Properties, class Size,
-          class Generator>
+template <typename ExecutionSpace, typename DataType, typename... Properties,
+          typename Size, typename Generator,
+          std::enable_if_t<is_execution_space_v<ExecutionSpace>, int> = 0>
 auto generate_n(const ExecutionSpace& ex,
                 const ::Kokkos::View<DataType, Properties...>& view, Size count,
                 Generator g) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
 
-  return Impl::generate_n_impl("Kokkos::generate_n_view_api_default", ex,
-                               begin(view), count, std::move(g));
+  return Impl::generate_n_exespace_impl("Kokkos::generate_n_view_api_default",
+                                        ex, begin(view), count, std::move(g));
 }
 
-template <class ExecutionSpace, class DataType, class... Properties, class Size,
-          class Generator>
+template <typename ExecutionSpace, typename DataType, typename... Properties,
+          typename Size, typename Generator,
+          std::enable_if_t<is_execution_space_v<ExecutionSpace>, int> = 0>
 auto generate_n(const std::string& label, const ExecutionSpace& ex,
                 const ::Kokkos::View<DataType, Properties...>& view, Size count,
                 Generator g) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
 
-  return Impl::generate_n_impl(label, ex, begin(view), count, std::move(g));
+  return Impl::generate_n_exespace_impl(label, ex, begin(view), count,
+                                        std::move(g));
+}
+
+//
+// overload set accepting a team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+template <typename TeamHandleType, typename IteratorType, typename Size,
+          typename Generator,
+          std::enable_if_t<is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION IteratorType generate_n(const TeamHandleType& teamHandle,
+                                        IteratorType first, Size count,
+                                        Generator g) {
+  return Impl::generate_n_team_impl(teamHandle, first, count, std::move(g));
+}
+
+template <typename TeamHandleType, typename DataType, typename... Properties,
+          typename Size, typename Generator,
+          std::enable_if_t<is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto generate_n(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType, Properties...>& view, Size count,
+    Generator g) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  return Impl::generate_n_team_impl(teamHandle, begin(view), count,
+                                    std::move(g));
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_InclusiveScan.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_InclusiveScan.hpp
index bcd731b850aa0d510ea2b2b4b2fece15f9a08970..a0e540b5e7aa79451ef10c7479232a73a26d0a64 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_InclusiveScan.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_InclusiveScan.hpp
@@ -23,33 +23,45 @@
 namespace Kokkos {
 namespace Experimental {
 
+//
+// overload set accepting execution space
+//
+
 // overload set 1
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorType>
-std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators<
-                     InputIteratorType, OutputIteratorType>::value,
-                 OutputIteratorType>
-inclusive_scan(const ExecutionSpace& ex, InputIteratorType first,
-               InputIteratorType last, OutputIteratorType first_dest) {
-  return Impl::inclusive_scan_default_op_impl(
+template <typename ExecutionSpace, typename InputIteratorType,
+          typename OutputIteratorType,
+          std::enable_if_t<
+              Impl::are_iterators_v<InputIteratorType, OutputIteratorType>&& ::
+                  Kokkos::is_execution_space_v<ExecutionSpace>,
+              int> = 0>
+OutputIteratorType inclusive_scan(const ExecutionSpace& ex,
+                                  InputIteratorType first,
+                                  InputIteratorType last,
+                                  OutputIteratorType first_dest) {
+  return Impl::inclusive_scan_default_op_exespace_impl(
       "Kokkos::inclusive_scan_default_functors_iterator_api", ex, first, last,
       first_dest);
 }
 
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorType>
-std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators<
-                     InputIteratorType, OutputIteratorType>::value,
-                 OutputIteratorType>
-inclusive_scan(const std::string& label, const ExecutionSpace& ex,
-               InputIteratorType first, InputIteratorType last,
-               OutputIteratorType first_dest) {
-  return Impl::inclusive_scan_default_op_impl(label, ex, first, last,
-                                              first_dest);
+template <typename ExecutionSpace, typename InputIteratorType,
+          typename OutputIteratorType,
+          std::enable_if_t<
+              Impl::are_iterators_v<InputIteratorType, OutputIteratorType>&& ::
+                  Kokkos::is_execution_space_v<ExecutionSpace>,
+              int> = 0>
+OutputIteratorType inclusive_scan(const std::string& label,
+                                  const ExecutionSpace& ex,
+                                  InputIteratorType first,
+                                  InputIteratorType last,
+                                  OutputIteratorType first_dest) {
+  return Impl::inclusive_scan_default_op_exespace_impl(label, ex, first, last,
+                                                       first_dest);
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto inclusive_scan(
     const ExecutionSpace& ex,
     const ::Kokkos::View<DataType1, Properties1...>& view_from,
@@ -57,13 +69,15 @@ auto inclusive_scan(
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
   namespace KE = ::Kokkos::Experimental;
-  return Impl::inclusive_scan_default_op_impl(
+  return Impl::inclusive_scan_default_op_exespace_impl(
       "Kokkos::inclusive_scan_default_functors_view_api", ex,
       KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest));
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto inclusive_scan(
     const std::string& label, const ExecutionSpace& ex,
     const ::Kokkos::View<DataType1, Properties1...>& view_from,
@@ -71,39 +85,45 @@ auto inclusive_scan(
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
   namespace KE = ::Kokkos::Experimental;
-  return Impl::inclusive_scan_default_op_impl(label, ex, KE::cbegin(view_from),
-                                              KE::cend(view_from),
-                                              KE::begin(view_dest));
+  return Impl::inclusive_scan_default_op_exespace_impl(
+      label, ex, KE::cbegin(view_from), KE::cend(view_from),
+      KE::begin(view_dest));
 }
 
 // overload set 2 (accepting custom binary op)
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorType, class BinaryOp>
-std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators<
-                     InputIteratorType, OutputIteratorType>::value,
-                 OutputIteratorType>
-inclusive_scan(const ExecutionSpace& ex, InputIteratorType first,
-               InputIteratorType last, OutputIteratorType first_dest,
-               BinaryOp binary_op) {
-  return Impl::inclusive_scan_custom_binary_op_impl(
+template <typename ExecutionSpace, typename InputIteratorType,
+          typename OutputIteratorType, typename BinaryOp,
+          std::enable_if_t<
+              Impl::are_iterators_v<InputIteratorType, OutputIteratorType>&& ::
+                  Kokkos::is_execution_space_v<ExecutionSpace>,
+              int> = 0>
+OutputIteratorType inclusive_scan(const ExecutionSpace& ex,
+                                  InputIteratorType first,
+                                  InputIteratorType last,
+                                  OutputIteratorType first_dest,
+                                  BinaryOp binary_op) {
+  return Impl::inclusive_scan_custom_binary_op_exespace_impl(
       "Kokkos::inclusive_scan_custom_functors_iterator_api", ex, first, last,
       first_dest, binary_op);
 }
 
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorType, class BinaryOp>
-std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators<
-                     InputIteratorType, OutputIteratorType>::value,
-                 OutputIteratorType>
-inclusive_scan(const std::string& label, const ExecutionSpace& ex,
-               InputIteratorType first, InputIteratorType last,
-               OutputIteratorType first_dest, BinaryOp binary_op) {
-  return Impl::inclusive_scan_custom_binary_op_impl(label, ex, first, last,
-                                                    first_dest, binary_op);
+template <typename ExecutionSpace, typename InputIteratorType,
+          typename OutputIteratorType, typename BinaryOp,
+          std::enable_if_t<
+              Impl::are_iterators_v<InputIteratorType, OutputIteratorType>&& ::
+                  Kokkos::is_execution_space_v<ExecutionSpace>,
+              int> = 0>
+OutputIteratorType inclusive_scan(
+    const std::string& label, const ExecutionSpace& ex, InputIteratorType first,
+    InputIteratorType last, OutputIteratorType first_dest, BinaryOp binary_op) {
+  return Impl::inclusive_scan_custom_binary_op_exespace_impl(
+      label, ex, first, last, first_dest, binary_op);
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class BinaryOp>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2, typename BinaryOp,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto inclusive_scan(const ExecutionSpace& ex,
                     const ::Kokkos::View<DataType1, Properties1...>& view_from,
                     const ::Kokkos::View<DataType2, Properties2...>& view_dest,
@@ -111,14 +131,16 @@ auto inclusive_scan(const ExecutionSpace& ex,
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
   namespace KE = ::Kokkos::Experimental;
-  return Impl::inclusive_scan_custom_binary_op_impl(
+  return Impl::inclusive_scan_custom_binary_op_exespace_impl(
       "Kokkos::inclusive_scan_custom_functors_view_api", ex,
       KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest),
       binary_op);
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class BinaryOp>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2, typename BinaryOp,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto inclusive_scan(const std::string& label, const ExecutionSpace& ex,
                     const ::Kokkos::View<DataType1, Properties1...>& view_from,
                     const ::Kokkos::View<DataType2, Properties2...>& view_dest,
@@ -126,67 +148,192 @@ auto inclusive_scan(const std::string& label, const ExecutionSpace& ex,
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
   namespace KE = ::Kokkos::Experimental;
-  return Impl::inclusive_scan_custom_binary_op_impl(
+  return Impl::inclusive_scan_custom_binary_op_exespace_impl(
       label, ex, KE::cbegin(view_from), KE::cend(view_from),
       KE::begin(view_dest), binary_op);
 }
 
 // overload set 3
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorType, class BinaryOp, class ValueType>
-std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators<
-                     InputIteratorType, OutputIteratorType>::value,
-                 OutputIteratorType>
-inclusive_scan(const ExecutionSpace& ex, InputIteratorType first,
-               InputIteratorType last, OutputIteratorType first_dest,
-               BinaryOp binary_op, ValueType init_value) {
-  return Impl::inclusive_scan_custom_binary_op_impl(
+template <typename ExecutionSpace, typename InputIteratorType,
+          typename OutputIteratorType, typename BinaryOp, typename ValueType,
+          std::enable_if_t<
+              Impl::are_iterators_v<InputIteratorType, OutputIteratorType>&& ::
+                  Kokkos::is_execution_space_v<ExecutionSpace>,
+              int> = 0>
+OutputIteratorType inclusive_scan(const ExecutionSpace& ex,
+                                  InputIteratorType first,
+                                  InputIteratorType last,
+                                  OutputIteratorType first_dest,
+                                  BinaryOp binary_op, ValueType init_value) {
+  static_assert(std::is_move_constructible_v<ValueType>,
+                "ValueType must be move constructible.");
+
+  return Impl::inclusive_scan_custom_binary_op_exespace_impl(
       "Kokkos::inclusive_scan_custom_functors_iterator_api", ex, first, last,
-      first_dest, binary_op, init_value);
+      first_dest, binary_op, std::move(init_value));
 }
 
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorType, class BinaryOp, class ValueType>
-std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators<
-                     InputIteratorType, OutputIteratorType>::value,
-                 OutputIteratorType>
-inclusive_scan(const std::string& label, const ExecutionSpace& ex,
-               InputIteratorType first, InputIteratorType last,
-               OutputIteratorType first_dest, BinaryOp binary_op,
-               ValueType init_value) {
-  return Impl::inclusive_scan_custom_binary_op_impl(
-      label, ex, first, last, first_dest, binary_op, init_value);
+template <typename ExecutionSpace, typename InputIteratorType,
+          typename OutputIteratorType, typename BinaryOp, typename ValueType,
+          std::enable_if_t<
+              Impl::are_iterators_v<InputIteratorType, OutputIteratorType>&& ::
+                  Kokkos::is_execution_space_v<ExecutionSpace>,
+              int> = 0>
+OutputIteratorType inclusive_scan(const std::string& label,
+                                  const ExecutionSpace& ex,
+                                  InputIteratorType first,
+                                  InputIteratorType last,
+                                  OutputIteratorType first_dest,
+                                  BinaryOp binary_op, ValueType init_value) {
+  static_assert(std::is_move_constructible_v<ValueType>,
+                "ValueType must be move constructible.");
+
+  return Impl::inclusive_scan_custom_binary_op_exespace_impl(
+      label, ex, first, last, first_dest, binary_op, std::move(init_value));
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class BinaryOp,
-          class ValueType>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2, typename BinaryOp,
+    typename ValueType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto inclusive_scan(const ExecutionSpace& ex,
                     const ::Kokkos::View<DataType1, Properties1...>& view_from,
                     const ::Kokkos::View<DataType2, Properties2...>& view_dest,
                     BinaryOp binary_op, ValueType init_value) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
+  static_assert(std::is_move_constructible_v<ValueType>,
+                "ValueType must be move constructible.");
+
   namespace KE = ::Kokkos::Experimental;
-  return Impl::inclusive_scan_custom_binary_op_impl(
+  return Impl::inclusive_scan_custom_binary_op_exespace_impl(
       "Kokkos::inclusive_scan_custom_functors_view_api", ex,
       KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest),
-      binary_op, init_value);
+      binary_op, std::move(init_value));
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class BinaryOp,
-          class ValueType>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2, typename BinaryOp,
+    typename ValueType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto inclusive_scan(const std::string& label, const ExecutionSpace& ex,
                     const ::Kokkos::View<DataType1, Properties1...>& view_from,
                     const ::Kokkos::View<DataType2, Properties2...>& view_dest,
                     BinaryOp binary_op, ValueType init_value) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
+  static_assert(std::is_move_constructible_v<ValueType>,
+                "ValueType must be move constructible.");
+
   namespace KE = ::Kokkos::Experimental;
-  return Impl::inclusive_scan_custom_binary_op_impl(
+  return Impl::inclusive_scan_custom_binary_op_exespace_impl(
       label, ex, KE::cbegin(view_from), KE::cend(view_from),
-      KE::begin(view_dest), binary_op, init_value);
+      KE::begin(view_dest), binary_op, std::move(init_value));
+}
+
+//
+// overload set accepting a team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+
+// overload set 1
+template <typename TeamHandleType, typename InputIteratorType,
+          typename OutputIteratorType,
+          std::enable_if_t<
+              Impl::are_iterators_v<InputIteratorType, OutputIteratorType>&& ::
+                  Kokkos::is_team_handle_v<TeamHandleType>,
+              int> = 0>
+KOKKOS_FUNCTION OutputIteratorType
+inclusive_scan(const TeamHandleType& teamHandle, InputIteratorType first,
+               InputIteratorType last, OutputIteratorType first_dest) {
+  return Impl::inclusive_scan_default_op_team_impl(teamHandle, first, last,
+                                                   first_dest);
+}
+
+template <typename TeamHandleType, typename DataType1, typename... Properties1,
+          typename DataType2, typename... Properties2,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto inclusive_scan(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType1, Properties1...>& view_from,
+    const ::Kokkos::View<DataType2, Properties2...>& view_dest) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::inclusive_scan_default_op_team_impl(
+      teamHandle, KE::cbegin(view_from), KE::cend(view_from),
+      KE::begin(view_dest));
+}
+
+// overload set 2 (accepting custom binary op)
+template <typename TeamHandleType, typename InputIteratorType,
+          typename OutputIteratorType, typename BinaryOp,
+          std::enable_if_t<
+              Impl::are_iterators_v<InputIteratorType, OutputIteratorType>&& ::
+                  Kokkos::is_team_handle_v<TeamHandleType>,
+              int> = 0>
+KOKKOS_FUNCTION OutputIteratorType inclusive_scan(
+    const TeamHandleType& teamHandle, InputIteratorType first,
+    InputIteratorType last, OutputIteratorType first_dest, BinaryOp binary_op) {
+  return Impl::inclusive_scan_custom_binary_op_team_impl(
+      teamHandle, first, last, first_dest, binary_op);
+}
+
+template <typename TeamHandleType, typename DataType1, typename... Properties1,
+          typename DataType2, typename... Properties2, typename BinaryOp,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto inclusive_scan(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType1, Properties1...>& view_from,
+    const ::Kokkos::View<DataType2, Properties2...>& view_dest,
+    BinaryOp binary_op) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::inclusive_scan_custom_binary_op_team_impl(
+      teamHandle, KE::cbegin(view_from), KE::cend(view_from),
+      KE::begin(view_dest), binary_op);
+}
+
+// overload set 3
+template <typename TeamHandleType, typename InputIteratorType,
+          typename OutputIteratorType, typename BinaryOp, typename ValueType,
+          std::enable_if_t<
+              Impl::are_iterators_v<InputIteratorType, OutputIteratorType>&& ::
+                  Kokkos::is_team_handle_v<TeamHandleType>,
+              int> = 0>
+
+KOKKOS_FUNCTION OutputIteratorType
+inclusive_scan(const TeamHandleType& teamHandle, InputIteratorType first,
+               InputIteratorType last, OutputIteratorType first_dest,
+               BinaryOp binary_op, ValueType init_value) {
+  static_assert(std::is_move_constructible_v<ValueType>,
+                "ValueType must be move constructible.");
+  return Impl::inclusive_scan_custom_binary_op_team_impl(
+      teamHandle, first, last, first_dest, binary_op, std::move(init_value));
+}
+
+template <typename TeamHandleType, typename DataType1, typename... Properties1,
+          typename DataType2, typename... Properties2, typename BinaryOp,
+          typename ValueType,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto inclusive_scan(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType1, Properties1...>& view_from,
+    const ::Kokkos::View<DataType2, Properties2...>& view_dest,
+    BinaryOp binary_op, ValueType init_value) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
+  static_assert(std::is_move_constructible_v<ValueType>,
+                "ValueType must be move constructible.");
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::inclusive_scan_custom_binary_op_team_impl(
+      teamHandle, KE::cbegin(view_from), KE::cend(view_from),
+      KE::begin(view_dest), binary_op, std::move(init_value));
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_IsPartitioned.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_IsPartitioned.hpp
index 29d6be9e8b769e0982f06645b48f77ee27713a3c..42f20bc4ecb6548ffb3bdbc685051030be6b0ce0 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_IsPartitioned.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_IsPartitioned.hpp
@@ -23,39 +23,78 @@
 namespace Kokkos {
 namespace Experimental {
 
-template <class ExecutionSpace, class IteratorType, class PredicateType>
+//
+// overload set accepting execution space
+//
+template <
+    typename ExecutionSpace, typename IteratorType, typename PredicateType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 bool is_partitioned(const ExecutionSpace& ex, IteratorType first,
                     IteratorType last, PredicateType p) {
-  return Impl::is_partitioned_impl(
+  return Impl::is_partitioned_exespace_impl(
       "Kokkos::is_partitioned_iterator_api_default", ex, first, last,
       std::move(p));
 }
 
-template <class ExecutionSpace, class IteratorType, class PredicateType>
+template <
+    typename ExecutionSpace, typename IteratorType, typename PredicateType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 bool is_partitioned(const std::string& label, const ExecutionSpace& ex,
                     IteratorType first, IteratorType last, PredicateType p) {
-  return Impl::is_partitioned_impl(label, ex, first, last, std::move(p));
+  return Impl::is_partitioned_exespace_impl(label, ex, first, last,
+                                            std::move(p));
 }
 
-template <class ExecutionSpace, class PredicateType, class DataType,
-          class... Properties>
+template <
+    typename ExecutionSpace, typename PredicateType, typename DataType,
+    typename... Properties,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 bool is_partitioned(const ExecutionSpace& ex,
                     const ::Kokkos::View<DataType, Properties...>& v,
                     PredicateType p) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
 
-  return Impl::is_partitioned_impl("Kokkos::is_partitioned_view_api_default",
-                                   ex, cbegin(v), cend(v), std::move(p));
+  return Impl::is_partitioned_exespace_impl(
+      "Kokkos::is_partitioned_view_api_default", ex, cbegin(v), cend(v),
+      std::move(p));
 }
 
-template <class ExecutionSpace, class PredicateType, class DataType,
-          class... Properties>
+template <
+    typename ExecutionSpace, typename PredicateType, typename DataType,
+    typename... Properties,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 bool is_partitioned(const std::string& label, const ExecutionSpace& ex,
                     const ::Kokkos::View<DataType, Properties...>& v,
                     PredicateType p) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
 
-  return Impl::is_partitioned_impl(label, ex, cbegin(v), cend(v), std::move(p));
+  return Impl::is_partitioned_exespace_impl(label, ex, cbegin(v), cend(v),
+                                            std::move(p));
+}
+
+//
+// overload set accepting a team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+template <typename TeamHandleType, typename IteratorType,
+          typename PredicateType,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION bool is_partitioned(const TeamHandleType& teamHandle,
+                                    IteratorType first, IteratorType last,
+                                    PredicateType p) {
+  return Impl::is_partitioned_team_impl(teamHandle, first, last, std::move(p));
+}
+
+template <typename TeamHandleType, typename PredicateType, typename DataType,
+          typename... Properties,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION bool is_partitioned(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType, Properties...>& v, PredicateType p) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+  return Impl::is_partitioned_team_impl(teamHandle, cbegin(v), cend(v),
+                                        std::move(p));
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_IsSorted.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_IsSorted.hpp
index f036254a02cf5280b22493869efa81e2dd4033dc..2c676c3ff34e49a4e54f3d2d7ed2bee8c5b8bb69 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_IsSorted.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_IsSorted.hpp
@@ -23,55 +23,73 @@
 namespace Kokkos {
 namespace Experimental {
 
-template <class ExecutionSpace, class IteratorType>
+//
+// overload set accepting execution space
+//
+template <
+    typename ExecutionSpace, typename IteratorType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 bool is_sorted(const ExecutionSpace& ex, IteratorType first,
                IteratorType last) {
-  return Impl::is_sorted_impl("Kokkos::is_sorted_iterator_api_default", ex,
-                              first, last);
+  return Impl::is_sorted_exespace_impl("Kokkos::is_sorted_iterator_api_default",
+                                       ex, first, last);
 }
 
-template <class ExecutionSpace, class IteratorType>
+template <
+    typename ExecutionSpace, typename IteratorType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 bool is_sorted(const std::string& label, const ExecutionSpace& ex,
                IteratorType first, IteratorType last) {
-  return Impl::is_sorted_impl(label, ex, first, last);
+  return Impl::is_sorted_exespace_impl(label, ex, first, last);
 }
 
-template <class ExecutionSpace, class DataType, class... Properties>
+template <
+    typename ExecutionSpace, typename DataType, typename... Properties,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 bool is_sorted(const ExecutionSpace& ex,
                const ::Kokkos::View<DataType, Properties...>& view) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::is_sorted_impl("Kokkos::is_sorted_view_api_default", ex,
-                              KE::cbegin(view), KE::cend(view));
+  return Impl::is_sorted_exespace_impl("Kokkos::is_sorted_view_api_default", ex,
+                                       KE::cbegin(view), KE::cend(view));
 }
 
-template <class ExecutionSpace, class DataType, class... Properties>
+template <
+    typename ExecutionSpace, typename DataType, typename... Properties,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 bool is_sorted(const std::string& label, const ExecutionSpace& ex,
                const ::Kokkos::View<DataType, Properties...>& view) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::is_sorted_impl(label, ex, KE::cbegin(view), KE::cend(view));
+  return Impl::is_sorted_exespace_impl(label, ex, KE::cbegin(view),
+                                       KE::cend(view));
 }
 
-template <class ExecutionSpace, class IteratorType, class ComparatorType>
+template <
+    typename ExecutionSpace, typename IteratorType, typename ComparatorType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 bool is_sorted(const ExecutionSpace& ex, IteratorType first, IteratorType last,
                ComparatorType comp) {
   Impl::static_assert_is_not_openmptarget(ex);
-  return Impl::is_sorted_impl("Kokkos::is_sorted_iterator_api_default", ex,
-                              first, last, std::move(comp));
+  return Impl::is_sorted_exespace_impl("Kokkos::is_sorted_iterator_api_default",
+                                       ex, first, last, std::move(comp));
 }
 
-template <class ExecutionSpace, class IteratorType, class ComparatorType>
+template <
+    typename ExecutionSpace, typename IteratorType, typename ComparatorType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 bool is_sorted(const std::string& label, const ExecutionSpace& ex,
                IteratorType first, IteratorType last, ComparatorType comp) {
   Impl::static_assert_is_not_openmptarget(ex);
-  return Impl::is_sorted_impl(label, ex, first, last, std::move(comp));
+  return Impl::is_sorted_exespace_impl(label, ex, first, last, std::move(comp));
 }
 
-template <class ExecutionSpace, class DataType, class... Properties,
-          class ComparatorType>
+template <
+    typename ExecutionSpace, typename DataType, typename... Properties,
+    typename ComparatorType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 bool is_sorted(const ExecutionSpace& ex,
                const ::Kokkos::View<DataType, Properties...>& view,
                ComparatorType comp) {
@@ -79,13 +97,15 @@ bool is_sorted(const ExecutionSpace& ex,
   Impl::static_assert_is_not_openmptarget(ex);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::is_sorted_impl("Kokkos::is_sorted_view_api_default", ex,
-                              KE::cbegin(view), KE::cend(view),
-                              std::move(comp));
+  return Impl::is_sorted_exespace_impl("Kokkos::is_sorted_view_api_default", ex,
+                                       KE::cbegin(view), KE::cend(view),
+                                       std::move(comp));
 }
 
-template <class ExecutionSpace, class DataType, class... Properties,
-          class ComparatorType>
+template <
+    typename ExecutionSpace, typename DataType, typename... Properties,
+    typename ComparatorType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 bool is_sorted(const std::string& label, const ExecutionSpace& ex,
                const ::Kokkos::View<DataType, Properties...>& view,
                ComparatorType comp) {
@@ -93,8 +113,56 @@ bool is_sorted(const std::string& label, const ExecutionSpace& ex,
   Impl::static_assert_is_not_openmptarget(ex);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::is_sorted_impl(label, ex, KE::cbegin(view), KE::cend(view),
-                              std::move(comp));
+  return Impl::is_sorted_exespace_impl(label, ex, KE::cbegin(view),
+                                       KE::cend(view), std::move(comp));
+}
+
+//
+// overload set accepting a team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+template <typename TeamHandleType, typename IteratorType,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION bool is_sorted(const TeamHandleType& teamHandle,
+                               IteratorType first, IteratorType last) {
+  return Impl::is_sorted_team_impl(teamHandle, first, last);
+}
+
+template <typename TeamHandleType, typename DataType, typename... Properties,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION bool is_sorted(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType, Properties...>& view) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::is_sorted_team_impl(teamHandle, KE::cbegin(view),
+                                   KE::cend(view));
+}
+
+template <typename TeamHandleType, typename IteratorType,
+          typename ComparatorType,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION bool is_sorted(const TeamHandleType& teamHandle,
+                               IteratorType first, IteratorType last,
+                               ComparatorType comp) {
+  Impl::static_assert_is_not_openmptarget(teamHandle);
+  return Impl::is_sorted_team_impl(teamHandle, first, last, std::move(comp));
+}
+
+template <typename TeamHandleType, typename DataType, typename... Properties,
+          typename ComparatorType,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION bool is_sorted(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType, Properties...>& view, ComparatorType comp) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  Impl::static_assert_is_not_openmptarget(teamHandle);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::is_sorted_team_impl(teamHandle, KE::cbegin(view), KE::cend(view),
+                                   std::move(comp));
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_IsSortedUntil.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_IsSortedUntil.hpp
index 276b3bb884269d43c87e38ef79aa38fae94b704c..96a17b67852cefbdde163a105038e8e4cff6c6b4 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_IsSortedUntil.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_IsSortedUntil.hpp
@@ -23,58 +23,78 @@
 namespace Kokkos {
 namespace Experimental {
 
-template <class ExecutionSpace, class IteratorType>
+//
+// overload set accepting execution space
+//
+template <
+    typename ExecutionSpace, typename IteratorType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 IteratorType is_sorted_until(const ExecutionSpace& ex, IteratorType first,
                              IteratorType last) {
-  return Impl::is_sorted_until_impl(
+  return Impl::is_sorted_until_exespace_impl(
       "Kokkos::is_sorted_until_iterator_api_default", ex, first, last);
 }
 
-template <class ExecutionSpace, class IteratorType>
+template <
+    typename ExecutionSpace, typename IteratorType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 IteratorType is_sorted_until(const std::string& label, const ExecutionSpace& ex,
                              IteratorType first, IteratorType last) {
-  return Impl::is_sorted_until_impl(label, ex, first, last);
+  return Impl::is_sorted_until_exespace_impl(label, ex, first, last);
 }
 
-template <class ExecutionSpace, class DataType, class... Properties>
+template <
+    typename ExecutionSpace, typename DataType, typename... Properties,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto is_sorted_until(const ExecutionSpace& ex,
                      const ::Kokkos::View<DataType, Properties...>& view) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::is_sorted_until_impl("Kokkos::is_sorted_until_view_api_default",
-                                    ex, KE::begin(view), KE::end(view));
+  return Impl::is_sorted_until_exespace_impl(
+      "Kokkos::is_sorted_until_view_api_default", ex, KE::begin(view),
+      KE::end(view));
 }
 
-template <class ExecutionSpace, class DataType, class... Properties>
+template <
+    typename ExecutionSpace, typename DataType, typename... Properties,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto is_sorted_until(const std::string& label, const ExecutionSpace& ex,
                      const ::Kokkos::View<DataType, Properties...>& view) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::is_sorted_until_impl(label, ex, KE::begin(view), KE::end(view));
+  return Impl::is_sorted_until_exespace_impl(label, ex, KE::begin(view),
+                                             KE::end(view));
 }
 
-template <class ExecutionSpace, class IteratorType, class ComparatorType>
+template <
+    typename ExecutionSpace, typename IteratorType, typename ComparatorType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 IteratorType is_sorted_until(const ExecutionSpace& ex, IteratorType first,
                              IteratorType last, ComparatorType comp) {
   Impl::static_assert_is_not_openmptarget(ex);
-  return Impl::is_sorted_until_impl(
+  return Impl::is_sorted_until_exespace_impl(
       "Kokkos::is_sorted_until_iterator_api_default", ex, first, last,
       std::move(comp));
 }
 
-template <class ExecutionSpace, class IteratorType, class ComparatorType>
+template <
+    typename ExecutionSpace, typename IteratorType, typename ComparatorType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 IteratorType is_sorted_until(const std::string& label, const ExecutionSpace& ex,
                              IteratorType first, IteratorType last,
                              ComparatorType comp) {
   Impl::static_assert_is_not_openmptarget(ex);
 
-  return Impl::is_sorted_until_impl(label, ex, first, last, std::move(comp));
+  return Impl::is_sorted_until_exespace_impl(label, ex, first, last,
+                                             std::move(comp));
 }
 
-template <class ExecutionSpace, class DataType, class... Properties,
-          class ComparatorType>
+template <
+    typename ExecutionSpace, typename DataType, typename... Properties,
+    typename ComparatorType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto is_sorted_until(const ExecutionSpace& ex,
                      const ::Kokkos::View<DataType, Properties...>& view,
                      ComparatorType comp) {
@@ -82,13 +102,15 @@ auto is_sorted_until(const ExecutionSpace& ex,
   Impl::static_assert_is_not_openmptarget(ex);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::is_sorted_until_impl("Kokkos::is_sorted_until_view_api_default",
-                                    ex, KE::begin(view), KE::end(view),
-                                    std::move(comp));
+  return Impl::is_sorted_until_exespace_impl(
+      "Kokkos::is_sorted_until_view_api_default", ex, KE::begin(view),
+      KE::end(view), std::move(comp));
 }
 
-template <class ExecutionSpace, class DataType, class... Properties,
-          class ComparatorType>
+template <
+    typename ExecutionSpace, typename DataType, typename... Properties,
+    typename ComparatorType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto is_sorted_until(const std::string& label, const ExecutionSpace& ex,
                      const ::Kokkos::View<DataType, Properties...>& view,
                      ComparatorType comp) {
@@ -96,8 +118,57 @@ auto is_sorted_until(const std::string& label, const ExecutionSpace& ex,
   Impl::static_assert_is_not_openmptarget(ex);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::is_sorted_until_impl(label, ex, KE::begin(view), KE::end(view),
-                                    std::move(comp));
+  return Impl::is_sorted_until_exespace_impl(label, ex, KE::begin(view),
+                                             KE::end(view), std::move(comp));
+}
+
+//
+// overload set accepting team handle
+//
+template <typename TeamHandleType, typename IteratorType,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION IteratorType is_sorted_until(const TeamHandleType& teamHandle,
+                                             IteratorType first,
+                                             IteratorType last) {
+  return Impl::is_sorted_until_team_impl(teamHandle, first, last);
+}
+
+template <typename TeamHandleType, typename DataType, typename... Properties,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto is_sorted_until(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType, Properties...>& view) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::is_sorted_until_team_impl(teamHandle, KE::begin(view),
+                                         KE::end(view));
+}
+
+template <typename TeamHandleType, typename IteratorType,
+          typename ComparatorType,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION IteratorType is_sorted_until(const TeamHandleType& teamHandle,
+                                             IteratorType first,
+                                             IteratorType last,
+                                             ComparatorType comp) {
+  Impl::static_assert_is_not_openmptarget(teamHandle);
+  return Impl::is_sorted_until_team_impl(teamHandle, first, last,
+                                         std::move(comp));
+}
+
+template <typename TeamHandleType, typename DataType, typename... Properties,
+          typename ComparatorType,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto is_sorted_until(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType, Properties...>& view, ComparatorType comp) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  Impl::static_assert_is_not_openmptarget(teamHandle);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::is_sorted_until_team_impl(teamHandle, KE::begin(view),
+                                         KE::end(view), std::move(comp));
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_LexicographicalCompare.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_LexicographicalCompare.hpp
index 0a77ef629f7baf3c25a35789fdf345c31ef939e5..4b5c69df4512e5f514b5b72aaaf1da431cfa77b4 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_LexicographicalCompare.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_LexicographicalCompare.hpp
@@ -23,25 +23,34 @@
 namespace Kokkos {
 namespace Experimental {
 
-template <class ExecutionSpace, class IteratorType1, class IteratorType2>
+//
+// overload set accepting execution space
+//
+template <
+    class ExecutionSpace, class IteratorType1, class IteratorType2,
+    std::enable_if_t<Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 bool lexicographical_compare(const ExecutionSpace& ex, IteratorType1 first1,
                              IteratorType1 last1, IteratorType2 first2,
                              IteratorType2 last2) {
-  return Impl::lexicographical_compare_impl(
+  return Impl::lexicographical_compare_exespace_impl(
       "Kokkos::lexicographical_compare_iterator_api_default", ex, first1, last1,
       first2, last2);
 }
 
-template <class ExecutionSpace, class IteratorType1, class IteratorType2>
+template <
+    class ExecutionSpace, class IteratorType1, class IteratorType2,
+    std::enable_if_t<Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 bool lexicographical_compare(const std::string& label, const ExecutionSpace& ex,
                              IteratorType1 first1, IteratorType1 last1,
                              IteratorType2 first2, IteratorType2 last2) {
-  return Impl::lexicographical_compare_impl(label, ex, first1, last1, first2,
-                                            last2);
+  return Impl::lexicographical_compare_exespace_impl(label, ex, first1, last1,
+                                                     first2, last2);
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
+template <
+    class ExecutionSpace, class DataType1, class... Properties1,
+    class DataType2, class... Properties2,
+    std::enable_if_t<Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 bool lexicographical_compare(
     const ExecutionSpace& ex,
     const ::Kokkos::View<DataType1, Properties1...>& view1,
@@ -50,13 +59,15 @@ bool lexicographical_compare(
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::lexicographical_compare_impl(
+  return Impl::lexicographical_compare_exespace_impl(
       "Kokkos::lexicographical_compare_view_api_default", ex, KE::cbegin(view1),
       KE::cend(view1), KE::cbegin(view2), KE::cend(view2));
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
+template <
+    class ExecutionSpace, class DataType1, class... Properties1,
+    class DataType2, class... Properties2,
+    std::enable_if_t<Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 bool lexicographical_compare(
     const std::string& label, const ExecutionSpace& ex,
     const ::Kokkos::View<DataType1, Properties1...>& view1,
@@ -65,33 +76,39 @@ bool lexicographical_compare(
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::lexicographical_compare_impl(label, ex, KE::cbegin(view1),
-                                            KE::cend(view1), KE::cbegin(view2),
-                                            KE::cend(view2));
+  return Impl::lexicographical_compare_exespace_impl(
+      label, ex, KE::cbegin(view1), KE::cend(view1), KE::cbegin(view2),
+      KE::cend(view2));
 }
 
-template <class ExecutionSpace, class IteratorType1, class IteratorType2,
-          class ComparatorType>
+template <
+    class ExecutionSpace, class IteratorType1, class IteratorType2,
+    class ComparatorType,
+    std::enable_if_t<Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 bool lexicographical_compare(const ExecutionSpace& ex, IteratorType1 first1,
                              IteratorType1 last1, IteratorType2 first2,
                              IteratorType2 last2, ComparatorType comp) {
-  return Impl::lexicographical_compare_impl(
+  return Impl::lexicographical_compare_exespace_impl(
       "Kokkos::lexicographical_compare_iterator_api_default", ex, first1, last1,
       first2, last2, comp);
 }
 
-template <class ExecutionSpace, class IteratorType1, class IteratorType2,
-          class ComparatorType>
+template <
+    class ExecutionSpace, class IteratorType1, class IteratorType2,
+    class ComparatorType,
+    std::enable_if_t<Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 bool lexicographical_compare(const std::string& label, const ExecutionSpace& ex,
                              IteratorType1 first1, IteratorType1 last1,
                              IteratorType2 first2, IteratorType2 last2,
                              ComparatorType comp) {
-  return Impl::lexicographical_compare_impl(label, ex, first1, last1, first2,
-                                            last2, comp);
+  return Impl::lexicographical_compare_exespace_impl(label, ex, first1, last1,
+                                                     first2, last2, comp);
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class ComparatorType>
+template <
+    class ExecutionSpace, class DataType1, class... Properties1,
+    class DataType2, class... Properties2, class ComparatorType,
+    std::enable_if_t<Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 bool lexicographical_compare(
     const ExecutionSpace& ex,
     const ::Kokkos::View<DataType1, Properties1...>& view1,
@@ -100,13 +117,15 @@ bool lexicographical_compare(
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::lexicographical_compare_impl(
+  return Impl::lexicographical_compare_exespace_impl(
       "Kokkos::lexicographical_compare_view_api_default", ex, KE::cbegin(view1),
       KE::cend(view1), KE::cbegin(view2), KE::cend(view2), comp);
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class ComparatorType>
+template <
+    class ExecutionSpace, class DataType1, class... Properties1,
+    class DataType2, class... Properties2, class ComparatorType,
+    std::enable_if_t<Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 bool lexicographical_compare(
     const std::string& label, const ExecutionSpace& ex,
     const ::Kokkos::View<DataType1, Properties1...>& view1,
@@ -115,9 +134,67 @@ bool lexicographical_compare(
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::lexicographical_compare_impl(label, ex, KE::cbegin(view1),
-                                            KE::cend(view1), KE::cbegin(view2),
-                                            KE::cend(view2), comp);
+  return Impl::lexicographical_compare_exespace_impl(
+      label, ex, KE::cbegin(view1), KE::cend(view1), KE::cbegin(view2),
+      KE::cend(view2), comp);
+}
+
+//
+// overload set accepting a team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+template <class TeamHandleType, class IteratorType1, class IteratorType2,
+          std::enable_if_t<Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION bool lexicographical_compare(const TeamHandleType& teamHandle,
+                                             IteratorType1 first1,
+                                             IteratorType1 last1,
+                                             IteratorType2 first2,
+                                             IteratorType2 last2) {
+  return Impl::lexicographical_compare_team_impl(teamHandle, first1, last1,
+                                                 first2, last2);
+}
+
+template <class TeamHandleType, class DataType1, class... Properties1,
+          class DataType2, class... Properties2,
+          std::enable_if_t<Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION bool lexicographical_compare(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType1, Properties1...>& view1,
+    ::Kokkos::View<DataType2, Properties2...>& view2) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::lexicographical_compare_team_impl(
+      teamHandle, KE::cbegin(view1), KE::cend(view1), KE::cbegin(view2),
+      KE::cend(view2));
+}
+
+template <class TeamHandleType, class IteratorType1, class IteratorType2,
+          class ComparatorType,
+          std::enable_if_t<Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION bool lexicographical_compare(
+    const TeamHandleType& teamHandle, IteratorType1 first1, IteratorType1 last1,
+    IteratorType2 first2, IteratorType2 last2, ComparatorType comp) {
+  return Impl::lexicographical_compare_team_impl(teamHandle, first1, last1,
+                                                 first2, last2, comp);
+}
+
+template <class TeamHandleType, class DataType1, class... Properties1,
+          class DataType2, class... Properties2, class ComparatorType,
+          std::enable_if_t<Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION bool lexicographical_compare(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType1, Properties1...>& view1,
+    ::Kokkos::View<DataType2, Properties2...>& view2, ComparatorType comp) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::lexicographical_compare_team_impl(
+      teamHandle, KE::cbegin(view1), KE::cend(view1), KE::cbegin(view2),
+      KE::cend(view2), comp);
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_MaxElement.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_MaxElement.hpp
index 2c1374f700764c7761bc3c81da45534b9bd24b8c..d16bac5bfc352683a4ae18aac311827039908fb8 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_MaxElement.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_MaxElement.hpp
@@ -23,81 +23,148 @@
 namespace Kokkos {
 namespace Experimental {
 
-template <class ExecutionSpace, class IteratorType>
+//
+// overload set accepting execution space
+//
+template <
+    typename ExecutionSpace, typename IteratorType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto max_element(const ExecutionSpace& ex, IteratorType first,
                  IteratorType last) {
-  return Impl::min_or_max_element_impl<MaxFirstLoc>(
+  return Impl::min_or_max_element_exespace_impl<MaxFirstLoc>(
       "Kokkos::max_element_iterator_api_default", ex, first, last);
 }
 
-template <class ExecutionSpace, class IteratorType>
+template <
+    typename ExecutionSpace, typename IteratorType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto max_element(const std::string& label, const ExecutionSpace& ex,
                  IteratorType first, IteratorType last) {
-  return Impl::min_or_max_element_impl<MaxFirstLoc>(label, ex, first, last);
+  return Impl::min_or_max_element_exespace_impl<MaxFirstLoc>(label, ex, first,
+                                                             last);
 }
 
-template <class ExecutionSpace, class IteratorType, class ComparatorType>
+template <
+    typename ExecutionSpace, typename IteratorType, typename ComparatorType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto max_element(const ExecutionSpace& ex, IteratorType first,
                  IteratorType last, ComparatorType comp) {
   Impl::static_assert_is_not_openmptarget(ex);
 
-  return Impl::min_or_max_element_impl<MaxFirstLocCustomComparator>(
+  return Impl::min_or_max_element_exespace_impl<MaxFirstLocCustomComparator>(
       "Kokkos::max_element_iterator_api_default", ex, first, last,
       std::move(comp));
 }
 
-template <class ExecutionSpace, class IteratorType, class ComparatorType>
+template <
+    typename ExecutionSpace, typename IteratorType, typename ComparatorType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto max_element(const std::string& label, const ExecutionSpace& ex,
                  IteratorType first, IteratorType last, ComparatorType comp) {
   Impl::static_assert_is_not_openmptarget(ex);
 
-  return Impl::min_or_max_element_impl<MaxFirstLocCustomComparator>(
+  return Impl::min_or_max_element_exespace_impl<MaxFirstLocCustomComparator>(
       label, ex, first, last, std::move(comp));
 }
 
-template <class ExecutionSpace, class DataType, class... Properties>
+template <
+    typename ExecutionSpace, typename DataType, typename... Properties,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto max_element(const ExecutionSpace& ex,
                  const ::Kokkos::View<DataType, Properties...>& v) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
 
-  return Impl::min_or_max_element_impl<MaxFirstLoc>(
+  return Impl::min_or_max_element_exespace_impl<MaxFirstLoc>(
       "Kokkos::max_element_view_api_default", ex, begin(v), end(v));
 }
 
-template <class ExecutionSpace, class DataType, class... Properties>
+template <
+    typename ExecutionSpace, typename DataType, typename... Properties,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto max_element(const std::string& label, const ExecutionSpace& ex,
                  const ::Kokkos::View<DataType, Properties...>& v) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
 
-  return Impl::min_or_max_element_impl<MaxFirstLoc>(label, ex, begin(v),
-                                                    end(v));
+  return Impl::min_or_max_element_exespace_impl<MaxFirstLoc>(label, ex,
+                                                             begin(v), end(v));
 }
 
-template <class ExecutionSpace, class DataType, class ComparatorType,
-          class... Properties>
+template <
+    typename ExecutionSpace, typename DataType, typename ComparatorType,
+    typename... Properties,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto max_element(const ExecutionSpace& ex,
                  const ::Kokkos::View<DataType, Properties...>& v,
                  ComparatorType comp) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
   Impl::static_assert_is_not_openmptarget(ex);
 
-  return Impl::min_or_max_element_impl<MaxFirstLocCustomComparator>(
+  return Impl::min_or_max_element_exespace_impl<MaxFirstLocCustomComparator>(
       "Kokkos::max_element_view_api_default", ex, begin(v), end(v),
       std::move(comp));
 }
 
-template <class ExecutionSpace, class DataType, class ComparatorType,
-          class... Properties>
+template <
+    typename ExecutionSpace, typename DataType, typename ComparatorType,
+    typename... Properties,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto max_element(const std::string& label, const ExecutionSpace& ex,
                  const ::Kokkos::View<DataType, Properties...>& v,
                  ComparatorType comp) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
   Impl::static_assert_is_not_openmptarget(ex);
 
-  return Impl::min_or_max_element_impl<MaxFirstLocCustomComparator>(
+  return Impl::min_or_max_element_exespace_impl<MaxFirstLocCustomComparator>(
       label, ex, begin(v), end(v), std::move(comp));
 }
 
+//
+// overload set accepting a team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+template <typename TeamHandleType, typename IteratorType,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto max_element(const TeamHandleType& teamHandle,
+                                 IteratorType first, IteratorType last) {
+  return Impl::min_or_max_element_team_impl<MaxFirstLoc>(teamHandle, first,
+                                                         last);
+}
+
+template <typename TeamHandleType, typename DataType, typename... Properties,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto max_element(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType, Properties...>& v) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+
+  return Impl::min_or_max_element_team_impl<MaxFirstLoc>(teamHandle, begin(v),
+                                                         end(v));
+}
+
+template <typename TeamHandleType, typename IteratorType,
+          typename ComparatorType,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto max_element(const TeamHandleType& teamHandle,
+                                 IteratorType first, IteratorType last,
+                                 ComparatorType comp) {
+  Impl::static_assert_is_not_openmptarget(teamHandle);
+  return Impl::min_or_max_element_team_impl<MaxFirstLocCustomComparator>(
+      teamHandle, first, last, std::move(comp));
+}
+
+template <typename TeamHandleType, typename DataType, typename ComparatorType,
+          typename... Properties,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto max_element(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType, Properties...>& v, ComparatorType comp) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+  Impl::static_assert_is_not_openmptarget(teamHandle);
+  return Impl::min_or_max_element_team_impl<MaxFirstLocCustomComparator>(
+      teamHandle, begin(v), end(v), std::move(comp));
+}
+
 }  // namespace Experimental
 }  // namespace Kokkos
 
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_MinElement.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_MinElement.hpp
index 1d03b7c962f21df9f7523dcf7c7222ddb88dbb74..2a53fce3e24e4ec6b2583598191647bd69883257 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_MinElement.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_MinElement.hpp
@@ -23,81 +23,148 @@
 namespace Kokkos {
 namespace Experimental {
 
-template <class ExecutionSpace, class IteratorType>
+//
+// overload set accepting execution space
+//
+template <
+    typename ExecutionSpace, typename IteratorType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto min_element(const ExecutionSpace& ex, IteratorType first,
                  IteratorType last) {
-  return Impl::min_or_max_element_impl<MinFirstLoc>(
+  return Impl::min_or_max_element_exespace_impl<MinFirstLoc>(
       "Kokkos::min_element_iterator_api_default", ex, first, last);
 }
 
-template <class ExecutionSpace, class IteratorType>
+template <
+    typename ExecutionSpace, typename IteratorType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto min_element(const std::string& label, const ExecutionSpace& ex,
                  IteratorType first, IteratorType last) {
-  return Impl::min_or_max_element_impl<MinFirstLoc>(label, ex, first, last);
+  return Impl::min_or_max_element_exespace_impl<MinFirstLoc>(label, ex, first,
+                                                             last);
 }
 
-template <class ExecutionSpace, class IteratorType, class ComparatorType>
+template <
+    typename ExecutionSpace, typename IteratorType, typename ComparatorType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto min_element(const ExecutionSpace& ex, IteratorType first,
                  IteratorType last, ComparatorType comp) {
   Impl::static_assert_is_not_openmptarget(ex);
 
-  return Impl::min_or_max_element_impl<MinFirstLocCustomComparator>(
+  return Impl::min_or_max_element_exespace_impl<MinFirstLocCustomComparator>(
       "Kokkos::min_element_iterator_api_default", ex, first, last,
       std::move(comp));
 }
 
-template <class ExecutionSpace, class IteratorType, class ComparatorType>
+template <
+    typename ExecutionSpace, typename IteratorType, typename ComparatorType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto min_element(const std::string& label, const ExecutionSpace& ex,
                  IteratorType first, IteratorType last, ComparatorType comp) {
   Impl::static_assert_is_not_openmptarget(ex);
 
-  return Impl::min_or_max_element_impl<MinFirstLocCustomComparator>(
+  return Impl::min_or_max_element_exespace_impl<MinFirstLocCustomComparator>(
       label, ex, first, last, std::move(comp));
 }
 
-template <class ExecutionSpace, class DataType, class... Properties>
+template <
+    typename ExecutionSpace, typename DataType, typename... Properties,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto min_element(const ExecutionSpace& ex,
                  const ::Kokkos::View<DataType, Properties...>& v) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
 
-  return Impl::min_or_max_element_impl<MinFirstLoc>(
+  return Impl::min_or_max_element_exespace_impl<MinFirstLoc>(
       "Kokkos::min_element_view_api_default", ex, begin(v), end(v));
 }
 
-template <class ExecutionSpace, class DataType, class ComparatorType,
-          class... Properties>
+template <
+    typename ExecutionSpace, typename DataType, typename ComparatorType,
+    typename... Properties,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto min_element(const ExecutionSpace& ex,
                  const ::Kokkos::View<DataType, Properties...>& v,
                  ComparatorType comp) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
   Impl::static_assert_is_not_openmptarget(ex);
 
-  return Impl::min_or_max_element_impl<MinFirstLocCustomComparator>(
+  return Impl::min_or_max_element_exespace_impl<MinFirstLocCustomComparator>(
       "Kokkos::min_element_view_api_default", ex, begin(v), end(v),
       std::move(comp));
 }
 
-template <class ExecutionSpace, class DataType, class... Properties>
+template <
+    typename ExecutionSpace, typename DataType, typename... Properties,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto min_element(const std::string& label, const ExecutionSpace& ex,
                  const ::Kokkos::View<DataType, Properties...>& v) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
 
-  return Impl::min_or_max_element_impl<MinFirstLoc>(label, ex, begin(v),
-                                                    end(v));
+  return Impl::min_or_max_element_exespace_impl<MinFirstLoc>(label, ex,
+                                                             begin(v), end(v));
 }
 
-template <class ExecutionSpace, class DataType, class ComparatorType,
-          class... Properties>
+template <
+    typename ExecutionSpace, typename DataType, typename ComparatorType,
+    typename... Properties,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto min_element(const std::string& label, const ExecutionSpace& ex,
                  const ::Kokkos::View<DataType, Properties...>& v,
                  ComparatorType comp) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
   Impl::static_assert_is_not_openmptarget(ex);
 
-  return Impl::min_or_max_element_impl<MinFirstLocCustomComparator>(
+  return Impl::min_or_max_element_exespace_impl<MinFirstLocCustomComparator>(
       label, ex, begin(v), end(v), std::move(comp));
 }
 
+//
+// overload set accepting a team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+template <typename TeamHandleType, typename IteratorType,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto min_element(const TeamHandleType& teamHandle,
+                                 IteratorType first, IteratorType last) {
+  return Impl::min_or_max_element_team_impl<MinFirstLoc>(teamHandle, first,
+                                                         last);
+}
+
+template <typename TeamHandleType, typename DataType, typename... Properties,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto min_element(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType, Properties...>& v) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+
+  return Impl::min_or_max_element_team_impl<MinFirstLoc>(teamHandle, begin(v),
+                                                         end(v));
+}
+
+template <typename TeamHandleType, typename IteratorType,
+          typename ComparatorType,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto min_element(const TeamHandleType& teamHandle,
+                                 IteratorType first, IteratorType last,
+                                 ComparatorType comp) {
+  Impl::static_assert_is_not_openmptarget(teamHandle);
+  return Impl::min_or_max_element_team_impl<MinFirstLocCustomComparator>(
+      teamHandle, first, last, std::move(comp));
+}
+
+template <typename TeamHandleType, typename DataType, typename ComparatorType,
+          typename... Properties,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto min_element(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType, Properties...>& v, ComparatorType comp) {
+  Impl::static_assert_is_not_openmptarget(teamHandle);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+  return Impl::min_or_max_element_team_impl<MinFirstLocCustomComparator>(
+      teamHandle, begin(v), end(v), std::move(comp));
+}
+
 }  // namespace Experimental
 }  // namespace Kokkos
 
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_MinMaxElement.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_MinMaxElement.hpp
index d481b499cc9932cd55a8f7eeb4d8487cbb9e2401..c3a1f73ef69f4c702800413e5c8a4ca2704ebb53 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_MinMaxElement.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_MinMaxElement.hpp
@@ -23,82 +23,151 @@
 namespace Kokkos {
 namespace Experimental {
 
-template <class ExecutionSpace, class IteratorType>
+//
+// overload set accepting execution space
+//
+template <
+    typename ExecutionSpace, typename IteratorType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto minmax_element(const ExecutionSpace& ex, IteratorType first,
                     IteratorType last) {
-  return Impl::minmax_element_impl<MinMaxFirstLastLoc>(
+  return Impl::minmax_element_exespace_impl<MinMaxFirstLastLoc>(
       "Kokkos::minmax_element_iterator_api_default", ex, first, last);
 }
 
-template <class ExecutionSpace, class IteratorType>
+template <
+    typename ExecutionSpace, typename IteratorType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto minmax_element(const std::string& label, const ExecutionSpace& ex,
                     IteratorType first, IteratorType last) {
-  return Impl::minmax_element_impl<MinMaxFirstLastLoc>(label, ex, first, last);
+  return Impl::minmax_element_exespace_impl<MinMaxFirstLastLoc>(label, ex,
+                                                                first, last);
 }
 
-template <class ExecutionSpace, class IteratorType, class ComparatorType>
+template <
+    typename ExecutionSpace, typename IteratorType, typename ComparatorType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto minmax_element(const ExecutionSpace& ex, IteratorType first,
                     IteratorType last, ComparatorType comp) {
   Impl::static_assert_is_not_openmptarget(ex);
 
-  return Impl::minmax_element_impl<MinMaxFirstLastLocCustomComparator>(
+  return Impl::minmax_element_exespace_impl<MinMaxFirstLastLocCustomComparator>(
       "Kokkos::minmax_element_iterator_api_default", ex, first, last,
       std::move(comp));
 }
 
-template <class ExecutionSpace, class IteratorType, class ComparatorType>
+template <
+    typename ExecutionSpace, typename IteratorType, typename ComparatorType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto minmax_element(const std::string& label, const ExecutionSpace& ex,
                     IteratorType first, IteratorType last,
                     ComparatorType comp) {
   Impl::static_assert_is_not_openmptarget(ex);
 
-  return Impl::minmax_element_impl<MinMaxFirstLastLocCustomComparator>(
+  return Impl::minmax_element_exespace_impl<MinMaxFirstLastLocCustomComparator>(
       label, ex, first, last, std::move(comp));
 }
 
-template <class ExecutionSpace, class DataType, class... Properties>
+template <
+    typename ExecutionSpace, typename DataType, typename... Properties,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto minmax_element(const ExecutionSpace& ex,
                     const ::Kokkos::View<DataType, Properties...>& v) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
 
-  return Impl::minmax_element_impl<MinMaxFirstLastLoc>(
+  return Impl::minmax_element_exespace_impl<MinMaxFirstLastLoc>(
       "Kokkos::minmax_element_view_api_default", ex, begin(v), end(v));
 }
 
-template <class ExecutionSpace, class DataType, class... Properties>
+template <
+    typename ExecutionSpace, typename DataType, typename... Properties,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto minmax_element(const std::string& label, const ExecutionSpace& ex,
                     const ::Kokkos::View<DataType, Properties...>& v) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
 
-  return Impl::minmax_element_impl<MinMaxFirstLastLoc>(label, ex, begin(v),
-                                                       end(v));
+  return Impl::minmax_element_exespace_impl<MinMaxFirstLastLoc>(
+      label, ex, begin(v), end(v));
 }
 
-template <class ExecutionSpace, class DataType, class ComparatorType,
-          class... Properties>
+template <
+    typename ExecutionSpace, typename DataType, typename ComparatorType,
+    typename... Properties,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto minmax_element(const ExecutionSpace& ex,
                     const ::Kokkos::View<DataType, Properties...>& v,
                     ComparatorType comp) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
   Impl::static_assert_is_not_openmptarget(ex);
 
-  return Impl::minmax_element_impl<MinMaxFirstLastLocCustomComparator>(
+  return Impl::minmax_element_exespace_impl<MinMaxFirstLastLocCustomComparator>(
       "Kokkos::minmax_element_view_api_default", ex, begin(v), end(v),
       std::move(comp));
 }
 
-template <class ExecutionSpace, class DataType, class ComparatorType,
-          class... Properties>
+template <
+    typename ExecutionSpace, typename DataType, typename ComparatorType,
+    typename... Properties,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto minmax_element(const std::string& label, const ExecutionSpace& ex,
                     const ::Kokkos::View<DataType, Properties...>& v,
                     ComparatorType comp) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
   Impl::static_assert_is_not_openmptarget(ex);
 
-  return Impl::minmax_element_impl<MinMaxFirstLastLocCustomComparator>(
+  return Impl::minmax_element_exespace_impl<MinMaxFirstLastLocCustomComparator>(
       label, ex, begin(v), end(v), std::move(comp));
 }
 
+//
+// overload set accepting a team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+template <typename TeamHandleType, typename IteratorType,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto minmax_element(const TeamHandleType& teamHandle,
+                                    IteratorType first, IteratorType last) {
+  return Impl::minmax_element_team_impl<MinMaxFirstLastLoc>(teamHandle, first,
+                                                            last);
+}
+
+template <typename TeamHandleType, typename IteratorType,
+          typename ComparatorType,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto minmax_element(const TeamHandleType& teamHandle,
+                                    IteratorType first, IteratorType last,
+                                    ComparatorType comp) {
+  Impl::static_assert_is_not_openmptarget(teamHandle);
+
+  return Impl::minmax_element_team_impl<MinMaxFirstLastLocCustomComparator>(
+      teamHandle, first, last, std::move(comp));
+}
+
+template <typename TeamHandleType, typename DataType, typename... Properties,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto minmax_element(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType, Properties...>& v) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+
+  return Impl::minmax_element_team_impl<MinMaxFirstLastLoc>(teamHandle,
+                                                            begin(v), end(v));
+}
+
+template <typename TeamHandleType, typename DataType, typename ComparatorType,
+          typename... Properties,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto minmax_element(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType, Properties...>& v, ComparatorType comp) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+  Impl::static_assert_is_not_openmptarget(teamHandle);
+
+  return Impl::minmax_element_team_impl<MinMaxFirstLastLocCustomComparator>(
+      teamHandle, begin(v), end(v), std::move(comp));
+}
+
 }  // namespace Experimental
 }  // namespace Kokkos
 
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Mismatch.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Mismatch.hpp
index 13c994ca90b19a21956251f5748bb65cc1598080..090afe69e3775c466765a77e8afcea22ab219bba 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Mismatch.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Mismatch.hpp
@@ -30,46 +30,60 @@ namespace Experimental {
 //
 // makes API ambiguous (with the overload accepting views).
 
-template <class ExecutionSpace, class IteratorType1, class IteratorType2>
+//
+// overload set accepting execution space
+//
+template <
+    class ExecutionSpace, class IteratorType1, class IteratorType2,
+    std::enable_if_t<Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 ::Kokkos::pair<IteratorType1, IteratorType2> mismatch(const ExecutionSpace& ex,
                                                       IteratorType1 first1,
                                                       IteratorType1 last1,
                                                       IteratorType2 first2,
                                                       IteratorType2 last2) {
-  return Impl::mismatch_impl("Kokkos::mismatch_iterator_api_default", ex,
-                             first1, last1, first2, last2);
+  return Impl::mismatch_exespace_impl("Kokkos::mismatch_iterator_api_default",
+                                      ex, first1, last1, first2, last2);
 }
 
-template <class ExecutionSpace, class IteratorType1, class IteratorType2,
-          class BinaryPredicateType>
+template <
+    class ExecutionSpace, class IteratorType1, class IteratorType2,
+    class BinaryPredicateType,
+    std::enable_if_t<Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 ::Kokkos::pair<IteratorType1, IteratorType2> mismatch(
     const ExecutionSpace& ex, IteratorType1 first1, IteratorType1 last1,
     IteratorType2 first2, IteratorType2 last2,
     BinaryPredicateType&& predicate) {
-  return Impl::mismatch_impl("Kokkos::mismatch_iterator_api_default", ex,
-                             first1, last1, first2, last2,
-                             std::forward<BinaryPredicateType>(predicate));
+  return Impl::mismatch_exespace_impl(
+      "Kokkos::mismatch_iterator_api_default", ex, first1, last1, first2, last2,
+      std::forward<BinaryPredicateType>(predicate));
 }
 
-template <class ExecutionSpace, class IteratorType1, class IteratorType2>
+template <
+    class ExecutionSpace, class IteratorType1, class IteratorType2,
+    std::enable_if_t<Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 ::Kokkos::pair<IteratorType1, IteratorType2> mismatch(
     const std::string& label, const ExecutionSpace& ex, IteratorType1 first1,
     IteratorType1 last1, IteratorType2 first2, IteratorType2 last2) {
-  return Impl::mismatch_impl(label, ex, first1, last1, first2, last2);
+  return Impl::mismatch_exespace_impl(label, ex, first1, last1, first2, last2);
 }
 
-template <class ExecutionSpace, class IteratorType1, class IteratorType2,
-          class BinaryPredicateType>
+template <
+    class ExecutionSpace, class IteratorType1, class IteratorType2,
+    class BinaryPredicateType,
+    std::enable_if_t<Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 ::Kokkos::pair<IteratorType1, IteratorType2> mismatch(
     const std::string& label, const ExecutionSpace& ex, IteratorType1 first1,
     IteratorType1 last1, IteratorType2 first2, IteratorType2 last2,
     BinaryPredicateType&& predicate) {
-  return Impl::mismatch_impl(label, ex, first1, last1, first2, last2,
-                             std::forward<BinaryPredicateType>(predicate));
+  return Impl::mismatch_exespace_impl(
+      label, ex, first1, last1, first2, last2,
+      std::forward<BinaryPredicateType>(predicate));
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
+template <
+    class ExecutionSpace, class DataType1, class... Properties1,
+    class DataType2, class... Properties2,
+    std::enable_if_t<Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto mismatch(const ExecutionSpace& ex,
               const ::Kokkos::View<DataType1, Properties1...>& view1,
               const ::Kokkos::View<DataType2, Properties2...>& view2) {
@@ -77,13 +91,15 @@ auto mismatch(const ExecutionSpace& ex,
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::mismatch_impl("Kokkos::mismatch_view_api_default", ex,
-                             KE::begin(view1), KE::end(view1), KE::begin(view2),
-                             KE::end(view2));
+  return Impl::mismatch_exespace_impl("Kokkos::mismatch_view_api_default", ex,
+                                      KE::begin(view1), KE::end(view1),
+                                      KE::begin(view2), KE::end(view2));
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class BinaryPredicateType>
+template <
+    class ExecutionSpace, class DataType1, class... Properties1,
+    class DataType2, class... Properties2, class BinaryPredicateType,
+    std::enable_if_t<Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto mismatch(const ExecutionSpace& ex,
               const ::Kokkos::View<DataType1, Properties1...>& view1,
               const ::Kokkos::View<DataType2, Properties2...>& view2,
@@ -92,14 +108,16 @@ auto mismatch(const ExecutionSpace& ex,
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::mismatch_impl("Kokkos::mismatch_view_api_default", ex,
-                             KE::begin(view1), KE::end(view1), KE::begin(view2),
-                             KE::end(view2),
-                             std::forward<BinaryPredicateType>(predicate));
+  return Impl::mismatch_exespace_impl(
+      "Kokkos::mismatch_view_api_default", ex, KE::begin(view1), KE::end(view1),
+      KE::begin(view2), KE::end(view2),
+      std::forward<BinaryPredicateType>(predicate));
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
+template <
+    class ExecutionSpace, class DataType1, class... Properties1,
+    class DataType2, class... Properties2,
+    std::enable_if_t<Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto mismatch(const std::string& label, const ExecutionSpace& ex,
               const ::Kokkos::View<DataType1, Properties1...>& view1,
               const ::Kokkos::View<DataType2, Properties2...>& view2) {
@@ -107,12 +125,15 @@ auto mismatch(const std::string& label, const ExecutionSpace& ex,
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::mismatch_impl(label, ex, KE::begin(view1), KE::end(view1),
-                             KE::begin(view2), KE::end(view2));
+  return Impl::mismatch_exespace_impl(label, ex, KE::begin(view1),
+                                      KE::end(view1), KE::begin(view2),
+                                      KE::end(view2));
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class BinaryPredicateType>
+template <
+    class ExecutionSpace, class DataType1, class... Properties1,
+    class DataType2, class... Properties2, class BinaryPredicateType,
+    std::enable_if_t<Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto mismatch(const std::string& label, const ExecutionSpace& ex,
               const ::Kokkos::View<DataType1, Properties1...>& view1,
               const ::Kokkos::View<DataType2, Properties2...>& view2,
@@ -121,9 +142,65 @@ auto mismatch(const std::string& label, const ExecutionSpace& ex,
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::mismatch_impl(label, ex, KE::begin(view1), KE::end(view1),
-                             KE::begin(view2), KE::end(view2),
-                             std::forward<BinaryPredicateType>(predicate));
+  return Impl::mismatch_exespace_impl(
+      label, ex, KE::begin(view1), KE::end(view1), KE::begin(view2),
+      KE::end(view2), std::forward<BinaryPredicateType>(predicate));
+}
+
+//
+// overload set accepting a team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+template <class TeamHandleType, class IteratorType1, class IteratorType2,
+          std::enable_if_t<Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION ::Kokkos::pair<IteratorType1, IteratorType2> mismatch(
+    const TeamHandleType& teamHandle, IteratorType1 first1, IteratorType1 last1,
+    IteratorType2 first2, IteratorType2 last2) {
+  return Impl::mismatch_team_impl(teamHandle, first1, last1, first2, last2);
+}
+
+template <class TeamHandleType, class IteratorType1, class IteratorType2,
+          class BinaryPredicateType,
+          std::enable_if_t<Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION ::Kokkos::pair<IteratorType1, IteratorType2> mismatch(
+    const TeamHandleType& teamHandle, IteratorType1 first1, IteratorType1 last1,
+    IteratorType2 first2, IteratorType2 last2,
+    BinaryPredicateType&& predicate) {
+  return Impl::mismatch_team_impl(teamHandle, first1, last1, first2, last2,
+                                  std::forward<BinaryPredicateType>(predicate));
+}
+
+template <class TeamHandleType, class DataType1, class... Properties1,
+          class DataType2, class... Properties2,
+          std::enable_if_t<Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto mismatch(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType1, Properties1...>& view1,
+    const ::Kokkos::View<DataType2, Properties2...>& view2) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::mismatch_team_impl(teamHandle, KE::begin(view1), KE::end(view1),
+                                  KE::begin(view2), KE::end(view2));
+}
+
+template <class TeamHandleType, class DataType1, class... Properties1,
+          class DataType2, class... Properties2, class BinaryPredicateType,
+          std::enable_if_t<Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto mismatch(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType1, Properties1...>& view1,
+    const ::Kokkos::View<DataType2, Properties2...>& view2,
+    BinaryPredicateType&& predicate) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::mismatch_team_impl(teamHandle, KE::begin(view1), KE::end(view1),
+                                  KE::begin(view2), KE::end(view2),
+                                  std::forward<BinaryPredicateType>(predicate));
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Move.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Move.hpp
index d49acd9f702d05897576f532b290dc6b965b220c..f04ea12ba88a92c6f4bab3412e4733c9a210133c 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Move.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Move.hpp
@@ -23,41 +23,81 @@
 namespace Kokkos {
 namespace Experimental {
 
-template <class ExecutionSpace, class InputIterator, class OutputIterator>
+//
+// overload set accepting execution space
+//
+template <
+    typename ExecutionSpace, typename InputIterator, typename OutputIterator,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 OutputIterator move(const ExecutionSpace& ex, InputIterator first,
                     InputIterator last, OutputIterator d_first) {
-  return Impl::move_impl("Kokkos::move_iterator_api_default", ex, first, last,
-                         d_first);
+  return Impl::move_exespace_impl("Kokkos::move_iterator_api_default", ex,
+                                  first, last, d_first);
 }
 
-template <class ExecutionSpace, class InputIterator, class OutputIterator>
+template <
+    typename ExecutionSpace, typename InputIterator, typename OutputIterator,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 OutputIterator move(const std::string& label, const ExecutionSpace& ex,
                     InputIterator first, InputIterator last,
                     OutputIterator d_first) {
-  return Impl::move_impl(label, ex, first, last, d_first);
+  return Impl::move_exespace_impl(label, ex, first, last, d_first);
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto move(const ExecutionSpace& ex,
           const ::Kokkos::View<DataType1, Properties1...>& source,
           ::Kokkos::View<DataType2, Properties2...>& dest) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
 
-  return Impl::move_impl("Kokkos::move_view_api_default", ex, begin(source),
-                         end(source), begin(dest));
+  return Impl::move_exespace_impl("Kokkos::move_view_api_default", ex,
+                                  begin(source), end(source), begin(dest));
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto move(const std::string& label, const ExecutionSpace& ex,
           const ::Kokkos::View<DataType1, Properties1...>& source,
           ::Kokkos::View<DataType2, Properties2...>& dest) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
 
-  return Impl::move_impl(label, ex, begin(source), end(source), begin(dest));
+  return Impl::move_exespace_impl(label, ex, begin(source), end(source),
+                                  begin(dest));
+}
+
+//
+// overload set accepting a team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+template <typename TeamHandleType, typename InputIterator,
+          typename OutputIterator,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION OutputIterator move(const TeamHandleType& teamHandle,
+                                    InputIterator first, InputIterator last,
+                                    OutputIterator d_first) {
+  return Impl::move_team_impl(teamHandle, first, last, d_first);
+}
+
+template <typename TeamHandleType, typename DataType1, typename... Properties1,
+          typename DataType2, typename... Properties2,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto move(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType1, Properties1...>& source,
+    ::Kokkos::View<DataType2, Properties2...>& dest) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
+
+  return Impl::move_team_impl(teamHandle, begin(source), end(source),
+                              begin(dest));
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_MoveBackward.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_MoveBackward.hpp
index 60d50fa881a812cd68111f1b7f8b69607a57e0a5..375474ca57f956504fea1f10a08c3dbfd0a47615 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_MoveBackward.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_MoveBackward.hpp
@@ -23,42 +23,83 @@
 namespace Kokkos {
 namespace Experimental {
 
-template <class ExecutionSpace, class IteratorType1, class IteratorType2>
+//
+// overload set accepting execution space
+//
+template <
+    typename ExecutionSpace, typename IteratorType1, typename IteratorType2,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 IteratorType2 move_backward(const ExecutionSpace& ex, IteratorType1 first,
                             IteratorType1 last, IteratorType2 d_last) {
-  return Impl::move_backward_impl("Kokkos::move_backward_iterator_api_default",
-                                  ex, first, last, d_last);
+  return Impl::move_backward_exespace_impl(
+      "Kokkos::move_backward_iterator_api_default", ex, first, last, d_last);
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto move_backward(const ExecutionSpace& ex,
                    const ::Kokkos::View<DataType1, Properties1...>& source,
                    ::Kokkos::View<DataType2, Properties2...>& dest) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
 
-  return Impl::move_backward_impl("Kokkos::move_backward_view_api_default", ex,
-                                  begin(source), end(source), end(dest));
+  return Impl::move_backward_exespace_impl(
+      "Kokkos::move_backward_view_api_default", ex, begin(source), end(source),
+      end(dest));
 }
 
-template <class ExecutionSpace, class IteratorType1, class IteratorType2>
+template <
+    typename ExecutionSpace, typename IteratorType1, typename IteratorType2,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 IteratorType2 move_backward(const std::string& label, const ExecutionSpace& ex,
                             IteratorType1 first, IteratorType1 last,
                             IteratorType2 d_last) {
-  return Impl::move_backward_impl(label, ex, first, last, d_last);
+  return Impl::move_backward_exespace_impl(label, ex, first, last, d_last);
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto move_backward(const std::string& label, const ExecutionSpace& ex,
                    const ::Kokkos::View<DataType1, Properties1...>& source,
                    ::Kokkos::View<DataType2, Properties2...>& dest) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
 
-  return Impl::move_backward_impl(label, ex, begin(source), end(source),
-                                  end(dest));
+  return Impl::move_backward_exespace_impl(label, ex, begin(source),
+                                           end(source), end(dest));
+}
+
+//
+// overload set accepting a team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+template <typename TeamHandleType, typename IteratorType1,
+          typename IteratorType2,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION IteratorType2 move_backward(const TeamHandleType& teamHandle,
+                                            IteratorType1 first,
+                                            IteratorType1 last,
+                                            IteratorType2 d_last) {
+  return Impl::move_backward_team_impl(teamHandle, first, last, d_last);
+}
+
+template <typename TeamHandleType, typename DataType1, typename... Properties1,
+          typename DataType2, typename... Properties2,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto move_backward(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType1, Properties1...>& source,
+    ::Kokkos::View<DataType2, Properties2...>& dest) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
+
+  return Impl::move_backward_team_impl(teamHandle, begin(source), end(source),
+                                       end(dest));
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_NoneOf.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_NoneOf.hpp
index cf5de3b72b961f306c7d670ce92b5f9c88395724..f7baab3fc0f9b1e0ca9054942a7475754369ade2 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_NoneOf.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_NoneOf.hpp
@@ -23,41 +23,80 @@
 namespace Kokkos {
 namespace Experimental {
 
-template <class ExecutionSpace, class IteratorType, class Predicate>
+//
+// overload set accepting execution space
+//
+template <
+    typename ExecutionSpace, typename IteratorType, typename Predicate,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 bool none_of(const ExecutionSpace& ex, IteratorType first, IteratorType last,
              Predicate predicate) {
-  return Impl::none_of_impl("Kokkos::none_of_iterator_api_default", ex, first,
-                            last, predicate);
+  return Impl::none_of_exespace_impl("Kokkos::none_of_iterator_api_default", ex,
+                                     first, last, predicate);
 }
 
-template <class ExecutionSpace, class IteratorType, class Predicate>
+template <
+    typename ExecutionSpace, typename IteratorType, typename Predicate,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 bool none_of(const std::string& label, const ExecutionSpace& ex,
              IteratorType first, IteratorType last, Predicate predicate) {
-  return Impl::none_of_impl(label, ex, first, last, predicate);
+  return Impl::none_of_exespace_impl(label, ex, first, last, predicate);
 }
 
-template <class ExecutionSpace, class DataType, class... Properties,
-          class Predicate>
+template <
+    typename ExecutionSpace, typename DataType, typename... Properties,
+    typename Predicate,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 bool none_of(const ExecutionSpace& ex,
              const ::Kokkos::View<DataType, Properties...>& v,
              Predicate predicate) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::none_of_impl("Kokkos::none_of_view_api_default", ex,
-                            KE::cbegin(v), KE::cend(v), std::move(predicate));
+  return Impl::none_of_exespace_impl("Kokkos::none_of_view_api_default", ex,
+                                     KE::cbegin(v), KE::cend(v),
+                                     std::move(predicate));
 }
 
-template <class ExecutionSpace, class DataType, class... Properties,
-          class Predicate>
+template <
+    typename ExecutionSpace, typename DataType, typename... Properties,
+    typename Predicate,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 bool none_of(const std::string& label, const ExecutionSpace& ex,
              const ::Kokkos::View<DataType, Properties...>& v,
              Predicate predicate) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::none_of_impl(label, ex, KE::cbegin(v), KE::cend(v),
-                            std::move(predicate));
+  return Impl::none_of_exespace_impl(label, ex, KE::cbegin(v), KE::cend(v),
+                                     std::move(predicate));
+}
+
+//
+// overload set accepting a team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+template <typename TeamHandleType, typename IteratorType, typename Predicate>
+KOKKOS_FUNCTION
+    std::enable_if_t<::Kokkos::is_team_handle<TeamHandleType>::value, bool>
+    none_of(const TeamHandleType& teamHandle, IteratorType first,
+            IteratorType last, Predicate predicate) {
+  return Impl::none_of_team_impl(teamHandle, first, last, predicate);
+}
+
+template <typename TeamHandleType, typename DataType, typename... Properties,
+          typename Predicate>
+KOKKOS_FUNCTION
+    std::enable_if_t<::Kokkos::is_team_handle<TeamHandleType>::value, bool>
+    none_of(const TeamHandleType& teamHandle,
+            const ::Kokkos::View<DataType, Properties...>& v,
+            Predicate predicate) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::none_of_team_impl(teamHandle, KE::cbegin(v), KE::cend(v),
+                                 std::move(predicate));
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_PartitionCopy.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_PartitionCopy.hpp
index 38c0a35b6208e43b9bc78f0cba4cb05d1a3766bf..a1feee8d6d7ac362ac4d07b41992fc385113505f 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_PartitionCopy.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_PartitionCopy.hpp
@@ -23,57 +23,103 @@
 namespace Kokkos {
 namespace Experimental {
 
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorTrueType, class OutputIteratorFalseType,
-          class PredicateType>
+//
+// overload set accepting execution space
+//
+template <
+    typename ExecutionSpace, typename InputIteratorType,
+    typename OutputIteratorTrueType, typename OutputIteratorFalseType,
+    typename PredicateType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 ::Kokkos::pair<OutputIteratorTrueType, OutputIteratorFalseType> partition_copy(
     const ExecutionSpace& ex, InputIteratorType from_first,
     InputIteratorType from_last, OutputIteratorTrueType to_first_true,
     OutputIteratorFalseType to_first_false, PredicateType p) {
-  return Impl::partition_copy_impl(
+  return Impl::partition_copy_exespace_impl(
       "Kokkos::partition_copy_iterator_api_default", ex, from_first, from_last,
       to_first_true, to_first_false, std::move(p));
 }
 
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorTrueType, class OutputIteratorFalseType,
-          class PredicateType>
+template <
+    typename ExecutionSpace, typename InputIteratorType,
+    typename OutputIteratorTrueType, typename OutputIteratorFalseType,
+    typename PredicateType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 ::Kokkos::pair<OutputIteratorTrueType, OutputIteratorFalseType> partition_copy(
     const std::string& label, const ExecutionSpace& ex,
     InputIteratorType from_first, InputIteratorType from_last,
     OutputIteratorTrueType to_first_true,
     OutputIteratorFalseType to_first_false, PredicateType p) {
-  return Impl::partition_copy_impl(label, ex, from_first, from_last,
-                                   to_first_true, to_first_false, std::move(p));
+  return Impl::partition_copy_exespace_impl(label, ex, from_first, from_last,
+                                            to_first_true, to_first_false,
+                                            std::move(p));
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class DataType3,
-          class... Properties3, class PredicateType>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2, typename DataType3,
+    typename... Properties3, typename PredicateType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto partition_copy(
     const ExecutionSpace& ex,
     const ::Kokkos::View<DataType1, Properties1...>& view_from,
     const ::Kokkos::View<DataType2, Properties2...>& view_dest_true,
     const ::Kokkos::View<DataType3, Properties3...>& view_dest_false,
     PredicateType p) {
-  return Impl::partition_copy_impl("Kokkos::partition_copy_view_api_default",
-                                   ex, cbegin(view_from), cend(view_from),
-                                   begin(view_dest_true),
-                                   begin(view_dest_false), std::move(p));
+  return Impl::partition_copy_exespace_impl(
+      "Kokkos::partition_copy_view_api_default", ex, cbegin(view_from),
+      cend(view_from), begin(view_dest_true), begin(view_dest_false),
+      std::move(p));
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class DataType3,
-          class... Properties3, class PredicateType>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2, typename DataType3,
+    typename... Properties3, typename PredicateType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto partition_copy(
     const std::string& label, const ExecutionSpace& ex,
     const ::Kokkos::View<DataType1, Properties1...>& view_from,
     const ::Kokkos::View<DataType2, Properties2...>& view_dest_true,
     const ::Kokkos::View<DataType3, Properties3...>& view_dest_false,
     PredicateType p) {
-  return Impl::partition_copy_impl(label, ex, cbegin(view_from),
-                                   cend(view_from), begin(view_dest_true),
-                                   begin(view_dest_false), std::move(p));
+  return Impl::partition_copy_exespace_impl(
+      label, ex, cbegin(view_from), cend(view_from), begin(view_dest_true),
+      begin(view_dest_false), std::move(p));
+}
+
+//
+// overload set accepting a team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+template <typename TeamHandleType, typename InputIteratorType,
+          typename OutputIteratorTrueType, typename OutputIteratorFalseType,
+          typename PredicateType,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION ::Kokkos::pair<OutputIteratorTrueType, OutputIteratorFalseType>
+partition_copy(const TeamHandleType& teamHandle, InputIteratorType from_first,
+               InputIteratorType from_last,
+               OutputIteratorTrueType to_first_true,
+               OutputIteratorFalseType to_first_false, PredicateType p) {
+  return Impl::partition_copy_team_impl(teamHandle, from_first, from_last,
+                                        to_first_true, to_first_false,
+                                        std::move(p));
+}
+
+template <typename TeamHandleType, typename DataType1, typename... Properties1,
+          typename DataType2, typename... Properties2, typename DataType3,
+          typename... Properties3, typename PredicateType,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto partition_copy(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType1, Properties1...>& view_from,
+    const ::Kokkos::View<DataType2, Properties2...>& view_dest_true,
+    const ::Kokkos::View<DataType3, Properties3...>& view_dest_false,
+    PredicateType p) {
+  return Impl::partition_copy_team_impl(teamHandle, cbegin(view_from),
+                                        cend(view_from), begin(view_dest_true),
+                                        begin(view_dest_false), std::move(p));
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_PartitionPoint.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_PartitionPoint.hpp
index 24798e377ef194c397f09b933e20988234ae0ea9..60cbeeda8754fd2077e249581c8ac693a940aab3 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_PartitionPoint.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_PartitionPoint.hpp
@@ -23,38 +23,78 @@
 namespace Kokkos {
 namespace Experimental {
 
-template <class ExecutionSpace, class IteratorType, class UnaryPredicate>
+//
+// overload set accepting execution space
+//
+template <
+    typename ExecutionSpace, typename IteratorType, typename UnaryPredicate,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 IteratorType partition_point(const ExecutionSpace& ex, IteratorType first,
                              IteratorType last, UnaryPredicate p) {
-  return Impl::partition_point_impl(
+  return Impl::partition_point_exespace_impl(
       "Kokkos::partitioned_point_iterator_api_default", ex, first, last,
       std::move(p));
 }
 
-template <class ExecutionSpace, class IteratorType, class UnaryPredicate>
+template <
+    typename ExecutionSpace, typename IteratorType, typename UnaryPredicate,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 IteratorType partition_point(const std::string& label, const ExecutionSpace& ex,
                              IteratorType first, IteratorType last,
                              UnaryPredicate p) {
-  return Impl::partition_point_impl(label, ex, first, last, std::move(p));
+  return Impl::partition_point_exespace_impl(label, ex, first, last,
+                                             std::move(p));
 }
 
-template <class ExecutionSpace, class UnaryPredicate, class DataType,
-          class... Properties>
+template <
+    typename ExecutionSpace, typename UnaryPredicate, typename DataType,
+    typename... Properties,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto partition_point(const std::string& label, const ExecutionSpace& ex,
                      const ::Kokkos::View<DataType, Properties...>& v,
                      UnaryPredicate p) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
-  return Impl::partition_point_impl(label, ex, begin(v), end(v), std::move(p));
+  return Impl::partition_point_exespace_impl(label, ex, begin(v), end(v),
+                                             std::move(p));
 }
 
-template <class ExecutionSpace, class UnaryPredicate, class DataType,
-          class... Properties>
+template <
+    typename ExecutionSpace, typename UnaryPredicate, typename DataType,
+    typename... Properties,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto partition_point(const ExecutionSpace& ex,
                      const ::Kokkos::View<DataType, Properties...>& v,
                      UnaryPredicate p) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
-  return Impl::partition_point_impl("Kokkos::partition_point_view_api_default",
-                                    ex, begin(v), end(v), std::move(p));
+  return Impl::partition_point_exespace_impl(
+      "Kokkos::partition_point_view_api_default", ex, begin(v), end(v),
+      std::move(p));
+}
+
+//
+// overload set accepting a team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+template <typename TeamHandleType, typename IteratorType,
+          typename UnaryPredicate,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION IteratorType partition_point(const TeamHandleType& teamHandle,
+                                             IteratorType first,
+                                             IteratorType last,
+                                             UnaryPredicate p) {
+  return Impl::partition_point_team_impl(teamHandle, first, last, std::move(p));
+}
+
+template <typename TeamHandleType, typename UnaryPredicate, typename DataType,
+          typename... Properties,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto partition_point(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType, Properties...>& v, UnaryPredicate p) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+  return Impl::partition_point_team_impl(teamHandle, begin(v), end(v),
+                                         std::move(p));
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Reduce.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Reduce.hpp
index a31fa1497ab1690b14fb8d89c81b6ece3b939ec4..b84f00f8bb500f23e53326ffb1b460ead9eb8276 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Reduce.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Reduce.hpp
@@ -23,28 +23,38 @@
 namespace Kokkos {
 namespace Experimental {
 
+//
+// overload set accepting execution space
+//
+
 //
 // overload set 1
 //
-template <class ExecutionSpace, class IteratorType>
+template <typename ExecutionSpace, typename IteratorType,
+          std::enable_if_t<::Kokkos::is_execution_space<ExecutionSpace>::value,
+                           int> = 0>
 typename IteratorType::value_type reduce(const ExecutionSpace& ex,
                                          IteratorType first,
                                          IteratorType last) {
-  return Impl::reduce_default_functors_impl(
+  return Impl::reduce_default_functors_exespace_impl(
       "Kokkos::reduce_default_functors_iterator_api", ex, first, last,
       typename IteratorType::value_type());
 }
 
-template <class ExecutionSpace, class IteratorType>
+template <typename ExecutionSpace, typename IteratorType,
+          std::enable_if_t<::Kokkos::is_execution_space<ExecutionSpace>::value,
+                           int> = 0>
 typename IteratorType::value_type reduce(const std::string& label,
                                          const ExecutionSpace& ex,
                                          IteratorType first,
                                          IteratorType last) {
-  return Impl::reduce_default_functors_impl(
+  return Impl::reduce_default_functors_exespace_impl(
       label, ex, first, last, typename IteratorType::value_type());
 }
 
-template <class ExecutionSpace, class DataType, class... Properties>
+template <typename ExecutionSpace, typename DataType, typename... Properties,
+          std::enable_if_t<::Kokkos::is_execution_space<ExecutionSpace>::value,
+                           int> = 0>
 auto reduce(const ExecutionSpace& ex,
             const ::Kokkos::View<DataType, Properties...>& view) {
   namespace KE = ::Kokkos::Experimental;
@@ -53,12 +63,14 @@ auto reduce(const ExecutionSpace& ex,
   using view_type  = ::Kokkos::View<DataType, Properties...>;
   using value_type = typename view_type::value_type;
 
-  return Impl::reduce_default_functors_impl(
+  return Impl::reduce_default_functors_exespace_impl(
       "Kokkos::reduce_default_functors_view_api", ex, KE::cbegin(view),
       KE::cend(view), value_type());
 }
 
-template <class ExecutionSpace, class DataType, class... Properties>
+template <typename ExecutionSpace, typename DataType, typename... Properties,
+          std::enable_if_t<::Kokkos::is_execution_space<ExecutionSpace>::value,
+                           int> = 0>
 auto reduce(const std::string& label, const ExecutionSpace& ex,
             const ::Kokkos::View<DataType, Properties...>& view) {
   namespace KE = ::Kokkos::Experimental;
@@ -67,37 +79,43 @@ auto reduce(const std::string& label, const ExecutionSpace& ex,
   using view_type  = ::Kokkos::View<DataType, Properties...>;
   using value_type = typename view_type::value_type;
 
-  return Impl::reduce_default_functors_impl(label, ex, KE::cbegin(view),
-                                            KE::cend(view), value_type());
+  return Impl::reduce_default_functors_exespace_impl(
+      label, ex, KE::cbegin(view), KE::cend(view), value_type());
 }
 
 //
 // overload set2:
 //
-template <class ExecutionSpace, class IteratorType, class ValueType>
+template <typename ExecutionSpace, typename IteratorType, typename ValueType,
+          std::enable_if_t<::Kokkos::is_execution_space<ExecutionSpace>::value,
+                           int> = 0>
 ValueType reduce(const ExecutionSpace& ex, IteratorType first,
                  IteratorType last, ValueType init_reduction_value) {
   static_assert(std::is_move_constructible<ValueType>::value,
                 "ValueType must be move constructible.");
 
-  return Impl::reduce_default_functors_impl(
+  return Impl::reduce_default_functors_exespace_impl(
       "Kokkos::reduce_default_functors_iterator_api", ex, first, last,
       init_reduction_value);
 }
 
-template <class ExecutionSpace, class IteratorType, class ValueType>
+template <typename ExecutionSpace, typename IteratorType, typename ValueType,
+          std::enable_if_t<::Kokkos::is_execution_space<ExecutionSpace>::value,
+                           int> = 0>
 ValueType reduce(const std::string& label, const ExecutionSpace& ex,
                  IteratorType first, IteratorType last,
                  ValueType init_reduction_value) {
   static_assert(std::is_move_constructible<ValueType>::value,
                 "ValueType must be move constructible.");
 
-  return Impl::reduce_default_functors_impl(label, ex, first, last,
-                                            init_reduction_value);
+  return Impl::reduce_default_functors_exespace_impl(label, ex, first, last,
+                                                     init_reduction_value);
 }
 
-template <class ExecutionSpace, class DataType, class... Properties,
-          class ValueType>
+template <typename ExecutionSpace, typename DataType, typename... Properties,
+          typename ValueType,
+          std::enable_if_t<::Kokkos::is_execution_space<ExecutionSpace>::value,
+                           int> = 0>
 ValueType reduce(const ExecutionSpace& ex,
                  const ::Kokkos::View<DataType, Properties...>& view,
                  ValueType init_reduction_value) {
@@ -107,13 +125,15 @@ ValueType reduce(const ExecutionSpace& ex,
   namespace KE = ::Kokkos::Experimental;
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
 
-  return Impl::reduce_default_functors_impl(
+  return Impl::reduce_default_functors_exespace_impl(
       "Kokkos::reduce_default_functors_view_api", ex, KE::cbegin(view),
       KE::cend(view), init_reduction_value);
 }
 
-template <class ExecutionSpace, class DataType, class... Properties,
-          class ValueType>
+template <typename ExecutionSpace, typename DataType, typename... Properties,
+          typename ValueType,
+          std::enable_if_t<::Kokkos::is_execution_space<ExecutionSpace>::value,
+                           int> = 0>
 ValueType reduce(const std::string& label, const ExecutionSpace& ex,
                  const ::Kokkos::View<DataType, Properties...>& view,
                  ValueType init_reduction_value) {
@@ -123,40 +143,46 @@ ValueType reduce(const std::string& label, const ExecutionSpace& ex,
   namespace KE = ::Kokkos::Experimental;
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
 
-  return Impl::reduce_default_functors_impl(
+  return Impl::reduce_default_functors_exespace_impl(
       label, ex, KE::cbegin(view), KE::cend(view), init_reduction_value);
 }
 
 //
 // overload set 3
 //
-template <class ExecutionSpace, class IteratorType, class ValueType,
-          class BinaryOp>
+template <typename ExecutionSpace, typename IteratorType, typename ValueType,
+          typename BinaryOp,
+          std::enable_if_t<::Kokkos::is_execution_space<ExecutionSpace>::value,
+                           int> = 0>
 ValueType reduce(const ExecutionSpace& ex, IteratorType first,
                  IteratorType last, ValueType init_reduction_value,
                  BinaryOp joiner) {
   static_assert(std::is_move_constructible<ValueType>::value,
                 "ValueType must be move constructible.");
 
-  return Impl::reduce_custom_functors_impl(
+  return Impl::reduce_custom_functors_exespace_impl(
       "Kokkos::reduce_default_functors_iterator_api", ex, first, last,
       init_reduction_value, joiner);
 }
 
-template <class ExecutionSpace, class IteratorType, class ValueType,
-          class BinaryOp>
+template <typename ExecutionSpace, typename IteratorType, typename ValueType,
+          typename BinaryOp,
+          std::enable_if_t<::Kokkos::is_execution_space<ExecutionSpace>::value,
+                           int> = 0>
 ValueType reduce(const std::string& label, const ExecutionSpace& ex,
                  IteratorType first, IteratorType last,
                  ValueType init_reduction_value, BinaryOp joiner) {
   static_assert(std::is_move_constructible<ValueType>::value,
                 "ValueType must be move constructible.");
 
-  return Impl::reduce_custom_functors_impl(label, ex, first, last,
-                                           init_reduction_value, joiner);
+  return Impl::reduce_custom_functors_exespace_impl(
+      label, ex, first, last, init_reduction_value, joiner);
 }
 
-template <class ExecutionSpace, class DataType, class... Properties,
-          class ValueType, class BinaryOp>
+template <typename ExecutionSpace, typename DataType, typename... Properties,
+          typename ValueType, typename BinaryOp,
+          std::enable_if_t<::Kokkos::is_execution_space<ExecutionSpace>::value,
+                           int> = 0>
 ValueType reduce(const ExecutionSpace& ex,
                  const ::Kokkos::View<DataType, Properties...>& view,
                  ValueType init_reduction_value, BinaryOp joiner) {
@@ -166,13 +192,15 @@ ValueType reduce(const ExecutionSpace& ex,
   namespace KE = ::Kokkos::Experimental;
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
 
-  return Impl::reduce_custom_functors_impl(
+  return Impl::reduce_custom_functors_exespace_impl(
       "Kokkos::reduce_custom_functors_view_api", ex, KE::cbegin(view),
       KE::cend(view), init_reduction_value, joiner);
 }
 
-template <class ExecutionSpace, class DataType, class... Properties,
-          class ValueType, class BinaryOp>
+template <typename ExecutionSpace, typename DataType, typename... Properties,
+          typename ValueType, typename BinaryOp,
+          std::enable_if_t<::Kokkos::is_execution_space<ExecutionSpace>::value,
+                           int> = 0>
 ValueType reduce(const std::string& label, const ExecutionSpace& ex,
                  const ::Kokkos::View<DataType, Properties...>& view,
                  ValueType init_reduction_value, BinaryOp joiner) {
@@ -182,9 +210,114 @@ ValueType reduce(const std::string& label, const ExecutionSpace& ex,
   namespace KE = ::Kokkos::Experimental;
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
 
-  return Impl::reduce_custom_functors_impl(label, ex, KE::cbegin(view),
-                                           KE::cend(view), init_reduction_value,
-                                           joiner);
+  return Impl::reduce_custom_functors_exespace_impl(
+      label, ex, KE::cbegin(view), KE::cend(view), init_reduction_value,
+      joiner);
+}
+
+//
+// overload set accepting a team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+
+//
+// overload set 1
+//
+template <
+    typename TeamHandleType, typename IteratorType,
+    std::enable_if_t<::Kokkos::is_team_handle<TeamHandleType>::value, int> = 0>
+KOKKOS_FUNCTION typename IteratorType::value_type reduce(
+    const TeamHandleType& teamHandle, IteratorType first, IteratorType last) {
+  return Impl::reduce_default_functors_team_impl(
+      teamHandle, first, last, typename IteratorType::value_type());
+}
+
+template <
+    typename TeamHandleType, typename DataType, typename... Properties,
+    std::enable_if_t<::Kokkos::is_team_handle<TeamHandleType>::value, int> = 0>
+KOKKOS_FUNCTION auto reduce(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType, Properties...>& view) {
+  namespace KE = ::Kokkos::Experimental;
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+
+  using view_type  = ::Kokkos::View<DataType, Properties...>;
+  using value_type = typename view_type::value_type;
+
+  return Impl::reduce_default_functors_team_impl(teamHandle, KE::cbegin(view),
+                                                 KE::cend(view), value_type());
+}
+
+//
+// overload set2:
+//
+template <
+    typename TeamHandleType, typename IteratorType, typename ValueType,
+    std::enable_if_t<::Kokkos::is_team_handle<TeamHandleType>::value, int> = 0>
+KOKKOS_FUNCTION ValueType reduce(const TeamHandleType& teamHandle,
+                                 IteratorType first, IteratorType last,
+                                 ValueType init_reduction_value) {
+  static_assert(std::is_move_constructible<ValueType>::value,
+                "ValueType must be move constructible.");
+
+  return Impl::reduce_default_functors_team_impl(teamHandle, first, last,
+                                                 init_reduction_value);
+}
+
+template <
+    typename TeamHandleType, typename DataType, typename... Properties,
+    typename ValueType,
+    std::enable_if_t<::Kokkos::is_team_handle<TeamHandleType>::value, int> = 0>
+KOKKOS_FUNCTION ValueType
+reduce(const TeamHandleType& teamHandle,
+       const ::Kokkos::View<DataType, Properties...>& view,
+       ValueType init_reduction_value) {
+  static_assert(std::is_move_constructible<ValueType>::value,
+                "ValueType must be move constructible.");
+
+  namespace KE = ::Kokkos::Experimental;
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+
+  return Impl::reduce_default_functors_team_impl(
+      teamHandle, KE::cbegin(view), KE::cend(view), init_reduction_value);
+}
+
+//
+// overload set 3
+//
+template <
+    typename TeamHandleType, typename IteratorType, typename ValueType,
+    typename BinaryOp,
+    std::enable_if_t<::Kokkos::is_team_handle<TeamHandleType>::value, int> = 0>
+KOKKOS_FUNCTION ValueType reduce(const TeamHandleType& teamHandle,
+                                 IteratorType first, IteratorType last,
+                                 ValueType init_reduction_value,
+                                 BinaryOp joiner) {
+  static_assert(std::is_move_constructible<ValueType>::value,
+                "ValueType must be move constructible.");
+
+  return Impl::reduce_custom_functors_team_impl(teamHandle, first, last,
+                                                init_reduction_value, joiner);
+}
+
+template <
+    typename TeamHandleType, typename DataType, typename... Properties,
+    typename ValueType, typename BinaryOp,
+    std::enable_if_t<::Kokkos::is_team_handle<TeamHandleType>::value, int> = 0>
+KOKKOS_FUNCTION ValueType
+reduce(const TeamHandleType& teamHandle,
+       const ::Kokkos::View<DataType, Properties...>& view,
+       ValueType init_reduction_value, BinaryOp joiner) {
+  static_assert(std::is_move_constructible<ValueType>::value,
+                "ValueType must be move constructible.");
+
+  namespace KE = ::Kokkos::Experimental;
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+
+  return Impl::reduce_custom_functors_team_impl(teamHandle, KE::cbegin(view),
+                                                KE::cend(view),
+                                                init_reduction_value, joiner);
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Remove.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Remove.hpp
index c8602d2f53cf5b8c4e9f56de09f2254d0cd67aad..8a429d8d5182b1d4656de4c940909c99ab95a7cd 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Remove.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Remove.hpp
@@ -23,38 +23,74 @@
 namespace Kokkos {
 namespace Experimental {
 
-template <class ExecutionSpace, class Iterator, class ValueType>
+//
+// overload set accepting execution space
+//
+template <
+    typename ExecutionSpace, typename Iterator, typename ValueType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 Iterator remove(const ExecutionSpace& ex, Iterator first, Iterator last,
                 const ValueType& value) {
-  return Impl::remove_impl("Kokkos::remove_iterator_api_default", ex, first,
-                           last, value);
+  return Impl::remove_exespace_impl("Kokkos::remove_iterator_api_default", ex,
+                                    first, last, value);
 }
 
-template <class ExecutionSpace, class Iterator, class ValueType>
+template <
+    typename ExecutionSpace, typename Iterator, typename ValueType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 Iterator remove(const std::string& label, const ExecutionSpace& ex,
                 Iterator first, Iterator last, const ValueType& value) {
-  return Impl::remove_impl(label, ex, first, last, value);
+  return Impl::remove_exespace_impl(label, ex, first, last, value);
 }
 
-template <class ExecutionSpace, class DataType, class... Properties,
-          class ValueType>
+template <
+    typename ExecutionSpace, typename DataType, typename... Properties,
+    typename ValueType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto remove(const ExecutionSpace& ex,
             const ::Kokkos::View<DataType, Properties...>& view,
             const ValueType& value) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-  return Impl::remove_impl("Kokkos::remove_iterator_api_default", ex,
-                           ::Kokkos::Experimental::begin(view),
-                           ::Kokkos::Experimental::end(view), value);
+  return Impl::remove_exespace_impl("Kokkos::remove_iterator_api_default", ex,
+                                    ::Kokkos::Experimental::begin(view),
+                                    ::Kokkos::Experimental::end(view), value);
 }
 
-template <class ExecutionSpace, class DataType, class... Properties,
-          class ValueType>
+template <
+    typename ExecutionSpace, typename DataType, typename... Properties,
+    typename ValueType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto remove(const std::string& label, const ExecutionSpace& ex,
             const ::Kokkos::View<DataType, Properties...>& view,
             const ValueType& value) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-  return Impl::remove_impl(label, ex, ::Kokkos::Experimental::begin(view),
-                           ::Kokkos::Experimental::end(view), value);
+  return Impl::remove_exespace_impl(label, ex,
+                                    ::Kokkos::Experimental::begin(view),
+                                    ::Kokkos::Experimental::end(view), value);
+}
+
+//
+// overload set accepting a team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+template <typename TeamHandleType, typename Iterator, typename ValueType,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION Iterator remove(const TeamHandleType& teamHandle,
+                                Iterator first, Iterator last,
+                                const ValueType& value) {
+  return Impl::remove_team_impl(teamHandle, first, last, value);
+}
+
+template <typename TeamHandleType, typename DataType, typename... Properties,
+          typename ValueType,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto remove(const TeamHandleType& teamHandle,
+                            const ::Kokkos::View<DataType, Properties...>& view,
+                            const ValueType& value) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  return Impl::remove_team_impl(teamHandle, ::Kokkos::Experimental::begin(view),
+                                ::Kokkos::Experimental::end(view), value);
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_RemoveCopy.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_RemoveCopy.hpp
index c2c06f6202804f47760ebd8d07b2cd4ba9384cf5..4b8fa9fe077c59c00a8a84297deb95b7fdc3ee12 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_RemoveCopy.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_RemoveCopy.hpp
@@ -23,26 +23,36 @@
 namespace Kokkos {
 namespace Experimental {
 
-template <class ExecutionSpace, class InputIterator, class OutputIterator,
-          class ValueType>
+//
+// overload set accepting execution space
+//
+template <
+    typename ExecutionSpace, typename InputIterator, typename OutputIterator,
+    typename ValueType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 OutputIterator remove_copy(const ExecutionSpace& ex, InputIterator first_from,
                            InputIterator last_from, OutputIterator first_dest,
                            const ValueType& value) {
-  return Impl::remove_copy_impl("Kokkos::remove_copy_iterator_api_default", ex,
-                                first_from, last_from, first_dest, value);
+  return Impl::remove_copy_exespace_impl(
+      "Kokkos::remove_copy_iterator_api_default", ex, first_from, last_from,
+      first_dest, value);
 }
 
-template <class ExecutionSpace, class InputIterator, class OutputIterator,
-          class ValueType>
+template <
+    typename ExecutionSpace, typename InputIterator, typename OutputIterator,
+    typename ValueType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 OutputIterator remove_copy(const std::string& label, const ExecutionSpace& ex,
                            InputIterator first_from, InputIterator last_from,
                            OutputIterator first_dest, const ValueType& value) {
-  return Impl::remove_copy_impl(label, ex, first_from, last_from, first_dest,
-                                value);
+  return Impl::remove_copy_exespace_impl(label, ex, first_from, last_from,
+                                         first_dest, value);
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class ValueType>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2, typename ValueType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto remove_copy(const ExecutionSpace& ex,
                  const ::Kokkos::View<DataType1, Properties1...>& view_from,
                  const ::Kokkos::View<DataType2, Properties2...>& view_dest,
@@ -50,15 +60,17 @@ auto remove_copy(const ExecutionSpace& ex,
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
 
-  return Impl::remove_copy_impl("Kokkos::remove_copy_iterator_api_default", ex,
-                                ::Kokkos::Experimental::cbegin(view_from),
-                                ::Kokkos::Experimental::cend(view_from),
-                                ::Kokkos::Experimental::begin(view_dest),
-                                value);
+  return Impl::remove_copy_exespace_impl(
+      "Kokkos::remove_copy_iterator_api_default", ex,
+      ::Kokkos::Experimental::cbegin(view_from),
+      ::Kokkos::Experimental::cend(view_from),
+      ::Kokkos::Experimental::begin(view_dest), value);
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class ValueType>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2, typename ValueType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto remove_copy(const std::string& label, const ExecutionSpace& ex,
                  const ::Kokkos::View<DataType1, Properties1...>& view_from,
                  const ::Kokkos::View<DataType2, Properties2...>& view_dest,
@@ -66,12 +78,46 @@ auto remove_copy(const std::string& label, const ExecutionSpace& ex,
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
 
-  return Impl::remove_copy_impl(
+  return Impl::remove_copy_exespace_impl(
       label, ex, ::Kokkos::Experimental::cbegin(view_from),
       ::Kokkos::Experimental::cend(view_from),
       ::Kokkos::Experimental::begin(view_dest), value);
 }
 
+//
+// overload set accepting a team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+template <typename TeamHandleType, typename InputIterator,
+          typename OutputIterator, typename ValueType,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION OutputIterator remove_copy(const TeamHandleType& teamHandle,
+                                           InputIterator first_from,
+                                           InputIterator last_from,
+                                           OutputIterator first_dest,
+                                           const ValueType& value) {
+  return Impl::remove_copy_team_impl(teamHandle, first_from, last_from,
+                                     first_dest, value);
+}
+
+template <typename TeamHandleType, typename DataType1, typename... Properties1,
+          typename DataType2, typename... Properties2, typename ValueType,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto remove_copy(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType1, Properties1...>& view_from,
+    const ::Kokkos::View<DataType2, Properties2...>& view_dest,
+    const ValueType& value) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
+
+  return Impl::remove_copy_team_impl(
+      teamHandle, ::Kokkos::Experimental::cbegin(view_from),
+      ::Kokkos::Experimental::cend(view_from),
+      ::Kokkos::Experimental::begin(view_dest), value);
+}
+
 }  // namespace Experimental
 }  // namespace Kokkos
 
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_RemoveCopyIf.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_RemoveCopyIf.hpp
index 6d642ed6f09b0ad82275fcc3c9dd7e1c842d54a8..45e2b54bb6f2a65b09f6c57b79d01222f3bb5f14 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_RemoveCopyIf.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_RemoveCopyIf.hpp
@@ -23,30 +23,39 @@
 namespace Kokkos {
 namespace Experimental {
 
-template <class ExecutionSpace, class InputIterator, class OutputIterator,
-          class UnaryPredicate>
+//
+// overload set accepting execution space
+//
+template <
+    typename ExecutionSpace, typename InputIterator, typename OutputIterator,
+    typename UnaryPredicate,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 OutputIterator remove_copy_if(const ExecutionSpace& ex,
                               InputIterator first_from, InputIterator last_from,
                               OutputIterator first_dest,
                               const UnaryPredicate& pred) {
-  return Impl::remove_copy_if_impl(
+  return Impl::remove_copy_if_exespace_impl(
       "Kokkos::remove_copy_if_iterator_api_default", ex, first_from, last_from,
       first_dest, pred);
 }
 
-template <class ExecutionSpace, class InputIterator, class OutputIterator,
-          class UnaryPredicate>
+template <
+    typename ExecutionSpace, typename InputIterator, typename OutputIterator,
+    typename UnaryPredicate,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 OutputIterator remove_copy_if(const std::string& label,
                               const ExecutionSpace& ex,
                               InputIterator first_from, InputIterator last_from,
                               OutputIterator first_dest,
                               const UnaryPredicate& pred) {
-  return Impl::remove_copy_if_impl(label, ex, first_from, last_from, first_dest,
-                                   pred);
+  return Impl::remove_copy_if_exespace_impl(label, ex, first_from, last_from,
+                                            first_dest, pred);
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class UnaryPredicate>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2, typename UnaryPredicate,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto remove_copy_if(const ExecutionSpace& ex,
                     const ::Kokkos::View<DataType1, Properties1...>& view_from,
                     const ::Kokkos::View<DataType2, Properties2...>& view_dest,
@@ -54,15 +63,17 @@ auto remove_copy_if(const ExecutionSpace& ex,
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
 
-  return Impl::remove_copy_if_impl(
+  return Impl::remove_copy_if_exespace_impl(
       "Kokkos::remove_copy_if_iterator_api_default", ex,
       ::Kokkos::Experimental::cbegin(view_from),
       ::Kokkos::Experimental::cend(view_from),
       ::Kokkos::Experimental::begin(view_dest), pred);
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class UnaryPredicate>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2, typename UnaryPredicate,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto remove_copy_if(const std::string& label, const ExecutionSpace& ex,
                     const ::Kokkos::View<DataType1, Properties1...>& view_from,
                     const ::Kokkos::View<DataType2, Properties2...>& view_dest,
@@ -70,12 +81,46 @@ auto remove_copy_if(const std::string& label, const ExecutionSpace& ex,
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
 
-  return Impl::remove_copy_if_impl(
+  return Impl::remove_copy_if_exespace_impl(
       label, ex, ::Kokkos::Experimental::cbegin(view_from),
       ::Kokkos::Experimental::cend(view_from),
       ::Kokkos::Experimental::begin(view_dest), pred);
 }
 
+//
+// overload set accepting a team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+template <typename TeamHandleType, typename InputIterator,
+          typename OutputIterator, typename UnaryPredicate,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION OutputIterator remove_copy_if(const TeamHandleType& teamHandle,
+                                              InputIterator first_from,
+                                              InputIterator last_from,
+                                              OutputIterator first_dest,
+                                              const UnaryPredicate& pred) {
+  return Impl::remove_copy_if_team_impl(teamHandle, first_from, last_from,
+                                        first_dest, pred);
+}
+
+template <typename TeamHandleType, typename DataType1, typename... Properties1,
+          typename DataType2, typename... Properties2, typename UnaryPredicate,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto remove_copy_if(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType1, Properties1...>& view_from,
+    const ::Kokkos::View<DataType2, Properties2...>& view_dest,
+    const UnaryPredicate& pred) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
+
+  return Impl::remove_copy_if_team_impl(
+      teamHandle, ::Kokkos::Experimental::cbegin(view_from),
+      ::Kokkos::Experimental::cend(view_from),
+      ::Kokkos::Experimental::begin(view_dest), pred);
+}
+
 }  // namespace Experimental
 }  // namespace Kokkos
 
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_RemoveIf.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_RemoveIf.hpp
index 4062e8d373e2be30681351ce86c30d55391c58c4..38461a37f26abccf41d5632e3f09a78dfbda215d 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_RemoveIf.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_RemoveIf.hpp
@@ -23,39 +23,77 @@
 namespace Kokkos {
 namespace Experimental {
 
-template <class ExecutionSpace, class Iterator, class UnaryPredicate>
+//
+// overload set accepting execution space
+//
+template <
+    typename ExecutionSpace, typename Iterator, typename UnaryPredicate,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 Iterator remove_if(const ExecutionSpace& ex, Iterator first, Iterator last,
                    UnaryPredicate pred) {
-  return Impl::remove_if_impl("Kokkos::remove_if_iterator_api_default", ex,
-                              first, last, pred);
+  return Impl::remove_if_exespace_impl("Kokkos::remove_if_iterator_api_default",
+                                       ex, first, last, pred);
 }
 
-template <class ExecutionSpace, class Iterator, class UnaryPredicate>
+template <
+    typename ExecutionSpace, typename Iterator, typename UnaryPredicate,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 Iterator remove_if(const std::string& label, const ExecutionSpace& ex,
                    Iterator first, Iterator last, UnaryPredicate pred) {
-  return Impl::remove_if_impl(label, ex, first, last, pred);
+  return Impl::remove_if_exespace_impl(label, ex, first, last, pred);
 }
 
-template <class ExecutionSpace, class DataType, class... Properties,
-          class UnaryPredicate>
+template <
+    typename ExecutionSpace, typename DataType, typename... Properties,
+    typename UnaryPredicate,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto remove_if(const ExecutionSpace& ex,
                const ::Kokkos::View<DataType, Properties...>& view,
                UnaryPredicate pred) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
 
-  return Impl::remove_if_impl("Kokkos::remove_if_iterator_api_default", ex,
-                              ::Kokkos::Experimental::begin(view),
-                              ::Kokkos::Experimental::end(view), pred);
+  return Impl::remove_if_exespace_impl("Kokkos::remove_if_iterator_api_default",
+                                       ex, ::Kokkos::Experimental::begin(view),
+                                       ::Kokkos::Experimental::end(view), pred);
 }
 
-template <class ExecutionSpace, class DataType, class... Properties,
-          class UnaryPredicate>
+template <
+    typename ExecutionSpace, typename DataType, typename... Properties,
+    typename UnaryPredicate,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto remove_if(const std::string& label, const ExecutionSpace& ex,
                const ::Kokkos::View<DataType, Properties...>& view,
                UnaryPredicate pred) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-  return Impl::remove_if_impl(label, ex, ::Kokkos::Experimental::begin(view),
-                              ::Kokkos::Experimental::end(view), pred);
+  return Impl::remove_if_exespace_impl(label, ex,
+                                       ::Kokkos::Experimental::begin(view),
+                                       ::Kokkos::Experimental::end(view), pred);
+}
+
+//
+// overload set accepting a team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+template <typename TeamHandleType, typename Iterator, typename UnaryPredicate,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION Iterator remove_if(const TeamHandleType& teamHandle,
+                                   Iterator first, Iterator last,
+                                   UnaryPredicate pred) {
+  return Impl::remove_if_team_impl(teamHandle, first, last, pred);
+}
+
+template <typename TeamHandleType, typename DataType, typename... Properties,
+          typename UnaryPredicate,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto remove_if(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType, Properties...>& view, UnaryPredicate pred) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+
+  return Impl::remove_if_team_impl(teamHandle,
+                                   ::Kokkos::Experimental::begin(view),
+                                   ::Kokkos::Experimental::end(view), pred);
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Replace.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Replace.hpp
index 4d1490ded0c8ebe05fdba27f27cfe80b140fdd47..29afc4f0c21395bd53ce0c18b0be787804f845f9 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Replace.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Replace.hpp
@@ -23,40 +23,77 @@
 namespace Kokkos {
 namespace Experimental {
 
-template <class ExecutionSpace, class Iterator, class ValueType>
+//
+// overload set accepting execution space
+//
+template <
+    typename ExecutionSpace, typename Iterator, typename ValueType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 void replace(const ExecutionSpace& ex, Iterator first, Iterator last,
              const ValueType& old_value, const ValueType& new_value) {
-  return Impl::replace_impl("Kokkos::replace_iterator_api", ex, first, last,
-                            old_value, new_value);
+  Impl::replace_exespace_impl("Kokkos::replace_iterator_api", ex, first, last,
+                              old_value, new_value);
 }
 
-template <class ExecutionSpace, class Iterator, class ValueType>
+template <
+    typename ExecutionSpace, typename Iterator, typename ValueType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 void replace(const std::string& label, const ExecutionSpace& ex, Iterator first,
              Iterator last, const ValueType& old_value,
              const ValueType& new_value) {
-  return Impl::replace_impl(label, ex, first, last, old_value, new_value);
+  Impl::replace_exespace_impl(label, ex, first, last, old_value, new_value);
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class ValueType>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename ValueType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 void replace(const ExecutionSpace& ex,
              const ::Kokkos::View<DataType1, Properties1...>& view,
              const ValueType& old_value, const ValueType& new_value) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
   namespace KE = ::Kokkos::Experimental;
-  return Impl::replace_impl("Kokkos::replace_view_api", ex, KE::begin(view),
-                            KE::end(view), old_value, new_value);
+  Impl::replace_exespace_impl("Kokkos::replace_view_api", ex, KE::begin(view),
+                              KE::end(view), old_value, new_value);
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class ValueType>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename ValueType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 void replace(const std::string& label, const ExecutionSpace& ex,
              const ::Kokkos::View<DataType1, Properties1...>& view,
              const ValueType& old_value, const ValueType& new_value) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
   namespace KE = ::Kokkos::Experimental;
-  return Impl::replace_impl(label, ex, KE::begin(view), KE::end(view),
-                            old_value, new_value);
+  Impl::replace_exespace_impl(label, ex, KE::begin(view), KE::end(view),
+                              old_value, new_value);
+}
+
+//
+// overload set accepting a team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+template <typename TeamHandleType, typename Iterator, typename ValueType,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION void replace(const TeamHandleType& teamHandle, Iterator first,
+                             Iterator last, const ValueType& old_value,
+                             const ValueType& new_value) {
+  Impl::replace_team_impl(teamHandle, first, last, old_value, new_value);
+}
+
+template <typename TeamHandleType, typename DataType1, typename... Properties1,
+          typename ValueType,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION void replace(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType1, Properties1...>& view,
+    const ValueType& old_value, const ValueType& new_value) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  namespace KE = ::Kokkos::Experimental;
+  Impl::replace_team_impl(teamHandle, KE::begin(view), KE::end(view), old_value,
+                          new_value);
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ReplaceCopy.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ReplaceCopy.hpp
index e7f464e4bd7b7102839dcf38547a51ebd437f1da..04d5767e895fcd9a06146cbb5ddce6fe71a27ea3 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ReplaceCopy.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ReplaceCopy.hpp
@@ -23,30 +23,39 @@
 namespace Kokkos {
 namespace Experimental {
 
-template <class ExecutionSpace, class InputIterator, class OutputIterator,
-          class ValueType>
+//
+// overload set accepting execution space
+//
+template <
+    typename ExecutionSpace, typename InputIterator, typename OutputIterator,
+    typename ValueType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 OutputIterator replace_copy(const ExecutionSpace& ex, InputIterator first_from,
                             InputIterator last_from, OutputIterator first_dest,
                             const ValueType& old_value,
                             const ValueType& new_value) {
-  return Impl::replace_copy_impl("Kokkos::replace_copy_iterator_api", ex,
-                                 first_from, last_from, first_dest, old_value,
-                                 new_value);
+  return Impl::replace_copy_exespace_impl("Kokkos::replace_copy_iterator_api",
+                                          ex, first_from, last_from, first_dest,
+                                          old_value, new_value);
 }
 
-template <class ExecutionSpace, class InputIterator, class OutputIterator,
-          class ValueType>
+template <
+    typename ExecutionSpace, typename InputIterator, typename OutputIterator,
+    typename ValueType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 OutputIterator replace_copy(const std::string& label, const ExecutionSpace& ex,
                             InputIterator first_from, InputIterator last_from,
                             OutputIterator first_dest,
                             const ValueType& old_value,
                             const ValueType& new_value) {
-  return Impl::replace_copy_impl(label, ex, first_from, last_from, first_dest,
-                                 old_value, new_value);
+  return Impl::replace_copy_exespace_impl(label, ex, first_from, last_from,
+                                          first_dest, old_value, new_value);
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class ValueType>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2, typename ValueType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto replace_copy(const ExecutionSpace& ex,
                   const ::Kokkos::View<DataType1, Properties1...>& view_from,
                   const ::Kokkos::View<DataType2, Properties2...>& view_dest,
@@ -54,13 +63,15 @@ auto replace_copy(const ExecutionSpace& ex,
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
   namespace KE = ::Kokkos::Experimental;
-  return Impl::replace_copy_impl("Kokkos::replace_copy_view_api", ex,
-                                 KE::cbegin(view_from), KE::cend(view_from),
-                                 KE::begin(view_dest), old_value, new_value);
+  return Impl::replace_copy_exespace_impl(
+      "Kokkos::replace_copy_view_api", ex, KE::cbegin(view_from),
+      KE::cend(view_from), KE::begin(view_dest), old_value, new_value);
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class ValueType>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2, typename ValueType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto replace_copy(const std::string& label, const ExecutionSpace& ex,
                   const ::Kokkos::View<DataType1, Properties1...>& view_from,
                   const ::Kokkos::View<DataType2, Properties2...>& view_dest,
@@ -68,9 +79,43 @@ auto replace_copy(const std::string& label, const ExecutionSpace& ex,
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
   namespace KE = ::Kokkos::Experimental;
-  return Impl::replace_copy_impl(label, ex, KE::cbegin(view_from),
-                                 KE::cend(view_from), KE::begin(view_dest),
-                                 old_value, new_value);
+  return Impl::replace_copy_exespace_impl(
+      label, ex, KE::cbegin(view_from), KE::cend(view_from),
+      KE::begin(view_dest), old_value, new_value);
+}
+
+//
+// overload set accepting a team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+template <typename TeamHandleType, typename InputIterator,
+          typename OutputIterator, typename ValueType,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION OutputIterator replace_copy(const TeamHandleType& teamHandle,
+                                            InputIterator first_from,
+                                            InputIterator last_from,
+                                            OutputIterator first_dest,
+                                            const ValueType& old_value,
+                                            const ValueType& new_value) {
+  return Impl::replace_copy_team_impl(teamHandle, first_from, last_from,
+                                      first_dest, old_value, new_value);
+}
+
+template <typename TeamHandleType, typename DataType1, typename... Properties1,
+          typename DataType2, typename... Properties2, typename ValueType,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto replace_copy(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType1, Properties1...>& view_from,
+    const ::Kokkos::View<DataType2, Properties2...>& view_dest,
+    const ValueType& old_value, const ValueType& new_value) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::replace_copy_team_impl(teamHandle, KE::cbegin(view_from),
+                                      KE::cend(view_from), KE::begin(view_dest),
+                                      old_value, new_value);
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ReplaceCopyIf.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ReplaceCopyIf.hpp
index 71ae8f845280d554f625815a80b3b0f0db9b00b3..b87163f194fb2b597b36318e35e3a7b103e62da8 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ReplaceCopyIf.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ReplaceCopyIf.hpp
@@ -23,33 +23,42 @@
 namespace Kokkos {
 namespace Experimental {
 
-template <class ExecutionSpace, class InputIterator, class OutputIterator,
-          class PredicateType, class ValueType>
+//
+// overload set accepting execution space
+//
+template <
+    typename ExecutionSpace, typename InputIterator, typename OutputIterator,
+    typename PredicateType, typename ValueType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 OutputIterator replace_copy_if(const ExecutionSpace& ex,
                                InputIterator first_from,
                                InputIterator last_from,
                                OutputIterator first_dest, PredicateType pred,
                                const ValueType& new_value) {
-  return Impl::replace_copy_if_impl("Kokkos::replace_copy_if_iterator_api", ex,
-                                    first_from, last_from, first_dest, pred,
-                                    new_value);
+  return Impl::replace_copy_if_exespace_impl(
+      "Kokkos::replace_copy_if_iterator_api", ex, first_from, last_from,
+      first_dest, pred, new_value);
 }
 
-template <class ExecutionSpace, class InputIterator, class OutputIterator,
-          class PredicateType, class ValueType>
+template <
+    typename ExecutionSpace, typename InputIterator, typename OutputIterator,
+    typename PredicateType, typename ValueType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 OutputIterator replace_copy_if(const std::string& label,
                                const ExecutionSpace& ex,
                                InputIterator first_from,
                                InputIterator last_from,
                                OutputIterator first_dest, PredicateType pred,
                                const ValueType& new_value) {
-  return Impl::replace_copy_if_impl(label, ex, first_from, last_from,
-                                    first_dest, pred, new_value);
+  return Impl::replace_copy_if_exespace_impl(label, ex, first_from, last_from,
+                                             first_dest, pred, new_value);
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class PredicateType,
-          class ValueType>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2, typename PredicateType,
+    typename ValueType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto replace_copy_if(const ExecutionSpace& ex,
                      const ::Kokkos::View<DataType1, Properties1...>& view_from,
                      const ::Kokkos::View<DataType2, Properties2...>& view_dest,
@@ -57,14 +66,16 @@ auto replace_copy_if(const ExecutionSpace& ex,
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
   namespace KE = ::Kokkos::Experimental;
-  return Impl::replace_copy_if_impl("Kokkos::replace_copy_if_view_api", ex,
-                                    KE::cbegin(view_from), KE::cend(view_from),
-                                    KE::begin(view_dest), pred, new_value);
+  return Impl::replace_copy_if_exespace_impl(
+      "Kokkos::replace_copy_if_view_api", ex, KE::cbegin(view_from),
+      KE::cend(view_from), KE::begin(view_dest), pred, new_value);
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class PredicateType,
-          class ValueType>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2, typename PredicateType,
+    typename ValueType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto replace_copy_if(const std::string& label, const ExecutionSpace& ex,
                      const ::Kokkos::View<DataType1, Properties1...>& view_from,
                      const ::Kokkos::View<DataType2, Properties2...>& view_dest,
@@ -72,9 +83,44 @@ auto replace_copy_if(const std::string& label, const ExecutionSpace& ex,
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
   namespace KE = ::Kokkos::Experimental;
-  return Impl::replace_copy_if_impl(label, ex, KE::cbegin(view_from),
-                                    KE::cend(view_from), KE::begin(view_dest),
-                                    pred, new_value);
+  return Impl::replace_copy_if_exespace_impl(
+      label, ex, KE::cbegin(view_from), KE::cend(view_from),
+      KE::begin(view_dest), pred, new_value);
+}
+
+//
+// overload set accepting a team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+template <typename TeamHandleType, typename InputIterator,
+          typename OutputIterator, typename PredicateType, typename ValueType,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION OutputIterator replace_copy_if(const TeamHandleType& teamHandle,
+                                               InputIterator first_from,
+                                               InputIterator last_from,
+                                               OutputIterator first_dest,
+                                               PredicateType pred,
+                                               const ValueType& new_value) {
+  return Impl::replace_copy_if_team_impl(teamHandle, first_from, last_from,
+                                         first_dest, pred, new_value);
+}
+
+template <typename TeamHandleType, typename DataType1, typename... Properties1,
+          typename DataType2, typename... Properties2, typename PredicateType,
+          typename ValueType,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto replace_copy_if(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType1, Properties1...>& view_from,
+    const ::Kokkos::View<DataType2, Properties2...>& view_dest,
+    PredicateType pred, const ValueType& new_value) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::replace_copy_if_team_impl(teamHandle, KE::cbegin(view_from),
+                                         KE::cend(view_from),
+                                         KE::begin(view_dest), pred, new_value);
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ReplaceIf.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ReplaceIf.hpp
index 7f06540e068b76b38052adf94767b6603c9e4298..73af1f16f02a79334d531a3218d3416a995d9e24 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ReplaceIf.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ReplaceIf.hpp
@@ -23,43 +23,82 @@
 namespace Kokkos {
 namespace Experimental {
 
-template <class ExecutionSpace, class InputIterator, class Predicate,
-          class ValueType>
+//
+// overload set accepting execution space
+//
+template <
+    typename ExecutionSpace, typename InputIterator, typename Predicate,
+    typename ValueType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 void replace_if(const ExecutionSpace& ex, InputIterator first,
                 InputIterator last, Predicate pred,
                 const ValueType& new_value) {
-  return Impl::replace_if_impl("Kokkos::replace_if_iterator_api", ex, first,
-                               last, pred, new_value);
+  Impl::replace_if_exespace_impl("Kokkos::replace_if_iterator_api", ex, first,
+                                 last, pred, new_value);
 }
 
-template <class ExecutionSpace, class InputIterator, class Predicate,
-          class ValueType>
+template <
+    typename ExecutionSpace, typename InputIterator, typename Predicate,
+    typename ValueType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 void replace_if(const std::string& label, const ExecutionSpace& ex,
                 InputIterator first, InputIterator last, Predicate pred,
                 const ValueType& new_value) {
-  return Impl::replace_if_impl(label, ex, first, last, pred, new_value);
+  Impl::replace_if_exespace_impl(label, ex, first, last, pred, new_value);
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class Predicate, class ValueType>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename Predicate, typename ValueType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 void replace_if(const ExecutionSpace& ex,
                 const ::Kokkos::View<DataType1, Properties1...>& view,
                 Predicate pred, const ValueType& new_value) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
   namespace KE = ::Kokkos::Experimental;
-  return Impl::replace_if_impl("Kokkos::replace_if_view_api", ex,
-                               KE::begin(view), KE::end(view), pred, new_value);
+  Impl::replace_if_exespace_impl("Kokkos::replace_if_view_api", ex,
+                                 KE::begin(view), KE::end(view), pred,
+                                 new_value);
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class Predicate, class ValueType>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename Predicate, typename ValueType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 void replace_if(const std::string& label, const ExecutionSpace& ex,
                 const ::Kokkos::View<DataType1, Properties1...>& view,
                 Predicate pred, const ValueType& new_value) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
   namespace KE = ::Kokkos::Experimental;
-  return Impl::replace_if_impl(label, ex, KE::begin(view), KE::end(view), pred,
-                               new_value);
+  Impl::replace_if_exespace_impl(label, ex, KE::begin(view), KE::end(view),
+                                 pred, new_value);
+}
+
+//
+// overload set accepting a team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+template <typename TeamHandleType, typename InputIterator, typename Predicate,
+          typename ValueType,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION void replace_if(const TeamHandleType& teamHandle,
+                                InputIterator first, InputIterator last,
+                                Predicate pred, const ValueType& new_value) {
+  Impl::replace_if_team_impl(teamHandle, first, last, pred, new_value);
+}
+
+template <typename TeamHandleType, typename DataType1, typename... Properties1,
+          typename Predicate, typename ValueType,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION void replace_if(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType1, Properties1...>& view, Predicate pred,
+    const ValueType& new_value) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  namespace KE = ::Kokkos::Experimental;
+  Impl::replace_if_team_impl(teamHandle, KE::begin(view), KE::end(view), pred,
+                             new_value);
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Reverse.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Reverse.hpp
index 9f2fc5f3ccffa493575d2d7bd745fb83094a4dad..a0786d3a2ebe8a0694fc27cb5c90d5718c39b21f 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Reverse.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Reverse.hpp
@@ -23,34 +23,67 @@
 namespace Kokkos {
 namespace Experimental {
 
-template <class ExecutionSpace, class InputIterator>
+//
+// overload set accepting execution space
+//
+template <
+    typename ExecutionSpace, typename InputIterator,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 void reverse(const ExecutionSpace& ex, InputIterator first,
              InputIterator last) {
-  return Impl::reverse_impl("Kokkos::reverse_iterator_api_default", ex, first,
-                            last);
+  return Impl::reverse_exespace_impl("Kokkos::reverse_iterator_api_default", ex,
+                                     first, last);
 }
 
-template <class ExecutionSpace, class InputIterator>
+template <
+    typename ExecutionSpace, typename InputIterator,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 void reverse(const std::string& label, const ExecutionSpace& ex,
              InputIterator first, InputIterator last) {
-  return Impl::reverse_impl(label, ex, first, last);
+  return Impl::reverse_exespace_impl(label, ex, first, last);
 }
 
-template <class ExecutionSpace, class DataType, class... Properties>
+template <
+    typename ExecutionSpace, typename DataType, typename... Properties,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 void reverse(const ExecutionSpace& ex,
              const ::Kokkos::View<DataType, Properties...>& view) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
   namespace KE = ::Kokkos::Experimental;
-  return Impl::reverse_impl("Kokkos::reverse_view_api_default", ex,
-                            KE::begin(view), KE::end(view));
+  return Impl::reverse_exespace_impl("Kokkos::reverse_view_api_default", ex,
+                                     KE::begin(view), KE::end(view));
 }
 
-template <class ExecutionSpace, class DataType, class... Properties>
+template <
+    typename ExecutionSpace, typename DataType, typename... Properties,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 void reverse(const std::string& label, const ExecutionSpace& ex,
              const ::Kokkos::View<DataType, Properties...>& view) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
   namespace KE = ::Kokkos::Experimental;
-  return Impl::reverse_impl(label, ex, KE::begin(view), KE::end(view));
+  return Impl::reverse_exespace_impl(label, ex, KE::begin(view), KE::end(view));
+}
+
+//
+// overload set accepting a team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+template <typename TeamHandleType, typename InputIterator,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION void reverse(const TeamHandleType& teamHandle,
+                             InputIterator first, InputIterator last) {
+  return Impl::reverse_team_impl(teamHandle, first, last);
+}
+
+template <typename TeamHandleType, typename DataType, typename... Properties,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION void reverse(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType, Properties...>& view) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::reverse_team_impl(teamHandle, KE::begin(view), KE::end(view));
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ReverseCopy.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ReverseCopy.hpp
index 279bb220869940f55a2689d519b4708838572d88..37336c983ab0e53d0a2172b5a7605e0edd75ea39 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ReverseCopy.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ReverseCopy.hpp
@@ -23,42 +23,83 @@
 namespace Kokkos {
 namespace Experimental {
 
-template <class ExecutionSpace, class InputIterator, class OutputIterator>
+//
+// overload set accepting execution space
+//
+template <
+    typename ExecutionSpace, typename InputIterator, typename OutputIterator,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 OutputIterator reverse_copy(const ExecutionSpace& ex, InputIterator first,
                             InputIterator last, OutputIterator d_first) {
-  return Impl::reverse_copy_impl("Kokkos::reverse_copy_iterator_api_default",
-                                 ex, first, last, d_first);
+  return Impl::reverse_copy_exespace_impl(
+      "Kokkos::reverse_copy_iterator_api_default", ex, first, last, d_first);
 }
 
-template <class ExecutionSpace, class InputIterator, class OutputIterator>
+template <
+    typename ExecutionSpace, typename InputIterator, typename OutputIterator,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 OutputIterator reverse_copy(const std::string& label, const ExecutionSpace& ex,
                             InputIterator first, InputIterator last,
                             OutputIterator d_first) {
-  return Impl::reverse_copy_impl(label, ex, first, last, d_first);
+  return Impl::reverse_copy_exespace_impl(label, ex, first, last, d_first);
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto reverse_copy(const ExecutionSpace& ex,
                   const ::Kokkos::View<DataType1, Properties1...>& source,
                   ::Kokkos::View<DataType2, Properties2...>& dest) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
 
-  return Impl::reverse_copy_impl("Kokkos::reverse_copy_view_api_default", ex,
-                                 cbegin(source), cend(source), begin(dest));
+  return Impl::reverse_copy_exespace_impl(
+      "Kokkos::reverse_copy_view_api_default", ex, cbegin(source), cend(source),
+      begin(dest));
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto reverse_copy(const std::string& label, const ExecutionSpace& ex,
                   const ::Kokkos::View<DataType1, Properties1...>& source,
                   ::Kokkos::View<DataType2, Properties2...>& dest) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
 
-  return Impl::reverse_copy_impl(label, ex, cbegin(source), cend(source),
-                                 begin(dest));
+  return Impl::reverse_copy_exespace_impl(label, ex, cbegin(source),
+                                          cend(source), begin(dest));
+}
+
+//
+// overload set accepting a team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+template <typename TeamHandleType, typename InputIterator,
+          typename OutputIterator,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION OutputIterator reverse_copy(const TeamHandleType& teamHandle,
+                                            InputIterator first,
+                                            InputIterator last,
+                                            OutputIterator d_first) {
+  return Impl::reverse_copy_team_impl(teamHandle, first, last, d_first);
+}
+
+template <typename TeamHandleType, typename DataType1, typename... Properties1,
+          typename DataType2, typename... Properties2,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto reverse_copy(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType1, Properties1...>& source,
+    ::Kokkos::View<DataType2, Properties2...>& dest) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
+
+  return Impl::reverse_copy_team_impl(teamHandle, cbegin(source), cend(source),
+                                      begin(dest));
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Rotate.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Rotate.hpp
index 738e9bf1374388dfc5be8600c2fb27cbb95edde0..aff04b47d63914897613f462b081d127dbfd9da0 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Rotate.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Rotate.hpp
@@ -23,36 +23,71 @@
 namespace Kokkos {
 namespace Experimental {
 
-template <class ExecutionSpace, class IteratorType>
+//
+// overload set accepting execution space
+//
+template <
+    typename ExecutionSpace, typename IteratorType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 IteratorType rotate(const ExecutionSpace& ex, IteratorType first,
                     IteratorType n_first, IteratorType last) {
-  return Impl::rotate_impl("Kokkos::rotate_iterator_api_default", ex, first,
-                           n_first, last);
+  return Impl::rotate_exespace_impl("Kokkos::rotate_iterator_api_default", ex,
+                                    first, n_first, last);
 }
 
-template <class ExecutionSpace, class IteratorType>
+template <
+    typename ExecutionSpace, typename IteratorType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 IteratorType rotate(const std::string& label, const ExecutionSpace& ex,
                     IteratorType first, IteratorType n_first,
                     IteratorType last) {
-  return Impl::rotate_impl(label, ex, first, n_first, last);
+  return Impl::rotate_exespace_impl(label, ex, first, n_first, last);
 }
 
-template <class ExecutionSpace, class DataType, class... Properties>
+template <
+    typename ExecutionSpace, typename DataType, typename... Properties,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto rotate(const ExecutionSpace& ex,
             const ::Kokkos::View<DataType, Properties...>& view,
             std::size_t n_location) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-  return Impl::rotate_impl("Kokkos::rotate_view_api_default", ex, begin(view),
-                           begin(view) + n_location, end(view));
+  return Impl::rotate_exespace_impl("Kokkos::rotate_view_api_default", ex,
+                                    begin(view), begin(view) + n_location,
+                                    end(view));
 }
 
-template <class ExecutionSpace, class DataType, class... Properties>
+template <
+    typename ExecutionSpace, typename DataType, typename... Properties,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto rotate(const std::string& label, const ExecutionSpace& ex,
             const ::Kokkos::View<DataType, Properties...>& view,
             std::size_t n_location) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-  return Impl::rotate_impl(label, ex, begin(view), begin(view) + n_location,
-                           end(view));
+  return Impl::rotate_exespace_impl(label, ex, begin(view),
+                                    begin(view) + n_location, end(view));
+}
+
+//
+// overload set accepting a team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+template <typename TeamHandleType, typename IteratorType,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION IteratorType rotate(const TeamHandleType& teamHandle,
+                                    IteratorType first, IteratorType n_first,
+                                    IteratorType last) {
+  return Impl::rotate_team_impl(teamHandle, first, n_first, last);
+}
+
+template <typename TeamHandleType, typename DataType, typename... Properties,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto rotate(const TeamHandleType& teamHandle,
+                            const ::Kokkos::View<DataType, Properties...>& view,
+                            std::size_t n_location) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  return Impl::rotate_team_impl(teamHandle, begin(view),
+                                begin(view) + n_location, end(view));
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_RotateCopy.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_RotateCopy.hpp
index f5d826c4bb4f32b1a3d0f66700cb22af5ae33134..cce37fccfae7a1bd2319b2639dfaaa11f5354f89 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_RotateCopy.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_RotateCopy.hpp
@@ -23,23 +23,34 @@
 namespace Kokkos {
 namespace Experimental {
 
-template <class ExecutionSpace, class InputIterator, class OutputIterator>
+//
+// overload set accepting execution space
+//
+template <
+    typename ExecutionSpace, typename InputIterator, typename OutputIterator,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 OutputIterator rotate_copy(const ExecutionSpace& ex, InputIterator first,
                            InputIterator n_first, InputIterator last,
                            OutputIterator d_first) {
-  return Impl::rotate_copy_impl("Kokkos::rotate_copy_iterator_api_default", ex,
-                                first, n_first, last, d_first);
+  return Impl::rotate_copy_exespace_impl(
+      "Kokkos::rotate_copy_iterator_api_default", ex, first, n_first, last,
+      d_first);
 }
 
-template <class ExecutionSpace, class InputIterator, class OutputIterator>
+template <
+    typename ExecutionSpace, typename InputIterator, typename OutputIterator,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 OutputIterator rotate_copy(const std::string& label, const ExecutionSpace& ex,
                            InputIterator first, InputIterator n_first,
                            InputIterator last, OutputIterator d_first) {
-  return Impl::rotate_copy_impl(label, ex, first, n_first, last, d_first);
+  return Impl::rotate_copy_exespace_impl(label, ex, first, n_first, last,
+                                         d_first);
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto rotate_copy(const ExecutionSpace& ex,
                  const ::Kokkos::View<DataType1, Properties1...>& source,
                  std::size_t n_location,
@@ -47,13 +58,15 @@ auto rotate_copy(const ExecutionSpace& ex,
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
 
-  return Impl::rotate_copy_impl("Kokkos::rotate_copy_view_api_default", ex,
-                                cbegin(source), cbegin(source) + n_location,
-                                cend(source), begin(dest));
+  return Impl::rotate_copy_exespace_impl(
+      "Kokkos::rotate_copy_view_api_default", ex, cbegin(source),
+      cbegin(source) + n_location, cend(source), begin(dest));
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto rotate_copy(const std::string& label, const ExecutionSpace& ex,
                  const ::Kokkos::View<DataType1, Properties1...>& source,
                  std::size_t n_location,
@@ -61,9 +74,41 @@ auto rotate_copy(const std::string& label, const ExecutionSpace& ex,
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
 
-  return Impl::rotate_copy_impl(label, ex, cbegin(source),
-                                cbegin(source) + n_location, cend(source),
-                                begin(dest));
+  return Impl::rotate_copy_exespace_impl(label, ex, cbegin(source),
+                                         cbegin(source) + n_location,
+                                         cend(source), begin(dest));
+}
+
+//
+// overload set accepting a team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+template <typename TeamHandleType, typename InputIterator,
+          typename OutputIterator,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION OutputIterator rotate_copy(const TeamHandleType& teamHandle,
+                                           InputIterator first,
+                                           InputIterator n_first,
+                                           InputIterator last,
+                                           OutputIterator d_first) {
+  return Impl::rotate_copy_team_impl(teamHandle, first, n_first, last, d_first);
+}
+
+template <typename TeamHandleType, typename DataType1, typename... Properties1,
+          typename DataType2, typename... Properties2,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto rotate_copy(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType1, Properties1...>& source,
+    std::size_t n_location,
+    const ::Kokkos::View<DataType2, Properties2...>& dest) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
+
+  return Impl::rotate_copy_team_impl(teamHandle, cbegin(source),
+                                     cbegin(source) + n_location, cend(source),
+                                     begin(dest));
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Search.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Search.hpp
index b1154b297edf01c76a3b791c8f1c50314d3f266b..43258a484eca86b970360cfb0cc5b57c2b613767 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Search.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Search.hpp
@@ -23,24 +23,34 @@
 namespace Kokkos {
 namespace Experimental {
 
+//
+// overload set accepting execution space
+//
+
 // overload set 1: no binary predicate passed
-template <class ExecutionSpace, class IteratorType1, class IteratorType2>
+template <
+    typename ExecutionSpace, typename IteratorType1, typename IteratorType2,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 IteratorType1 search(const ExecutionSpace& ex, IteratorType1 first,
                      IteratorType1 last, IteratorType2 s_first,
                      IteratorType2 s_last) {
-  return Impl::search_impl("Kokkos::search_iterator_api_default", ex, first,
-                           last, s_first, s_last);
+  return Impl::search_exespace_impl("Kokkos::search_iterator_api_default", ex,
+                                    first, last, s_first, s_last);
 }
 
-template <class ExecutionSpace, class IteratorType1, class IteratorType2>
+template <
+    typename ExecutionSpace, typename IteratorType1, typename IteratorType2,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 IteratorType1 search(const std::string& label, const ExecutionSpace& ex,
                      IteratorType1 first, IteratorType1 last,
                      IteratorType2 s_first, IteratorType2 s_last) {
-  return Impl::search_impl(label, ex, first, last, s_first, s_last);
+  return Impl::search_exespace_impl(label, ex, first, last, s_first, s_last);
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto search(const ExecutionSpace& ex,
             const ::Kokkos::View<DataType1, Properties1...>& view,
             const ::Kokkos::View<DataType2, Properties2...>& s_view) {
@@ -48,13 +58,15 @@ auto search(const ExecutionSpace& ex,
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::search_impl("Kokkos::search_view_api_default", ex,
-                           KE::begin(view), KE::end(view), KE::begin(s_view),
-                           KE::end(s_view));
+  return Impl::search_exespace_impl("Kokkos::search_view_api_default", ex,
+                                    KE::begin(view), KE::end(view),
+                                    KE::begin(s_view), KE::end(s_view));
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto search(const std::string& label, const ExecutionSpace& ex,
             const ::Kokkos::View<DataType1, Properties1...>& view,
             const ::Kokkos::View<DataType2, Properties2...>& s_view) {
@@ -62,31 +74,38 @@ auto search(const std::string& label, const ExecutionSpace& ex,
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::search_impl(label, ex, KE::begin(view), KE::end(view),
-                           KE::begin(s_view), KE::end(s_view));
+  return Impl::search_exespace_impl(label, ex, KE::begin(view), KE::end(view),
+                                    KE::begin(s_view), KE::end(s_view));
 }
 
 // overload set 2: binary predicate passed
-template <class ExecutionSpace, class IteratorType1, class IteratorType2,
-          class BinaryPredicateType>
+template <
+    typename ExecutionSpace, typename IteratorType1, typename IteratorType2,
+    typename BinaryPredicateType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 IteratorType1 search(const ExecutionSpace& ex, IteratorType1 first,
                      IteratorType1 last, IteratorType2 s_first,
                      IteratorType2 s_last, const BinaryPredicateType& pred) {
-  return Impl::search_impl("Kokkos::search_iterator_api_default", ex, first,
-                           last, s_first, s_last, pred);
+  return Impl::search_exespace_impl("Kokkos::search_iterator_api_default", ex,
+                                    first, last, s_first, s_last, pred);
 }
 
-template <class ExecutionSpace, class IteratorType1, class IteratorType2,
-          class BinaryPredicateType>
+template <
+    typename ExecutionSpace, typename IteratorType1, typename IteratorType2,
+    typename BinaryPredicateType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 IteratorType1 search(const std::string& label, const ExecutionSpace& ex,
                      IteratorType1 first, IteratorType1 last,
                      IteratorType2 s_first, IteratorType2 s_last,
                      const BinaryPredicateType& pred) {
-  return Impl::search_impl(label, ex, first, last, s_first, s_last, pred);
+  return Impl::search_exespace_impl(label, ex, first, last, s_first, s_last,
+                                    pred);
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class BinaryPredicateType>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2, typename BinaryPredicateType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto search(const ExecutionSpace& ex,
             const ::Kokkos::View<DataType1, Properties1...>& view,
             const ::Kokkos::View<DataType2, Properties2...>& s_view,
@@ -95,13 +114,15 @@ auto search(const ExecutionSpace& ex,
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::search_impl("Kokkos::search_view_api_default", ex,
-                           KE::begin(view), KE::end(view), KE::begin(s_view),
-                           KE::end(s_view), pred);
+  return Impl::search_exespace_impl("Kokkos::search_view_api_default", ex,
+                                    KE::begin(view), KE::end(view),
+                                    KE::begin(s_view), KE::end(s_view), pred);
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class BinaryPredicateType>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2, typename BinaryPredicateType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto search(const std::string& label, const ExecutionSpace& ex,
             const ::Kokkos::View<DataType1, Properties1...>& view,
             const ::Kokkos::View<DataType2, Properties2...>& s_view,
@@ -110,8 +131,70 @@ auto search(const std::string& label, const ExecutionSpace& ex,
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::search_impl(label, ex, KE::begin(view), KE::end(view),
-                           KE::begin(s_view), KE::end(s_view), pred);
+  return Impl::search_exespace_impl(label, ex, KE::begin(view), KE::end(view),
+                                    KE::begin(s_view), KE::end(s_view), pred);
+}
+
+//
+// overload set accepting a team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+
+// overload set 1: no binary predicate passed
+template <typename TeamHandleType, typename IteratorType1,
+          typename IteratorType2,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION IteratorType1 search(const TeamHandleType& teamHandle,
+                                     IteratorType1 first, IteratorType1 last,
+                                     IteratorType2 s_first,
+                                     IteratorType2 s_last) {
+  return Impl::search_team_impl(teamHandle, first, last, s_first, s_last);
+}
+
+template <typename TeamHandleType, typename DataType1, typename... Properties1,
+          typename DataType2, typename... Properties2,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto search(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType1, Properties1...>& view,
+    const ::Kokkos::View<DataType2, Properties2...>& s_view) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::search_team_impl(teamHandle, KE::begin(view), KE::end(view),
+                                KE::begin(s_view), KE::end(s_view));
+}
+
+// overload set 2: binary predicate passed
+template <typename TeamHandleType, typename IteratorType1,
+          typename IteratorType2, typename BinaryPredicateType,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+
+KOKKOS_FUNCTION IteratorType1 search(const TeamHandleType& teamHandle,
+                                     IteratorType1 first, IteratorType1 last,
+                                     IteratorType2 s_first,
+                                     IteratorType2 s_last,
+                                     const BinaryPredicateType& pred) {
+  return Impl::search_team_impl(teamHandle, first, last, s_first, s_last, pred);
+}
+
+template <typename TeamHandleType, typename DataType1, typename... Properties1,
+          typename DataType2, typename... Properties2,
+          typename BinaryPredicateType,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto search(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType1, Properties1...>& view,
+    const ::Kokkos::View<DataType2, Properties2...>& s_view,
+    const BinaryPredicateType& pred) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::search_team_impl(teamHandle, KE::begin(view), KE::end(view),
+                                KE::begin(s_view), KE::end(s_view), pred);
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_SearchN.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_SearchN.hpp
index a649c8f2053a0817bc6683900ff5825f3e4c1af4..0f8aa5f1c132b97395e2e91b119caa12d942de21 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_SearchN.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_SearchN.hpp
@@ -23,68 +23,86 @@
 namespace Kokkos {
 namespace Experimental {
 
+//
+// overload set accepting execution space
+//
+
 // overload set 1: no binary predicate passed
-template <class ExecutionSpace, class IteratorType, class SizeType,
-          class ValueType>
+template <
+    class ExecutionSpace, class IteratorType, class SizeType, class ValueType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 IteratorType search_n(const ExecutionSpace& ex, IteratorType first,
                       IteratorType last, SizeType count,
                       const ValueType& value) {
-  return Impl::search_n_impl("Kokkos::search_n_iterator_api_default", ex, first,
-                             last, count, value);
+  return Impl::search_n_exespace_impl("Kokkos::search_n_iterator_api_default",
+                                      ex, first, last, count, value);
 }
 
-template <class ExecutionSpace, class IteratorType, class SizeType,
-          class ValueType>
+template <
+    class ExecutionSpace, class IteratorType, class SizeType, class ValueType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 IteratorType search_n(const std::string& label, const ExecutionSpace& ex,
                       IteratorType first, IteratorType last, SizeType count,
                       const ValueType& value) {
-  return Impl::search_n_impl(label, ex, first, last, count, value);
+  return Impl::search_n_exespace_impl(label, ex, first, last, count, value);
 }
 
 template <class ExecutionSpace, class DataType, class... Properties,
-          class SizeType, class ValueType>
+          class SizeType, class ValueType,
+          std::enable_if_t<::Kokkos::is_execution_space<ExecutionSpace>::value,
+                           int> = 0>
 auto search_n(const ExecutionSpace& ex,
               const ::Kokkos::View<DataType, Properties...>& view,
               SizeType count, const ValueType& value) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::search_n_impl("Kokkos::search_n_view_api_default", ex,
-                             KE::begin(view), KE::end(view), count, value);
+  return Impl::search_n_exespace_impl("Kokkos::search_n_view_api_default", ex,
+                                      KE::begin(view), KE::end(view), count,
+                                      value);
 }
 
 template <class ExecutionSpace, class DataType, class... Properties,
-          class SizeType, class ValueType>
+          class SizeType, class ValueType,
+          std::enable_if_t<::Kokkos::is_execution_space<ExecutionSpace>::value,
+                           int> = 0>
 auto search_n(const std::string& label, const ExecutionSpace& ex,
               const ::Kokkos::View<DataType, Properties...>& view,
               SizeType count, const ValueType& value) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::search_n_impl(label, ex, KE::begin(view), KE::end(view), count,
-                             value);
+  return Impl::search_n_exespace_impl(label, ex, KE::begin(view), KE::end(view),
+                                      count, value);
 }
 
 // overload set 2: binary predicate passed
-template <class ExecutionSpace, class IteratorType, class SizeType,
-          class ValueType, class BinaryPredicateType>
+template <
+    class ExecutionSpace, class IteratorType, class SizeType, class ValueType,
+    class BinaryPredicateType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 IteratorType search_n(const ExecutionSpace& ex, IteratorType first,
                       IteratorType last, SizeType count, const ValueType& value,
                       const BinaryPredicateType& pred) {
-  return Impl::search_n_impl("Kokkos::search_n_iterator_api_default", ex, first,
-                             last, count, value, pred);
+  return Impl::search_n_exespace_impl("Kokkos::search_n_iterator_api_default",
+                                      ex, first, last, count, value, pred);
 }
 
-template <class ExecutionSpace, class IteratorType, class SizeType,
-          class ValueType, class BinaryPredicateType>
+template <
+    class ExecutionSpace, class IteratorType, class SizeType, class ValueType,
+    class BinaryPredicateType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 IteratorType search_n(const std::string& label, const ExecutionSpace& ex,
                       IteratorType first, IteratorType last, SizeType count,
                       const ValueType& value, const BinaryPredicateType& pred) {
-  return Impl::search_n_impl(label, ex, first, last, count, value, pred);
+  return Impl::search_n_exespace_impl(label, ex, first, last, count, value,
+                                      pred);
 }
 
 template <class ExecutionSpace, class DataType, class... Properties,
-          class SizeType, class ValueType, class BinaryPredicateType>
+          class SizeType, class ValueType, class BinaryPredicateType,
+          std::enable_if_t<::Kokkos::is_execution_space<ExecutionSpace>::value,
+                           int> = 0>
 auto search_n(const ExecutionSpace& ex,
               const ::Kokkos::View<DataType, Properties...>& view,
               SizeType count, const ValueType& value,
@@ -92,13 +110,15 @@ auto search_n(const ExecutionSpace& ex,
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::search_n_impl("Kokkos::search_n_view_api_default", ex,
-                             KE::begin(view), KE::end(view), count, value,
-                             pred);
+  return Impl::search_n_exespace_impl("Kokkos::search_n_view_api_default", ex,
+                                      KE::begin(view), KE::end(view), count,
+                                      value, pred);
 }
 
 template <class ExecutionSpace, class DataType, class... Properties,
-          class SizeType, class ValueType, class BinaryPredicateType>
+          class SizeType, class ValueType, class BinaryPredicateType,
+          std::enable_if_t<::Kokkos::is_execution_space<ExecutionSpace>::value,
+                           int> = 0>
 auto search_n(const std::string& label, const ExecutionSpace& ex,
               const ::Kokkos::View<DataType, Properties...>& view,
               SizeType count, const ValueType& value,
@@ -106,8 +126,65 @@ auto search_n(const std::string& label, const ExecutionSpace& ex,
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
 
   namespace KE = ::Kokkos::Experimental;
-  return Impl::search_n_impl(label, ex, KE::begin(view), KE::end(view), count,
-                             value, pred);
+  return Impl::search_n_exespace_impl(label, ex, KE::begin(view), KE::end(view),
+                                      count, value, pred);
+}
+
+//
+// overload set accepting a team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+
+// overload set 1: no binary predicate passed
+template <class TeamHandleType, class IteratorType, class SizeType,
+          class ValueType,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION IteratorType search_n(const TeamHandleType& teamHandle,
+                                      IteratorType first, IteratorType last,
+                                      SizeType count, const ValueType& value) {
+  return Impl::search_n_team_impl(teamHandle, first, last, count, value);
+}
+
+template <
+    class TeamHandleType, class DataType, class... Properties, class SizeType,
+    class ValueType,
+    std::enable_if_t<::Kokkos::is_team_handle<TeamHandleType>::value, int> = 0>
+KOKKOS_FUNCTION auto search_n(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType, Properties...>& view, SizeType count,
+    const ValueType& value) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::search_n_team_impl(teamHandle, KE::begin(view), KE::end(view),
+                                  count, value);
+}
+
+// overload set 2: binary predicate passed
+template <class TeamHandleType, class IteratorType, class SizeType,
+          class ValueType, class BinaryPredicateType,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION IteratorType search_n(const TeamHandleType& teamHandle,
+                                      IteratorType first, IteratorType last,
+                                      SizeType count, const ValueType& value,
+                                      const BinaryPredicateType& pred) {
+  return Impl::search_n_team_impl(teamHandle, first, last, count, value, pred);
+}
+
+template <
+    class TeamHandleType, class DataType, class... Properties, class SizeType,
+    class ValueType, class BinaryPredicateType,
+    std::enable_if_t<::Kokkos::is_team_handle<TeamHandleType>::value, int> = 0>
+KOKKOS_FUNCTION auto search_n(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType, Properties...>& view, SizeType count,
+    const ValueType& value, const BinaryPredicateType& pred) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::search_n_team_impl(teamHandle, KE::begin(view), KE::end(view),
+                                  count, value, pred);
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ShiftLeft.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ShiftLeft.hpp
index 4b91a17ab838e5d5fedc16570100614e1e8e0eef..b3e04a3b974a439268452898964b9113da30a8c2 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ShiftLeft.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ShiftLeft.hpp
@@ -23,36 +23,70 @@
 namespace Kokkos {
 namespace Experimental {
 
-template <class ExecutionSpace, class IteratorType>
+//
+// overload set accepting execution space
+//
+template <
+    typename ExecutionSpace, typename IteratorType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 IteratorType shift_left(const ExecutionSpace& ex, IteratorType first,
                         IteratorType last,
                         typename IteratorType::difference_type n) {
-  return Impl::shift_left_impl("Kokkos::shift_left_iterator_api_default", ex,
-                               first, last, n);
+  return Impl::shift_left_exespace_impl(
+      "Kokkos::shift_left_iterator_api_default", ex, first, last, n);
 }
 
-template <class ExecutionSpace, class IteratorType>
+template <
+    typename ExecutionSpace, typename IteratorType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 IteratorType shift_left(const std::string& label, const ExecutionSpace& ex,
                         IteratorType first, IteratorType last,
                         typename IteratorType::difference_type n) {
-  return Impl::shift_left_impl(label, ex, first, last, n);
+  return Impl::shift_left_exespace_impl(label, ex, first, last, n);
 }
 
-template <class ExecutionSpace, class DataType, class... Properties>
+template <
+    typename ExecutionSpace, typename DataType, typename... Properties,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto shift_left(const ExecutionSpace& ex,
                 const ::Kokkos::View<DataType, Properties...>& view,
                 typename decltype(begin(view))::difference_type n) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-  return Impl::shift_left_impl("Kokkos::shift_left_view_api_default", ex,
-                               begin(view), end(view), n);
+  return Impl::shift_left_exespace_impl("Kokkos::shift_left_view_api_default",
+                                        ex, begin(view), end(view), n);
 }
 
-template <class ExecutionSpace, class DataType, class... Properties>
+template <
+    typename ExecutionSpace, typename DataType, typename... Properties,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto shift_left(const std::string& label, const ExecutionSpace& ex,
                 const ::Kokkos::View<DataType, Properties...>& view,
                 typename decltype(begin(view))::difference_type n) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-  return Impl::shift_left_impl(label, ex, begin(view), end(view), n);
+  return Impl::shift_left_exespace_impl(label, ex, begin(view), end(view), n);
+}
+
+//
+// overload set accepting a team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+template <typename TeamHandleType, typename IteratorType,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION IteratorType
+shift_left(const TeamHandleType& teamHandle, IteratorType first,
+           IteratorType last, typename IteratorType::difference_type n) {
+  return Impl::shift_left_team_impl(teamHandle, first, last, n);
+}
+
+template <typename TeamHandleType, typename DataType, typename... Properties,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto shift_left(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType, Properties...>& view,
+    typename decltype(begin(view))::difference_type n) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  return Impl::shift_left_team_impl(teamHandle, begin(view), end(view), n);
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ShiftRight.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ShiftRight.hpp
index 2ea50fd74e5c1910c77040af07dd5e56661ea716..0f7ed539487812da6edb900ed7cca6d3750f189b 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ShiftRight.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ShiftRight.hpp
@@ -23,36 +23,70 @@
 namespace Kokkos {
 namespace Experimental {
 
-template <class ExecutionSpace, class IteratorType>
+//
+// overload set accepting execution space
+//
+template <
+    typename ExecutionSpace, typename IteratorType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 IteratorType shift_right(const ExecutionSpace& ex, IteratorType first,
                          IteratorType last,
                          typename IteratorType::difference_type n) {
-  return Impl::shift_right_impl("Kokkos::shift_right_iterator_api_default", ex,
-                                first, last, n);
+  return Impl::shift_right_exespace_impl(
+      "Kokkos::shift_right_iterator_api_default", ex, first, last, n);
 }
 
-template <class ExecutionSpace, class IteratorType>
+template <
+    typename ExecutionSpace, typename IteratorType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 IteratorType shift_right(const std::string& label, const ExecutionSpace& ex,
                          IteratorType first, IteratorType last,
                          typename IteratorType::difference_type n) {
-  return Impl::shift_right_impl(label, ex, first, last, n);
+  return Impl::shift_right_exespace_impl(label, ex, first, last, n);
 }
 
-template <class ExecutionSpace, class DataType, class... Properties>
+template <
+    typename ExecutionSpace, typename DataType, typename... Properties,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto shift_right(const ExecutionSpace& ex,
                  const ::Kokkos::View<DataType, Properties...>& view,
                  typename decltype(begin(view))::difference_type n) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-  return Impl::shift_right_impl("Kokkos::shift_right_view_api_default", ex,
-                                begin(view), end(view), n);
+  return Impl::shift_right_exespace_impl("Kokkos::shift_right_view_api_default",
+                                         ex, begin(view), end(view), n);
 }
 
-template <class ExecutionSpace, class DataType, class... Properties>
+template <
+    typename ExecutionSpace, typename DataType, typename... Properties,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto shift_right(const std::string& label, const ExecutionSpace& ex,
                  const ::Kokkos::View<DataType, Properties...>& view,
                  typename decltype(begin(view))::difference_type n) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-  return Impl::shift_right_impl(label, ex, begin(view), end(view), n);
+  return Impl::shift_right_exespace_impl(label, ex, begin(view), end(view), n);
+}
+
+//
+// overload set accepting a team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+template <typename TeamHandleType, typename IteratorType,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION IteratorType
+shift_right(const TeamHandleType& teamHandle, IteratorType first,
+            IteratorType last, typename IteratorType::difference_type n) {
+  return Impl::shift_right_team_impl(teamHandle, first, last, n);
+}
+
+template <typename TeamHandleType, typename DataType, typename... Properties,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto shift_right(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType, Properties...>& view,
+    typename decltype(begin(view))::difference_type n) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  return Impl::shift_right_team_impl(teamHandle, begin(view), end(view), n);
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_SwapRanges.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_SwapRanges.hpp
index 5fbf04531885c22887d3d4cf0998f6e07da1a773..39f33b64879a2509edf9d7ee111755df3b6f31a6 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_SwapRanges.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_SwapRanges.hpp
@@ -23,15 +23,21 @@
 namespace Kokkos {
 namespace Experimental {
 
-template <class ExecutionSpace, class IteratorType1, class IteratorType2>
+//
+// overload set accepting execution space
+//
+template <typename ExecutionSpace, typename IteratorType1,
+          typename IteratorType2,
+          std::enable_if_t<is_execution_space_v<ExecutionSpace>, int> = 0>
 IteratorType2 swap_ranges(const ExecutionSpace& ex, IteratorType1 first1,
                           IteratorType1 last1, IteratorType2 first2) {
-  return Impl::swap_ranges_impl("Kokkos::swap_ranges_iterator_api_default", ex,
-                                first1, last1, first2);
+  return Impl::swap_ranges_exespace_impl(
+      "Kokkos::swap_ranges_iterator_api_default", ex, first1, last1, first2);
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
+template <typename ExecutionSpace, typename DataType1, typename... Properties1,
+          typename DataType2, typename... Properties2,
+          std::enable_if_t<is_execution_space_v<ExecutionSpace>, int> = 0>
 auto swap_ranges(const ExecutionSpace& ex,
                  const ::Kokkos::View<DataType1, Properties1...>& source,
                  ::Kokkos::View<DataType2, Properties2...>& dest) {
@@ -39,19 +45,23 @@ auto swap_ranges(const ExecutionSpace& ex,
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
 
   assert(source.extent(0) == dest.extent(0));
-  return Impl::swap_ranges_impl("Kokkos::swap_ranges_view_api_default", ex,
-                                begin(source), end(source), begin(dest));
+  return Impl::swap_ranges_exespace_impl("Kokkos::swap_ranges_view_api_default",
+                                         ex, begin(source), end(source),
+                                         begin(dest));
 }
 
-template <class ExecutionSpace, class IteratorType1, class IteratorType2>
+template <typename ExecutionSpace, typename IteratorType1,
+          typename IteratorType2,
+          std::enable_if_t<is_execution_space_v<ExecutionSpace>, int> = 0>
 IteratorType2 swap_ranges(const std::string& label, const ExecutionSpace& ex,
                           IteratorType1 first1, IteratorType1 last1,
                           IteratorType2 first2) {
-  return Impl::swap_ranges_impl(label, ex, first1, last1, first2);
+  return Impl::swap_ranges_exespace_impl(label, ex, first1, last1, first2);
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
+template <typename ExecutionSpace, typename DataType1, typename... Properties1,
+          typename DataType2, typename... Properties2,
+          std::enable_if_t<is_execution_space_v<ExecutionSpace>, int> = 0>
 auto swap_ranges(const std::string& label, const ExecutionSpace& ex,
                  const ::Kokkos::View<DataType1, Properties1...>& source,
                  ::Kokkos::View<DataType2, Properties2...>& dest) {
@@ -59,8 +69,38 @@ auto swap_ranges(const std::string& label, const ExecutionSpace& ex,
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
 
   assert(source.extent(0) == dest.extent(0));
-  return Impl::swap_ranges_impl(label, ex, begin(source), end(source),
-                                begin(dest));
+  return Impl::swap_ranges_exespace_impl(label, ex, begin(source), end(source),
+                                         begin(dest));
+}
+
+//
+// overload set accepting a team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+template <typename TeamHandleType, typename IteratorType1,
+          typename IteratorType2,
+          std::enable_if_t<is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION IteratorType2 swap_ranges(const TeamHandleType& teamHandle,
+                                          IteratorType1 first1,
+                                          IteratorType1 last1,
+                                          IteratorType2 first2) {
+  return Impl::swap_ranges_team_impl(teamHandle, first1, last1, first2);
+}
+
+template <typename TeamHandleType, typename DataType1, typename... Properties1,
+          typename DataType2, typename... Properties2,
+          std::enable_if_t<is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto swap_ranges(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType1, Properties1...>& source,
+    ::Kokkos::View<DataType2, Properties2...>& dest) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
+
+  assert(source.extent(0) == dest.extent(0));
+  return Impl::swap_ranges_team_impl(teamHandle, begin(source), end(source),
+                                     begin(dest));
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Transform.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Transform.hpp
index 27dee304261d14ebad1dd066d3cfe7655715bd2f..838c9169e25c4411c22259b8ef5a21b5f964a4fe 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Transform.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Transform.hpp
@@ -23,31 +23,39 @@
 namespace Kokkos {
 namespace Experimental {
 
-template <class ExecutionSpace, class InputIterator, class OutputIterator,
-          class UnaryOperation>
-std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators<
-                      InputIterator, OutputIterator>::value,
-                  OutputIterator>
-transform(const ExecutionSpace& ex, InputIterator first1, InputIterator last1,
-          OutputIterator d_first, UnaryOperation unary_op) {
-  return Impl::transform_impl("Kokkos::transform_iterator_api_default", ex,
-                              first1, last1, d_first, std::move(unary_op));
+//
+// overload set accepting execution space
+//
+template <
+    typename ExecutionSpace, typename InputIterator, typename OutputIterator,
+    typename UnaryOperation,
+    std::enable_if_t<Impl::are_iterators_v<InputIterator, OutputIterator> &&
+                         is_execution_space_v<ExecutionSpace>,
+                     int> = 0>
+OutputIterator transform(const ExecutionSpace& ex, InputIterator first1,
+                         InputIterator last1, OutputIterator d_first,
+                         UnaryOperation unary_op) {
+  return Impl::transform_exespace_impl("Kokkos::transform_iterator_api_default",
+                                       ex, first1, last1, d_first,
+                                       std::move(unary_op));
 }
 
-template <class ExecutionSpace, class InputIterator, class OutputIterator,
-          class UnaryOperation>
-std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators<
-                      InputIterator, OutputIterator>::value,
-                  OutputIterator>
-transform(const std::string& label, const ExecutionSpace& ex,
-          InputIterator first1, InputIterator last1, OutputIterator d_first,
-          UnaryOperation unary_op) {
-  return Impl::transform_impl(label, ex, first1, last1, d_first,
-                              std::move(unary_op));
+template <
+    typename ExecutionSpace, typename InputIterator, typename OutputIterator,
+    typename UnaryOperation,
+    std::enable_if_t<Impl::are_iterators_v<InputIterator, OutputIterator> &&
+                         is_execution_space_v<ExecutionSpace>,
+                     int> = 0>
+OutputIterator transform(const std::string& label, const ExecutionSpace& ex,
+                         InputIterator first1, InputIterator last1,
+                         OutputIterator d_first, UnaryOperation unary_op) {
+  return Impl::transform_exespace_impl(label, ex, first1, last1, d_first,
+                                       std::move(unary_op));
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class UnaryOperation>
+template <typename ExecutionSpace, typename DataType1, typename... Properties1,
+          typename DataType2, typename... Properties2, typename UnaryOperation,
+          std::enable_if_t<is_execution_space_v<ExecutionSpace>, int> = 0>
 auto transform(const ExecutionSpace& ex,
                const ::Kokkos::View<DataType1, Properties1...>& source,
                ::Kokkos::View<DataType2, Properties2...>& dest,
@@ -55,13 +63,14 @@ auto transform(const ExecutionSpace& ex,
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
 
-  return Impl::transform_impl("Kokkos::transform_view_api_default", ex,
-                              begin(source), end(source), begin(dest),
-                              std::move(unary_op));
+  return Impl::transform_exespace_impl("Kokkos::transform_view_api_default", ex,
+                                       begin(source), end(source), begin(dest),
+                                       std::move(unary_op));
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class UnaryOperation>
+template <typename ExecutionSpace, typename DataType1, typename... Properties1,
+          typename DataType2, typename... Properties2, typename UnaryOperation,
+          std::enable_if_t<is_execution_space_v<ExecutionSpace>, int> = 0>
 auto transform(const std::string& label, const ExecutionSpace& ex,
                const ::Kokkos::View<DataType1, Properties1...>& source,
                ::Kokkos::View<DataType2, Properties2...>& dest,
@@ -69,38 +78,44 @@ auto transform(const std::string& label, const ExecutionSpace& ex,
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
 
-  return Impl::transform_impl(label, ex, begin(source), end(source),
-                              begin(dest), std::move(unary_op));
+  return Impl::transform_exespace_impl(label, ex, begin(source), end(source),
+                                       begin(dest), std::move(unary_op));
 }
 
-template <class ExecutionSpace, class InputIterator1, class InputIterator2,
-          class OutputIterator, class BinaryOperation>
-std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators<
-                      InputIterator1, InputIterator2, OutputIterator>::value,
-                  OutputIterator>
-transform(const ExecutionSpace& ex, InputIterator1 first1, InputIterator1 last1,
-          InputIterator2 first2, OutputIterator d_first,
-          BinaryOperation binary_op) {
-  return Impl::transform_impl("Kokkos::transform_iterator_api_default", ex,
-                              first1, last1, first2, d_first,
-                              std::move(binary_op));
+template <
+    typename ExecutionSpace, typename InputIterator1, typename InputIterator2,
+    typename OutputIterator, typename BinaryOperation,
+    std::enable_if_t<
+        Impl::are_iterators_v<InputIterator1, InputIterator2, OutputIterator> &&
+            is_execution_space_v<ExecutionSpace>,
+        int> = 0>
+OutputIterator transform(const ExecutionSpace& ex, InputIterator1 first1,
+                         InputIterator1 last1, InputIterator2 first2,
+                         OutputIterator d_first, BinaryOperation binary_op) {
+  return Impl::transform_exespace_impl("Kokkos::transform_iterator_api_default",
+                                       ex, first1, last1, first2, d_first,
+                                       std::move(binary_op));
 }
 
-template <class ExecutionSpace, class InputIterator1, class InputIterator2,
-          class OutputIterator, class BinaryOperation>
-std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators<
-                      InputIterator1, InputIterator2, OutputIterator>::value,
-                  OutputIterator>
-transform(const std::string& label, const ExecutionSpace& ex,
-          InputIterator1 first1, InputIterator1 last1, InputIterator2 first2,
-          OutputIterator d_first, BinaryOperation binary_op) {
-  return Impl::transform_impl(label, ex, first1, last1, first2, d_first,
-                              std::move(binary_op));
+template <
+    typename ExecutionSpace, typename InputIterator1, typename InputIterator2,
+    typename OutputIterator, typename BinaryOperation,
+    std::enable_if_t<
+        Impl::are_iterators_v<InputIterator1, InputIterator2, OutputIterator> &&
+            is_execution_space_v<ExecutionSpace>,
+        int> = 0>
+OutputIterator transform(const std::string& label, const ExecutionSpace& ex,
+                         InputIterator1 first1, InputIterator1 last1,
+                         InputIterator2 first2, OutputIterator d_first,
+                         BinaryOperation binary_op) {
+  return Impl::transform_exespace_impl(label, ex, first1, last1, first2,
+                                       d_first, std::move(binary_op));
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class DataType3,
-          class... Properties3, class BinaryOperation>
+template <typename ExecutionSpace, typename DataType1, typename... Properties1,
+          typename DataType2, typename... Properties2, typename DataType3,
+          typename... Properties3, typename BinaryOperation,
+          std::enable_if_t<is_execution_space_v<ExecutionSpace>, int> = 0>
 auto transform(const ExecutionSpace& ex,
                const ::Kokkos::View<DataType1, Properties1...>& source1,
                const ::Kokkos::View<DataType2, Properties2...>& source2,
@@ -110,14 +125,15 @@ auto transform(const ExecutionSpace& ex,
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source2);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
 
-  return Impl::transform_impl("Kokkos::transform_view_api_default", ex,
-                              begin(source1), end(source1), begin(source2),
-                              begin(dest), std::move(binary_op));
+  return Impl::transform_exespace_impl(
+      "Kokkos::transform_view_api_default", ex, begin(source1), end(source1),
+      begin(source2), begin(dest), std::move(binary_op));
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class DataType3,
-          class... Properties3, class BinaryOperation>
+template <typename ExecutionSpace, typename DataType1, typename... Properties1,
+          typename DataType2, typename... Properties2, typename DataType3,
+          typename... Properties3, typename BinaryOperation,
+          std::enable_if_t<is_execution_space_v<ExecutionSpace>, int> = 0>
 auto transform(const std::string& label, const ExecutionSpace& ex,
                const ::Kokkos::View<DataType1, Properties1...>& source1,
                const ::Kokkos::View<DataType2, Properties2...>& source2,
@@ -127,9 +143,79 @@ auto transform(const std::string& label, const ExecutionSpace& ex,
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source2);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
 
-  return Impl::transform_impl(label, ex, begin(source1), end(source1),
-                              begin(source2), begin(dest),
-                              std::move(binary_op));
+  return Impl::transform_exespace_impl(label, ex, begin(source1), end(source1),
+                                       begin(source2), begin(dest),
+                                       std::move(binary_op));
+}
+
+//
+// overload set accepting a team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+template <
+    typename TeamHandleType, typename InputIterator, typename OutputIterator,
+    typename UnaryOperation,
+    std::enable_if_t<Impl::are_iterators_v<InputIterator, OutputIterator> &&
+                         is_team_handle_v<TeamHandleType>,
+                     int> = 0>
+KOKKOS_FUNCTION OutputIterator transform(const TeamHandleType& teamHandle,
+                                         InputIterator first1,
+                                         InputIterator last1,
+                                         OutputIterator d_first,
+                                         UnaryOperation unary_op) {
+  return Impl::transform_team_impl(teamHandle, first1, last1, d_first,
+                                   std::move(unary_op));
+}
+
+template <typename TeamHandleType, typename DataType1, typename... Properties1,
+          typename DataType2, typename... Properties2, typename UnaryOperation,
+          std::enable_if_t<is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto transform(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType1, Properties1...>& source,
+    ::Kokkos::View<DataType2, Properties2...>& dest, UnaryOperation unary_op) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
+
+  return Impl::transform_team_impl(teamHandle, begin(source), end(source),
+                                   begin(dest), std::move(unary_op));
+}
+
+template <
+    typename TeamHandleType, typename InputIterator1, typename InputIterator2,
+    typename OutputIterator, typename BinaryOperation,
+    std::enable_if_t<
+        Impl::are_iterators_v<InputIterator1, InputIterator2, OutputIterator> &&
+            is_team_handle_v<TeamHandleType>,
+        int> = 0>
+KOKKOS_FUNCTION OutputIterator transform(const TeamHandleType& teamHandle,
+                                         InputIterator1 first1,
+                                         InputIterator1 last1,
+                                         InputIterator2 first2,
+                                         OutputIterator d_first,
+                                         BinaryOperation binary_op) {
+  return Impl::transform_team_impl(teamHandle, first1, last1, first2, d_first,
+                                   std::move(binary_op));
+}
+
+template <typename TeamHandleType, typename DataType1, typename... Properties1,
+          typename DataType2, typename... Properties2, typename DataType3,
+          typename... Properties3, typename BinaryOperation,
+          std::enable_if_t<is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto transform(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType1, Properties1...>& source1,
+    const ::Kokkos::View<DataType2, Properties2...>& source2,
+    ::Kokkos::View<DataType3, Properties3...>& dest,
+    BinaryOperation binary_op) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source1);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source2);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
+
+  return Impl::transform_team_impl(teamHandle, begin(source1), end(source1),
+                                   begin(source2), begin(dest),
+                                   std::move(binary_op));
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_TransformExclusiveScan.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_TransformExclusiveScan.hpp
index 9d85aee06f8e5fc2dbfb46792e3b4c9ab9643a1a..37fc0f860ee44b304b21e489cf35ec2cfbe1e405 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_TransformExclusiveScan.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_TransformExclusiveScan.hpp
@@ -23,44 +23,52 @@
 namespace Kokkos {
 namespace Experimental {
 
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorType, class ValueType, class BinaryOpType,
-          class UnaryOpType>
-std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators<
-                     InputIteratorType, OutputIteratorType>::value,
-                 OutputIteratorType>
-transform_exclusive_scan(const ExecutionSpace& ex, InputIteratorType first,
-                         InputIteratorType last, OutputIteratorType first_dest,
-                         ValueType init_value, BinaryOpType binary_op,
-                         UnaryOpType unary_op) {
+//
+// overload set accepting execution space
+//
+template <typename ExecutionSpace, typename InputIteratorType,
+          typename OutputIteratorType, typename ValueType,
+          typename BinaryOpType, typename UnaryOpType,
+          std::enable_if_t<
+              Impl::are_iterators_v<InputIteratorType, OutputIteratorType>&& ::
+                  Kokkos::is_execution_space_v<ExecutionSpace>,
+              int> = 0>
+OutputIteratorType transform_exclusive_scan(
+    const ExecutionSpace& ex, InputIteratorType first, InputIteratorType last,
+    OutputIteratorType first_dest, ValueType init_value, BinaryOpType binary_op,
+    UnaryOpType unary_op) {
   Impl::static_assert_is_not_openmptarget(ex);
-  static_assert(std::is_move_constructible<ValueType>::value,
+  static_assert(std::is_move_constructible_v<ValueType>,
                 "ValueType must be move constructible.");
-  return Impl::transform_exclusive_scan_impl(
+  return Impl::transform_exclusive_scan_exespace_impl(
       "Kokkos::transform_exclusive_scan_custom_functors_iterator_api", ex,
-      first, last, first_dest, init_value, binary_op, unary_op);
+      first, last, first_dest, std::move(init_value), binary_op, unary_op);
 }
 
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorType, class ValueType, class BinaryOpType,
-          class UnaryOpType>
-std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators<
-                     InputIteratorType, OutputIteratorType>::value,
-                 OutputIteratorType>
-transform_exclusive_scan(const std::string& label, const ExecutionSpace& ex,
-                         InputIteratorType first, InputIteratorType last,
-                         OutputIteratorType first_dest, ValueType init_value,
-                         BinaryOpType binary_op, UnaryOpType unary_op) {
+template <typename ExecutionSpace, typename InputIteratorType,
+          typename OutputIteratorType, typename ValueType,
+          typename BinaryOpType, typename UnaryOpType,
+          std::enable_if_t<
+              Impl::are_iterators_v<InputIteratorType, OutputIteratorType>&& ::
+                  Kokkos::is_execution_space_v<ExecutionSpace>,
+              int> = 0>
+OutputIteratorType transform_exclusive_scan(
+    const std::string& label, const ExecutionSpace& ex, InputIteratorType first,
+    InputIteratorType last, OutputIteratorType first_dest, ValueType init_value,
+    BinaryOpType binary_op, UnaryOpType unary_op) {
   Impl::static_assert_is_not_openmptarget(ex);
-  static_assert(std::is_move_constructible<ValueType>::value,
+  static_assert(std::is_move_constructible_v<ValueType>,
                 "ValueType must be move constructible.");
-  return Impl::transform_exclusive_scan_impl(label, ex, first, last, first_dest,
-                                             init_value, binary_op, unary_op);
+  return Impl::transform_exclusive_scan_exespace_impl(
+      label, ex, first, last, first_dest, std::move(init_value), binary_op,
+      unary_op);
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class ValueType,
-          class BinaryOpType, class UnaryOpType>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2, typename ValueType,
+    typename BinaryOpType, typename UnaryOpType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto transform_exclusive_scan(
     const ExecutionSpace& ex,
     const ::Kokkos::View<DataType1, Properties1...>& view_from,
@@ -69,18 +77,20 @@ auto transform_exclusive_scan(
   Impl::static_assert_is_not_openmptarget(ex);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
-  static_assert(std::is_move_constructible<ValueType>::value,
+  static_assert(std::is_move_constructible_v<ValueType>,
                 "ValueType must be move constructible.");
   namespace KE = ::Kokkos::Experimental;
-  return Impl::transform_exclusive_scan_impl(
+  return Impl::transform_exclusive_scan_exespace_impl(
       "Kokkos::transform_exclusive_scan_custom_functors_view_api", ex,
       KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest),
-      init_value, binary_op, unary_op);
+      std::move(init_value), binary_op, unary_op);
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class ValueType,
-          class BinaryOpType, class UnaryOpType>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2, typename ValueType,
+    typename BinaryOpType, typename UnaryOpType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto transform_exclusive_scan(
     const std::string& label, const ExecutionSpace& ex,
     const ::Kokkos::View<DataType1, Properties1...>& view_from,
@@ -89,12 +99,56 @@ auto transform_exclusive_scan(
   Impl::static_assert_is_not_openmptarget(ex);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
-  static_assert(std::is_move_constructible<ValueType>::value,
+  static_assert(std::is_move_constructible_v<ValueType>,
                 "ValueType must be move constructible.");
   namespace KE = ::Kokkos::Experimental;
-  return Impl::transform_exclusive_scan_impl(
+  return Impl::transform_exclusive_scan_exespace_impl(
       label, ex, KE::cbegin(view_from), KE::cend(view_from),
-      KE::begin(view_dest), init_value, binary_op, unary_op);
+      KE::begin(view_dest), std::move(init_value), binary_op, unary_op);
+}
+
+//
+// overload set accepting a team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+template <typename TeamHandleType, typename InputIteratorType,
+          typename OutputIteratorType, typename ValueType,
+          typename BinaryOpType, typename UnaryOpType,
+          std::enable_if_t<
+              Impl::are_iterators_v<InputIteratorType, OutputIteratorType>&& ::
+                  Kokkos::is_team_handle_v<TeamHandleType>,
+              int> = 0>
+KOKKOS_FUNCTION OutputIteratorType transform_exclusive_scan(
+    const TeamHandleType& teamHandle, InputIteratorType first,
+    InputIteratorType last, OutputIteratorType first_dest, ValueType init_value,
+    BinaryOpType binary_op, UnaryOpType unary_op) {
+  Impl::static_assert_is_not_openmptarget(teamHandle);
+  static_assert(std::is_move_constructible_v<ValueType>,
+                "ValueType must be move constructible.");
+  return Impl::transform_exclusive_scan_team_impl(
+      teamHandle, first, last, first_dest, std::move(init_value), binary_op,
+      unary_op);
+}
+
+template <typename TeamHandleType, typename DataType1, typename... Properties1,
+          typename DataType2, typename... Properties2, typename ValueType,
+          typename BinaryOpType, typename UnaryOpType,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto transform_exclusive_scan(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType1, Properties1...>& view_from,
+    const ::Kokkos::View<DataType2, Properties2...>& view_dest,
+    ValueType init_value, BinaryOpType binary_op, UnaryOpType unary_op) {
+  Impl::static_assert_is_not_openmptarget(teamHandle);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
+  static_assert(std::is_move_constructible_v<ValueType>,
+                "ValueType must be move constructible.");
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::transform_exclusive_scan_team_impl(
+      teamHandle, KE::cbegin(view_from), KE::cend(view_from),
+      KE::begin(view_dest), std::move(init_value), binary_op, unary_op);
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_TransformInclusiveScan.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_TransformInclusiveScan.hpp
index 7489af7e379b576bc38e555d425c65cd33c675a7..5f694dbfd98bdcd99fbf81609a6f4da21c2a2deb 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_TransformInclusiveScan.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_TransformInclusiveScan.hpp
@@ -23,40 +23,53 @@
 namespace Kokkos {
 namespace Experimental {
 
+//
+// overload set accepting execution space
+//
+
 // overload set 1 (no init value)
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorType, class BinaryOpType, class UnaryOpType>
-std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators<
-                     InputIteratorType, OutputIteratorType>::value,
-                 OutputIteratorType>
-transform_inclusive_scan(const ExecutionSpace& ex, InputIteratorType first,
-                         InputIteratorType last, OutputIteratorType first_dest,
-                         BinaryOpType binary_op, UnaryOpType unary_op) {
+template <typename ExecutionSpace, typename InputIteratorType,
+          typename OutputIteratorType, typename BinaryOpType,
+          typename UnaryOpType,
+          std::enable_if_t<
+              Impl::are_iterators_v<InputIteratorType, OutputIteratorType>&& ::
+                  Kokkos::is_execution_space_v<ExecutionSpace>,
+              int> = 0>
+OutputIteratorType transform_inclusive_scan(const ExecutionSpace& ex,
+                                            InputIteratorType first,
+                                            InputIteratorType last,
+                                            OutputIteratorType first_dest,
+                                            BinaryOpType binary_op,
+                                            UnaryOpType unary_op) {
   Impl::static_assert_is_not_openmptarget(ex);
 
-  return Impl::transform_inclusive_scan_impl(
+  return Impl::transform_inclusive_scan_exespace_impl(
       "Kokkos::transform_inclusive_scan_custom_functors_iterator_api", ex,
       first, last, first_dest, binary_op, unary_op);
 }
 
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorType, class BinaryOpType, class UnaryOpType>
-std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators<
-                     InputIteratorType, OutputIteratorType>::value,
-                 OutputIteratorType>
-transform_inclusive_scan(const std::string& label, const ExecutionSpace& ex,
-                         InputIteratorType first, InputIteratorType last,
-                         OutputIteratorType first_dest, BinaryOpType binary_op,
-                         UnaryOpType unary_op) {
+template <typename ExecutionSpace, typename InputIteratorType,
+          typename OutputIteratorType, typename BinaryOpType,
+          typename UnaryOpType,
+          std::enable_if_t<
+              Impl::are_iterators_v<InputIteratorType, OutputIteratorType>&& ::
+                  Kokkos::is_execution_space_v<ExecutionSpace>,
+              int> = 0>
+OutputIteratorType transform_inclusive_scan(
+    const std::string& label, const ExecutionSpace& ex, InputIteratorType first,
+    InputIteratorType last, OutputIteratorType first_dest,
+    BinaryOpType binary_op, UnaryOpType unary_op) {
   Impl::static_assert_is_not_openmptarget(ex);
 
-  return Impl::transform_inclusive_scan_impl(label, ex, first, last, first_dest,
-                                             binary_op, unary_op);
+  return Impl::transform_inclusive_scan_exespace_impl(
+      label, ex, first, last, first_dest, binary_op, unary_op);
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class BinaryOpType,
-          class UnaryOpType>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2, typename BinaryOpType,
+    typename UnaryOpType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto transform_inclusive_scan(
     const ExecutionSpace& ex,
     const ::Kokkos::View<DataType1, Properties1...>& view_from,
@@ -66,15 +79,17 @@ auto transform_inclusive_scan(
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
   namespace KE = ::Kokkos::Experimental;
-  return Impl::transform_inclusive_scan_impl(
+  return Impl::transform_inclusive_scan_exespace_impl(
       "Kokkos::transform_inclusive_scan_custom_functors_view_api", ex,
       KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest),
       binary_op, unary_op);
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class BinaryOpType,
-          class UnaryOpType>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2, typename BinaryOpType,
+    typename UnaryOpType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto transform_inclusive_scan(
     const std::string& label, const ExecutionSpace& ex,
     const ::Kokkos::View<DataType1, Properties1...>& view_from,
@@ -84,46 +99,59 @@ auto transform_inclusive_scan(
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
   namespace KE = ::Kokkos::Experimental;
-  return Impl::transform_inclusive_scan_impl(
+  return Impl::transform_inclusive_scan_exespace_impl(
       label, ex, KE::cbegin(view_from), KE::cend(view_from),
       KE::begin(view_dest), binary_op, unary_op);
 }
 
 // overload set 2 (init value)
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorType, class BinaryOpType, class UnaryOpType,
-          class ValueType>
-std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators<
-                     InputIteratorType, OutputIteratorType>::value,
-                 OutputIteratorType>
-transform_inclusive_scan(const ExecutionSpace& ex, InputIteratorType first,
-                         InputIteratorType last, OutputIteratorType first_dest,
-                         BinaryOpType binary_op, UnaryOpType unary_op,
-                         ValueType init_value) {
+template <typename ExecutionSpace, typename InputIteratorType,
+          typename OutputIteratorType, typename BinaryOpType,
+          typename UnaryOpType, typename ValueType,
+
+          std::enable_if_t<
+              Impl::are_iterators_v<InputIteratorType, OutputIteratorType>&& ::
+                  Kokkos::is_execution_space_v<ExecutionSpace>,
+              int> = 0>
+OutputIteratorType transform_inclusive_scan(
+    const ExecutionSpace& ex, InputIteratorType first, InputIteratorType last,
+    OutputIteratorType first_dest, BinaryOpType binary_op, UnaryOpType unary_op,
+    ValueType init_value) {
   Impl::static_assert_is_not_openmptarget(ex);
-  return Impl::transform_inclusive_scan_impl(
+  static_assert(std::is_move_constructible_v<ValueType>,
+                "ValueType must be move constructible.");
+
+  return Impl::transform_inclusive_scan_exespace_impl(
       "Kokkos::transform_inclusive_scan_custom_functors_iterator_api", ex,
-      first, last, first_dest, binary_op, unary_op, init_value);
+      first, last, first_dest, binary_op, unary_op, std::move(init_value));
 }
 
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorType, class BinaryOpType, class UnaryOpType,
-          class ValueType>
-std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators<
-                     InputIteratorType, OutputIteratorType>::value,
-                 OutputIteratorType>
-transform_inclusive_scan(const std::string& label, const ExecutionSpace& ex,
-                         InputIteratorType first, InputIteratorType last,
-                         OutputIteratorType first_dest, BinaryOpType binary_op,
-                         UnaryOpType unary_op, ValueType init_value) {
+template <typename ExecutionSpace, typename InputIteratorType,
+          typename OutputIteratorType, typename BinaryOpType,
+          typename UnaryOpType, typename ValueType,
+
+          std::enable_if_t<
+              Impl::are_iterators_v<InputIteratorType, OutputIteratorType>&& ::
+                  Kokkos::is_execution_space_v<ExecutionSpace>,
+              int> = 0>
+OutputIteratorType transform_inclusive_scan(
+    const std::string& label, const ExecutionSpace& ex, InputIteratorType first,
+    InputIteratorType last, OutputIteratorType first_dest,
+    BinaryOpType binary_op, UnaryOpType unary_op, ValueType init_value) {
   Impl::static_assert_is_not_openmptarget(ex);
-  return Impl::transform_inclusive_scan_impl(label, ex, first, last, first_dest,
-                                             binary_op, unary_op, init_value);
+  static_assert(std::is_move_constructible_v<ValueType>,
+                "ValueType must be move constructible.");
+
+  return Impl::transform_inclusive_scan_exespace_impl(
+      label, ex, first, last, first_dest, binary_op, unary_op,
+      std::move(init_value));
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class BinaryOpType,
-          class UnaryOpType, class ValueType>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2, typename BinaryOpType,
+    typename UnaryOpType, typename ValueType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto transform_inclusive_scan(
     const ExecutionSpace& ex,
     const ::Kokkos::View<DataType1, Properties1...>& view_from,
@@ -132,16 +160,21 @@ auto transform_inclusive_scan(
   Impl::static_assert_is_not_openmptarget(ex);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
+  static_assert(std::is_move_constructible_v<ValueType>,
+                "ValueType must be move constructible.");
+
   namespace KE = ::Kokkos::Experimental;
-  return Impl::transform_inclusive_scan_impl(
+  return Impl::transform_inclusive_scan_exespace_impl(
       "Kokkos::transform_inclusive_scan_custom_functors_view_api", ex,
       KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest),
-      binary_op, unary_op, init_value);
+      binary_op, unary_op, std::move(init_value));
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class BinaryOpType,
-          class UnaryOpType, class ValueType>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2, typename BinaryOpType,
+    typename UnaryOpType, typename ValueType,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto transform_inclusive_scan(
     const std::string& label, const ExecutionSpace& ex,
     const ::Kokkos::View<DataType1, Properties1...>& view_from,
@@ -150,10 +183,97 @@ auto transform_inclusive_scan(
   Impl::static_assert_is_not_openmptarget(ex);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
+  static_assert(std::is_move_constructible_v<ValueType>,
+                "ValueType must be move constructible.");
+
   namespace KE = ::Kokkos::Experimental;
-  return Impl::transform_inclusive_scan_impl(
+  return Impl::transform_inclusive_scan_exespace_impl(
       label, ex, KE::cbegin(view_from), KE::cend(view_from),
-      KE::begin(view_dest), binary_op, unary_op, init_value);
+      KE::begin(view_dest), binary_op, unary_op, std::move(init_value));
+}
+
+//
+// overload set accepting a team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+
+// overload set 1 (no init value)
+template <typename TeamHandleType, typename InputIteratorType,
+          typename OutputIteratorType, typename BinaryOpType,
+          typename UnaryOpType,
+          std::enable_if_t<
+              Impl::are_iterators_v<InputIteratorType, OutputIteratorType> &&
+                  Kokkos::is_team_handle_v<TeamHandleType>,
+              int> = 0>
+KOKKOS_FUNCTION OutputIteratorType transform_inclusive_scan(
+    const TeamHandleType& teamHandle, InputIteratorType first,
+    InputIteratorType last, OutputIteratorType first_dest,
+    BinaryOpType binary_op, UnaryOpType unary_op) {
+  Impl::static_assert_is_not_openmptarget(teamHandle);
+
+  return Impl::transform_inclusive_scan_team_impl(
+      teamHandle, first, last, first_dest, binary_op, unary_op);
+}
+
+template <typename TeamHandleType, typename DataType1, typename... Properties1,
+          typename DataType2, typename... Properties2, typename BinaryOpType,
+          typename UnaryOpType,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto transform_inclusive_scan(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType1, Properties1...>& view_from,
+    const ::Kokkos::View<DataType2, Properties2...>& view_dest,
+    BinaryOpType binary_op, UnaryOpType unary_op) {
+  Impl::static_assert_is_not_openmptarget(teamHandle);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::transform_inclusive_scan_team_impl(
+      teamHandle, KE::cbegin(view_from), KE::cend(view_from),
+      KE::begin(view_dest), binary_op, unary_op);
+}
+
+// overload set 2 (init value)
+template <typename TeamHandleType, typename InputIteratorType,
+          typename OutputIteratorType, typename BinaryOpType,
+          typename UnaryOpType, typename ValueType,
+          std::enable_if_t<
+              Impl::are_iterators_v<InputIteratorType, OutputIteratorType> &&
+                  Kokkos::is_team_handle_v<TeamHandleType>,
+              int> = 0>
+KOKKOS_FUNCTION OutputIteratorType transform_inclusive_scan(
+    const TeamHandleType& teamHandle, InputIteratorType first,
+    InputIteratorType last, OutputIteratorType first_dest,
+    BinaryOpType binary_op, UnaryOpType unary_op, ValueType init_value) {
+  Impl::static_assert_is_not_openmptarget(teamHandle);
+  static_assert(std::is_move_constructible_v<ValueType>,
+                "ValueType must be move constructible.");
+
+  return Impl::transform_inclusive_scan_team_impl(
+      teamHandle, first, last, first_dest, binary_op, unary_op,
+      std::move(init_value));
+}
+
+template <typename TeamHandleType, typename DataType1, typename... Properties1,
+          typename DataType2, typename... Properties2, typename BinaryOpType,
+          typename UnaryOpType, typename ValueType,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto transform_inclusive_scan(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType1, Properties1...>& view_from,
+    const ::Kokkos::View<DataType2, Properties2...>& view_dest,
+    BinaryOpType binary_op, UnaryOpType unary_op, ValueType init_value) {
+  Impl::static_assert_is_not_openmptarget(teamHandle);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
+  static_assert(std::is_move_constructible_v<ValueType>,
+                "ValueType must be move constructible.");
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::transform_inclusive_scan_team_impl(
+      teamHandle, KE::cbegin(view_from), KE::cend(view_from),
+      KE::begin(view_dest), binary_op, unary_op, std::move(init_value));
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_TransformReduce.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_TransformReduce.hpp
index b5ec9066d2e25709575e1dcd6e27795de9d00de8..101f5113f68a406a2aa59c0542ff5f2cfe24f9dc 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_TransformReduce.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_TransformReduce.hpp
@@ -23,34 +23,44 @@
 namespace Kokkos {
 namespace Experimental {
 
+//
+// overload set accepting execution space
+//
+
 // ----------------------------
 // overload set1:
 // no custom functors passed, so equivalent to
 // transform_reduce(first1, last1, first2, init, plus<>(), multiplies<>());
 // ----------------------------
-template <class ExecutionSpace, class IteratorType1, class IteratorType2,
-          class ValueType>
+template <typename ExecutionSpace, typename IteratorType1,
+          typename IteratorType2, typename ValueType,
+          std::enable_if_t<::Kokkos::is_execution_space<ExecutionSpace>::value,
+                           int> = 0>
 ValueType transform_reduce(const ExecutionSpace& ex, IteratorType1 first1,
                            IteratorType1 last1, IteratorType2 first2,
                            ValueType init_reduction_value) {
-  return Impl::transform_reduce_default_functors_impl(
+  return Impl::transform_reduce_default_functors_exespace_impl(
       "Kokkos::transform_reduce_default_functors_iterator_api", ex, first1,
       last1, first2, std::move(init_reduction_value));
 }
 
-template <class ExecutionSpace, class IteratorType1, class IteratorType2,
-          class ValueType>
+template <typename ExecutionSpace, typename IteratorType1,
+          typename IteratorType2, typename ValueType,
+          std::enable_if_t<::Kokkos::is_execution_space<ExecutionSpace>::value,
+                           int> = 0>
 ValueType transform_reduce(const std::string& label, const ExecutionSpace& ex,
                            IteratorType1 first1, IteratorType1 last1,
                            IteratorType2 first2,
                            ValueType init_reduction_value) {
-  return Impl::transform_reduce_default_functors_impl(
+  return Impl::transform_reduce_default_functors_exespace_impl(
       label, ex, first1, last1, first2, std::move(init_reduction_value));
 }
 
 // overload1 accepting views
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class ValueType>
+template <typename ExecutionSpace, typename DataType1, typename... Properties1,
+          typename DataType2, typename... Properties2, typename ValueType,
+          std::enable_if_t<::Kokkos::is_execution_space<ExecutionSpace>::value,
+                           int> = 0>
 ValueType transform_reduce(
     const ExecutionSpace& ex,
     const ::Kokkos::View<DataType1, Properties1...>& first_view,
@@ -60,14 +70,16 @@ ValueType transform_reduce(
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(first_view);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(second_view);
 
-  return Impl::transform_reduce_default_functors_impl(
+  return Impl::transform_reduce_default_functors_exespace_impl(
       "Kokkos::transform_reduce_default_functors_iterator_api", ex,
       KE::cbegin(first_view), KE::cend(first_view), KE::cbegin(second_view),
       std::move(init_reduction_value));
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class ValueType>
+template <typename ExecutionSpace, typename DataType1, typename... Properties1,
+          typename DataType2, typename... Properties2, typename ValueType,
+          std::enable_if_t<::Kokkos::is_execution_space<ExecutionSpace>::value,
+                           int> = 0>
 ValueType transform_reduce(
     const std::string& label, const ExecutionSpace& ex,
     const ::Kokkos::View<DataType1, Properties1...>& first_view,
@@ -77,7 +89,7 @@ ValueType transform_reduce(
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(first_view);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(second_view);
 
-  return Impl::transform_reduce_default_functors_impl(
+  return Impl::transform_reduce_default_functors_exespace_impl(
       label, ex, KE::cbegin(first_view), KE::cend(first_view),
       KE::cbegin(second_view), std::move(init_reduction_value));
 }
@@ -95,8 +107,11 @@ ValueType transform_reduce(
 // https://en.cppreference.com/w/cpp/algorithm/transform_reduce
 
 // api accepting iterators
-template <class ExecutionSpace, class IteratorType1, class IteratorType2,
-          class ValueType, class BinaryJoinerType, class BinaryTransform>
+template <
+    typename ExecutionSpace, typename IteratorType1, typename IteratorType2,
+    typename ValueType, typename BinaryJoinerType, typename BinaryTransform,
+    std::enable_if_t<::Kokkos::is_execution_space<ExecutionSpace>::value, int> =
+        0>
 ValueType transform_reduce(const ExecutionSpace& ex, IteratorType1 first1,
                            IteratorType1 last1, IteratorType2 first2,
                            ValueType init_reduction_value,
@@ -105,14 +120,17 @@ ValueType transform_reduce(const ExecutionSpace& ex, IteratorType1 first1,
   static_assert(std::is_move_constructible<ValueType>::value,
                 "ValueType must be move constructible.");
 
-  return Impl::transform_reduce_custom_functors_impl(
+  return Impl::transform_reduce_custom_functors_exespace_impl(
       "Kokkos::transform_reduce_custom_functors_iterator_api", ex, first1,
       last1, first2, std::move(init_reduction_value), std::move(joiner),
       std::move(transformer));
 }
 
-template <class ExecutionSpace, class IteratorType1, class IteratorType2,
-          class ValueType, class BinaryJoinerType, class BinaryTransform>
+template <
+    typename ExecutionSpace, typename IteratorType1, typename IteratorType2,
+    typename ValueType, typename BinaryJoinerType, typename BinaryTransform,
+    std::enable_if_t<::Kokkos::is_execution_space<ExecutionSpace>::value, int> =
+        0>
 ValueType transform_reduce(const std::string& label, const ExecutionSpace& ex,
                            IteratorType1 first1, IteratorType1 last1,
                            IteratorType2 first2, ValueType init_reduction_value,
@@ -121,15 +139,17 @@ ValueType transform_reduce(const std::string& label, const ExecutionSpace& ex,
   static_assert(std::is_move_constructible<ValueType>::value,
                 "ValueType must be move constructible.");
 
-  return Impl::transform_reduce_custom_functors_impl(
+  return Impl::transform_reduce_custom_functors_exespace_impl(
       label, ex, first1, last1, first2, std::move(init_reduction_value),
       std::move(joiner), std::move(transformer));
 }
 
 // accepting views
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class ValueType,
-          class BinaryJoinerType, class BinaryTransform>
+template <typename ExecutionSpace, typename DataType1, typename... Properties1,
+          typename DataType2, typename... Properties2, typename ValueType,
+          typename BinaryJoinerType, typename BinaryTransform,
+          std::enable_if_t<::Kokkos::is_execution_space<ExecutionSpace>::value,
+                           int> = 0>
 ValueType transform_reduce(
     const ExecutionSpace& ex,
     const ::Kokkos::View<DataType1, Properties1...>& first_view,
@@ -143,16 +163,18 @@ ValueType transform_reduce(
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(first_view);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(second_view);
 
-  return Impl::transform_reduce_custom_functors_impl(
+  return Impl::transform_reduce_custom_functors_exespace_impl(
       "Kokkos::transform_reduce_custom_functors_view_api", ex,
       KE::cbegin(first_view), KE::cend(first_view), KE::cbegin(second_view),
       std::move(init_reduction_value), std::move(joiner),
       std::move(transformer));
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class ValueType,
-          class BinaryJoinerType, class BinaryTransform>
+template <typename ExecutionSpace, typename DataType1, typename... Properties1,
+          typename DataType2, typename... Properties2, typename ValueType,
+          typename BinaryJoinerType, typename BinaryTransform,
+          std::enable_if_t<::Kokkos::is_execution_space<ExecutionSpace>::value,
+                           int> = 0>
 ValueType transform_reduce(
     const std::string& label, const ExecutionSpace& ex,
     const ::Kokkos::View<DataType1, Properties1...>& first_view,
@@ -166,7 +188,7 @@ ValueType transform_reduce(
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(first_view);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(second_view);
 
-  return Impl::transform_reduce_custom_functors_impl(
+  return Impl::transform_reduce_custom_functors_exespace_impl(
       label, ex, KE::cbegin(first_view), KE::cend(first_view),
       KE::cbegin(second_view), std::move(init_reduction_value),
       std::move(joiner), std::move(transformer));
@@ -176,43 +198,50 @@ ValueType transform_reduce(
 // overload set3:
 //
 // accepting iterators
-template <class ExecutionSpace, class IteratorType, class ValueType,
-          class BinaryJoinerType, class UnaryTransform>
-// need this to avoid ambiguous call
-std::enable_if_t<
-    ::Kokkos::Experimental::Impl::are_iterators<IteratorType>::value, ValueType>
-transform_reduce(const ExecutionSpace& ex, IteratorType first1,
-                 IteratorType last1, ValueType init_reduction_value,
-                 BinaryJoinerType joiner, UnaryTransform transformer) {
+template <typename ExecutionSpace, typename IteratorType, typename ValueType,
+          typename BinaryJoinerType, typename UnaryTransform,
+          std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators<
+                               IteratorType>::value &&
+                               is_execution_space<ExecutionSpace>::value,
+                           int> = 0>
+ValueType transform_reduce(const ExecutionSpace& ex, IteratorType first1,
+                           IteratorType last1, ValueType init_reduction_value,
+                           BinaryJoinerType joiner,
+                           UnaryTransform transformer) {
   static_assert(std::is_move_constructible<ValueType>::value,
                 "ValueType must be move constructible.");
 
-  return Impl::transform_reduce_custom_functors_impl(
+  return Impl::transform_reduce_custom_functors_exespace_impl(
       "Kokkos::transform_reduce_custom_functors_iterator_api", ex, first1,
       last1, std::move(init_reduction_value), std::move(joiner),
       std::move(transformer));
 }
 
-template <class ExecutionSpace, class IteratorType, class ValueType,
-          class BinaryJoinerType, class UnaryTransform>
-// need this to avoid ambiguous call
-std::enable_if_t<
-    ::Kokkos::Experimental::Impl::are_iterators<IteratorType>::value, ValueType>
-transform_reduce(const std::string& label, const ExecutionSpace& ex,
-                 IteratorType first1, IteratorType last1,
-                 ValueType init_reduction_value, BinaryJoinerType joiner,
-                 UnaryTransform transformer) {
+template <typename ExecutionSpace, typename IteratorType, typename ValueType,
+          typename BinaryJoinerType, typename UnaryTransform,
+          std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators<
+                               IteratorType>::value &&
+                               is_execution_space<ExecutionSpace>::value,
+                           int> = 0>
+ValueType transform_reduce(const std::string& label, const ExecutionSpace& ex,
+                           IteratorType first1, IteratorType last1,
+                           ValueType init_reduction_value,
+                           BinaryJoinerType joiner,
+                           UnaryTransform transformer) {
   static_assert(std::is_move_constructible<ValueType>::value,
                 "ValueType must be move constructible.");
 
-  return Impl::transform_reduce_custom_functors_impl(
+  return Impl::transform_reduce_custom_functors_exespace_impl(
       label, ex, first1, last1, std::move(init_reduction_value),
       std::move(joiner), std::move(transformer));
 }
 
 // accepting views
-template <class ExecutionSpace, class DataType, class... Properties,
-          class ValueType, class BinaryJoinerType, class UnaryTransform>
+template <
+    typename ExecutionSpace, typename DataType, typename... Properties,
+    typename ValueType, typename BinaryJoinerType, typename UnaryTransform,
+    std::enable_if_t<::Kokkos::is_execution_space<ExecutionSpace>::value, int> =
+        0>
 ValueType transform_reduce(const ExecutionSpace& ex,
                            const ::Kokkos::View<DataType, Properties...>& view,
                            ValueType init_reduction_value,
@@ -224,14 +253,17 @@ ValueType transform_reduce(const ExecutionSpace& ex,
 
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
 
-  return Impl::transform_reduce_custom_functors_impl(
+  return Impl::transform_reduce_custom_functors_exespace_impl(
       "Kokkos::transform_reduce_custom_functors_view_api", ex, KE::cbegin(view),
       KE::cend(view), std::move(init_reduction_value), std::move(joiner),
       std::move(transformer));
 }
 
-template <class ExecutionSpace, class DataType, class... Properties,
-          class ValueType, class BinaryJoinerType, class UnaryTransform>
+template <
+    typename ExecutionSpace, typename DataType, typename... Properties,
+    typename ValueType, typename BinaryJoinerType, typename UnaryTransform,
+    std::enable_if_t<::Kokkos::is_execution_space<ExecutionSpace>::value, int> =
+        0>
 ValueType transform_reduce(const std::string& label, const ExecutionSpace& ex,
                            const ::Kokkos::View<DataType, Properties...>& view,
                            ValueType init_reduction_value,
@@ -243,12 +275,154 @@ ValueType transform_reduce(const std::string& label, const ExecutionSpace& ex,
 
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
 
-  return Impl::transform_reduce_custom_functors_impl(
+  return Impl::transform_reduce_custom_functors_exespace_impl(
       label, ex, KE::cbegin(view), KE::cend(view),
       std::move(init_reduction_value), std::move(joiner),
       std::move(transformer));
 }
 
+//
+// overload set accepting a team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+
+// ----------------------------
+// overload set1:
+// no custom functors passed, so equivalent to
+// transform_reduce(first1, last1, first2, init, plus<>(), multiplies<>());
+// ----------------------------
+template <
+    typename TeamHandleType, typename IteratorType1, typename IteratorType2,
+    typename ValueType,
+    std::enable_if_t<::Kokkos::is_team_handle<TeamHandleType>::value, int> = 0>
+KOKKOS_FUNCTION ValueType transform_reduce(const TeamHandleType& teamHandle,
+                                           IteratorType1 first1,
+                                           IteratorType1 last1,
+                                           IteratorType2 first2,
+                                           ValueType init_reduction_value) {
+  return Impl::transform_reduce_default_functors_team_impl(
+      teamHandle, first1, last1, first2, std::move(init_reduction_value));
+}
+
+// overload1 accepting views
+template <
+    typename TeamHandleType, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2, typename ValueType,
+    std::enable_if_t<::Kokkos::is_team_handle<TeamHandleType>::value, int> = 0>
+KOKKOS_FUNCTION ValueType
+transform_reduce(const TeamHandleType& teamHandle,
+                 const ::Kokkos::View<DataType1, Properties1...>& first_view,
+                 const ::Kokkos::View<DataType2, Properties2...>& second_view,
+                 ValueType init_reduction_value) {
+  namespace KE = ::Kokkos::Experimental;
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(first_view);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(second_view);
+
+  return Impl::transform_reduce_default_functors_team_impl(
+      teamHandle, KE::cbegin(first_view), KE::cend(first_view),
+      KE::cbegin(second_view), std::move(init_reduction_value));
+}
+
+//
+// overload set2:
+// accepts a custom transform and joiner functor
+//
+
+// Note the std refers to the arg BinaryReductionOp
+// but in the Kokkos naming convention, it corresponds
+// to a "joiner" that knows how to join two values
+// NOTE: "joiner/transformer" need to be commutative.
+
+// https://en.cppreference.com/w/cpp/algorithm/transform_reduce
+
+// api accepting iterators
+template <
+    typename TeamHandleType, typename IteratorType1, typename IteratorType2,
+    typename ValueType, typename BinaryJoinerType, typename BinaryTransform,
+    std::enable_if_t<::Kokkos::is_team_handle<TeamHandleType>::value, int> = 0>
+KOKKOS_FUNCTION ValueType transform_reduce(
+    const TeamHandleType& teamHandle, IteratorType1 first1, IteratorType1 last1,
+    IteratorType2 first2, ValueType init_reduction_value,
+    BinaryJoinerType joiner, BinaryTransform transformer) {
+  static_assert(std::is_move_constructible<ValueType>::value,
+                "ValueType must be move constructible.");
+
+  return Impl::transform_reduce_custom_functors_team_impl(
+      teamHandle, first1, last1, first2, std::move(init_reduction_value),
+      std::move(joiner), std::move(transformer));
+}
+
+// accepting views
+template <
+    typename TeamHandleType, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2, typename ValueType,
+    typename BinaryJoinerType, typename BinaryTransform,
+    std::enable_if_t<::Kokkos::is_team_handle<TeamHandleType>::value, int> = 0>
+KOKKOS_FUNCTION ValueType
+transform_reduce(const TeamHandleType& teamHandle,
+                 const ::Kokkos::View<DataType1, Properties1...>& first_view,
+                 const ::Kokkos::View<DataType2, Properties2...>& second_view,
+                 ValueType init_reduction_value, BinaryJoinerType joiner,
+                 BinaryTransform transformer) {
+  namespace KE = ::Kokkos::Experimental;
+  static_assert(std::is_move_constructible<ValueType>::value,
+                "ValueType must be move constructible.");
+
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(first_view);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(second_view);
+
+  return Impl::transform_reduce_custom_functors_team_impl(
+      teamHandle, KE::cbegin(first_view), KE::cend(first_view),
+      KE::cbegin(second_view), std::move(init_reduction_value),
+      std::move(joiner), std::move(transformer));
+}
+
+//
+// overload set3:
+//
+// accepting iterators
+template <typename TeamHandleType, typename IteratorType, typename ValueType,
+          typename BinaryJoinerType, typename UnaryTransform,
+          std::enable_if_t<Impl::are_iterators<IteratorType>::value &&
+                               is_team_handle<TeamHandleType>::value,
+                           int> = 0>
+KOKKOS_FUNCTION ValueType transform_reduce(const TeamHandleType& teamHandle,
+                                           IteratorType first1,
+                                           IteratorType last1,
+                                           ValueType init_reduction_value,
+                                           BinaryJoinerType joiner,
+                                           UnaryTransform transformer) {
+  static_assert(std::is_move_constructible<ValueType>::value,
+                "ValueType must be move constructible.");
+
+  return Impl::transform_reduce_custom_functors_team_impl(
+      teamHandle, first1, last1, std::move(init_reduction_value),
+      std::move(joiner), std::move(transformer));
+}
+
+// accepting views
+template <
+    typename TeamHandleType, typename DataType, typename... Properties,
+    typename ValueType, typename BinaryJoinerType, typename UnaryTransform,
+    std::enable_if_t<::Kokkos::is_team_handle<TeamHandleType>::value, int> = 0>
+KOKKOS_FUNCTION ValueType
+transform_reduce(const TeamHandleType& teamHandle,
+                 const ::Kokkos::View<DataType, Properties...>& view,
+                 ValueType init_reduction_value, BinaryJoinerType joiner,
+                 UnaryTransform transformer) {
+  namespace KE = ::Kokkos::Experimental;
+  static_assert(std::is_move_constructible<ValueType>::value,
+                "ValueType must be move constructible.");
+
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+
+  return Impl::transform_reduce_custom_functors_team_impl(
+      teamHandle, KE::cbegin(view), KE::cend(view),
+      std::move(init_reduction_value), std::move(joiner),
+      std::move(transformer));
+}
+
 }  // namespace Experimental
 }  // namespace Kokkos
 
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Unique.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Unique.hpp
index b47ecffb20737ee439c0427c47510855d1f5d7af..2d56315f616ebe18d83cb9c37752a7a37416e6b7 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Unique.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Unique.hpp
@@ -23,71 +23,132 @@
 namespace Kokkos {
 namespace Experimental {
 
-// note: the enable_if below is to avoid "call to ... is ambiguous"
-// for example in the unit test when using a variadic function
-
-// overload set1
-template <class ExecutionSpace, class IteratorType>
-std::enable_if_t<!::Kokkos::is_view<IteratorType>::value, IteratorType> unique(
-    const ExecutionSpace& ex, IteratorType first, IteratorType last) {
-  return Impl::unique_impl("Kokkos::unique_iterator_api_default", ex, first,
-                           last);
+//
+// overload set1: default predicate, accepting execution space
+//
+template <typename ExecutionSpace, typename IteratorType,
+          std::enable_if_t<Impl::is_iterator_v<IteratorType> &&
+                               is_execution_space<ExecutionSpace>::value,
+                           int> = 0>
+IteratorType unique(const ExecutionSpace& ex, IteratorType first,
+                    IteratorType last) {
+  return Impl::unique_exespace_impl("Kokkos::unique_iterator_api_default", ex,
+                                    first, last);
 }
 
-template <class ExecutionSpace, class IteratorType>
-std::enable_if_t<!::Kokkos::is_view<IteratorType>::value, IteratorType> unique(
-    const std::string& label, const ExecutionSpace& ex, IteratorType first,
-    IteratorType last) {
-  return Impl::unique_impl(label, ex, first, last);
+template <typename ExecutionSpace, typename IteratorType,
+          std::enable_if_t<Impl::is_iterator_v<IteratorType> &&
+                               is_execution_space<ExecutionSpace>::value,
+                           int> = 0>
+IteratorType unique(const std::string& label, const ExecutionSpace& ex,
+                    IteratorType first, IteratorType last) {
+  return Impl::unique_exespace_impl(label, ex, first, last);
 }
 
-template <class ExecutionSpace, class DataType, class... Properties>
+template <typename ExecutionSpace, typename DataType, typename... Properties,
+          std::enable_if_t<is_execution_space<ExecutionSpace>::value, int> = 0>
 auto unique(const ExecutionSpace& ex,
             const ::Kokkos::View<DataType, Properties...>& view) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-  return ::Kokkos::Experimental::unique("Kokkos::unique_view_api_default", ex,
-                                        begin(view), end(view));
+  return Impl::unique_exespace_impl("Kokkos::unique_view_api_default", ex,
+                                    begin(view), end(view));
 }
 
-template <class ExecutionSpace, class DataType, class... Properties>
+template <typename ExecutionSpace, typename DataType, typename... Properties,
+          std::enable_if_t<is_execution_space<ExecutionSpace>::value, int> = 0>
 auto unique(const std::string& label, const ExecutionSpace& ex,
             const ::Kokkos::View<DataType, Properties...>& view) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-  return ::Kokkos::Experimental::unique(label, ex, begin(view), end(view));
+  return Impl::unique_exespace_impl(label, ex, begin(view), end(view));
 }
 
-// overload set2
-template <class ExecutionSpace, class IteratorType, class BinaryPredicate>
+//
+// overload set2: custom predicate, accepting execution space
+//
+template <typename ExecutionSpace, typename IteratorType,
+          typename BinaryPredicate,
+          std::enable_if_t<is_execution_space<ExecutionSpace>::value, int> = 0>
 IteratorType unique(const ExecutionSpace& ex, IteratorType first,
                     IteratorType last, BinaryPredicate pred) {
-  return Impl::unique_impl("Kokkos::unique_iterator_api_default", ex, first,
-                           last, pred);
+  return Impl::unique_exespace_impl("Kokkos::unique_iterator_api_default", ex,
+                                    first, last, pred);
 }
 
-template <class ExecutionSpace, class IteratorType, class BinaryPredicate>
+template <typename ExecutionSpace, typename IteratorType,
+          typename BinaryPredicate,
+          std::enable_if_t<is_execution_space<ExecutionSpace>::value, int> = 0>
 IteratorType unique(const std::string& label, const ExecutionSpace& ex,
                     IteratorType first, IteratorType last,
                     BinaryPredicate pred) {
-  return Impl::unique_impl(label, ex, first, last, pred);
+  return Impl::unique_exespace_impl(label, ex, first, last, pred);
 }
 
-template <class ExecutionSpace, class DataType, class... Properties,
-          class BinaryPredicate>
+template <typename ExecutionSpace, typename DataType, typename... Properties,
+          typename BinaryPredicate,
+          std::enable_if_t<is_execution_space<ExecutionSpace>::value, int> = 0>
 auto unique(const ExecutionSpace& ex,
             const ::Kokkos::View<DataType, Properties...>& view,
             BinaryPredicate pred) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-  return Impl::unique_impl("Kokkos::unique_view_api_default", ex, begin(view),
-                           end(view), std::move(pred));
+  return Impl::unique_exespace_impl("Kokkos::unique_view_api_default", ex,
+                                    begin(view), end(view), std::move(pred));
 }
 
-template <class ExecutionSpace, class DataType, class... Properties,
-          class BinaryPredicate>
+template <typename ExecutionSpace, typename DataType, typename... Properties,
+          typename BinaryPredicate,
+          std::enable_if_t<is_execution_space<ExecutionSpace>::value, int> = 0>
 auto unique(const std::string& label, const ExecutionSpace& ex,
             const ::Kokkos::View<DataType, Properties...>& view,
             BinaryPredicate pred) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-  return Impl::unique_impl(label, ex, begin(view), end(view), std::move(pred));
+  return Impl::unique_exespace_impl(label, ex, begin(view), end(view),
+                                    std::move(pred));
+}
+
+//
+// overload set3: default predicate, accepting team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+template <typename TeamHandleType, typename IteratorType,
+          std::enable_if_t<Impl::is_iterator_v<IteratorType> &&
+                               is_team_handle<TeamHandleType>::value,
+                           int> = 0>
+KOKKOS_FUNCTION IteratorType unique(const TeamHandleType& teamHandle,
+                                    IteratorType first, IteratorType last) {
+  return Impl::unique_team_impl(teamHandle, first, last);
+}
+
+template <typename TeamHandleType, typename DataType, typename... Properties,
+          std::enable_if_t<is_team_handle<TeamHandleType>::value, int> = 0>
+KOKKOS_FUNCTION auto unique(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType, Properties...>& view) {
+  return Impl::unique_team_impl(teamHandle, begin(view), end(view));
+}
+
+//
+// overload set4: custom predicate, accepting team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+template <typename TeamHandleType, typename IteratorType,
+          typename BinaryPredicate,
+          std::enable_if_t<is_team_handle<TeamHandleType>::value, int> = 0>
+KOKKOS_FUNCTION IteratorType unique(const TeamHandleType& teamHandle,
+                                    IteratorType first, IteratorType last,
+                                    BinaryPredicate pred) {
+  return Impl::unique_team_impl(teamHandle, first, last, std::move(pred));
+}
+
+template <typename TeamHandleType, typename DataType, typename... Properties,
+          typename BinaryPredicate,
+          std::enable_if_t<is_team_handle<TeamHandleType>::value, int> = 0>
+KOKKOS_FUNCTION auto unique(const TeamHandleType& teamHandle,
+                            const ::Kokkos::View<DataType, Properties...>& view,
+                            BinaryPredicate pred) {
+  return Impl::unique_team_impl(teamHandle, begin(view), end(view),
+                                std::move(pred));
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_UniqueCopy.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_UniqueCopy.hpp
index bd2451c220dd06bdcaee1d8c4693af745e3771bb..4a32d7e095d22b924d1567f6bf1a6ed8a502ed8c 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_UniqueCopy.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_UniqueCopy.hpp
@@ -23,67 +23,90 @@
 namespace Kokkos {
 namespace Experimental {
 
-// overload set1
-template <class ExecutionSpace, class InputIterator, class OutputIterator>
-std::enable_if_t<!::Kokkos::is_view<InputIterator>::value, OutputIterator>
-unique_copy(const ExecutionSpace& ex, InputIterator first, InputIterator last,
-            OutputIterator d_first) {
-  return Impl::unique_copy_impl("Kokkos::unique_copy_iterator_api_default", ex,
-                                first, last, d_first);
+//
+// overload set1: default predicate, accepting execution space
+//
+template <
+    typename ExecutionSpace, typename InputIterator, typename OutputIterator,
+    std::enable_if_t<Impl::are_iterators_v<InputIterator, OutputIterator> &&
+                         is_execution_space_v<ExecutionSpace>,
+                     int> = 0>
+OutputIterator unique_copy(const ExecutionSpace& ex, InputIterator first,
+                           InputIterator last, OutputIterator d_first) {
+  return Impl::unique_copy_exespace_impl(
+      "Kokkos::unique_copy_iterator_api_default", ex, first, last, d_first);
 }
 
-template <class ExecutionSpace, class InputIterator, class OutputIterator>
-std::enable_if_t<!::Kokkos::is_view<InputIterator>::value, OutputIterator>
-unique_copy(const std::string& label, const ExecutionSpace& ex,
-            InputIterator first, InputIterator last, OutputIterator d_first) {
-  return Impl::unique_copy_impl(label, ex, first, last, d_first);
+template <
+    typename ExecutionSpace, typename InputIterator, typename OutputIterator,
+    std::enable_if_t<Impl::are_iterators_v<InputIterator, OutputIterator> &&
+                         is_execution_space_v<ExecutionSpace>,
+                     int> = 0>
+OutputIterator unique_copy(const std::string& label, const ExecutionSpace& ex,
+                           InputIterator first, InputIterator last,
+                           OutputIterator d_first) {
+  return Impl::unique_copy_exespace_impl(label, ex, first, last, d_first);
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto unique_copy(const ExecutionSpace& ex,
                  const ::Kokkos::View<DataType1, Properties1...>& source,
                  const ::Kokkos::View<DataType2, Properties2...>& dest) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
 
-  return ::Kokkos::Experimental::unique_copy(
-      "Kokkos::unique_copy_view_api_default", ex, cbegin(source), cend(source),
-      begin(dest));
+  return Impl::unique_copy_exespace_impl("Kokkos::unique_copy_view_api_default",
+                                         ex, cbegin(source), cend(source),
+                                         begin(dest));
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto unique_copy(const std::string& label, const ExecutionSpace& ex,
                  const ::Kokkos::View<DataType1, Properties1...>& source,
                  const ::Kokkos::View<DataType2, Properties2...>& dest) {
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
 
-  return ::Kokkos::Experimental::unique_copy(label, ex, cbegin(source),
-                                             cend(source), begin(dest));
+  return Impl::unique_copy_exespace_impl(label, ex, cbegin(source),
+                                         cend(source), begin(dest));
 }
 
-// overload set2
-template <class ExecutionSpace, class InputIterator, class OutputIterator,
-          class BinaryPredicate>
+//
+// overload set2: custom predicate, accepting execution space
+//
+
+template <
+    typename ExecutionSpace, typename InputIterator, typename OutputIterator,
+    typename BinaryPredicate,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 OutputIterator unique_copy(const ExecutionSpace& ex, InputIterator first,
                            InputIterator last, OutputIterator d_first,
                            BinaryPredicate pred) {
-  return Impl::unique_copy_impl("Kokkos::unique_copy_iterator_api_default", ex,
-                                first, last, d_first, pred);
+  return Impl::unique_copy_exespace_impl(
+      "Kokkos::unique_copy_iterator_api_default", ex, first, last, d_first,
+      pred);
 }
 
-template <class ExecutionSpace, class InputIterator, class OutputIterator,
-          class BinaryPredicate>
+template <
+    typename ExecutionSpace, typename InputIterator, typename OutputIterator,
+    typename BinaryPredicate,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 OutputIterator unique_copy(const std::string& label, const ExecutionSpace& ex,
                            InputIterator first, InputIterator last,
                            OutputIterator d_first, BinaryPredicate pred) {
-  return Impl::unique_copy_impl(label, ex, first, last, d_first, pred);
+  return Impl::unique_copy_exespace_impl(label, ex, first, last, d_first, pred);
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class BinaryPredicate>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2, typename BinaryPredicate,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto unique_copy(const ExecutionSpace& ex,
                  const ::Kokkos::View<DataType1, Properties1...>& source,
                  const ::Kokkos::View<DataType2, Properties2...>& dest,
@@ -91,13 +114,15 @@ auto unique_copy(const ExecutionSpace& ex,
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
 
-  return Impl::unique_copy_impl("Kokkos::unique_copy_view_api_default", ex,
-                                cbegin(source), cend(source), begin(dest),
-                                std::move(pred));
+  return Impl::unique_copy_exespace_impl("Kokkos::unique_copy_view_api_default",
+                                         ex, cbegin(source), cend(source),
+                                         begin(dest), std::move(pred));
 }
 
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class BinaryPredicate>
+template <
+    typename ExecutionSpace, typename DataType1, typename... Properties1,
+    typename DataType2, typename... Properties2, typename BinaryPredicate,
+    std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0>
 auto unique_copy(const std::string& label, const ExecutionSpace& ex,
                  const ::Kokkos::View<DataType1, Properties1...>& source,
                  const ::Kokkos::View<DataType2, Properties2...>& dest,
@@ -105,8 +130,70 @@ auto unique_copy(const std::string& label, const ExecutionSpace& ex,
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
   Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
 
-  return Impl::unique_copy_impl(label, ex, cbegin(source), cend(source),
-                                begin(dest), std::move(pred));
+  return Impl::unique_copy_exespace_impl(
+      label, ex, cbegin(source), cend(source), begin(dest), std::move(pred));
+}
+
+//
+// overload set3: default predicate, accepting team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+template <
+    typename TeamHandleType, typename InputIterator, typename OutputIterator,
+    std::enable_if_t<Impl::are_iterators_v<InputIterator, OutputIterator> &&
+                         Kokkos::is_team_handle_v<TeamHandleType>,
+                     int> = 0>
+KOKKOS_FUNCTION OutputIterator unique_copy(const TeamHandleType& teamHandle,
+                                           InputIterator first,
+                                           InputIterator last,
+                                           OutputIterator d_first) {
+  return Impl::unique_copy_team_impl(teamHandle, first, last, d_first);
+}
+
+template <typename TeamHandleType, typename DataType1, typename... Properties1,
+          typename DataType2, typename... Properties2,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto unique_copy(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType1, Properties1...>& source,
+    const ::Kokkos::View<DataType2, Properties2...>& dest) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
+
+  return Impl::unique_copy_team_impl(teamHandle, cbegin(source), cend(source),
+                                     begin(dest));
+}
+
+//
+// overload set4: custom predicate, accepting team handle
+// Note: for now omit the overloads accepting a label
+// since they cause issues on device because of the string allocation.
+//
+template <typename TeamHandleType, typename InputIterator,
+          typename OutputIterator, typename BinaryPredicate,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION OutputIterator unique_copy(const TeamHandleType& teamHandle,
+                                           InputIterator first,
+                                           InputIterator last,
+                                           OutputIterator d_first,
+                                           BinaryPredicate pred) {
+  return Impl::unique_copy_team_impl(teamHandle, first, last, d_first, pred);
+}
+
+template <typename TeamHandleType, typename DataType1, typename... Properties1,
+          typename DataType2, typename... Properties2, typename BinaryPredicate,
+          std::enable_if_t<::Kokkos::is_team_handle_v<TeamHandleType>, int> = 0>
+KOKKOS_FUNCTION auto unique_copy(
+    const TeamHandleType& teamHandle,
+    const ::Kokkos::View<DataType1, Properties1...>& source,
+    const ::Kokkos::View<DataType2, Properties2...>& dest,
+    BinaryPredicate pred) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
+
+  return Impl::unique_copy_team_impl(teamHandle, cbegin(source), cend(source),
+                                     begin(dest), std::move(pred));
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp
index 8a474508d73edfe06b14c7360cf513f4c2c988bd..a8171fa068d1290f8842956f888165e087894cad 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp
@@ -63,14 +63,15 @@ struct StdAdjacentDiffFunctor {
         m_op(std::move(op)) {}
 };
 
+//
+// exespace impl
+//
 template <class ExecutionSpace, class InputIteratorType,
           class OutputIteratorType, class BinaryOp>
-OutputIteratorType adjacent_difference_impl(const std::string& label,
-                                            const ExecutionSpace& ex,
-                                            InputIteratorType first_from,
-                                            InputIteratorType last_from,
-                                            OutputIteratorType first_dest,
-                                            BinaryOp bin_op) {
+OutputIteratorType adjacent_difference_exespace_impl(
+    const std::string& label, const ExecutionSpace& ex,
+    InputIteratorType first_from, InputIteratorType last_from,
+    OutputIteratorType first_dest, BinaryOp bin_op) {
   // checks
   Impl::static_assert_random_access_and_accessible(ex, first_from, first_dest);
   Impl::static_assert_iterators_have_matching_difference_type(first_from,
@@ -81,25 +82,50 @@ OutputIteratorType adjacent_difference_impl(const std::string& label,
     return first_dest;
   }
 
-  // aliases
-  using value_type    = typename OutputIteratorType::value_type;
-  using aux_view_type = ::Kokkos::View<value_type*, ExecutionSpace>;
-  using functor_t =
-      StdAdjacentDiffFunctor<InputIteratorType, OutputIteratorType, BinaryOp>;
-
   // run
   const auto num_elements =
       Kokkos::Experimental::distance(first_from, last_from);
-  aux_view_type aux_view("aux_view", num_elements);
-  ::Kokkos::parallel_for(label,
-                         RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-                         functor_t(first_from, first_dest, bin_op));
+  ::Kokkos::parallel_for(
+      label, RangePolicy<ExecutionSpace>(ex, 0, num_elements),
+      StdAdjacentDiffFunctor(first_from, first_dest, bin_op));
   ex.fence("Kokkos::adjacent_difference: fence after operation");
 
   // return
   return first_dest + num_elements;
 }
 
+//
+// team impl
+//
+template <class TeamHandleType, class InputIteratorType,
+          class OutputIteratorType, class BinaryOp>
+KOKKOS_FUNCTION OutputIteratorType adjacent_difference_team_impl(
+    const TeamHandleType& teamHandle, InputIteratorType first_from,
+    InputIteratorType last_from, OutputIteratorType first_dest,
+    BinaryOp bin_op) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(teamHandle, first_from,
+                                                   first_dest);
+  Impl::static_assert_iterators_have_matching_difference_type(first_from,
+                                                              first_dest);
+  Impl::expect_valid_range(first_from, last_from);
+
+  if (first_from == last_from) {
+    return first_dest;
+  }
+
+  // run
+  const auto num_elements =
+      Kokkos::Experimental::distance(first_from, last_from);
+  ::Kokkos::parallel_for(
+      TeamThreadRange(teamHandle, 0, num_elements),
+      StdAdjacentDiffFunctor(first_from, first_dest, bin_op));
+  teamHandle.team_barrier();
+
+  // return
+  return first_dest + num_elements;
+}
+
 }  // namespace Impl
 }  // namespace Experimental
 }  // namespace Kokkos
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_AdjacentFind.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_AdjacentFind.hpp
index cc6b63f028760d02da7a0b46e250b1bd1ee8439a..f30b7be06a6fa560207b13f6f7d915ba3f295534 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_AdjacentFind.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_AdjacentFind.hpp
@@ -27,9 +27,9 @@ namespace Kokkos {
 namespace Experimental {
 namespace Impl {
 
-template <class IndexType, class IteratorType, class ReducerType,
-          class PredicateType>
+template <class IteratorType, class ReducerType, class PredicateType>
 struct StdAdjacentFindFunctor {
+  using index_type     = typename IteratorType::difference_type;
   using red_value_type = typename ReducerType::value_type;
 
   IteratorType m_first;
@@ -37,17 +37,18 @@ struct StdAdjacentFindFunctor {
   PredicateType m_p;
 
   KOKKOS_FUNCTION
-  void operator()(const IndexType i, red_value_type& red_value) const {
+  void operator()(const index_type i, red_value_type& red_value) const {
     const auto& my_value   = m_first[i];
     const auto& next_value = m_first[i + 1];
     const bool are_equal   = m_p(my_value, next_value);
 
-    auto rv =
-        are_equal
-            ? red_value_type{i}
-            : red_value_type{::Kokkos::reduction_identity<IndexType>::min()};
+    // FIXME_NVHPC using a ternary operator causes problems
+    red_value_type value = {::Kokkos::reduction_identity<index_type>::min()};
+    if (are_equal) {
+      value.min_loc_true = i;
+    }
 
-    m_reducer.join(red_value, rv);
+    m_reducer.join(red_value, value);
   }
 
   KOKKOS_FUNCTION
@@ -58,10 +59,14 @@ struct StdAdjacentFindFunctor {
         m_p(std::move(p)) {}
 };
 
+//
+// exespace impl
+//
 template <class ExecutionSpace, class IteratorType, class PredicateType>
-IteratorType adjacent_find_impl(const std::string& label,
-                                const ExecutionSpace& ex, IteratorType first,
-                                IteratorType last, PredicateType pred) {
+IteratorType adjacent_find_exespace_impl(const std::string& label,
+                                         const ExecutionSpace& ex,
+                                         IteratorType first, IteratorType last,
+                                         PredicateType pred) {
   // checks
   Impl::static_assert_random_access_and_accessible(ex, first);
   Impl::expect_valid_range(first, last);
@@ -75,8 +80,6 @@ IteratorType adjacent_find_impl(const std::string& label,
   using index_type           = typename IteratorType::difference_type;
   using reducer_type         = FirstLoc<index_type>;
   using reduction_value_type = typename reducer_type::value_type;
-  using func_t = StdAdjacentFindFunctor<index_type, IteratorType, reducer_type,
-                                        PredicateType>;
 
   reduction_value_type red_result;
   reducer_type reducer(red_result);
@@ -85,7 +88,8 @@ IteratorType adjacent_find_impl(const std::string& label,
   // each index i in the reduction checks i and (i+1).
   ::Kokkos::parallel_reduce(
       label, RangePolicy<ExecutionSpace>(ex, 0, num_elements - 1),
-      func_t(first, reducer, pred), reducer);
+      // use CTAD
+      StdAdjacentFindFunctor(first, reducer, pred), reducer);
 
   // fence not needed because reducing into scalar
   if (red_result.min_loc_true ==
@@ -97,12 +101,62 @@ IteratorType adjacent_find_impl(const std::string& label,
 }
 
 template <class ExecutionSpace, class IteratorType>
-IteratorType adjacent_find_impl(const std::string& label,
-                                const ExecutionSpace& ex, IteratorType first,
-                                IteratorType last) {
+IteratorType adjacent_find_exespace_impl(const std::string& label,
+                                         const ExecutionSpace& ex,
+                                         IteratorType first,
+                                         IteratorType last) {
+  using value_type     = typename IteratorType::value_type;
+  using default_pred_t = StdAlgoEqualBinaryPredicate<value_type>;
+  return adjacent_find_exespace_impl(label, ex, first, last, default_pred_t());
+}
+
+//
+// team impl
+//
+template <class TeamHandleType, class IteratorType, class PredicateType>
+KOKKOS_FUNCTION IteratorType
+adjacent_find_team_impl(const TeamHandleType& teamHandle, IteratorType first,
+                        IteratorType last, PredicateType pred) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(teamHandle, first);
+  Impl::expect_valid_range(first, last);
+
+  const auto num_elements = Kokkos::Experimental::distance(first, last);
+
+  if (num_elements <= 1) {
+    return last;
+  }
+
+  using index_type           = typename IteratorType::difference_type;
+  using reducer_type         = FirstLoc<index_type>;
+  using reduction_value_type = typename reducer_type::value_type;
+
+  reduction_value_type red_result;
+  reducer_type reducer(red_result);
+
+  // note that we use below num_elements-1 because
+  // each index i in the reduction checks i and (i+1).
+  ::Kokkos::parallel_reduce(TeamThreadRange(teamHandle, 0, num_elements - 1),
+                            // use CTAD
+                            StdAdjacentFindFunctor(first, reducer, pred),
+                            reducer);
+
+  teamHandle.team_barrier();
+
+  if (red_result.min_loc_true ==
+      ::Kokkos::reduction_identity<index_type>::min()) {
+    return last;
+  } else {
+    return first + red_result.min_loc_true;
+  }
+}
+
+template <class TeamHandleType, class IteratorType>
+KOKKOS_FUNCTION IteratorType adjacent_find_team_impl(
+    const TeamHandleType& teamHandle, IteratorType first, IteratorType last) {
   using value_type     = typename IteratorType::value_type;
   using default_pred_t = StdAlgoEqualBinaryPredicate<value_type>;
-  return adjacent_find_impl(label, ex, first, last, default_pred_t());
+  return adjacent_find_team_impl(teamHandle, first, last, default_pred_t());
 }
 
 }  // namespace Impl
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_AllOfAnyOfNoneOf.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_AllOfAnyOfNoneOf.hpp
index ad562070a044f337970129f9e9a1c241cfaf4944..bdc050f9c19dc14739d2844481750b133e7ee5d1 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_AllOfAnyOfNoneOf.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_AllOfAnyOfNoneOf.hpp
@@ -23,23 +23,58 @@ namespace Kokkos {
 namespace Experimental {
 namespace Impl {
 
+//
+// exespace impl
+//
 template <class ExecutionSpace, class InputIterator, class Predicate>
-bool all_of_impl(const std::string& label, const ExecutionSpace& ex,
-                 InputIterator first, InputIterator last, Predicate predicate) {
-  return (find_if_or_not_impl<false>(label, ex, first, last, predicate) ==
-          last);
+bool all_of_exespace_impl(const std::string& label, const ExecutionSpace& ex,
+                          InputIterator first, InputIterator last,
+                          Predicate predicate) {
+  return (find_if_or_not_exespace_impl<false>(label, ex, first, last,
+                                              predicate) == last);
 }
 
 template <class ExecutionSpace, class InputIterator, class Predicate>
-bool any_of_impl(const std::string& label, const ExecutionSpace& ex,
-                 InputIterator first, InputIterator last, Predicate predicate) {
-  return (find_if_or_not_impl<true>(label, ex, first, last, predicate) != last);
+bool any_of_exespace_impl(const std::string& label, const ExecutionSpace& ex,
+                          InputIterator first, InputIterator last,
+                          Predicate predicate) {
+  return (find_if_or_not_exespace_impl<true>(label, ex, first, last,
+                                             predicate) != last);
 }
 
 template <class ExecutionSpace, class IteratorType, class Predicate>
-bool none_of_impl(const std::string& label, const ExecutionSpace& ex,
-                  IteratorType first, IteratorType last, Predicate predicate) {
-  return (find_if_or_not_impl<true>(label, ex, first, last, predicate) == last);
+bool none_of_exespace_impl(const std::string& label, const ExecutionSpace& ex,
+                           IteratorType first, IteratorType last,
+                           Predicate predicate) {
+  return (find_if_or_not_exespace_impl<true>(label, ex, first, last,
+                                             predicate) == last);
+}
+
+//
+// team impl
+//
+template <class TeamHandleType, class InputIterator, class Predicate>
+KOKKOS_FUNCTION bool all_of_team_impl(const TeamHandleType& teamHandle,
+                                      InputIterator first, InputIterator last,
+                                      Predicate predicate) {
+  return (find_if_or_not_team_impl<false>(teamHandle, first, last, predicate) ==
+          last);
+}
+
+template <class TeamHandleType, class InputIterator, class Predicate>
+KOKKOS_FUNCTION bool any_of_team_impl(const TeamHandleType& teamHandle,
+                                      InputIterator first, InputIterator last,
+                                      Predicate predicate) {
+  return (find_if_or_not_team_impl<true>(teamHandle, first, last, predicate) !=
+          last);
+}
+
+template <class TeamHandleType, class IteratorType, class Predicate>
+KOKKOS_FUNCTION bool none_of_team_impl(const TeamHandleType& teamHandle,
+                                       IteratorType first, IteratorType last,
+                                       Predicate predicate) {
+  return (find_if_or_not_team_impl<true>(teamHandle, first, last, predicate) ==
+          last);
 }
 
 }  // namespace Impl
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp
index 52e7625e4d2f0e55c89d8e42634a3083c8d2088a..27ce5a6fad6eb89124b2dc13462102852171cf8c 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp
@@ -29,7 +29,7 @@ struct is_admissible_to_kokkos_std_algorithms : std::false_type {};
 
 template <typename T>
 struct is_admissible_to_kokkos_std_algorithms<
-    T, std::enable_if_t< ::Kokkos::is_view<T>::value && T::rank == 1 &&
+    T, std::enable_if_t< ::Kokkos::is_view<T>::value && T::rank() == 1 &&
                          (std::is_same<typename T::traits::array_layout,
                                        Kokkos::LayoutLeft>::value ||
                           std::is_same<typename T::traits::array_layout,
@@ -55,6 +55,9 @@ using iterator_category_t = typename T::iterator_category;
 template <class T>
 using is_iterator = Kokkos::is_detected<iterator_category_t, T>;
 
+template <class T>
+inline constexpr bool is_iterator_v = is_iterator<T>::value;
+
 //
 // are_iterators
 //
@@ -63,15 +66,18 @@ struct are_iterators;
 
 template <class T>
 struct are_iterators<T> {
-  static constexpr bool value = is_iterator<T>::value;
+  static constexpr bool value = is_iterator_v<T>;
 };
 
 template <class Head, class... Tail>
 struct are_iterators<Head, Tail...> {
   static constexpr bool value =
-      are_iterators<Head>::value && are_iterators<Tail...>::value;
+      are_iterators<Head>::value && (are_iterators<Tail>::value && ... && true);
 };
 
+template <class... Ts>
+inline constexpr bool are_iterators_v = are_iterators<Ts...>::value;
+
 //
 // are_random_access_iterators
 //
@@ -81,17 +87,21 @@ struct are_random_access_iterators;
 template <class T>
 struct are_random_access_iterators<T> {
   static constexpr bool value =
-      is_iterator<T>::value &&
-      std::is_base_of<std::random_access_iterator_tag,
-                      typename T::iterator_category>::value;
+      is_iterator_v<T> && std::is_base_of<std::random_access_iterator_tag,
+                                          typename T::iterator_category>::value;
 };
 
 template <class Head, class... Tail>
 struct are_random_access_iterators<Head, Tail...> {
-  static constexpr bool value = are_random_access_iterators<Head>::value &&
-                                are_random_access_iterators<Tail...>::value;
+  static constexpr bool value =
+      are_random_access_iterators<Head>::value &&
+      (are_random_access_iterators<Tail>::value && ... && true);
 };
 
+template <class... Ts>
+inline constexpr bool are_random_access_iterators_v =
+    are_random_access_iterators<Ts...>::value;
+
 //
 // iterators_are_accessible_from
 //
@@ -113,16 +123,18 @@ struct iterators_are_accessible_from<ExeSpace, Head, Tail...> {
       iterators_are_accessible_from<ExeSpace, Tail...>::value;
 };
 
-template <class ExecutionSpace, class... IteratorTypes>
+template <class ExecutionSpaceOrTeamHandleType, class... IteratorTypes>
 KOKKOS_INLINE_FUNCTION constexpr void
-static_assert_random_access_and_accessible(const ExecutionSpace& /* ex */,
-                                           IteratorTypes... /* iterators */) {
+static_assert_random_access_and_accessible(
+    const ExecutionSpaceOrTeamHandleType& /* ex_or_th*/,
+    IteratorTypes... /* iterators */) {
   static_assert(
       are_random_access_iterators<IteratorTypes...>::value,
       "Currently, Kokkos standard algorithms require random access iterators.");
-  static_assert(
-      iterators_are_accessible_from<ExecutionSpace, IteratorTypes...>::value,
-      "Incompatible view/iterator and execution space");
+  static_assert(iterators_are_accessible_from<
+                    typename ExecutionSpaceOrTeamHandleType::execution_space,
+                    IteratorTypes...>::value,
+                "Incompatible view/iterator and execution space");
 }
 
 //
@@ -182,10 +194,10 @@ struct not_openmptarget {
 #endif
 };
 
-template <class ExecutionSpace>
+template <class ExecutionSpaceOrTeamHandleType>
 KOKKOS_INLINE_FUNCTION constexpr void static_assert_is_not_openmptarget(
-    const ExecutionSpace&) {
-  static_assert(not_openmptarget<ExecutionSpace>::value,
+    const ExecutionSpaceOrTeamHandleType& /*ex_or_th*/) {
+  static_assert(not_openmptarget<ExecutionSpaceOrTeamHandleType>::value,
                 "Currently, Kokkos standard algorithms do not support custom "
                 "comparators in OpenMPTarget");
 }
@@ -194,7 +206,8 @@ KOKKOS_INLINE_FUNCTION constexpr void static_assert_is_not_openmptarget(
 // valid range
 //
 template <class IteratorType>
-void expect_valid_range(IteratorType first, IteratorType last) {
+KOKKOS_INLINE_FUNCTION void expect_valid_range(IteratorType first,
+                                               IteratorType last) {
   // this is a no-op for release
   KOKKOS_EXPECTS(last >= first);
   // avoid compiler complaining when KOKKOS_EXPECTS is no-op
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_CopyBackward.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_CopyBackward.hpp
index b3adbc5e2dcf5f970bf3509944fc34c71131117d..0f68c9e978094f50e917a45fa2663765e4325969 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_CopyBackward.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_CopyBackward.hpp
@@ -27,16 +27,18 @@ namespace Kokkos {
 namespace Experimental {
 namespace Impl {
 
-template <class IndexType, class IteratorType1, class IteratorType2>
+template <class IteratorType1, class IteratorType2>
 struct StdCopyBackwardFunctor {
-  static_assert(std::is_signed<IndexType>::value,
-                "Kokkos: StdCopyBackwardFunctor requires signed index type");
+  // we can use difference type from IteratorType1 since
+  // the calling functions below already static assert that
+  // the iterators have matching difference type
+  using index_type = typename IteratorType1::difference_type;
 
   IteratorType1 m_last;
   IteratorType2 m_dest_last;
 
   KOKKOS_FUNCTION
-  void operator()(IndexType i) const { m_dest_last[-i - 1] = m_last[-i - 1]; }
+  void operator()(index_type i) const { m_dest_last[-i - 1] = m_last[-i - 1]; }
 
   KOKKOS_FUNCTION
   StdCopyBackwardFunctor(IteratorType1 _last, IteratorType2 _dest_last)
@@ -44,30 +46,51 @@ struct StdCopyBackwardFunctor {
 };
 
 template <class ExecutionSpace, class IteratorType1, class IteratorType2>
-IteratorType2 copy_backward_impl(const std::string& label,
-                                 const ExecutionSpace& ex, IteratorType1 first,
-                                 IteratorType1 last, IteratorType2 d_last) {
+IteratorType2 copy_backward_exespace_impl(const std::string& label,
+                                          const ExecutionSpace& ex,
+                                          IteratorType1 first,
+                                          IteratorType1 last,
+                                          IteratorType2 d_last) {
   // checks
   Impl::static_assert_random_access_and_accessible(ex, first, d_last);
   Impl::static_assert_iterators_have_matching_difference_type(first, d_last);
   Impl::expect_valid_range(first, last);
 
-  // aliases
-  using index_type = typename IteratorType1::difference_type;
-  using func_t =
-      StdCopyBackwardFunctor<index_type, IteratorType1, IteratorType2>;
-
   // run
   const auto num_elements = Kokkos::Experimental::distance(first, last);
   ::Kokkos::parallel_for(label,
                          RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-                         func_t(last, d_last));
+                         // use CTAD
+                         StdCopyBackwardFunctor(last, d_last));
   ex.fence("Kokkos::copy_backward: fence after operation");
 
   // return
   return d_last - num_elements;
 }
 
+//
+// team-level impl
+//
+template <class TeamHandleType, class IteratorType1, class IteratorType2>
+KOKKOS_FUNCTION IteratorType2
+copy_backward_team_impl(const TeamHandleType& teamHandle, IteratorType1 first,
+                        IteratorType1 last, IteratorType2 d_last) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(teamHandle, first, d_last);
+  Impl::static_assert_iterators_have_matching_difference_type(first, d_last);
+  Impl::expect_valid_range(first, last);
+
+  // run
+  const auto num_elements = Kokkos::Experimental::distance(first, last);
+  ::Kokkos::parallel_for(TeamThreadRange(teamHandle, 0, num_elements),
+                         // use CTAD
+                         StdCopyBackwardFunctor(last, d_last));
+  teamHandle.team_barrier();
+
+  // return
+  return d_last - num_elements;
+}
+
 }  // namespace Impl
 }  // namespace Experimental
 }  // namespace Kokkos
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_CopyCopyN.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_CopyCopyN.hpp
index 1b120c46d0844da292652683b5234cfc7c8417d4..86e99ecbd0ce3107fdffa5d4a6bc7b1dd915dd6c 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_CopyCopyN.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_CopyCopyN.hpp
@@ -27,13 +27,18 @@ namespace Kokkos {
 namespace Experimental {
 namespace Impl {
 
-template <class IndexType, class InputIterator, class OutputIterator>
+template <class InputIterator, class OutputIterator>
 struct StdCopyFunctor {
+  // we can use difference type from InputIterator since
+  // the calling functions below already static assert that
+  // the iterators have matching difference type
+  using index_type = typename InputIterator::difference_type;
+
   InputIterator m_first;
   OutputIterator m_dest_first;
 
   KOKKOS_FUNCTION
-  void operator()(IndexType i) const { m_dest_first[i] = m_first[i]; }
+  void operator()(index_type i) const { m_dest_first[i] = m_first[i]; }
 
   KOKKOS_FUNCTION
   StdCopyFunctor(InputIterator _first, OutputIterator _dest_first)
@@ -41,23 +46,20 @@ struct StdCopyFunctor {
 };
 
 template <class ExecutionSpace, class InputIterator, class OutputIterator>
-OutputIterator copy_impl(const std::string& label, const ExecutionSpace& ex,
-                         InputIterator first, InputIterator last,
-                         OutputIterator d_first) {
+OutputIterator copy_exespace_impl(const std::string& label,
+                                  const ExecutionSpace& ex, InputIterator first,
+                                  InputIterator last, OutputIterator d_first) {
   // checks
   Impl::static_assert_random_access_and_accessible(ex, first, d_first);
   Impl::static_assert_iterators_have_matching_difference_type(first, d_first);
   Impl::expect_valid_range(first, last);
 
-  // aliases
-  using index_type = typename InputIterator::difference_type;
-  using func_t     = StdCopyFunctor<index_type, InputIterator, OutputIterator>;
-
   // run
   const auto num_elements = Kokkos::Experimental::distance(first, last);
   ::Kokkos::parallel_for(label,
                          RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-                         func_t(first, d_first));
+                         // use CTAD
+                         StdCopyFunctor(first, d_first));
   ex.fence("Kokkos::copy: fence after operation");
 
   // return
@@ -66,16 +68,61 @@ OutputIterator copy_impl(const std::string& label, const ExecutionSpace& ex,
 
 template <class ExecutionSpace, class InputIterator, class Size,
           class OutputIterator>
-OutputIterator copy_n_impl(const std::string& label, const ExecutionSpace& ex,
-                           InputIterator first_from, Size count,
-                           OutputIterator first_dest) {
+OutputIterator copy_n_exespace_impl(const std::string& label,
+                                    const ExecutionSpace& ex,
+                                    InputIterator first_from, Size count,
+                                    OutputIterator first_dest) {
   // checks
   Impl::static_assert_random_access_and_accessible(ex, first_from, first_dest);
   Impl::static_assert_iterators_have_matching_difference_type(first_from,
                                                               first_dest);
 
   if (count > 0) {
-    return copy_impl(label, ex, first_from, first_from + count, first_dest);
+    return copy_exespace_impl(label, ex, first_from, first_from + count,
+                              first_dest);
+  } else {
+    return first_dest;
+  }
+}
+
+//
+// team-level impl
+//
+template <class TeamHandleType, class InputIterator, class OutputIterator>
+KOKKOS_FUNCTION OutputIterator copy_team_impl(const TeamHandleType& teamHandle,
+                                              InputIterator first,
+                                              InputIterator last,
+                                              OutputIterator d_first) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(teamHandle, first, d_first);
+  Impl::static_assert_iterators_have_matching_difference_type(first, d_first);
+  Impl::expect_valid_range(first, last);
+
+  // run
+  const auto num_elements = Kokkos::Experimental::distance(first, last);
+  ::Kokkos::parallel_for(TeamThreadRange(teamHandle, 0, num_elements),
+                         // use CTAD
+                         StdCopyFunctor(first, d_first));
+  teamHandle.team_barrier();
+
+  // return
+  return d_first + num_elements;
+}
+
+template <class TeamHandleType, class InputIterator, class Size,
+          class OutputIterator>
+KOKKOS_FUNCTION OutputIterator
+copy_n_team_impl(const TeamHandleType& teamHandle, InputIterator first_from,
+                 Size count, OutputIterator first_dest) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(teamHandle, first_from,
+                                                   first_dest);
+  Impl::static_assert_iterators_have_matching_difference_type(first_from,
+                                                              first_dest);
+
+  if (count > 0) {
+    return copy_team_impl(teamHandle, first_from, first_from + count,
+                          first_dest);
   } else {
     return first_dest;
   }
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_CopyIf.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_CopyIf.hpp
index 3c0c4f7e9bce2f0ce3d11303566204db612739ec..3c1e2474bc9ce7dd82c1030e6f771bf2e710c174 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_CopyIf.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_CopyIf.hpp
@@ -20,6 +20,7 @@
 #include <Kokkos_Core.hpp>
 #include "Kokkos_Constraints.hpp"
 #include "Kokkos_HelperPredicates.hpp"
+#include "Kokkos_MustUseKokkosSingleInTeam.hpp"
 #include <std_algorithms/Kokkos_Distance.hpp>
 #include <string>
 
@@ -27,8 +28,10 @@ namespace Kokkos {
 namespace Experimental {
 namespace Impl {
 
-template <class IndexType, class FirstFrom, class FirstDest, class PredType>
+template <class FirstFrom, class FirstDest, class PredType>
 struct StdCopyIfFunctor {
+  using index_type = typename FirstFrom::difference_type;
+
   FirstFrom m_first_from;
   FirstDest m_first_dest;
   PredType m_pred;
@@ -40,7 +43,7 @@ struct StdCopyIfFunctor {
         m_pred(std::move(pred)) {}
 
   KOKKOS_FUNCTION
-  void operator()(const IndexType i, IndexType& update,
+  void operator()(const index_type i, index_type& update,
                   const bool final_pass) const {
     const auto& myval = m_first_from[i];
     if (final_pass) {
@@ -57,9 +60,11 @@ struct StdCopyIfFunctor {
 
 template <class ExecutionSpace, class InputIterator, class OutputIterator,
           class PredicateType>
-OutputIterator copy_if_impl(const std::string& label, const ExecutionSpace& ex,
-                            InputIterator first, InputIterator last,
-                            OutputIterator d_first, PredicateType pred) {
+OutputIterator copy_if_exespace_impl(const std::string& label,
+                                     const ExecutionSpace& ex,
+                                     InputIterator first, InputIterator last,
+                                     OutputIterator d_first,
+                                     PredicateType pred) {
   /*
     To explain the impl, suppose that our data is:
 
@@ -90,23 +95,67 @@ OutputIterator copy_if_impl(const std::string& label, const ExecutionSpace& ex,
   if (first == last) {
     return d_first;
   } else {
-    // aliases
-    using index_type = typename InputIterator::difference_type;
-    using func_type  = StdCopyIfFunctor<index_type, InputIterator,
-                                       OutputIterator, PredicateType>;
-
     // run
     const auto num_elements = Kokkos::Experimental::distance(first, last);
-    index_type count        = 0;
+
+    typename InputIterator::difference_type count = 0;
     ::Kokkos::parallel_scan(label,
                             RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-                            func_type(first, d_first, pred), count);
+                            // use CTAD
+                            StdCopyIfFunctor(first, d_first, pred), count);
 
     // fence not needed because of the scan accumulating into count
     return d_first + count;
   }
 }
 
+template <class TeamHandleType, class InputIterator, class OutputIterator,
+          class PredicateType>
+KOKKOS_FUNCTION OutputIterator copy_if_team_impl(
+    const TeamHandleType& teamHandle, InputIterator first, InputIterator last,
+    OutputIterator d_first, PredicateType pred) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(teamHandle, first, d_first);
+  Impl::static_assert_iterators_have_matching_difference_type(first, d_first);
+  Impl::expect_valid_range(first, last);
+
+  if (first == last) {
+    return d_first;
+  }
+
+  const std::size_t num_elements = Kokkos::Experimental::distance(first, last);
+  if constexpr (stdalgo_must_use_kokkos_single_for_team_scan_v<
+                    typename TeamHandleType::execution_space>) {
+    std::size_t count = 0;
+    Kokkos::single(
+        Kokkos::PerTeam(teamHandle),
+        [=](std::size_t& lcount) {
+          lcount = 0;
+          for (std::size_t i = 0; i < num_elements; ++i) {
+            const auto& myval = first[i];
+            if (pred(myval)) {
+              d_first[lcount++] = myval;
+            }
+          }
+        },
+        count);
+    // no barrier needed since single above broadcasts to all members
+    return d_first + count;
+
+  } else {
+    typename InputIterator::difference_type count = 0;
+    ::Kokkos::parallel_scan(TeamThreadRange(teamHandle, 0, num_elements),
+                            StdCopyIfFunctor(first, d_first, pred), count);
+    // no barrier needed because of the scan accumulating into count
+    return d_first + count;
+  }
+
+#if defined KOKKOS_COMPILER_INTEL || \
+    (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130)
+  __builtin_unreachable();
+#endif
+}
+
 }  // namespace Impl
 }  // namespace Experimental
 }  // namespace Kokkos
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_CountCountIf.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_CountCountIf.hpp
index 18b8c463594f2c3da1268db5ac57c59b0f40fbef..9b6b403aa494ad55e0db3433e9ecdd6a65201be0 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_CountCountIf.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_CountCountIf.hpp
@@ -46,37 +46,65 @@ struct StdCountIfFunctor {
 };
 
 template <class ExecutionSpace, class IteratorType, class Predicate>
-typename IteratorType::difference_type count_if_impl(const std::string& label,
-                                                     const ExecutionSpace& ex,
-                                                     IteratorType first,
-                                                     IteratorType last,
-                                                     Predicate predicate) {
+typename IteratorType::difference_type count_if_exespace_impl(
+    const std::string& label, const ExecutionSpace& ex, IteratorType first,
+    IteratorType last, Predicate predicate) {
   // checks
   Impl::static_assert_random_access_and_accessible(ex, first);
   Impl::expect_valid_range(first, last);
 
-  // aliases
-  using func_t = StdCountIfFunctor<IteratorType, Predicate>;
-
   // run
   const auto num_elements = Kokkos::Experimental::distance(first, last);
   typename IteratorType::difference_type count = 0;
   ::Kokkos::parallel_reduce(label,
                             RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-                            func_t(first, predicate), count);
+                            // use CTAD
+                            StdCountIfFunctor(first, predicate), count);
   ex.fence("Kokkos::count_if: fence after operation");
 
   return count;
 }
 
 template <class ExecutionSpace, class IteratorType, class T>
-auto count_impl(const std::string& label, const ExecutionSpace& ex,
-                IteratorType first, IteratorType last, const T& value) {
-  return count_if_impl(
+auto count_exespace_impl(const std::string& label, const ExecutionSpace& ex,
+                         IteratorType first, IteratorType last,
+                         const T& value) {
+  return count_if_exespace_impl(
       label, ex, first, last,
       ::Kokkos::Experimental::Impl::StdAlgoEqualsValUnaryPredicate<T>(value));
 }
 
+//
+// team-level impl
+//
+template <class TeamHandleType, class IteratorType, class Predicate>
+KOKKOS_FUNCTION typename IteratorType::difference_type count_if_team_impl(
+    const TeamHandleType& teamHandle, IteratorType first, IteratorType last,
+    Predicate predicate) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(teamHandle, first);
+  Impl::expect_valid_range(first, last);
+
+  // run
+  const auto num_elements = Kokkos::Experimental::distance(first, last);
+  typename IteratorType::difference_type count = 0;
+  ::Kokkos::parallel_reduce(TeamThreadRange(teamHandle, 0, num_elements),
+                            // use CTAD
+                            StdCountIfFunctor(first, predicate), count);
+  teamHandle.team_barrier();
+
+  return count;
+}
+
+template <class TeamHandleType, class IteratorType, class T>
+KOKKOS_FUNCTION auto count_team_impl(const TeamHandleType& teamHandle,
+                                     IteratorType first, IteratorType last,
+                                     const T& value) {
+  return count_if_team_impl(
+      teamHandle, first, last,
+      ::Kokkos::Experimental::Impl::StdAlgoEqualsValUnaryPredicate<T>(value));
+}
+
 }  // namespace Impl
 }  // namespace Experimental
 }  // namespace Kokkos
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Equal.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Equal.hpp
index e045080d4a6a3d8bf6a78f3041a31681c0491027..62b7d226f6363436d359ca55623d081ca5766d21 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Equal.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Equal.hpp
@@ -27,15 +27,16 @@ namespace Kokkos {
 namespace Experimental {
 namespace Impl {
 
-template <class IndexType, class IteratorType1, class IteratorType2,
-          class BinaryPredicateType>
+template <class IteratorType1, class IteratorType2, class BinaryPredicateType>
 struct StdEqualFunctor {
+  using index_type = typename IteratorType1::difference_type;
+
   IteratorType1 m_first1;
   IteratorType2 m_first2;
   BinaryPredicateType m_predicate;
 
   KOKKOS_FUNCTION
-  void operator()(IndexType i, std::size_t& lsum) const {
+  void operator()(index_type i, std::size_t& lsum) const {
     if (!m_predicate(m_first1[i], m_first2[i])) {
       lsum = 1;
     }
@@ -49,67 +50,130 @@ struct StdEqualFunctor {
         m_predicate(std::move(_predicate)) {}
 };
 
+//
+// exespace impl
+//
 template <class ExecutionSpace, class IteratorType1, class IteratorType2,
           class BinaryPredicateType>
-bool equal_impl(const std::string& label, const ExecutionSpace& ex,
-                IteratorType1 first1, IteratorType1 last1, IteratorType2 first2,
-                BinaryPredicateType predicate) {
+bool equal_exespace_impl(const std::string& label, const ExecutionSpace& ex,
+                         IteratorType1 first1, IteratorType1 last1,
+                         IteratorType2 first2, BinaryPredicateType predicate) {
   // checks
   Impl::static_assert_random_access_and_accessible(ex, first1, first2);
   Impl::static_assert_iterators_have_matching_difference_type(first1, first2);
   Impl::expect_valid_range(first1, last1);
 
-  // aliases
-  using index_type = typename IteratorType1::difference_type;
-  using func_t     = StdEqualFunctor<index_type, IteratorType1, IteratorType2,
-                                 BinaryPredicateType>;
-
   // run
   const auto num_elements = Kokkos::Experimental::distance(first1, last1);
   std::size_t different   = 0;
-  ::Kokkos::parallel_reduce(label,
-                            RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-                            func_t(first1, first2, predicate), different);
+  ::Kokkos::parallel_reduce(
+      label, RangePolicy<ExecutionSpace>(ex, 0, num_elements),
+      StdEqualFunctor(first1, first2, predicate), different);
   ex.fence("Kokkos::equal: fence after operation");
 
   return !different;
 }
 
 template <class ExecutionSpace, class IteratorType1, class IteratorType2>
-bool equal_impl(const std::string& label, const ExecutionSpace& ex,
-                IteratorType1 first1, IteratorType1 last1,
-                IteratorType2 first2) {
+bool equal_exespace_impl(const std::string& label, const ExecutionSpace& ex,
+                         IteratorType1 first1, IteratorType1 last1,
+                         IteratorType2 first2) {
   using value_type1 = typename IteratorType1::value_type;
   using value_type2 = typename IteratorType2::value_type;
   using pred_t      = StdAlgoEqualBinaryPredicate<value_type1, value_type2>;
-  return equal_impl(label, ex, first1, last1, first2, pred_t());
+  return equal_exespace_impl(label, ex, first1, last1, first2, pred_t());
 }
 
 template <class ExecutionSpace, class IteratorType1, class IteratorType2,
           class BinaryPredicateType>
-bool equal_impl(const std::string& label, const ExecutionSpace& ex,
-                IteratorType1 first1, IteratorType1 last1, IteratorType2 first2,
-                IteratorType2 last2, BinaryPredicateType predicate) {
+bool equal_exespace_impl(const std::string& label, const ExecutionSpace& ex,
+                         IteratorType1 first1, IteratorType1 last1,
+                         IteratorType2 first2, IteratorType2 last2,
+                         BinaryPredicateType predicate) {
   const auto d1 = ::Kokkos::Experimental::distance(first1, last1);
   const auto d2 = ::Kokkos::Experimental::distance(first2, last2);
   if (d1 != d2) {
     return false;
   }
 
-  return equal_impl(label, ex, first1, last1, first2, predicate);
+  return equal_exespace_impl(label, ex, first1, last1, first2, predicate);
 }
 
 template <class ExecutionSpace, class IteratorType1, class IteratorType2>
-bool equal_impl(const std::string& label, const ExecutionSpace& ex,
-                IteratorType1 first1, IteratorType1 last1, IteratorType2 first2,
-                IteratorType2 last2) {
+bool equal_exespace_impl(const std::string& label, const ExecutionSpace& ex,
+                         IteratorType1 first1, IteratorType1 last1,
+                         IteratorType2 first2, IteratorType2 last2) {
+  Impl::expect_valid_range(first1, last1);
+  Impl::expect_valid_range(first2, last2);
+
+  using value_type1 = typename IteratorType1::value_type;
+  using value_type2 = typename IteratorType2::value_type;
+  using pred_t      = StdAlgoEqualBinaryPredicate<value_type1, value_type2>;
+  return equal_exespace_impl(label, ex, first1, last1, first2, last2, pred_t());
+}
+
+//
+// team impl
+//
+template <class TeamHandleType, class IteratorType1, class IteratorType2,
+          class BinaryPredicateType>
+KOKKOS_FUNCTION bool equal_team_impl(const TeamHandleType& teamHandle,
+                                     IteratorType1 first1, IteratorType1 last1,
+                                     IteratorType2 first2,
+                                     BinaryPredicateType predicate) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(teamHandle, first1, first2);
+  Impl::static_assert_iterators_have_matching_difference_type(first1, first2);
+  Impl::expect_valid_range(first1, last1);
+
+  // run
+  const auto num_elements = Kokkos::Experimental::distance(first1, last1);
+  std::size_t different   = 0;
+  ::Kokkos::parallel_reduce(TeamThreadRange(teamHandle, 0, num_elements),
+                            StdEqualFunctor(first1, first2, predicate),
+                            different);
+  teamHandle.team_barrier();
+
+  return !different;
+}
+
+template <class TeamHandleType, class IteratorType1, class IteratorType2>
+KOKKOS_FUNCTION bool equal_team_impl(const TeamHandleType& teamHandle,
+                                     IteratorType1 first1, IteratorType1 last1,
+                                     IteratorType2 first2) {
+  using value_type1 = typename IteratorType1::value_type;
+  using value_type2 = typename IteratorType2::value_type;
+  using pred_t      = StdAlgoEqualBinaryPredicate<value_type1, value_type2>;
+  return equal_team_impl(teamHandle, first1, last1, first2, pred_t());
+}
+
+template <class TeamHandleType, class IteratorType1, class IteratorType2,
+          class BinaryPredicateType>
+KOKKOS_FUNCTION bool equal_team_impl(const TeamHandleType& teamHandle,
+                                     IteratorType1 first1, IteratorType1 last1,
+                                     IteratorType2 first2, IteratorType2 last2,
+                                     BinaryPredicateType predicate) {
+  const auto d1 = ::Kokkos::Experimental::distance(first1, last1);
+  const auto d2 = ::Kokkos::Experimental::distance(first2, last2);
+  if (d1 != d2) {
+    return false;
+  }
+
+  return equal_team_impl(teamHandle, first1, last1, first2, predicate);
+}
+
+template <class TeamHandleType, class IteratorType1, class IteratorType2>
+KOKKOS_FUNCTION bool equal_team_impl(const TeamHandleType& teamHandle,
+                                     IteratorType1 first1, IteratorType1 last1,
+                                     IteratorType2 first2,
+                                     IteratorType2 last2) {
   Impl::expect_valid_range(first1, last1);
   Impl::expect_valid_range(first2, last2);
 
   using value_type1 = typename IteratorType1::value_type;
   using value_type2 = typename IteratorType2::value_type;
   using pred_t      = StdAlgoEqualBinaryPredicate<value_type1, value_type2>;
-  return equal_impl(label, ex, first1, last1, first2, last2, pred_t());
+  return equal_team_impl(teamHandle, first1, last1, first2, last2, pred_t());
 }
 
 }  // namespace Impl
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ExclusiveScan.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ExclusiveScan.hpp
index f2bfa23ccdcc1489d07ccb4af9db249dd19180cc..6da992b4bbe586335ad547f684708654d121f07f 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ExclusiveScan.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ExclusiveScan.hpp
@@ -22,6 +22,7 @@
 #include "Kokkos_HelperPredicates.hpp"
 #include "Kokkos_ValueWrapperForNoNeutralElement.hpp"
 #include "Kokkos_IdentityReferenceUnaryFunctor.hpp"
+#include "Kokkos_FunctorsForExclusiveScan.hpp"
 #include <std_algorithms/Kokkos_TransformExclusiveScan.hpp>
 #include <std_algorithms/Kokkos_Distance.hpp>
 #include <string>
@@ -30,125 +31,15 @@ namespace Kokkos {
 namespace Experimental {
 namespace Impl {
 
-template <class ExeSpace, class IndexType, class ValueType, class FirstFrom,
-          class FirstDest>
-struct ExclusiveScanDefaultFunctorForKnownNeutralElement {
-  using execution_space = ExeSpace;
-
-  ValueType m_init_value;
-  FirstFrom m_first_from;
-  FirstDest m_first_dest;
-
-  KOKKOS_FUNCTION
-  ExclusiveScanDefaultFunctorForKnownNeutralElement(ValueType init,
-                                                    FirstFrom first_from,
-                                                    FirstDest first_dest)
-      : m_init_value(std::move(init)),
-        m_first_from(std::move(first_from)),
-        m_first_dest(std::move(first_dest)) {}
-
-  KOKKOS_FUNCTION
-  void operator()(const IndexType i, ValueType& update,
-                  const bool final_pass) const {
-    if (final_pass) m_first_dest[i] = update + m_init_value;
-    update += m_first_from[i];
-  }
-};
-
-template <class ExeSpace, class IndexType, class ValueType, class FirstFrom,
-          class FirstDest>
-struct ExclusiveScanDefaultFunctor {
-  using execution_space = ExeSpace;
-  using value_type =
-      ::Kokkos::Experimental::Impl::ValueWrapperForNoNeutralElement<ValueType>;
-
-  ValueType m_init_value;
-  FirstFrom m_first_from;
-  FirstDest m_first_dest;
-
-  KOKKOS_FUNCTION
-  ExclusiveScanDefaultFunctor(ValueType init, FirstFrom first_from,
-                              FirstDest first_dest)
-      : m_init_value(std::move(init)),
-        m_first_from(std::move(first_from)),
-        m_first_dest(std::move(first_dest)) {}
-
-  KOKKOS_FUNCTION
-  void operator()(const IndexType i, value_type& update,
-                  const bool final_pass) const {
-    if (final_pass) {
-      if (i == 0) {
-        m_first_dest[i] = m_init_value;
-      } else {
-        m_first_dest[i] = update.val + m_init_value;
-      }
-    }
-
-    const auto tmp = value_type{m_first_from[i], false};
-    this->join(update, tmp);
-  }
-
-  KOKKOS_FUNCTION
-  void init(value_type& update) const {
-    update.val        = {};
-    update.is_initial = true;
-  }
-
-  KOKKOS_FUNCTION
-  void join(value_type& update, const value_type& input) const {
-    if (update.is_initial) {
-      update.val        = input.val;
-      update.is_initial = false;
-    } else {
-      update.val = update.val + input.val;
-    }
-  }
-};
-
+//
+// exespace impl
+//
 template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorType, class ValueType, class BinaryOpType>
-OutputIteratorType exclusive_scan_custom_op_impl(
+          class OutputIteratorType, class ValueType>
+OutputIteratorType exclusive_scan_default_op_exespace_impl(
     const std::string& label, const ExecutionSpace& ex,
     InputIteratorType first_from, InputIteratorType last_from,
-    OutputIteratorType first_dest, ValueType init_value, BinaryOpType bop) {
-  // checks
-  Impl::static_assert_random_access_and_accessible(ex, first_from, first_dest);
-  Impl::static_assert_iterators_have_matching_difference_type(first_from,
-                                                              first_dest);
-  Impl::expect_valid_range(first_from, last_from);
-
-  // aliases
-  using index_type    = typename InputIteratorType::difference_type;
-  using unary_op_type = StdNumericScanIdentityReferenceUnaryFunctor<ValueType>;
-  using func_type =
-      TransformExclusiveScanFunctor<ExecutionSpace, index_type, ValueType,
-                                    InputIteratorType, OutputIteratorType,
-                                    BinaryOpType, unary_op_type>;
-
-  // run
-  const auto num_elements =
-      Kokkos::Experimental::distance(first_from, last_from);
-  ::Kokkos::parallel_scan(
-      label, RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-      func_type(init_value, first_from, first_dest, bop, unary_op_type()));
-  ex.fence("Kokkos::exclusive_scan_custom_op: fence after operation");
-
-  // return
-  return first_dest + num_elements;
-}
-
-template <typename ValueType>
-using ex_scan_has_reduction_identity_sum_t =
-    decltype(Kokkos::reduction_identity<ValueType>::sum());
-
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorType, class ValueType>
-OutputIteratorType exclusive_scan_default_op_impl(const std::string& label,
-                                                  const ExecutionSpace& ex,
-                                                  InputIteratorType first_from,
-                                                  InputIteratorType last_from,
-                                                  OutputIteratorType first_dest,
-                                                  ValueType init_value) {
+    OutputIteratorType first_dest, ValueType init_value) {
   // checks
   Impl::static_assert_random_access_and_accessible(ex, first_from, first_dest);
   Impl::static_assert_iterators_have_matching_difference_type(first_from,
@@ -182,17 +73,122 @@ OutputIteratorType exclusive_scan_default_op_impl(const std::string& label,
       ExclusiveScanDefaultFunctorForKnownNeutralElement<
           ExecutionSpace, index_type, ValueType, InputIteratorType,
           OutputIteratorType>,
-      ExclusiveScanDefaultFunctor<ExecutionSpace, index_type, ValueType,
-                                  InputIteratorType, OutputIteratorType>>;
+      ExclusiveScanDefaultFunctorWithValueWrapper<ExecutionSpace, index_type,
+                                                  ValueType, InputIteratorType,
+                                                  OutputIteratorType>>;
+
+  // run
+  const auto num_elements =
+      Kokkos::Experimental::distance(first_from, last_from);
+  ::Kokkos::parallel_scan(
+      label, RangePolicy<ExecutionSpace>(ex, 0, num_elements),
+      func_type(std::move(init_value), first_from, first_dest));
+
+  ex.fence("Kokkos::exclusive_scan_default_op: fence after operation");
+
+  return first_dest + num_elements;
+}
+
+template <class ExecutionSpace, class InputIteratorType,
+          class OutputIteratorType, class ValueType, class BinaryOpType>
+OutputIteratorType exclusive_scan_custom_op_exespace_impl(
+    const std::string& label, const ExecutionSpace& ex,
+    InputIteratorType first_from, InputIteratorType last_from,
+    OutputIteratorType first_dest, ValueType init_value, BinaryOpType bop) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(ex, first_from, first_dest);
+  Impl::static_assert_iterators_have_matching_difference_type(first_from,
+                                                              first_dest);
+  Impl::expect_valid_range(first_from, last_from);
+
+  // aliases
+  using index_type    = typename InputIteratorType::difference_type;
+  using unary_op_type = StdNumericScanIdentityReferenceUnaryFunctor<ValueType>;
+  using func_type     = TransformExclusiveScanFunctorWithValueWrapper<
+      ExecutionSpace, index_type, ValueType, InputIteratorType,
+      OutputIteratorType, BinaryOpType, unary_op_type>;
 
   // run
   const auto num_elements =
       Kokkos::Experimental::distance(first_from, last_from);
   ::Kokkos::parallel_scan(label,
                           RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-                          func_type(init_value, first_from, first_dest));
+                          func_type(std::move(init_value), first_from,
+                                    first_dest, bop, unary_op_type()));
+  ex.fence("Kokkos::exclusive_scan_custom_op: fence after operation");
 
-  ex.fence("Kokkos::exclusive_scan_default_op: fence after operation");
+  // return
+  return first_dest + num_elements;
+}
+
+//
+// team impl
+//
+template <class TeamHandleType, class InputIteratorType,
+          class OutputIteratorType, class ValueType>
+KOKKOS_FUNCTION OutputIteratorType exclusive_scan_default_op_team_impl(
+    const TeamHandleType& teamHandle, InputIteratorType first_from,
+    InputIteratorType last_from, OutputIteratorType first_dest,
+    ValueType init_value) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(teamHandle, first_from,
+                                                   first_dest);
+  Impl::static_assert_iterators_have_matching_difference_type(first_from,
+                                                              first_dest);
+  Impl::expect_valid_range(first_from, last_from);
+
+  static_assert(
+      ::Kokkos::is_detected_v<ex_scan_has_reduction_identity_sum_t, ValueType>,
+      "The team-level impl of Kokkos::Experimental::exclusive_scan currently "
+      "does not support types without reduction identity");
+
+  // aliases
+  using exe_space  = typename TeamHandleType::execution_space;
+  using index_type = typename InputIteratorType::difference_type;
+  using func_type  = ExclusiveScanDefaultFunctorForKnownNeutralElement<
+      exe_space, index_type, ValueType, InputIteratorType, OutputIteratorType>;
+
+  const auto num_elements =
+      Kokkos::Experimental::distance(first_from, last_from);
+  ::Kokkos::parallel_scan(
+      TeamThreadRange(teamHandle, 0, num_elements),
+      func_type(std::move(init_value), first_from, first_dest));
+  teamHandle.team_barrier();
+  return first_dest + num_elements;
+}
+
+template <class TeamHandleType, class InputIteratorType,
+          class OutputIteratorType, class ValueType, class BinaryOpType>
+KOKKOS_FUNCTION OutputIteratorType exclusive_scan_custom_op_team_impl(
+    const TeamHandleType& teamHandle, InputIteratorType first_from,
+    InputIteratorType last_from, OutputIteratorType first_dest,
+    ValueType init_value, BinaryOpType bop) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(teamHandle, first_from,
+                                                   first_dest);
+  Impl::static_assert_iterators_have_matching_difference_type(first_from,
+                                                              first_dest);
+  Impl::expect_valid_range(first_from, last_from);
+
+  static_assert(
+      ::Kokkos::is_detected_v<ex_scan_has_reduction_identity_sum_t, ValueType>,
+      "The team-level impl of Kokkos::Experimental::exclusive_scan currently "
+      "does not support types without reduction identity");
+
+  // aliases
+  using exe_space     = typename TeamHandleType::execution_space;
+  using unary_op_type = StdNumericScanIdentityReferenceUnaryFunctor<ValueType>;
+  using index_type    = typename InputIteratorType::difference_type;
+  using func_type     = TransformExclusiveScanFunctorWithoutValueWrapper<
+      exe_space, index_type, ValueType, InputIteratorType, OutputIteratorType,
+      BinaryOpType, unary_op_type>;
+
+  const auto num_elements =
+      Kokkos::Experimental::distance(first_from, last_from);
+  ::Kokkos::parallel_scan(TeamThreadRange(teamHandle, 0, num_elements),
+                          func_type(std::move(init_value), first_from,
+                                    first_dest, bop, unary_op_type()));
+  teamHandle.team_barrier();
 
   return first_dest + num_elements;
 }
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_FillFillN.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_FillFillN.hpp
index 316d865f316225528987b038a4b623816ffdfd76..972e57f2ccbe08f97adc83bd31ce01cbb93dfa66 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_FillFillN.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_FillFillN.hpp
@@ -41,9 +41,12 @@ struct StdFillFunctor {
       : m_first(std::move(_first)), m_value(std::move(_value)) {}
 };
 
+//
+// exespace impl
+//
 template <class ExecutionSpace, class IteratorType, class T>
-void fill_impl(const std::string& label, const ExecutionSpace& ex,
-               IteratorType first, IteratorType last, const T& value) {
+void fill_exespace_impl(const std::string& label, const ExecutionSpace& ex,
+                        IteratorType first, IteratorType last, const T& value) {
   // checks
   Impl::static_assert_random_access_and_accessible(ex, first);
   Impl::expect_valid_range(first, last);
@@ -52,13 +55,14 @@ void fill_impl(const std::string& label, const ExecutionSpace& ex,
   const auto num_elements = Kokkos::Experimental::distance(first, last);
   ::Kokkos::parallel_for(label,
                          RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-                         StdFillFunctor<IteratorType, T>(first, value));
+                         StdFillFunctor(first, value));
   ex.fence("Kokkos::fill: fence after operation");
 }
 
 template <class ExecutionSpace, class IteratorType, class SizeType, class T>
-IteratorType fill_n_impl(const std::string& label, const ExecutionSpace& ex,
-                         IteratorType first, SizeType n, const T& value) {
+IteratorType fill_n_exespace_impl(const std::string& label,
+                                  const ExecutionSpace& ex, IteratorType first,
+                                  SizeType n, const T& value) {
   auto last = first + n;
   Impl::static_assert_random_access_and_accessible(ex, first);
   Impl::expect_valid_range(first, last);
@@ -67,7 +71,40 @@ IteratorType fill_n_impl(const std::string& label, const ExecutionSpace& ex,
     return first;
   }
 
-  fill_impl(label, ex, first, last, value);
+  fill_exespace_impl(label, ex, first, last, value);
+  return last;
+}
+
+//
+// team-level impl
+//
+template <class TeamHandleType, class IteratorType, class T>
+KOKKOS_FUNCTION void fill_team_impl(const TeamHandleType& teamHandle,
+                                    IteratorType first, IteratorType last,
+                                    const T& value) {
+  Impl::static_assert_random_access_and_accessible(teamHandle, first);
+  Impl::expect_valid_range(first, last);
+
+  const auto num_elements = Kokkos::Experimental::distance(first, last);
+  ::Kokkos::parallel_for(TeamThreadRange(teamHandle, 0, num_elements),
+                         StdFillFunctor(first, value));
+
+  teamHandle.team_barrier();
+}
+
+template <class TeamHandleType, class IteratorType, class SizeType, class T>
+KOKKOS_FUNCTION IteratorType fill_n_team_impl(const TeamHandleType& teamHandle,
+                                              IteratorType first, SizeType n,
+                                              const T& value) {
+  auto last = first + n;
+  Impl::static_assert_random_access_and_accessible(teamHandle, first);
+  Impl::expect_valid_range(first, last);
+
+  if (n <= 0) {
+    return first;
+  }
+
+  fill_team_impl(teamHandle, first, last, value);
   return last;
 }
 
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_FindEnd.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_FindEnd.hpp
index 3fa41af8ea3271eddbe322a93f49395462463e2c..1f1ec5e54f64badfe080e8f18374e2c912794984 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_FindEnd.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_FindEnd.hpp
@@ -59,9 +59,11 @@ struct StdFindEndFunctor {
       }
     }
 
-    const auto rv =
-        found ? red_value_type{i}
-              : red_value_type{::Kokkos::reduction_identity<IndexType>::max()};
+    // FIXME_NVHPC using a ternary operator causes problems
+    red_value_type rv = {::Kokkos::reduction_identity<IndexType>::max()};
+    if (found) {
+      rv.max_loc_true = i;
+    }
 
     m_reducer.join(red_value, rv);
   }
@@ -78,12 +80,17 @@ struct StdFindEndFunctor {
         m_p(std::move(p)) {}
 };
 
+//
+// exespace impl
+//
 template <class ExecutionSpace, class IteratorType1, class IteratorType2,
           class BinaryPredicateType>
-IteratorType1 find_end_impl(const std::string& label, const ExecutionSpace& ex,
-                            IteratorType1 first, IteratorType1 last,
-                            IteratorType2 s_first, IteratorType2 s_last,
-                            const BinaryPredicateType& pred) {
+IteratorType1 find_end_exespace_impl(const std::string& label,
+                                     const ExecutionSpace& ex,
+                                     IteratorType1 first, IteratorType1 last,
+                                     IteratorType2 s_first,
+                                     IteratorType2 s_last,
+                                     const BinaryPredicateType& pred) {
   // checks
   Impl::static_assert_random_access_and_accessible(ex, first, s_first);
   Impl::static_assert_iterators_have_matching_difference_type(first, s_first);
@@ -95,7 +102,6 @@ IteratorType1 find_end_impl(const std::string& label, const ExecutionSpace& ex,
   const auto num_elements = KE::distance(first, last);
   const auto s_count      = KE::distance(s_first, s_last);
   KOKKOS_EXPECTS(num_elements >= s_count);
-  (void)s_count;  // needed when macro above is a no-op
 
   if (s_first == s_last) {
     return last;
@@ -107,7 +113,8 @@ IteratorType1 find_end_impl(const std::string& label, const ExecutionSpace& ex,
 
   // special case where the two ranges have equal size
   if (num_elements == s_count) {
-    const auto equal_result = equal_impl(label, ex, first, last, s_first, pred);
+    const auto equal_result =
+        equal_exespace_impl(label, ex, first, last, s_first, pred);
     return (equal_result) ? first : last;
   } else {
     using index_type           = typename IteratorType1::difference_type;
@@ -146,14 +153,97 @@ IteratorType1 find_end_impl(const std::string& label, const ExecutionSpace& ex,
 }
 
 template <class ExecutionSpace, class IteratorType1, class IteratorType2>
-IteratorType1 find_end_impl(const std::string& label, const ExecutionSpace& ex,
-                            IteratorType1 first, IteratorType1 last,
-                            IteratorType2 s_first, IteratorType2 s_last) {
+IteratorType1 find_end_exespace_impl(const std::string& label,
+                                     const ExecutionSpace& ex,
+                                     IteratorType1 first, IteratorType1 last,
+                                     IteratorType2 s_first,
+                                     IteratorType2 s_last) {
+  using value_type1    = typename IteratorType1::value_type;
+  using value_type2    = typename IteratorType2::value_type;
+  using predicate_type = StdAlgoEqualBinaryPredicate<value_type1, value_type2>;
+  return find_end_exespace_impl(label, ex, first, last, s_first, s_last,
+                                predicate_type());
+}
+
+//
+// team impl
+//
+template <class TeamHandleType, class IteratorType1, class IteratorType2,
+          class BinaryPredicateType>
+KOKKOS_FUNCTION IteratorType1
+find_end_team_impl(const TeamHandleType& teamHandle, IteratorType1 first,
+                   IteratorType1 last, IteratorType2 s_first,
+                   IteratorType2 s_last, const BinaryPredicateType& pred) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(teamHandle, first, s_first);
+  Impl::static_assert_iterators_have_matching_difference_type(first, s_first);
+  Impl::expect_valid_range(first, last);
+  Impl::expect_valid_range(s_first, s_last);
+
+  // the target sequence should not be larger than the range [first, last)
+  namespace KE            = ::Kokkos::Experimental;
+  const auto num_elements = KE::distance(first, last);
+  const auto s_count      = KE::distance(s_first, s_last);
+  KOKKOS_EXPECTS(num_elements >= s_count);
+
+  if (s_first == s_last) {
+    return last;
+  }
+
+  if (first == last) {
+    return last;
+  }
+
+  // special case where the two ranges have equal size
+  if (num_elements == s_count) {
+    const auto equal_result =
+        equal_team_impl(teamHandle, first, last, s_first, pred);
+    return (equal_result) ? first : last;
+  } else {
+    using index_type           = typename IteratorType1::difference_type;
+    using reducer_type         = LastLoc<index_type>;
+    using reduction_value_type = typename reducer_type::value_type;
+    using func_t = StdFindEndFunctor<index_type, IteratorType1, IteratorType2,
+                                     reducer_type, BinaryPredicateType>;
+
+    // run
+    reduction_value_type red_result;
+    reducer_type reducer(red_result);
+
+    // decide the size of the range policy of the par_red:
+    // note that the last feasible index to start looking is the index
+    // whose distance from the "last" is equal to the sequence count.
+    // the +1 is because we need to include that location too.
+    const auto range_size = num_elements - s_count + 1;
+
+    // run par reduce
+    ::Kokkos::parallel_reduce(
+        TeamThreadRange(teamHandle, 0, range_size),
+        func_t(first, last, s_first, s_last, reducer, pred), reducer);
+
+    teamHandle.team_barrier();
+
+    // decide and return
+    if (red_result.max_loc_true ==
+        ::Kokkos::reduction_identity<index_type>::max()) {
+      // if here, a subrange has not been found
+      return last;
+    } else {
+      // a location has been found
+      return first + red_result.max_loc_true;
+    }
+  }
+}
+
+template <class TeamHandleType, class IteratorType1, class IteratorType2>
+KOKKOS_FUNCTION IteratorType1 find_end_team_impl(
+    const TeamHandleType& teamHandle, IteratorType1 first, IteratorType1 last,
+    IteratorType2 s_first, IteratorType2 s_last) {
   using value_type1    = typename IteratorType1::value_type;
   using value_type2    = typename IteratorType2::value_type;
   using predicate_type = StdAlgoEqualBinaryPredicate<value_type1, value_type2>;
-  return find_end_impl(label, ex, first, last, s_first, s_last,
-                       predicate_type());
+  return find_end_team_impl(teamHandle, first, last, s_first, s_last,
+                            predicate_type());
 }
 
 }  // namespace Impl
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_FindFirstOf.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_FindFirstOf.hpp
index df10da2fd55e513112f9a2cf3ca0c710556e1adb..145e235b9ddf2b2bc8fc0ae1e4c3e25aec73a962 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_FindFirstOf.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_FindFirstOf.hpp
@@ -52,10 +52,11 @@ struct StdFindFirstOfFunctor {
       }
     }
 
-    const auto rv =
-        found ? red_value_type{i}
-              : red_value_type{::Kokkos::reduction_identity<IndexType>::min()};
-
+    // FIXME_NVHPC using a ternary operator causes problems
+    red_value_type rv = {::Kokkos::reduction_identity<IndexType>::min()};
+    if (found) {
+      rv.min_loc_true = i;
+    }
     m_reducer.join(red_value, rv);
   }
 
@@ -70,13 +71,15 @@ struct StdFindFirstOfFunctor {
         m_p(std::move(p)) {}
 };
 
+//
+// exespace impl
+//
 template <class ExecutionSpace, class IteratorType1, class IteratorType2,
           class BinaryPredicateType>
-IteratorType1 find_first_of_impl(const std::string& label,
-                                 const ExecutionSpace& ex, IteratorType1 first,
-                                 IteratorType1 last, IteratorType2 s_first,
-                                 IteratorType2 s_last,
-                                 const BinaryPredicateType& pred) {
+IteratorType1 find_first_of_exespace_impl(
+    const std::string& label, const ExecutionSpace& ex, IteratorType1 first,
+    IteratorType1 last, IteratorType2 s_first, IteratorType2 s_last,
+    const BinaryPredicateType& pred) {
   // checks
   Impl::static_assert_random_access_and_accessible(ex, first, s_first);
   Impl::static_assert_iterators_have_matching_difference_type(first, s_first);
@@ -115,15 +118,71 @@ IteratorType1 find_first_of_impl(const std::string& label,
 }
 
 template <class ExecutionSpace, class IteratorType1, class IteratorType2>
-IteratorType1 find_first_of_impl(const std::string& label,
-                                 const ExecutionSpace& ex, IteratorType1 first,
-                                 IteratorType1 last, IteratorType2 s_first,
-                                 IteratorType2 s_last) {
+IteratorType1 find_first_of_exespace_impl(
+    const std::string& label, const ExecutionSpace& ex, IteratorType1 first,
+    IteratorType1 last, IteratorType2 s_first, IteratorType2 s_last) {
+  using value_type1    = typename IteratorType1::value_type;
+  using value_type2    = typename IteratorType2::value_type;
+  using predicate_type = StdAlgoEqualBinaryPredicate<value_type1, value_type2>;
+  return find_first_of_exespace_impl(label, ex, first, last, s_first, s_last,
+                                     predicate_type());
+}
+
+//
+// team impl
+//
+template <class TeamHandleType, class IteratorType1, class IteratorType2,
+          class BinaryPredicateType>
+KOKKOS_FUNCTION IteratorType1
+find_first_of_team_impl(const TeamHandleType& teamHandle, IteratorType1 first,
+                        IteratorType1 last, IteratorType2 s_first,
+                        IteratorType2 s_last, const BinaryPredicateType& pred) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(teamHandle, first, s_first);
+  Impl::static_assert_iterators_have_matching_difference_type(first, s_first);
+  Impl::expect_valid_range(first, last);
+  Impl::expect_valid_range(s_first, s_last);
+
+  if ((s_first == s_last) || (first == last)) {
+    return last;
+  }
+
+  using index_type           = typename IteratorType1::difference_type;
+  using reducer_type         = FirstLoc<index_type>;
+  using reduction_value_type = typename reducer_type::value_type;
+  using func_t = StdFindFirstOfFunctor<index_type, IteratorType1, IteratorType2,
+                                       reducer_type, BinaryPredicateType>;
+
+  // run
+  reduction_value_type red_result;
+  reducer_type reducer(red_result);
+  const auto num_elements = Kokkos::Experimental::distance(first, last);
+  ::Kokkos::parallel_reduce(TeamThreadRange(teamHandle, 0, num_elements),
+                            func_t(first, s_first, s_last, reducer, pred),
+                            reducer);
+
+  teamHandle.team_barrier();
+
+  // decide and return
+  if (red_result.min_loc_true ==
+      ::Kokkos::reduction_identity<index_type>::min()) {
+    // if here, nothing found
+    return last;
+  } else {
+    // a location has been found
+    return first + red_result.min_loc_true;
+  }
+}
+
+template <class TeamHandleType, class IteratorType1, class IteratorType2>
+KOKKOS_FUNCTION IteratorType1 find_first_of_team_impl(
+    const TeamHandleType& teamHandle, IteratorType1 first, IteratorType1 last,
+    IteratorType2 s_first, IteratorType2 s_last) {
   using value_type1    = typename IteratorType1::value_type;
   using value_type2    = typename IteratorType2::value_type;
   using predicate_type = StdAlgoEqualBinaryPredicate<value_type1, value_type2>;
-  return find_first_of_impl(label, ex, first, last, s_first, s_last,
-                            predicate_type());
+  return find_first_of_team_impl(teamHandle, first, last, s_first, s_last,
+                                 predicate_type());
 }
 
 }  // namespace Impl
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_FindIfOrNot.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_FindIfOrNot.hpp
index f7ec4b1110c48a616cc451b04edad2f69616bfe0..8fffb59094a023aa003388aa8eee4c671c0570bf 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_FindIfOrNot.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_FindIfOrNot.hpp
@@ -44,10 +44,11 @@ struct StdFindIfOrNotFunctor {
     // if doing find_if_not, look for when predicate is false
     const bool found_condition = is_find_if ? m_p(my_value) : !m_p(my_value);
 
-    auto rv =
-        found_condition
-            ? red_value_type{i}
-            : red_value_type{::Kokkos::reduction_identity<IndexType>::min()};
+    // FIXME_NVHPC using a ternary operator causes problems
+    red_value_type rv = {::Kokkos::reduction_identity<IndexType>::min()};
+    if (found_condition) {
+      rv.min_loc_true = i;
+    }
 
     m_reducer.join(red_value, rv);
   }
@@ -60,11 +61,15 @@ struct StdFindIfOrNotFunctor {
         m_p(std::move(p)) {}
 };
 
+//
+// exespace impl
+//
 template <bool is_find_if, class ExecutionSpace, class IteratorType,
           class PredicateType>
-IteratorType find_if_or_not_impl(const std::string& label,
-                                 const ExecutionSpace& ex, IteratorType first,
-                                 IteratorType last, PredicateType pred) {
+IteratorType find_if_or_not_exespace_impl(const std::string& label,
+                                          const ExecutionSpace& ex,
+                                          IteratorType first, IteratorType last,
+                                          PredicateType pred) {
   // checks
   Impl::static_assert_random_access_and_accessible(
       ex, first);  // only need one It per type
@@ -103,14 +108,68 @@ IteratorType find_if_or_not_impl(const std::string& label,
 }
 
 template <class ExecutionSpace, class InputIterator, class T>
-InputIterator find_impl(const std::string& label, ExecutionSpace ex,
-                        InputIterator first, InputIterator last,
-                        const T& value) {
-  return find_if_or_not_impl<true>(
+InputIterator find_exespace_impl(const std::string& label, ExecutionSpace ex,
+                                 InputIterator first, InputIterator last,
+                                 const T& value) {
+  return find_if_or_not_exespace_impl<true>(
       label, ex, first, last,
       ::Kokkos::Experimental::Impl::StdAlgoEqualsValUnaryPredicate<T>(value));
 }
 
+//
+// team impl
+//
+template <bool is_find_if, class TeamHandleType, class IteratorType,
+          class PredicateType>
+KOKKOS_FUNCTION IteratorType
+find_if_or_not_team_impl(const TeamHandleType& teamHandle, IteratorType first,
+                         IteratorType last, PredicateType pred) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(
+      teamHandle, first);  // only need one It per type
+  Impl::expect_valid_range(first, last);
+
+  if (first == last) {
+    return last;
+  }
+
+  // aliases
+  using index_type           = typename IteratorType::difference_type;
+  using reducer_type         = FirstLoc<index_type>;
+  using reduction_value_type = typename reducer_type::value_type;
+  using func_t = StdFindIfOrNotFunctor<is_find_if, index_type, IteratorType,
+                                       reducer_type, PredicateType>;
+
+  // run
+  reduction_value_type red_result;
+  reducer_type reducer(red_result);
+  const auto num_elements = Kokkos::Experimental::distance(first, last);
+  ::Kokkos::parallel_reduce(TeamThreadRange(teamHandle, 0, num_elements),
+                            func_t(first, reducer, pred), reducer);
+
+  teamHandle.team_barrier();
+
+  // decide and return
+  if (red_result.min_loc_true ==
+      ::Kokkos::reduction_identity<index_type>::min()) {
+    // here, it means a valid loc has not been found,
+    return last;
+  } else {
+    // a location has been found
+    return first + red_result.min_loc_true;
+  }
+}
+
+template <class TeamHandleType, class InputIterator, class T>
+KOKKOS_FUNCTION InputIterator find_team_impl(const TeamHandleType& teamHandle,
+                                             InputIterator first,
+                                             InputIterator last,
+                                             const T& value) {
+  return find_if_or_not_team_impl<true>(
+      teamHandle, first, last,
+      ::Kokkos::Experimental::Impl::StdAlgoEqualsValUnaryPredicate<T>(value));
+}
+
 }  // namespace Impl
 }  // namespace Experimental
 }  // namespace Kokkos
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ForEachForEachN.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ForEachForEachN.hpp
index f9a6ff2e99e6229cfa154585e9ae298c9622ad91..d3be3b7f6670384c2ddc2042515fd69091bf1db3 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ForEachForEachN.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ForEachForEachN.hpp
@@ -41,29 +41,31 @@ struct StdForEachFunctor {
       : m_first(std::move(_first)), m_functor(std::move(_functor)) {}
 };
 
-template <class ExecutionSpace, class IteratorType, class UnaryFunctorType>
-UnaryFunctorType for_each_impl(const std::string& label,
-                               const ExecutionSpace& ex, IteratorType first,
-                               IteratorType last, UnaryFunctorType functor) {
+template <class HandleType, class IteratorType, class UnaryFunctorType>
+UnaryFunctorType for_each_exespace_impl(const std::string& label,
+                                        const HandleType& handle,
+                                        IteratorType first, IteratorType last,
+                                        UnaryFunctorType functor) {
   // checks
-  Impl::static_assert_random_access_and_accessible(ex, first);
+  Impl::static_assert_random_access_and_accessible(handle, first);
   Impl::expect_valid_range(first, last);
 
   // run
   const auto num_elements = Kokkos::Experimental::distance(first, last);
   ::Kokkos::parallel_for(
-      label, RangePolicy<ExecutionSpace>(ex, 0, num_elements),
+      label, RangePolicy<HandleType>(handle, 0, num_elements),
       StdForEachFunctor<IteratorType, UnaryFunctorType>(first, functor));
-  ex.fence("Kokkos::for_each: fence after operation");
+  handle.fence("Kokkos::for_each: fence after operation");
 
   return functor;
 }
 
 template <class ExecutionSpace, class IteratorType, class SizeType,
           class UnaryFunctorType>
-IteratorType for_each_n_impl(const std::string& label, const ExecutionSpace& ex,
-                             IteratorType first, SizeType n,
-                             UnaryFunctorType functor) {
+IteratorType for_each_n_exespace_impl(const std::string& label,
+                                      const ExecutionSpace& ex,
+                                      IteratorType first, SizeType n,
+                                      UnaryFunctorType functor) {
   auto last = first + n;
   Impl::static_assert_random_access_and_accessible(ex, first, last);
   Impl::expect_valid_range(first, last);
@@ -72,8 +74,46 @@ IteratorType for_each_n_impl(const std::string& label, const ExecutionSpace& ex,
     return first;
   }
 
-  for_each_impl(label, ex, first, last, std::move(functor));
-  // no neeed to fence since for_each_impl fences already
+  for_each_exespace_impl(label, ex, first, last, std::move(functor));
+  // no neeed to fence since for_each_exespace_impl fences already
+
+  return last;
+}
+
+//
+// team impl
+//
+template <class TeamHandleType, class IteratorType, class UnaryFunctorType>
+KOKKOS_FUNCTION UnaryFunctorType
+for_each_team_impl(const TeamHandleType& teamHandle, IteratorType first,
+                   IteratorType last, UnaryFunctorType functor) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(teamHandle, first);
+  Impl::expect_valid_range(first, last);
+  // run
+  const auto num_elements = Kokkos::Experimental::distance(first, last);
+  ::Kokkos::parallel_for(
+      TeamThreadRange(teamHandle, 0, num_elements),
+      StdForEachFunctor<IteratorType, UnaryFunctorType>(first, functor));
+  teamHandle.team_barrier();
+  return functor;
+}
+
+template <class TeamHandleType, class IteratorType, class SizeType,
+          class UnaryFunctorType>
+KOKKOS_FUNCTION IteratorType
+for_each_n_team_impl(const TeamHandleType& teamHandle, IteratorType first,
+                     SizeType n, UnaryFunctorType functor) {
+  auto last = first + n;
+  Impl::static_assert_random_access_and_accessible(teamHandle, first, last);
+  Impl::expect_valid_range(first, last);
+
+  if (n == 0) {
+    return first;
+  }
+
+  for_each_team_impl(teamHandle, first, last, std::move(functor));
+  // no neeed to fence since for_each_team_impl fences already
 
   return last;
 }
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_FunctorsForExclusiveScan.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_FunctorsForExclusiveScan.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..8151ee34955295b1c658b974b829d5bbb78440d5
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_FunctorsForExclusiveScan.hpp
@@ -0,0 +1,220 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_STD_ALGORITHMS_FUNCTORS_FOR_EXCLUSIVE_SCAN_IMPL_HPP
+#define KOKKOS_STD_ALGORITHMS_FUNCTORS_FOR_EXCLUSIVE_SCAN_IMPL_HPP
+
+#include <Kokkos_Core.hpp>
+#include "Kokkos_ValueWrapperForNoNeutralElement.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template <typename ValueType>
+using ex_scan_has_reduction_identity_sum_t =
+    decltype(Kokkos::reduction_identity<ValueType>::sum());
+
+template <class ExeSpace, class IndexType, class ValueType, class FirstFrom,
+          class FirstDest>
+struct ExclusiveScanDefaultFunctorForKnownNeutralElement {
+  using execution_space = ExeSpace;
+  ValueType m_init_value;
+  FirstFrom m_first_from;
+  FirstDest m_first_dest;
+
+  KOKKOS_FUNCTION
+  ExclusiveScanDefaultFunctorForKnownNeutralElement(ValueType init,
+                                                    FirstFrom first_from,
+                                                    FirstDest first_dest)
+      : m_init_value(std::move(init)),
+        m_first_from(std::move(first_from)),
+        m_first_dest(std::move(first_dest)) {}
+
+  KOKKOS_FUNCTION
+  void operator()(const IndexType i, ValueType& update,
+                  const bool final_pass) const {
+    if (final_pass) m_first_dest[i] = update + m_init_value;
+    update += m_first_from[i];
+  }
+};
+
+template <class ExeSpace, class IndexType, class ValueType, class FirstFrom,
+          class FirstDest>
+struct ExclusiveScanDefaultFunctorWithValueWrapper {
+  using execution_space = ExeSpace;
+  using value_type =
+      ::Kokkos::Experimental::Impl::ValueWrapperForNoNeutralElement<ValueType>;
+  ValueType m_init_value;
+  FirstFrom m_first_from;
+  FirstDest m_first_dest;
+
+  KOKKOS_FUNCTION
+  ExclusiveScanDefaultFunctorWithValueWrapper(ValueType init,
+                                              FirstFrom first_from,
+                                              FirstDest first_dest)
+      : m_init_value(std::move(init)),
+        m_first_from(std::move(first_from)),
+        m_first_dest(std::move(first_dest)) {}
+
+  KOKKOS_FUNCTION
+  void operator()(const IndexType i, value_type& update,
+                  const bool final_pass) const {
+    if (final_pass) {
+      if (i == 0) {
+        m_first_dest[i] = m_init_value;
+      } else {
+        m_first_dest[i] = update.val + m_init_value;
+      }
+    }
+
+    const auto tmp = value_type{m_first_from[i], false};
+    this->join(update, tmp);
+  }
+
+  KOKKOS_FUNCTION
+  void init(value_type& update) const {
+    update.val        = {};
+    update.is_initial = true;
+  }
+
+  KOKKOS_FUNCTION
+  void join(value_type& update, const value_type& input) const {
+    if (input.is_initial) return;
+
+    if (update.is_initial) {
+      update.val        = input.val;
+      update.is_initial = false;
+    } else {
+      update.val = update.val + input.val;
+    }
+  }
+};
+
+template <class ExeSpace, class IndexType, class ValueType, class FirstFrom,
+          class FirstDest, class BinaryOpType, class UnaryOpType>
+struct TransformExclusiveScanFunctorWithValueWrapper {
+  using execution_space = ExeSpace;
+  using value_type =
+      ::Kokkos::Experimental::Impl::ValueWrapperForNoNeutralElement<ValueType>;
+
+  ValueType m_init_value;
+  FirstFrom m_first_from;
+  FirstDest m_first_dest;
+  BinaryOpType m_binary_op;
+  UnaryOpType m_unary_op;
+
+  KOKKOS_FUNCTION
+  TransformExclusiveScanFunctorWithValueWrapper(ValueType init,
+                                                FirstFrom first_from,
+                                                FirstDest first_dest,
+                                                BinaryOpType bop,
+                                                UnaryOpType uop)
+      : m_init_value(std::move(init)),
+        m_first_from(std::move(first_from)),
+        m_first_dest(std::move(first_dest)),
+        m_binary_op(std::move(bop)),
+        m_unary_op(std::move(uop)) {}
+
+  KOKKOS_FUNCTION
+  void operator()(const IndexType i, value_type& update,
+                  const bool final_pass) const {
+    if (final_pass) {
+      if (i == 0) {
+        // for both ExclusiveScan and TransformExclusiveScan,
+        // init is unmodified
+        m_first_dest[i] = m_init_value;
+      } else {
+        m_first_dest[i] = m_binary_op(update.val, m_init_value);
+      }
+    }
+
+    const auto tmp = value_type{m_unary_op(m_first_from[i]), false};
+    this->join(update, tmp);
+  }
+
+  KOKKOS_FUNCTION void init(value_type& value) const {
+    value.val        = {};
+    value.is_initial = true;
+  }
+
+  KOKKOS_FUNCTION
+  void join(value_type& update, const value_type& input) const {
+    if (input.is_initial) return;
+
+    if (update.is_initial) {
+      update.val = input.val;
+    } else {
+      update.val = m_binary_op(update.val, input.val);
+    }
+    update.is_initial = false;
+  }
+};
+
+template <class ExeSpace, class IndexType, class ValueType, class FirstFrom,
+          class FirstDest, class BinaryOpType, class UnaryOpType>
+struct TransformExclusiveScanFunctorWithoutValueWrapper {
+  using execution_space = ExeSpace;
+
+  ValueType m_init_value;
+  FirstFrom m_first_from;
+  FirstDest m_first_dest;
+  BinaryOpType m_binary_op;
+  UnaryOpType m_unary_op;
+
+  KOKKOS_FUNCTION
+  TransformExclusiveScanFunctorWithoutValueWrapper(ValueType init,
+                                                   FirstFrom first_from,
+                                                   FirstDest first_dest,
+                                                   BinaryOpType bop,
+                                                   UnaryOpType uop)
+      : m_init_value(std::move(init)),
+        m_first_from(std::move(first_from)),
+        m_first_dest(std::move(first_dest)),
+        m_binary_op(std::move(bop)),
+        m_unary_op(std::move(uop)) {}
+
+  KOKKOS_FUNCTION
+  void operator()(const IndexType i, ValueType& update,
+                  const bool final_pass) const {
+    if (final_pass) {
+      if (i == 0) {
+        // for both ExclusiveScan and TransformExclusiveScan,
+        // init is unmodified
+        m_first_dest[i] = m_init_value;
+      } else {
+        m_first_dest[i] = m_binary_op(update, m_init_value);
+      }
+    }
+
+    const auto tmp = ValueType{m_unary_op(m_first_from[i])};
+    this->join(update, tmp);
+  }
+
+  KOKKOS_FUNCTION
+  void init(ValueType& update) const { update = {}; }
+
+  KOKKOS_FUNCTION
+  void join(ValueType& update, const ValueType& input) const {
+    update = m_binary_op(update, input);
+  }
+};
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_GenerateGenerateN.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_GenerateGenerateN.hpp
index 228390bdfff0f6dcbd20cd17912f6babc736daa0..157de1125ee307422419480023f38b20fe269cee 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_GenerateGenerateN.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_GenerateGenerateN.hpp
@@ -41,32 +41,65 @@ struct StdGenerateFunctor {
       : m_first(std::move(_first)), m_generator(std::move(_g)) {}
 };
 
+//
+// generate impl
+//
 template <class ExecutionSpace, class IteratorType, class Generator>
-void generate_impl(const std::string& label, const ExecutionSpace& ex,
-                   IteratorType first, IteratorType last, Generator g) {
+void generate_exespace_impl(const std::string& label, const ExecutionSpace& ex,
+                            IteratorType first, IteratorType last,
+                            Generator g) {
   // checks
   Impl::static_assert_random_access_and_accessible(ex, first);
   Impl::expect_valid_range(first, last);
 
-  // aliases
-  using func_t = StdGenerateFunctor<IteratorType, Generator>;
-
   // run
   const auto num_elements = Kokkos::Experimental::distance(first, last);
   ::Kokkos::parallel_for(label,
                          RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-                         func_t(first, g));
+                         StdGenerateFunctor(first, g));
   ex.fence("Kokkos::generate: fence after operation");
 }
 
+template <class TeamHandleType, class IteratorType, class Generator>
+KOKKOS_FUNCTION void generate_team_impl(const TeamHandleType& teamHandle,
+                                        IteratorType first, IteratorType last,
+                                        Generator g) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(teamHandle, first);
+  Impl::expect_valid_range(first, last);
+
+  // run
+  const auto num_elements = Kokkos::Experimental::distance(first, last);
+  ::Kokkos::parallel_for(TeamThreadRange(teamHandle, 0, num_elements),
+                         StdGenerateFunctor(first, g));
+  teamHandle.team_barrier();
+}
+
+//
+// generate_n impl
+//
 template <class ExecutionSpace, class IteratorType, class Size, class Generator>
-IteratorType generate_n_impl(const std::string& label, const ExecutionSpace& ex,
-                             IteratorType first, Size count, Generator g) {
+IteratorType generate_n_exespace_impl(const std::string& label,
+                                      const ExecutionSpace& ex,
+                                      IteratorType first, Size count,
+                                      Generator g) {
+  if (count <= 0) {
+    return first;
+  }
+
+  generate_exespace_impl(label, ex, first, first + count, g);
+  return first + count;
+}
+
+template <class TeamHandleType, class IteratorType, class Size, class Generator>
+KOKKOS_FUNCTION IteratorType
+generate_n_team_impl(const TeamHandleType& teamHandle, IteratorType first,
+                     Size count, Generator g) {
   if (count <= 0) {
     return first;
   }
 
-  generate_impl(label, ex, first, first + count, g);
+  generate_team_impl(teamHandle, first, first + count, g);
   return first + count;
 }
 
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_InclusiveScan.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_InclusiveScan.hpp
index 55e1a78695d0f39e7a9c3ea84eb785c92cb63a36..0b4acec0feb91986fc4a7e27a66cd9ae7daf100f 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_InclusiveScan.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_InclusiveScan.hpp
@@ -90,6 +90,8 @@ struct InclusiveScanDefaultFunctor {
 
   KOKKOS_FUNCTION
   void join(value_type& update, const value_type& input) const {
+    if (input.is_initial) return;
+
     if (update.is_initial) {
       update.val = input.val;
     } else {
@@ -99,9 +101,12 @@ struct InclusiveScanDefaultFunctor {
   }
 };
 
+//
+// exespace impl
+//
 template <class ExecutionSpace, class InputIteratorType,
           class OutputIteratorType>
-OutputIteratorType inclusive_scan_default_op_impl(
+OutputIteratorType inclusive_scan_default_op_exespace_impl(
     const std::string& label, const ExecutionSpace& ex,
     InputIteratorType first_from, InputIteratorType last_from,
     OutputIteratorType first_dest) {
@@ -141,7 +146,7 @@ OutputIteratorType inclusive_scan_default_op_impl(
 // -------------------------------------------------------------
 template <class ExecutionSpace, class InputIteratorType,
           class OutputIteratorType, class BinaryOpType>
-OutputIteratorType inclusive_scan_custom_binary_op_impl(
+OutputIteratorType inclusive_scan_custom_binary_op_exespace_impl(
     const std::string& label, const ExecutionSpace& ex,
     InputIteratorType first_from, InputIteratorType last_from,
     OutputIteratorType first_dest, BinaryOpType binary_op) {
@@ -156,7 +161,7 @@ OutputIteratorType inclusive_scan_custom_binary_op_impl(
   using value_type =
       std::remove_const_t<typename InputIteratorType::value_type>;
   using unary_op_type = StdNumericScanIdentityReferenceUnaryFunctor<value_type>;
-  using func_type     = TransformInclusiveScanNoInitValueFunctor<
+  using func_type     = ExeSpaceTransformInclusiveScanNoInitValueFunctor<
       ExecutionSpace, index_type, value_type, InputIteratorType,
       OutputIteratorType, BinaryOpType, unary_op_type>;
 
@@ -177,7 +182,7 @@ OutputIteratorType inclusive_scan_custom_binary_op_impl(
 // -------------------------------------------------------------
 template <class ExecutionSpace, class InputIteratorType,
           class OutputIteratorType, class BinaryOpType, class ValueType>
-OutputIteratorType inclusive_scan_custom_binary_op_impl(
+OutputIteratorType inclusive_scan_custom_binary_op_exespace_impl(
     const std::string& label, const ExecutionSpace& ex,
     InputIteratorType first_from, InputIteratorType last_from,
     OutputIteratorType first_dest, BinaryOpType binary_op,
@@ -191,7 +196,7 @@ OutputIteratorType inclusive_scan_custom_binary_op_impl(
   // aliases
   using index_type    = typename InputIteratorType::difference_type;
   using unary_op_type = StdNumericScanIdentityReferenceUnaryFunctor<ValueType>;
-  using func_type     = TransformInclusiveScanWithInitValueFunctor<
+  using func_type     = ExeSpaceTransformInclusiveScanWithInitValueFunctor<
       ExecutionSpace, index_type, ValueType, InputIteratorType,
       OutputIteratorType, BinaryOpType, unary_op_type>;
 
@@ -201,13 +206,142 @@ OutputIteratorType inclusive_scan_custom_binary_op_impl(
   ::Kokkos::parallel_scan(label,
                           RangePolicy<ExecutionSpace>(ex, 0, num_elements),
                           func_type(first_from, first_dest, binary_op,
-                                    unary_op_type(), init_value));
+                                    unary_op_type(), std::move(init_value)));
   ex.fence("Kokkos::inclusive_scan_custom_binary_op: fence after operation");
 
   // return
   return first_dest + num_elements;
 }
 
+//
+// team impl
+//
+template <class TeamHandleType, class InputIteratorType,
+          class OutputIteratorType>
+KOKKOS_FUNCTION OutputIteratorType inclusive_scan_default_op_team_impl(
+    const TeamHandleType& teamHandle, InputIteratorType first_from,
+    InputIteratorType last_from, OutputIteratorType first_dest) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(teamHandle, first_from,
+                                                   first_dest);
+  Impl::static_assert_iterators_have_matching_difference_type(first_from,
+                                                              first_dest);
+  Impl::expect_valid_range(first_from, last_from);
+
+  using value_type =
+      std::remove_const_t<typename InputIteratorType::value_type>;
+
+  // #if defined(KOKKOS_ENABLE_CUDA)
+
+  using exe_space  = typename TeamHandleType::execution_space;
+  using index_type = typename InputIteratorType::difference_type;
+  using func_type  = std::conditional_t<
+      ::Kokkos::is_detected<in_scan_has_reduction_identity_sum_t,
+                            value_type>::value,
+      InclusiveScanDefaultFunctorForKnownIdentityElement<
+          exe_space, index_type, value_type, InputIteratorType,
+          OutputIteratorType>,
+      InclusiveScanDefaultFunctor<exe_space, index_type, value_type,
+                                  InputIteratorType, OutputIteratorType>>;
+
+  // run
+  const auto num_elements =
+      Kokkos::Experimental::distance(first_from, last_from);
+  ::Kokkos::parallel_scan(TeamThreadRange(teamHandle, 0, num_elements),
+                          func_type(first_from, first_dest));
+  teamHandle.team_barrier();
+
+  // return
+  return first_dest + num_elements;
+}
+
+// -------------------------------------------------------------
+// inclusive_scan_custom_binary_op_impl
+// -------------------------------------------------------------
+template <class TeamHandleType, class InputIteratorType,
+          class OutputIteratorType, class BinaryOpType>
+KOKKOS_FUNCTION OutputIteratorType inclusive_scan_custom_binary_op_team_impl(
+    const TeamHandleType& teamHandle, InputIteratorType first_from,
+    InputIteratorType last_from, OutputIteratorType first_dest,
+    BinaryOpType binary_op) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(teamHandle, first_from,
+                                                   first_dest);
+  Impl::static_assert_iterators_have_matching_difference_type(first_from,
+                                                              first_dest);
+  Impl::expect_valid_range(first_from, last_from);
+
+  using value_type =
+      std::remove_const_t<typename InputIteratorType::value_type>;
+
+  static_assert(
+      ::Kokkos::is_detected_v<ex_scan_has_reduction_identity_sum_t, value_type>,
+      "At the moment inclusive_scan doesn't support types without reduction "
+      "identity");
+
+  // #if defined(KOKKOS_ENABLE_CUDA)
+
+  // aliases
+  using exe_space     = typename TeamHandleType::execution_space;
+  using unary_op_type = StdNumericScanIdentityReferenceUnaryFunctor<value_type>;
+  using func_type     = TeamTransformInclusiveScanNoInitValueFunctor<
+      exe_space, value_type, InputIteratorType, OutputIteratorType,
+      BinaryOpType, unary_op_type>;
+
+  // run
+  const auto num_elements =
+      Kokkos::Experimental::distance(first_from, last_from);
+
+  ::Kokkos::parallel_scan(
+      TeamThreadRange(teamHandle, 0, num_elements),
+      func_type(first_from, first_dest, binary_op, unary_op_type()));
+  teamHandle.team_barrier();
+
+  return first_dest + num_elements;
+}
+
+// -------------------------------------------------------------
+// inclusive_scan_custom_binary_op_impl with init_value
+// -------------------------------------------------------------
+template <class TeamHandleType, class InputIteratorType,
+          class OutputIteratorType, class BinaryOpType, class ValueType>
+KOKKOS_FUNCTION OutputIteratorType inclusive_scan_custom_binary_op_team_impl(
+    const TeamHandleType& teamHandle, InputIteratorType first_from,
+    InputIteratorType last_from, OutputIteratorType first_dest,
+    BinaryOpType binary_op, ValueType init_value) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(teamHandle, first_from,
+                                                   first_dest);
+  Impl::static_assert_iterators_have_matching_difference_type(first_from,
+                                                              first_dest);
+  Impl::expect_valid_range(first_from, last_from);
+
+  static_assert(
+      ::Kokkos::is_detected_v<ex_scan_has_reduction_identity_sum_t, ValueType>,
+      "At the moment inclusive_scan doesn't support types without reduction "
+      "identity");
+
+  // #if defined(KOKKOS_ENABLE_CUDA)
+
+  // aliases
+  using exe_space     = typename TeamHandleType::execution_space;
+  using unary_op_type = StdNumericScanIdentityReferenceUnaryFunctor<ValueType>;
+  using func_type     = TeamTransformInclusiveScanWithInitValueFunctor<
+      exe_space, ValueType, InputIteratorType, OutputIteratorType, BinaryOpType,
+      unary_op_type>;
+
+  // run
+  const auto num_elements =
+      Kokkos::Experimental::distance(first_from, last_from);
+  ::Kokkos::parallel_scan(TeamThreadRange(teamHandle, 0, num_elements),
+                          func_type(first_from, first_dest, binary_op,
+                                    unary_op_type(), std::move(init_value)));
+  teamHandle.team_barrier();
+
+  // return
+  return first_dest + num_elements;
+}
+
 }  // namespace Impl
 }  // namespace Experimental
 }  // namespace Kokkos
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_IsPartitioned.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_IsPartitioned.hpp
index 92a22f3c3a82770491eaa8d08912d64f2e8c3ed6..281efca36b5cc15f8bde040ffdc8b9edd4b3eb3c 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_IsPartitioned.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_IsPartitioned.hpp
@@ -43,8 +43,12 @@ struct StdIsPartitionedFunctor {
         ::Kokkos::reduction_identity<index_type>::min();
     constexpr index_type m_red_id_max =
         ::Kokkos::reduction_identity<index_type>::max();
-    auto rv = predicate_value ? red_value_type{i, m_red_id_min}
-                              : red_value_type{m_red_id_max, i};
+
+    // FIXME_NVHPC using a ternary operator causes problems
+    red_value_type rv = {m_red_id_max, i};
+    if (predicate_value) {
+      rv = {i, m_red_id_min};
+    }
 
     m_reducer.join(redValue, rv);
   }
@@ -58,9 +62,9 @@ struct StdIsPartitionedFunctor {
 };
 
 template <class ExecutionSpace, class IteratorType, class PredicateType>
-bool is_partitioned_impl(const std::string& label, const ExecutionSpace& ex,
-                         IteratorType first, IteratorType last,
-                         PredicateType pred) {
+bool is_partitioned_exespace_impl(const std::string& label,
+                                  const ExecutionSpace& ex, IteratorType first,
+                                  IteratorType last, PredicateType pred) {
   // true if all elements in the range [first, last) that satisfy
   // the predicate "pred" appear before all elements that don't.
   // Also returns true if [first, last) is empty.
@@ -93,6 +97,63 @@ bool is_partitioned_impl(const std::string& label, const ExecutionSpace& ex,
   const auto num_elements = Kokkos::Experimental::distance(first, last);
   ::Kokkos::parallel_reduce(label,
                             RangePolicy<ExecutionSpace>(ex, 0, num_elements),
+
+                            func_t(first, reducer, pred), reducer);
+
+  // fence not needed because reducing into scalar
+
+  // decide and return
+  constexpr index_type red_id_min =
+      ::Kokkos::reduction_identity<index_type>::min();
+  constexpr index_type red_id_max =
+      ::Kokkos::reduction_identity<index_type>::max();
+
+  if (red_result.max_loc_true != red_id_max &&
+      red_result.min_loc_false != red_id_min) {
+    // this occurs when the reduction yields nontrivial values
+    return red_result.max_loc_true < red_result.min_loc_false;
+  } else if (red_result.max_loc_true == red_id_max &&
+             red_result.min_loc_false == 0) {
+    // this occurs when all values do NOT satisfy
+    // the predicate, and this corner case should also be true
+    return true;
+  } else if (first + red_result.max_loc_true == --last) {
+    // this occurs when all values satisfy the predicate,
+    // this corner case should also be true
+    return true;
+  } else {
+    return false;
+  }
+}
+
+template <class TeamHandleType, class IteratorType, class PredicateType>
+KOKKOS_FUNCTION bool is_partitioned_team_impl(const TeamHandleType& teamHandle,
+                                              IteratorType first,
+                                              IteratorType last,
+                                              PredicateType pred) {
+  /* see exespace impl for the description of the impl */
+
+  // checks
+  Impl::static_assert_random_access_and_accessible(teamHandle, first);
+  Impl::expect_valid_range(first, last);
+
+  // trivial case
+  if (first == last) {
+    return true;
+  }
+
+  // aliases
+  using index_type           = typename IteratorType::difference_type;
+  using reducer_type         = StdIsPartitioned<index_type>;
+  using reduction_value_type = typename reducer_type::value_type;
+  using func_t =
+      StdIsPartitionedFunctor<IteratorType, reducer_type, PredicateType>;
+
+  // run
+  reduction_value_type red_result;
+  reducer_type reducer(red_result);
+  const auto num_elements = Kokkos::Experimental::distance(first, last);
+  ::Kokkos::parallel_reduce(TeamThreadRange(teamHandle, 0, num_elements),
                             func_t(first, reducer, pred), reducer);
 
   // fence not needed because reducing into scalar
@@ -105,8 +166,16 @@ bool is_partitioned_impl(const std::string& label, const ExecutionSpace& ex,
 
   if (red_result.max_loc_true != red_id_max &&
       red_result.min_loc_false != red_id_min) {
+    // this occurs when the reduction yields nontrivial values
     return red_result.max_loc_true < red_result.min_loc_false;
+  } else if (red_result.max_loc_true == red_id_max &&
+             red_result.min_loc_false == 0) {
+    // this occurs when all values do NOT satisfy
+    // the predicate, and this corner case should also be true
+    return true;
   } else if (first + red_result.max_loc_true == --last) {
+    // this occurs when all values satisfy the predicate,
+    // this corner case should also be true
     return true;
   } else {
     return false;
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_IsSorted.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_IsSorted.hpp
index 469682158641f61793821c22f9735b4243dd135d..b2c912848a3b6c8626519bae4ab18351b114d12e 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_IsSorted.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_IsSorted.hpp
@@ -48,10 +48,13 @@ struct StdIsSortedFunctor {
       : m_first(std::move(_first1)), m_comparator(std::move(comparator)) {}
 };
 
+//
+// exespace impl
+//
 template <class ExecutionSpace, class IteratorType, class ComparatorType>
-bool is_sorted_impl(const std::string& label, const ExecutionSpace& ex,
-                    IteratorType first, IteratorType last,
-                    ComparatorType comp) {
+bool is_sorted_exespace_impl(const std::string& label, const ExecutionSpace& ex,
+                             IteratorType first, IteratorType last,
+                             ComparatorType comp) {
   // checks
   Impl::static_assert_random_access_and_accessible(ex, first);
   Impl::expect_valid_range(first, last);
@@ -75,11 +78,49 @@ bool is_sorted_impl(const std::string& label, const ExecutionSpace& ex,
 }
 
 template <class ExecutionSpace, class IteratorType>
-bool is_sorted_impl(const std::string& label, const ExecutionSpace& ex,
-                    IteratorType first, IteratorType last) {
+bool is_sorted_exespace_impl(const std::string& label, const ExecutionSpace& ex,
+                             IteratorType first, IteratorType last) {
+  using value_type = typename IteratorType::value_type;
+  using pred_t     = Impl::StdAlgoLessThanBinaryPredicate<value_type>;
+  return is_sorted_exespace_impl(label, ex, first, last, pred_t());
+}
+
+//
+// team impl
+//
+template <class TeamHandleType, class IteratorType, class ComparatorType>
+KOKKOS_FUNCTION bool is_sorted_team_impl(const TeamHandleType& teamHandle,
+                                         IteratorType first, IteratorType last,
+                                         ComparatorType comp) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(teamHandle, first);
+  Impl::expect_valid_range(first, last);
+
+  const auto num_elements = Kokkos::Experimental::distance(first, last);
+  if (num_elements <= 1) {
+    return true;
+  }
+
+  // use num_elements-1 because each index handles i and i+1
+  const auto num_elements_minus_one = num_elements - 1;
+
+  // result is incremented by one if sorting breaks at index i
+  std::size_t result = 0;
+  ::Kokkos::parallel_reduce(
+      TeamThreadRange(teamHandle, 0, num_elements_minus_one),
+      // use CTAD here
+      StdIsSortedFunctor(first, std::move(comp)), result);
+
+  return result == 0;
+}
+
+template <class TeamHandleType, class IteratorType>
+KOKKOS_FUNCTION bool is_sorted_team_impl(const TeamHandleType& teamHandle,
+                                         IteratorType first,
+                                         IteratorType last) {
   using value_type = typename IteratorType::value_type;
   using pred_t     = Impl::StdAlgoLessThanBinaryPredicate<value_type>;
-  return is_sorted_impl(label, ex, first, last, pred_t());
+  return is_sorted_team_impl(teamHandle, first, last, pred_t());
 }
 
 }  // namespace Impl
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_IsSortedUntil.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_IsSortedUntil.hpp
index fe52e18a33d070ce3406418c242357d2de785a5a..d33580ca5372dfb7d870d9e9e4ef774111a8798c 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_IsSortedUntil.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_IsSortedUntil.hpp
@@ -28,39 +28,41 @@ namespace Kokkos {
 namespace Experimental {
 namespace Impl {
 
-template <class IteratorType, class IndicatorViewType, class ComparatorType>
+template <class IteratorType, class ComparatorType, class ReducerType>
 struct StdIsSortedUntilFunctor {
   using index_type = typename IteratorType::difference_type;
+  using value_type = typename ReducerType::value_type;
+
   IteratorType m_first;
-  IndicatorViewType m_indicator;
   ComparatorType m_comparator;
+  ReducerType m_reducer;
 
   KOKKOS_FUNCTION
-  void operator()(const index_type i, int& update, const bool final) const {
+  void operator()(const index_type i, value_type& reduction_result) const {
     const auto& val_i   = m_first[i];
     const auto& val_ip1 = m_first[i + 1];
-
     if (m_comparator(val_ip1, val_i)) {
-      ++update;
-    }
-
-    if (final) {
-      m_indicator(i) = update;
+      m_reducer.join(reduction_result, i);
     }
   }
 
   KOKKOS_FUNCTION
-  StdIsSortedUntilFunctor(IteratorType _first1, IndicatorViewType indicator,
-                          ComparatorType comparator)
-      : m_first(std::move(_first1)),
-        m_indicator(std::move(indicator)),
-        m_comparator(std::move(comparator)) {}
+  StdIsSortedUntilFunctor(IteratorType first, ComparatorType comparator,
+                          ReducerType reducer)
+      : m_first(std::move(first)),
+        m_comparator(std::move(comparator)),
+        m_reducer(std::move(reducer)) {}
 };
 
+//
+// overloads accepting exespace
+//
 template <class ExecutionSpace, class IteratorType, class ComparatorType>
-IteratorType is_sorted_until_impl(const std::string& label,
-                                  const ExecutionSpace& ex, IteratorType first,
-                                  IteratorType last, ComparatorType comp) {
+IteratorType is_sorted_until_exespace_impl(const std::string& label,
+                                           const ExecutionSpace& ex,
+                                           IteratorType first,
+                                           IteratorType last,
+                                           ComparatorType comp) {
   // checks
   Impl::static_assert_random_access_and_accessible(ex, first);
   Impl::expect_valid_range(first, last);
@@ -73,49 +75,93 @@ IteratorType is_sorted_until_impl(const std::string& label,
   }
 
   /*
-    use scan and a helper "indicator" view
-    such that we scan the data and fill the indicator with
-    partial sum that is always 0 unless we find a pair that
-    breaks the sorting, so in that case the indicator will
-    have a 1 starting at the location where the sorting breaks.
-    So finding that 1 means finding the location we want.
-   */
-
-  // aliases
-  using indicator_value_type = std::size_t;
-  using indicator_view_type =
-      ::Kokkos::View<indicator_value_type*, ExecutionSpace>;
-  using functor_type =
-      StdIsSortedUntilFunctor<IteratorType, indicator_view_type,
-                              ComparatorType>;
-
-  // do scan
-  // use num_elements-1 because each index handles i and i+1
-  const auto num_elements_minus_one = num_elements - 1;
-  indicator_view_type indicator("is_sorted_until_indicator_helper",
-                                num_elements_minus_one);
-  ::Kokkos::parallel_scan(
-      label, RangePolicy<ExecutionSpace>(ex, 0, num_elements_minus_one),
-      functor_type(first, indicator, std::move(comp)));
-
-  // try to find the first sentinel value, which indicates
-  // where the sorting condition breaks
-  namespace KE                                  = ::Kokkos::Experimental;
-  constexpr indicator_value_type sentinel_value = 1;
-  auto r =
-      KE::find(ex, KE::cbegin(indicator), KE::cend(indicator), sentinel_value);
-  const auto shift = r - ::Kokkos::Experimental::cbegin(indicator);
-
-  return first + (shift + 1);
+    Do a par_reduce computing the *min* index that breaks the sorting.
+    If such an index is found, then the range is sorted until that element.
+    If no such index is found, then the range is sorted until the end.
+  */
+  using index_type = typename IteratorType::difference_type;
+  index_type reduction_result;
+  ::Kokkos::Min<index_type> reducer(reduction_result);
+  ::Kokkos::parallel_reduce(
+      label,
+      // use num_elements-1 because each index handles i and i+1
+      RangePolicy<ExecutionSpace>(ex, 0, num_elements - 1),
+      StdIsSortedUntilFunctor(first, comp, reducer), reducer);
+
+  /* If the reduction result is equal to the initial value,
+     it means the range is sorted until the end */
+  index_type reduction_result_init;
+  reducer.init(reduction_result_init);
+  if (reduction_result == reduction_result_init) {
+    return last;
+  } else {
+    /* If such an index is found, then the range is sorted until there and
+       we need to return an iterator past the element found so do +1 */
+    return first + (reduction_result + 1);
+  }
+}
+
+template <class ExecutionSpace, class IteratorType>
+IteratorType is_sorted_until_exespace_impl(const std::string& label,
+                                           const ExecutionSpace& ex,
+                                           IteratorType first,
+                                           IteratorType last) {
+  using value_type = typename IteratorType::value_type;
+  using pred_t     = Impl::StdAlgoLessThanBinaryPredicate<value_type>;
+  return is_sorted_until_exespace_impl(label, ex, first, last, pred_t());
+}
+
+//
+// overloads accepting team handle
+//
+template <class ExecutionSpace, class IteratorType, class ComparatorType>
+KOKKOS_FUNCTION IteratorType
+is_sorted_until_team_impl(const ExecutionSpace& teamHandle, IteratorType first,
+                          IteratorType last, ComparatorType comp) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(teamHandle, first);
+  Impl::expect_valid_range(first, last);
+
+  const auto num_elements = Kokkos::Experimental::distance(first, last);
+
+  // trivial case
+  if (num_elements <= 1) {
+    return last;
+  }
+
+  /*
+    Do a par_reduce computing the *min* index that breaks the sorting.
+    If one such index is found, then the range is sorted until that element,
+    if no such index is found, then it means the range is sorted until the end.
+  */
+  using index_type = typename IteratorType::difference_type;
+  index_type red_result;
+  index_type red_result_init;
+  ::Kokkos::Min<index_type> reducer(red_result);
+  reducer.init(red_result_init);
+  ::Kokkos::parallel_reduce(  // use num_elements-1 because each index handles i
+                              // and i+1
+      TeamThreadRange(teamHandle, 0, num_elements - 1),
+      StdIsSortedUntilFunctor(first, comp, reducer), reducer);
+  teamHandle.team_barrier();
+
+  /* If the reduction result is equal to the initial value,
+     and it means the range is sorted until the end */
+  if (red_result == red_result_init) {
+    return last;
+  } else {
+    /* If  such index is found, then the range is sorted until there and
+       we need to return an iterator past the element found so do +1 */
+    return first + (red_result + 1);
+  }
 }
 
 template <class ExecutionSpace, class IteratorType>
-IteratorType is_sorted_until_impl(const std::string& label,
-                                  const ExecutionSpace& ex, IteratorType first,
-                                  IteratorType last) {
+KOKKOS_FUNCTION IteratorType is_sorted_until_team_impl(
+    const ExecutionSpace& teamHandle, IteratorType first, IteratorType last) {
   using value_type = typename IteratorType::value_type;
   using pred_t     = Impl::StdAlgoLessThanBinaryPredicate<value_type>;
-  return is_sorted_until_impl(label, ex, first, last, pred_t());
+  return is_sorted_until_team_impl(teamHandle, first, last, pred_t());
 }
 
 }  // namespace Impl
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_LexicographicalCompare.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_LexicographicalCompare.hpp
index 170ec9f2911d6ad303f1262998dac1912b733c29..b95a66c3bd9b94fe25bcff8c7140d046e5c77627 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_LexicographicalCompare.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_LexicographicalCompare.hpp
@@ -63,12 +63,14 @@ struct StdLexicographicalCompareFunctor {
     const auto& my_value1 = m_first1[i];
     const auto& my_value2 = m_first2[i];
 
-    bool different = m_comparator(my_value1, my_value2) ||
-                     m_comparator(my_value2, my_value1);
-    auto rv =
-        different
-            ? red_value_type{i}
-            : red_value_type{::Kokkos::reduction_identity<IndexType>::min()};
+    const bool different = m_comparator(my_value1, my_value2) ||
+                           m_comparator(my_value2, my_value1);
+
+    // FIXME_NVHPC using a ternary operator causes problems
+    red_value_type rv = {::Kokkos::reduction_identity<IndexType>::min()};
+    if (different) {
+      rv.min_loc_true = i;
+    }
 
     m_reducer.join(red_value, rv);
   }
@@ -82,13 +84,15 @@ struct StdLexicographicalCompareFunctor {
         m_comparator(std::move(_comp)) {}
 };
 
+//
+// exespace impl
+//
 template <class ExecutionSpace, class IteratorType1, class IteratorType2,
           class ComparatorType>
-bool lexicographical_compare_impl(const std::string& label,
-                                  const ExecutionSpace& ex,
-                                  IteratorType1 first1, IteratorType1 last1,
-                                  IteratorType2 first2, IteratorType2 last2,
-                                  ComparatorType comp) {
+bool lexicographical_compare_exespace_impl(
+    const std::string& label, const ExecutionSpace& ex, IteratorType1 first1,
+    IteratorType1 last1, IteratorType2 first2, IteratorType2 last2,
+    ComparatorType comp) {
   // checks
   Impl::static_assert_random_access_and_accessible(ex, first1, first2);
   Impl::static_assert_iterators_have_matching_difference_type(first1, first2);
@@ -137,16 +141,84 @@ bool lexicographical_compare_impl(const std::string& label,
 }
 
 template <class ExecutionSpace, class IteratorType1, class IteratorType2>
-bool lexicographical_compare_impl(const std::string& label,
-                                  const ExecutionSpace& ex,
-                                  IteratorType1 first1, IteratorType1 last1,
-                                  IteratorType2 first2, IteratorType2 last2) {
+bool lexicographical_compare_exespace_impl(
+    const std::string& label, const ExecutionSpace& ex, IteratorType1 first1,
+    IteratorType1 last1, IteratorType2 first2, IteratorType2 last2) {
+  using value_type_1 = typename IteratorType1::value_type;
+  using value_type_2 = typename IteratorType2::value_type;
+  using predicate_t =
+      Impl::StdAlgoLessThanBinaryPredicate<value_type_1, value_type_2>;
+  return lexicographical_compare_exespace_impl(label, ex, first1, last1, first2,
+                                               last2, predicate_t());
+}
+
+//
+// team impl
+//
+template <class TeamHandleType, class IteratorType1, class IteratorType2,
+          class ComparatorType>
+KOKKOS_FUNCTION bool lexicographical_compare_team_impl(
+    const TeamHandleType& teamHandle, IteratorType1 first1, IteratorType1 last1,
+    IteratorType2 first2, IteratorType2 last2, ComparatorType comp) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(teamHandle, first1, first2);
+  Impl::static_assert_iterators_have_matching_difference_type(first1, first2);
+  Impl::expect_valid_range(first1, last1);
+  Impl::expect_valid_range(first2, last2);
+
+  // aliases
+  using index_type           = typename IteratorType1::difference_type;
+  using reducer_type         = FirstLoc<index_type>;
+  using reduction_value_type = typename reducer_type::value_type;
+
+  // run
+  const auto d1    = Kokkos::Experimental::distance(first1, last1);
+  const auto d2    = Kokkos::Experimental::distance(first2, last2);
+  const auto range = Kokkos::min(d1, d2);
+  reduction_value_type red_result;
+  reducer_type reducer(red_result);
+  using func1_t =
+      StdLexicographicalCompareFunctor<index_type, IteratorType1, IteratorType2,
+                                       reducer_type, ComparatorType>;
+
+  ::Kokkos::parallel_reduce(TeamThreadRange(teamHandle, 0, range),
+                            func1_t(first1, first2, reducer, comp), reducer);
+
+  teamHandle.team_barrier();
+
+  // no mismatch
+  if (red_result.min_loc_true ==
+      ::Kokkos::reduction_identity<index_type>::min()) {
+    auto new_last1 = first1 + range;
+    auto new_last2 = first2 + range;
+    bool is_prefix = (new_last1 == last1) && (new_last2 != last2);
+    return is_prefix;
+  }
+
+  // check mismatched
+  int less      = 0;
+  auto it1      = first1 + red_result.min_loc_true;
+  auto it2      = first2 + red_result.min_loc_true;
+  using func2_t = StdCompareFunctor<index_type, IteratorType1, IteratorType2,
+                                    ComparatorType>;
+  ::Kokkos::parallel_reduce(TeamThreadRange(teamHandle, 0, 1),
+                            func2_t(it1, it2, comp), less);
+
+  teamHandle.team_barrier();
+
+  return static_cast<bool>(less);
+}
+
+template <class TeamHandleType, class IteratorType1, class IteratorType2>
+KOKKOS_FUNCTION bool lexicographical_compare_team_impl(
+    const TeamHandleType& teamHandle, IteratorType1 first1, IteratorType1 last1,
+    IteratorType2 first2, IteratorType2 last2) {
   using value_type_1 = typename IteratorType1::value_type;
   using value_type_2 = typename IteratorType2::value_type;
   using predicate_t =
       Impl::StdAlgoLessThanBinaryPredicate<value_type_1, value_type_2>;
-  return lexicographical_compare_impl(label, ex, first1, last1, first2, last2,
-                                      predicate_t());
+  return lexicographical_compare_team_impl(teamHandle, first1, last1, first2,
+                                           last2, predicate_t());
 }
 
 }  // namespace Impl
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_MinMaxMinmaxElement.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_MinMaxMinmaxElement.hpp
index 048420f7a83dbc554cd491242021d933acc49f35..2f51db03b4625d157f4f098270377d83f6dd99e3 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_MinMaxMinmaxElement.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_MinMaxMinmaxElement.hpp
@@ -63,12 +63,16 @@ struct StdMinMaxElemFunctor {
       : m_first(std::move(first)), m_reducer(std::move(reducer)) {}
 };
 
+//
+// exespace impl
+//
 template <template <class... Args> class ReducerType, class ExecutionSpace,
           class IteratorType, class... Args>
-IteratorType min_or_max_element_impl(const std::string& label,
-                                     const ExecutionSpace& ex,
-                                     IteratorType first, IteratorType last,
-                                     Args&&... args) {
+IteratorType min_or_max_element_exespace_impl(const std::string& label,
+                                              const ExecutionSpace& ex,
+                                              IteratorType first,
+                                              IteratorType last,
+                                              Args&&... args) {
   // checks
   Impl::static_assert_random_access_and_accessible(ex, first);
   Impl::expect_valid_range(first, last);
@@ -100,7 +104,7 @@ IteratorType min_or_max_element_impl(const std::string& label,
 
 template <template <class... Args> class ReducerType, class ExecutionSpace,
           class IteratorType, class... Args>
-::Kokkos::pair<IteratorType, IteratorType> minmax_element_impl(
+::Kokkos::pair<IteratorType, IteratorType> minmax_element_exespace_impl(
     const std::string& label, const ExecutionSpace& ex, IteratorType first,
     IteratorType last, Args&&... args) {
   // checks
@@ -132,6 +136,75 @@ template <template <class... Args> class ReducerType, class ExecutionSpace,
   return {first + red_result.min_loc, first + red_result.max_loc};
 }
 
+//
+// team level impl
+//
+template <template <class... Args> class ReducerType, class TeamHandleType,
+          class IteratorType, class... Args>
+KOKKOS_FUNCTION IteratorType min_or_max_element_team_impl(
+    const TeamHandleType& teamHandle, IteratorType first, IteratorType last,
+    Args&&... args) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(teamHandle, first);
+  Impl::expect_valid_range(first, last);
+
+  if (first == last) {
+    return last;
+  }
+
+  // aliases
+  using index_type           = typename IteratorType::difference_type;
+  using value_type           = typename IteratorType::value_type;
+  using reducer_type         = ReducerType<value_type, index_type, Args...>;
+  using reduction_value_type = typename reducer_type::value_type;
+  using func_t = StdMinOrMaxElemFunctor<IteratorType, reducer_type>;
+
+  // run
+  reduction_value_type red_result;
+  reducer_type reducer(red_result, std::forward<Args>(args)...);
+  const auto num_elements = Kokkos::Experimental::distance(first, last);
+  ::Kokkos::parallel_reduce(TeamThreadRange(teamHandle, 0, num_elements),
+                            func_t(first, reducer), reducer);
+  teamHandle.team_barrier();
+  // maybe the barrier is not needed since reducing into scalar?
+
+  // return
+  return first + red_result.loc;
+}
+
+template <template <class... Args> class ReducerType, class TeamHandleType,
+          class IteratorType, class... Args>
+KOKKOS_FUNCTION ::Kokkos::pair<IteratorType, IteratorType>
+minmax_element_team_impl(const TeamHandleType& teamHandle, IteratorType first,
+                         IteratorType last, Args&&... args) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(teamHandle, first);
+  Impl::expect_valid_range(first, last);
+
+  if (first == last) {
+    return {first, first};
+  }
+
+  // aliases
+  using index_type           = typename IteratorType::difference_type;
+  using value_type           = typename IteratorType::value_type;
+  using reducer_type         = ReducerType<value_type, index_type, Args...>;
+  using reduction_value_type = typename reducer_type::value_type;
+  using func_t               = StdMinMaxElemFunctor<IteratorType, reducer_type>;
+
+  // run
+  reduction_value_type red_result;
+  reducer_type reducer(red_result, std::forward<Args>(args)...);
+  const auto num_elements = Kokkos::Experimental::distance(first, last);
+  ::Kokkos::parallel_reduce(TeamThreadRange(teamHandle, 0, num_elements),
+                            func_t(first, reducer), reducer);
+  teamHandle.team_barrier();
+  // maybe the barrier is not needed since reducing into scalar?
+
+  // return
+  return {first + red_result.min_loc, first + red_result.max_loc};
+}
+
 }  // namespace Impl
 }  // namespace Experimental
 }  // namespace Kokkos
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Mismatch.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Mismatch.hpp
index 9d2e31f63fcf782284a60d3e5c0d23c7de987f04..dfe96aaf586b9a116624f3a7afd545b5da582b54 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Mismatch.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Mismatch.hpp
@@ -27,9 +27,10 @@ namespace Kokkos {
 namespace Experimental {
 namespace Impl {
 
-template <class IndexType, class IteratorType1, class IteratorType2,
-          class ReducerType, class BinaryPredicateType>
+template <class IteratorType1, class IteratorType2, class ReducerType,
+          class BinaryPredicateType>
 struct StdMismatchRedFunctor {
+  using index_type     = typename IteratorType1::difference_type;
   using red_value_type = typename ReducerType::value_type;
 
   IteratorType1 m_first1;
@@ -38,14 +39,15 @@ struct StdMismatchRedFunctor {
   BinaryPredicateType m_predicate;
 
   KOKKOS_FUNCTION
-  void operator()(const IndexType i, red_value_type& red_value) const {
+  void operator()(const index_type i, red_value_type& red_value) const {
     const auto& my_value1 = m_first1[i];
     const auto& my_value2 = m_first2[i];
 
-    auto rv =
-        !m_predicate(my_value1, my_value2)
-            ? red_value_type{i}
-            : red_value_type{::Kokkos::reduction_identity<IndexType>::min()};
+    // FIXME_NVHPC using a ternary operator causes problems
+    red_value_type rv = {i};
+    if (m_predicate(my_value1, my_value2)) {
+      rv = {::Kokkos::reduction_identity<index_type>::min()};
+    }
 
     m_reducer.join(red_value, rv);
   }
@@ -59,9 +61,12 @@ struct StdMismatchRedFunctor {
         m_predicate(std::move(predicate)) {}
 };
 
+//
+// exespace impl
+//
 template <class ExecutionSpace, class IteratorType1, class IteratorType2,
           class BinaryPredicateType>
-::Kokkos::pair<IteratorType1, IteratorType2> mismatch_impl(
+::Kokkos::pair<IteratorType1, IteratorType2> mismatch_exespace_impl(
     const std::string& label, const ExecutionSpace& ex, IteratorType1 first1,
     IteratorType1 last1, IteratorType2 first2, IteratorType2 last2,
     BinaryPredicateType predicate) {
@@ -76,9 +81,6 @@ template <class ExecutionSpace, class IteratorType1, class IteratorType2,
   using index_type           = typename IteratorType1::difference_type;
   using reducer_type         = FirstLoc<index_type>;
   using reduction_value_type = typename reducer_type::value_type;
-  using functor_type =
-      StdMismatchRedFunctor<index_type, IteratorType1, IteratorType2,
-                            reducer_type, BinaryPredicateType>;
 
   // trivial case: note that this is important,
   // for OpenMPTarget, omitting special handling of
@@ -95,7 +97,9 @@ template <class ExecutionSpace, class IteratorType1, class IteratorType2,
   reducer_type reducer(red_result);
   ::Kokkos::parallel_reduce(
       label, RangePolicy<ExecutionSpace>(ex, 0, num_elemen_par_reduce),
-      functor_type(first1, first2, reducer, std::move(predicate)), reducer);
+      // use CTAD
+      StdMismatchRedFunctor(first1, first2, reducer, std::move(predicate)),
+      reducer);
 
   // fence not needed because reducing into scalar
 
@@ -118,13 +122,83 @@ template <class ExecutionSpace, class IteratorType1, class IteratorType2,
 }
 
 template <class ExecutionSpace, class IteratorType1, class IteratorType2>
-::Kokkos::pair<IteratorType1, IteratorType2> mismatch_impl(
+::Kokkos::pair<IteratorType1, IteratorType2> mismatch_exespace_impl(
     const std::string& label, const ExecutionSpace& ex, IteratorType1 first1,
     IteratorType1 last1, IteratorType2 first2, IteratorType2 last2) {
   using value_type1 = typename IteratorType1::value_type;
   using value_type2 = typename IteratorType2::value_type;
   using pred_t      = StdAlgoEqualBinaryPredicate<value_type1, value_type2>;
-  return mismatch_impl(label, ex, first1, last1, first2, last2, pred_t());
+  return mismatch_exespace_impl(label, ex, first1, last1, first2, last2,
+                                pred_t());
+}
+
+//
+// team impl
+//
+template <class TeamHandleType, class IteratorType1, class IteratorType2,
+          class BinaryPredicateType>
+KOKKOS_FUNCTION ::Kokkos::pair<IteratorType1, IteratorType2> mismatch_team_impl(
+    const TeamHandleType& teamHandle, IteratorType1 first1, IteratorType1 last1,
+    IteratorType2 first2, IteratorType2 last2, BinaryPredicateType predicate) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(teamHandle, first1, first2);
+  Impl::static_assert_iterators_have_matching_difference_type(first1, first2);
+  Impl::expect_valid_range(first1, last1);
+  Impl::expect_valid_range(first2, last2);
+
+  // aliases
+  using return_type          = ::Kokkos::pair<IteratorType1, IteratorType2>;
+  using index_type           = typename IteratorType1::difference_type;
+  using reducer_type         = FirstLoc<index_type>;
+  using reduction_value_type = typename reducer_type::value_type;
+
+  // trivial case: note that this is important,
+  // for OpenMPTarget, omitting special handling of
+  // the trivial case was giving all sorts of strange stuff.
+  const auto num_e1 = last1 - first1;
+  const auto num_e2 = last2 - first2;
+  if (num_e1 == 0 || num_e2 == 0) {
+    return return_type(first1, first2);
+  }
+
+  // run
+  const auto num_elemen_par_reduce = (num_e1 <= num_e2) ? num_e1 : num_e2;
+  reduction_value_type red_result;
+  reducer_type reducer(red_result);
+  ::Kokkos::parallel_reduce(
+      TeamThreadRange(teamHandle, 0, num_elemen_par_reduce),
+      // use CTAD
+      StdMismatchRedFunctor(first1, first2, reducer, std::move(predicate)),
+      reducer);
+
+  teamHandle.team_barrier();
+
+  // decide and return
+  constexpr auto red_min = ::Kokkos::reduction_identity<index_type>::min();
+  if (red_result.min_loc_true == red_min) {
+    // in here means mismatch has not been found
+    if (num_e1 == num_e2) {
+      return return_type(last1, last2);
+    } else if (num_e1 < num_e2) {
+      return return_type(last1, first2 + num_e1);
+    } else {
+      return return_type(first1 + num_e2, last2);
+    }
+  } else {
+    // in here means mismatch has been found
+    return return_type(first1 + red_result.min_loc_true,
+                       first2 + red_result.min_loc_true);
+  }
+}
+
+template <class TeamHandleType, class IteratorType1, class IteratorType2>
+KOKKOS_FUNCTION ::Kokkos::pair<IteratorType1, IteratorType2> mismatch_team_impl(
+    const TeamHandleType& teamHandle, IteratorType1 first1, IteratorType1 last1,
+    IteratorType2 first2, IteratorType2 last2) {
+  using value_type1 = typename IteratorType1::value_type;
+  using value_type2 = typename IteratorType2::value_type;
+  using pred_t      = StdAlgoEqualBinaryPredicate<value_type1, value_type2>;
+  return mismatch_team_impl(teamHandle, first1, last1, first2, last2, pred_t());
 }
 
 }  // namespace Impl
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Move.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Move.hpp
index 01086d1772aaa3c722c84e238f8d97a82812b4ec..5110c51d41427d2daa5782560d4747f7e818d445 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Move.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Move.hpp
@@ -37,14 +37,15 @@ struct StdMoveFunctor {
     m_dest_first[i] = std::move(m_first[i]);
   }
 
-  StdMoveFunctor(InputIterator _first, OutputIterator _dest_first)
+  KOKKOS_FUNCTION StdMoveFunctor(InputIterator _first,
+                                 OutputIterator _dest_first)
       : m_first(std::move(_first)), m_dest_first(std::move(_dest_first)) {}
 };
 
 template <class ExecutionSpace, class InputIterator, class OutputIterator>
-OutputIterator move_impl(const std::string& label, const ExecutionSpace& ex,
-                         InputIterator first, InputIterator last,
-                         OutputIterator d_first) {
+OutputIterator move_exespace_impl(const std::string& label,
+                                  const ExecutionSpace& ex, InputIterator first,
+                                  InputIterator last, OutputIterator d_first) {
   // checks
   Impl::static_assert_random_access_and_accessible(ex, first, d_first);
   Impl::static_assert_iterators_have_matching_difference_type(first, d_first);
@@ -65,6 +66,30 @@ OutputIterator move_impl(const std::string& label, const ExecutionSpace& ex,
   return d_first + num_elements;
 }
 
+template <class TeamHandleType, class InputIterator, class OutputIterator>
+KOKKOS_FUNCTION OutputIterator move_team_impl(const TeamHandleType& teamHandle,
+                                              InputIterator first,
+                                              InputIterator last,
+                                              OutputIterator d_first) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(teamHandle, first, d_first);
+  Impl::static_assert_iterators_have_matching_difference_type(first, d_first);
+  Impl::expect_valid_range(first, last);
+
+  // aliases
+  using index_type = typename InputIterator::difference_type;
+  using func_t     = StdMoveFunctor<index_type, InputIterator, OutputIterator>;
+
+  // run
+  const auto num_elements = Kokkos::Experimental::distance(first, last);
+  ::Kokkos::parallel_for(TeamThreadRange(teamHandle, 0, num_elements),
+                         func_t(first, d_first));
+  teamHandle.team_barrier();
+
+  // return
+  return d_first + num_elements;
+}
+
 }  // namespace Impl
 }  // namespace Experimental
 }  // namespace Kokkos
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_MoveBackward.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_MoveBackward.hpp
index 9a28c3fb4a4df5f01b0e8cc7959297f2c4f46f2a..9075562d460e635ce9c6f2214ce8846ba1847baa 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_MoveBackward.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_MoveBackward.hpp
@@ -27,48 +27,66 @@ namespace Kokkos {
 namespace Experimental {
 namespace Impl {
 
-template <class IndexType, class IteratorType1, class IteratorType2>
+template <class IteratorType1, class IteratorType2>
 struct StdMoveBackwardFunctor {
-  static_assert(std::is_signed<IndexType>::value,
+  using index_type = typename IteratorType1::difference_type;
+  static_assert(std::is_signed<index_type>::value,
                 "Kokkos: StdMoveBackwardFunctor requires signed index type");
 
   IteratorType1 m_last;
   IteratorType2 m_dest_last;
 
   KOKKOS_FUNCTION
-  void operator()(IndexType i) const {
+  void operator()(index_type i) const {
     m_dest_last[-i - 1] = std::move(m_last[-i - 1]);
   }
 
+  KOKKOS_FUNCTION
   StdMoveBackwardFunctor(IteratorType1 _last, IteratorType2 _dest_last)
       : m_last(std::move(_last)), m_dest_last(std::move(_dest_last)) {}
 };
 
 template <class ExecutionSpace, class IteratorType1, class IteratorType2>
-IteratorType2 move_backward_impl(const std::string& label,
-                                 const ExecutionSpace& ex, IteratorType1 first,
-                                 IteratorType1 last, IteratorType2 d_last) {
+IteratorType2 move_backward_exespace_impl(const std::string& label,
+                                          const ExecutionSpace& ex,
+                                          IteratorType1 first,
+                                          IteratorType1 last,
+                                          IteratorType2 d_last) {
   // checks
   Impl::static_assert_random_access_and_accessible(ex, first, d_last);
   Impl::static_assert_iterators_have_matching_difference_type(first, d_last);
   Impl::expect_valid_range(first, last);
 
-  // aliases
-  using index_type = typename IteratorType1::difference_type;
-  using func_t =
-      StdMoveBackwardFunctor<index_type, IteratorType1, IteratorType2>;
-
   // run
   const auto num_elements = Kokkos::Experimental::distance(first, last);
   ::Kokkos::parallel_for(label,
                          RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-                         func_t(last, d_last));
+                         StdMoveBackwardFunctor(last, d_last));
   ex.fence("Kokkos::move_backward: fence after operation");
 
   // return
   return d_last - num_elements;
 }
 
+template <class TeamHandleType, class IteratorType1, class IteratorType2>
+KOKKOS_FUNCTION IteratorType2
+move_backward_team_impl(const TeamHandleType& teamHandle, IteratorType1 first,
+                        IteratorType1 last, IteratorType2 d_last) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(teamHandle, first, d_last);
+  Impl::static_assert_iterators_have_matching_difference_type(first, d_last);
+  Impl::expect_valid_range(first, last);
+
+  // run
+  const auto num_elements = Kokkos::Experimental::distance(first, last);
+  ::Kokkos::parallel_for(TeamThreadRange(teamHandle, 0, num_elements),
+                         StdMoveBackwardFunctor(last, d_last));
+  teamHandle.team_barrier();
+
+  // return
+  return d_last - num_elements;
+}
+
 }  // namespace Impl
 }  // namespace Experimental
 }  // namespace Kokkos
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_MustUseKokkosSingleInTeam.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_MustUseKokkosSingleInTeam.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..162c72c2db79802da919ed9cc08dbf688a3acacb
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_MustUseKokkosSingleInTeam.hpp
@@ -0,0 +1,47 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_STD_ALGORITHMS_MUSTUSEKOKKOSSINGLEINTEAM_HPP
+#define KOKKOS_STD_ALGORITHMS_MUSTUSEKOKKOSSINGLEINTEAM_HPP
+
+#include <Kokkos_Core.hpp>
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template <typename T>
+struct stdalgo_must_use_kokkos_single_for_team_scan : std::false_type {};
+
+// the following do not support the overload for team-level scan
+// accepting an "out" value to store the scan result
+
+// FIXME_OPENACC
+#if defined(KOKKOS_ENABLE_OPENACC)
+template <>
+struct stdalgo_must_use_kokkos_single_for_team_scan<
+    Kokkos::Experimental::OpenACC> : std::true_type {};
+#endif
+
+template <typename T>
+inline constexpr bool stdalgo_must_use_kokkos_single_for_team_scan_v =
+    stdalgo_must_use_kokkos_single_for_team_scan<T>::value;
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_PartitionCopy.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_PartitionCopy.hpp
index 5457ae25084867af6c2b07a15b1d5e7b4c333ab8..35e9cfa53e2db695378e2926c7bc554fb65fa074 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_PartitionCopy.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_PartitionCopy.hpp
@@ -31,31 +31,13 @@ template <class ValueType>
 struct StdPartitionCopyScalar {
   ValueType true_count_;
   ValueType false_count_;
-
-  // Here we implement the copy assignment operators explicitly for consistency
-  // with how the Scalar structs are implemented inside
-  // Kokkos_Parallel_Reduce.hpp.
-  KOKKOS_FUNCTION
-  void operator=(const StdPartitionCopyScalar& other) {
-    true_count_  = other.true_count_;
-    false_count_ = other.false_count_;
-  }
-
-  // this is needed for
-  // OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp:699:21: error: no viable
-  // overloaded '=' m_returnvalue = 0;
-  //
-  KOKKOS_FUNCTION
-  void operator=(const ValueType value) {
-    true_count_  = value;
-    false_count_ = value;
-  }
 };
 
-template <class IndexType, class FirstFrom, class FirstDestTrue,
-          class FirstDestFalse, class PredType>
+template <class FirstFrom, class FirstDestTrue, class FirstDestFalse,
+          class PredType>
 struct StdPartitionCopyFunctor {
-  using value_type = StdPartitionCopyScalar<IndexType>;
+  using index_type = typename FirstFrom::difference_type;
+  using value_type = StdPartitionCopyScalar<index_type>;
 
   FirstFrom m_first_from;
   FirstDestTrue m_first_dest_true;
@@ -71,7 +53,7 @@ struct StdPartitionCopyFunctor {
         m_pred(std::move(pred)) {}
 
   KOKKOS_FUNCTION
-  void operator()(const IndexType i, value_type& update,
+  void operator()(const index_type i, value_type& update,
                   const bool final_pass) const {
     const auto& myval = m_first_from[i];
     if (final_pass) {
@@ -106,11 +88,12 @@ template <class ExecutionSpace, class InputIteratorType,
           class OutputIteratorTrueType, class OutputIteratorFalseType,
           class PredicateType>
 ::Kokkos::pair<OutputIteratorTrueType, OutputIteratorFalseType>
-partition_copy_impl(const std::string& label, const ExecutionSpace& ex,
-                    InputIteratorType from_first, InputIteratorType from_last,
-                    OutputIteratorTrueType to_first_true,
-                    OutputIteratorFalseType to_first_false,
-                    PredicateType pred) {
+partition_copy_exespace_impl(const std::string& label, const ExecutionSpace& ex,
+                             InputIteratorType from_first,
+                             InputIteratorType from_last,
+                             OutputIteratorTrueType to_first_true,
+                             OutputIteratorFalseType to_first_false,
+                             PredicateType pred) {
   // impl uses a scan, this is similar how we implemented copy_if
 
   // checks
@@ -124,12 +107,9 @@ partition_copy_impl(const std::string& label, const ExecutionSpace& ex,
     return {to_first_true, to_first_false};
   }
 
-  // aliases
-  using index_type = typename InputIteratorType::difference_type;
   using func_type =
-      StdPartitionCopyFunctor<index_type, InputIteratorType,
-                              OutputIteratorTrueType, OutputIteratorFalseType,
-                              PredicateType>;
+      StdPartitionCopyFunctor<InputIteratorType, OutputIteratorTrueType,
+                              OutputIteratorFalseType, PredicateType>;
 
   // run
   const auto num_elements =
@@ -145,6 +125,55 @@ partition_copy_impl(const std::string& label, const ExecutionSpace& ex,
           to_first_false + counts.false_count_};
 }
 
+template <class TeamHandleType, class InputIteratorType,
+          class OutputIteratorTrueType, class OutputIteratorFalseType,
+          class PredicateType>
+KOKKOS_FUNCTION ::Kokkos::pair<OutputIteratorTrueType, OutputIteratorFalseType>
+partition_copy_team_impl(const TeamHandleType& teamHandle,
+                         InputIteratorType from_first,
+                         InputIteratorType from_last,
+                         OutputIteratorTrueType to_first_true,
+                         OutputIteratorFalseType to_first_false,
+                         PredicateType pred) {
+  // impl uses a scan, this is similar how we implemented copy_if
+
+  // checks
+  Impl::static_assert_random_access_and_accessible(
+      teamHandle, from_first, to_first_true, to_first_false);
+  Impl::static_assert_iterators_have_matching_difference_type(
+      from_first, to_first_true, to_first_false);
+  Impl::expect_valid_range(from_first, from_last);
+
+  if (from_first == from_last) {
+    return {to_first_true, to_first_false};
+  }
+
+  const std::size_t num_elements =
+      Kokkos::Experimental::distance(from_first, from_last);
+
+  // FIXME: there is no parallel_scan overload that accepts TeamThreadRange and
+  // return_value, so temporarily serial implementation is used
+  using counts_t  = ::Kokkos::pair<std::size_t, std::size_t>;
+  counts_t counts = {};
+  Kokkos::single(
+      Kokkos::PerTeam(teamHandle),
+      [=](counts_t& lcounts) {
+        lcounts = {};
+        for (std::size_t i = 0; i < num_elements; ++i) {
+          const auto& myval = from_first[i];
+          if (pred(myval)) {
+            to_first_true[lcounts.first++] = myval;
+          } else {
+            to_first_false[lcounts.second++] = myval;
+          }
+        }
+      },
+      counts);
+  // no barrier needed since single above broadcasts to all members
+
+  return {to_first_true + counts.first, to_first_false + counts.second};
+}
+
 }  // namespace Impl
 }  // namespace Experimental
 }  // namespace Kokkos
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_PartitionPoint.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_PartitionPoint.hpp
index 2d0ae2aac615ae6953bd5a3c8ea6c47810f598f7..7ec7061e3166cb9509bb4fcac72aeda4e9cd3622 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_PartitionPoint.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_PartitionPoint.hpp
@@ -39,10 +39,13 @@ struct StdPartitionPointFunctor {
   KOKKOS_FUNCTION
   void operator()(const index_type i, red_value_type& redValue) const {
     const auto predicate_value = m_p(m_first[i]);
-    auto rv =
-        predicate_value
-            ? red_value_type{::Kokkos::reduction_identity<index_type>::min()}
-            : red_value_type{i};
+
+    // FIXME_NVHPC using a ternary operator causes problems
+    red_value_type rv = {i};
+    if (predicate_value) {
+      rv = {::Kokkos::reduction_identity<index_type>::min()};
+    }
+
     m_reducer.join(redValue, rv);
   }
 
@@ -55,9 +58,11 @@ struct StdPartitionPointFunctor {
 };
 
 template <class ExecutionSpace, class IteratorType, class PredicateType>
-IteratorType partition_point_impl(const std::string& label,
-                                  const ExecutionSpace& ex, IteratorType first,
-                                  IteratorType last, PredicateType pred) {
+IteratorType partition_point_exespace_impl(const std::string& label,
+                                           const ExecutionSpace& ex,
+                                           IteratorType first,
+                                           IteratorType last,
+                                           PredicateType pred) {
   // locates the end of the first partition, that is, the first
   // element that does not satisfy p or last if all elements satisfy p.
   // Implementation below finds the first location where p is false.
@@ -97,6 +102,43 @@ IteratorType partition_point_impl(const std::string& label,
   }
 }
 
+template <class TeamHandleType, class IteratorType, class PredicateType>
+KOKKOS_FUNCTION IteratorType
+partition_point_team_impl(const TeamHandleType& teamHandle, IteratorType first,
+                          IteratorType last, PredicateType pred) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(teamHandle, first);
+  Impl::expect_valid_range(first, last);
+
+  if (first == last) {
+    return first;
+  }
+
+  // aliases
+  using index_type           = typename IteratorType::difference_type;
+  using reducer_type         = StdPartitionPoint<index_type>;
+  using reduction_value_type = typename reducer_type::value_type;
+
+  // run
+  reduction_value_type red_result;
+  reducer_type reducer(red_result);
+  const auto num_elements = Kokkos::Experimental::distance(first, last);
+  ::Kokkos::parallel_reduce(TeamThreadRange(teamHandle, 0, num_elements),
+                            StdPartitionPointFunctor(first, reducer, pred),
+                            reducer);
+
+  // fence not needed because reducing into scalar
+
+  // decide and return
+  if (red_result.min_loc_false ==
+      ::Kokkos::reduction_identity<index_type>::min()) {
+    // if all elements are true, return last
+    return last;
+  } else {
+    return first + red_result.min_loc_false;
+  }
+}
+
 }  // namespace Impl
 }  // namespace Experimental
 }  // namespace Kokkos
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Reduce.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Reduce.hpp
index 45a0de3727d8407e12d465d0bdd59943ea117490..ab01cdd80485205a5a5d07180fa7cb07a406660f 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Reduce.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Reduce.hpp
@@ -72,16 +72,22 @@ struct StdReduceFunctor {
       : m_first(std::move(first)), m_reducer(std::move(reducer)) {}
 };
 
-//------------------------------
-// reduce_custom_functors_impl
-//------------------------------
+template <typename ValueType>
+using has_reduction_identity_sum_t =
+    decltype(Kokkos::reduction_identity<ValueType>::sum());
+
+//
+// exespace impl
+//
+
+//-------------------------------------
+// reduce_custom_functors_exespace_impl
+//-------------------------------------
 template <class ExecutionSpace, class IteratorType, class ValueType,
           class JoinerType>
-ValueType reduce_custom_functors_impl(const std::string& label,
-                                      const ExecutionSpace& ex,
-                                      IteratorType first, IteratorType last,
-                                      ValueType init_reduction_value,
-                                      JoinerType joiner) {
+ValueType reduce_custom_functors_exespace_impl(
+    const std::string& label, const ExecutionSpace& ex, IteratorType first,
+    IteratorType last, ValueType init_reduction_value, JoinerType joiner) {
   // checks
   Impl::static_assert_random_access_and_accessible(ex, first);
   Impl::static_assert_is_not_openmptarget(ex);
@@ -95,7 +101,6 @@ ValueType reduce_custom_functors_impl(const std::string& label,
   // aliases
   using reducer_type =
       ReducerWithArbitraryJoinerNoNeutralElement<ValueType, JoinerType>;
-  using functor_type         = StdReduceFunctor<IteratorType, reducer_type>;
   using reduction_value_type = typename reducer_type::value_type;
 
   // run
@@ -104,21 +109,16 @@ ValueType reduce_custom_functors_impl(const std::string& label,
   const auto num_elements = Kokkos::Experimental::distance(first, last);
   ::Kokkos::parallel_reduce(label,
                             RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-                            functor_type(first, reducer), reducer);
+                            StdReduceFunctor(first, reducer), reducer);
 
   // fence not needed since reducing into scalar
   return joiner(result.val, init_reduction_value);
 }
 
-template <typename ValueType>
-using has_reduction_identity_sum_t =
-    decltype(Kokkos::reduction_identity<ValueType>::sum());
-
 template <class ExecutionSpace, class IteratorType, class ValueType>
-ValueType reduce_default_functors_impl(const std::string& label,
-                                       const ExecutionSpace& ex,
-                                       IteratorType first, IteratorType last,
-                                       ValueType init_reduction_value) {
+ValueType reduce_default_functors_exespace_impl(
+    const std::string& label, const ExecutionSpace& ex, IteratorType first,
+    IteratorType last, ValueType init_reduction_value) {
   // checks
   Impl::static_assert_random_access_and_accessible(ex, first);
   Impl::static_assert_is_not_openmptarget(ex);
@@ -146,11 +146,88 @@ ValueType reduce_default_functors_impl(const std::string& label,
     return tmp;
   } else {
     using joiner_type = Impl::StdReduceDefaultJoinFunctor<value_type>;
-    return reduce_custom_functors_impl(
+    return reduce_custom_functors_exespace_impl(
         label, ex, first, last, std::move(init_reduction_value), joiner_type());
   }
 }
 
+//
+// team impl
+//
+
+//---------------------------------
+// reduce_custom_functors_team_impl
+//---------------------------------
+template <class TeamHandleType, class IteratorType, class ValueType,
+          class JoinerType>
+KOKKOS_FUNCTION ValueType reduce_custom_functors_team_impl(
+    const TeamHandleType& teamHandle, IteratorType first, IteratorType last,
+    ValueType init_reduction_value, JoinerType joiner) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(teamHandle, first);
+  Impl::static_assert_is_not_openmptarget(teamHandle);
+  Impl::expect_valid_range(first, last);
+
+  if (first == last) {
+    // init is returned, unmodified
+    return init_reduction_value;
+  }
+
+  // aliases
+  using reducer_type =
+      ReducerWithArbitraryJoinerNoNeutralElement<ValueType, JoinerType>;
+  using reduction_value_type = typename reducer_type::value_type;
+
+  // run
+  reduction_value_type result;
+  reducer_type reducer(result, joiner);
+  const auto num_elements = Kokkos::Experimental::distance(first, last);
+  ::Kokkos::parallel_reduce(TeamThreadRange(teamHandle, 0, num_elements),
+                            StdReduceFunctor(first, reducer), reducer);
+
+  teamHandle.team_barrier();
+
+  return joiner(result.val, init_reduction_value);
+}
+
+template <class TeamHandleType, class IteratorType, class ValueType>
+KOKKOS_FUNCTION ValueType reduce_default_functors_team_impl(
+    const TeamHandleType& teamHandle, IteratorType first, IteratorType last,
+    ValueType init_reduction_value) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(teamHandle, first);
+  Impl::static_assert_is_not_openmptarget(teamHandle);
+  Impl::expect_valid_range(first, last);
+
+  using value_type = Kokkos::Impl::remove_cvref_t<ValueType>;
+
+  if (::Kokkos::is_detected<has_reduction_identity_sum_t, value_type>::value) {
+    if (first == last) {
+      // init is returned, unmodified
+      return init_reduction_value;
+    }
+
+    using functor_type =
+        Impl::StdReduceDefaultFunctor<IteratorType, value_type>;
+
+    // run
+    value_type tmp;
+    const auto num_elements = Kokkos::Experimental::distance(first, last);
+    ::Kokkos::parallel_reduce(TeamThreadRange(teamHandle, 0, num_elements),
+                              functor_type{first}, tmp);
+
+    teamHandle.team_barrier();
+
+    tmp += init_reduction_value;
+    return tmp;
+  } else {
+    using joiner_type = Impl::StdReduceDefaultJoinFunctor<value_type>;
+    return reduce_custom_functors_team_impl(teamHandle, first, last,
+                                            std::move(init_reduction_value),
+                                            joiner_type());
+  }
+}
+
 }  // namespace Impl
 }  // namespace Experimental
 }  // namespace Kokkos
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RemoveAllVariants.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RemoveAllVariants.hpp
index bda647019959c519ab29b1aeb89058d777b2f2b1..50224c8874ed19ebe7041d6f0be8a15e2c5001cb 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RemoveAllVariants.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RemoveAllVariants.hpp
@@ -76,10 +76,14 @@ struct StdRemoveIfStage2Functor {
   }
 };
 
+//
+// remove if
+//
 template <class ExecutionSpace, class IteratorType, class UnaryPredicateType>
-IteratorType remove_if_impl(const std::string& label, const ExecutionSpace& ex,
-                            IteratorType first, IteratorType last,
-                            UnaryPredicateType pred) {
+IteratorType remove_if_exespace_impl(const std::string& label,
+                                     const ExecutionSpace& ex,
+                                     IteratorType first, IteratorType last,
+                                     UnaryPredicateType pred) {
   Impl::static_assert_random_access_and_accessible(ex, first);
   Impl::expect_valid_range(first, last);
 
@@ -139,19 +143,71 @@ IteratorType remove_if_impl(const std::string& label, const ExecutionSpace& ex,
   }
 }
 
+template <class TeamHandleType, class IteratorType, class UnaryPredicateType>
+KOKKOS_FUNCTION IteratorType
+remove_if_team_impl(const TeamHandleType& teamHandle, IteratorType first,
+                    IteratorType last, UnaryPredicateType pred) {
+  Impl::static_assert_random_access_and_accessible(teamHandle, first);
+  Impl::expect_valid_range(first, last);
+
+  if (first == last) {
+    return last;
+  } else {
+    const auto remove_count =
+        ::Kokkos::Experimental::count_if(teamHandle, first, last, pred);
+    const std::size_t num_elements =
+        ::Kokkos::Experimental::distance(first, last);
+
+    if (remove_count > 0) {
+      std::size_t count = 0;
+      Kokkos::single(
+          Kokkos::PerTeam(teamHandle),
+          [=](std::size_t& lcount) {
+            lcount = 0;
+            for (std::size_t i = 0; i < num_elements; ++i) {
+              if (!pred(first[i])) {
+                first[lcount++] = std::move(first[i]);
+              }
+            }
+          },
+          count);
+    }
+    // no barrier needed since single above broadcasts to all members
+
+    return first + num_elements - remove_count;
+  }
+}
+
+//
+// remove
+//
 template <class ExecutionSpace, class IteratorType, class ValueType>
-auto remove_impl(const std::string& label, const ExecutionSpace& ex,
-                 IteratorType first, IteratorType last,
-                 const ValueType& value) {
+auto remove_exespace_impl(const std::string& label, const ExecutionSpace& ex,
+                          IteratorType first, IteratorType last,
+                          const ValueType& value) {
+  using predicate_type = StdAlgoEqualsValUnaryPredicate<ValueType>;
+  return remove_if_exespace_impl(label, ex, first, last, predicate_type(value));
+}
+
+template <class TeamHandleType, class IteratorType, class ValueType>
+KOKKOS_FUNCTION auto remove_team_impl(const TeamHandleType& teamHandle,
+                                      IteratorType first, IteratorType last,
+                                      const ValueType& value) {
   using predicate_type = StdAlgoEqualsValUnaryPredicate<ValueType>;
-  return remove_if_impl(label, ex, first, last, predicate_type(value));
+  return remove_if_team_impl(teamHandle, first, last, predicate_type(value));
 }
 
+//
+// remove_copy
+//
 template <class ExecutionSpace, class InputIteratorType,
           class OutputIteratorType, class ValueType>
-auto remove_copy_impl(const std::string& label, const ExecutionSpace& ex,
-                      InputIteratorType first_from, InputIteratorType last_from,
-                      OutputIteratorType first_dest, const ValueType& value) {
+auto remove_copy_exespace_impl(const std::string& label,
+                               const ExecutionSpace& ex,
+                               InputIteratorType first_from,
+                               InputIteratorType last_from,
+                               OutputIteratorType first_dest,
+                               const ValueType& value) {
   // this is like copy_if except that we need to *ignore* the elements
   // that match the value, so we can solve this as follows:
 
@@ -160,13 +216,32 @@ auto remove_copy_impl(const std::string& label, const ExecutionSpace& ex,
                                          first_dest, predicate_type(value));
 }
 
+template <class TeamHandleType, class InputIteratorType,
+          class OutputIteratorType, class ValueType>
+KOKKOS_FUNCTION auto remove_copy_team_impl(const TeamHandleType& teamHandle,
+                                           InputIteratorType first_from,
+                                           InputIteratorType last_from,
+                                           OutputIteratorType first_dest,
+                                           const ValueType& value) {
+  // this is like copy_if except that we need to *ignore* the elements
+  // that match the value, so we can solve this as follows:
+
+  using predicate_type = StdAlgoNotEqualsValUnaryPredicate<ValueType>;
+  return ::Kokkos::Experimental::copy_if(teamHandle, first_from, last_from,
+                                         first_dest, predicate_type(value));
+}
+
+//
+// remove_copy_if
+//
 template <class ExecutionSpace, class InputIteratorType,
           class OutputIteratorType, class UnaryPredicate>
-auto remove_copy_if_impl(const std::string& label, const ExecutionSpace& ex,
-                         InputIteratorType first_from,
-                         InputIteratorType last_from,
-                         OutputIteratorType first_dest,
-                         const UnaryPredicate& pred) {
+auto remove_copy_if_exespace_impl(const std::string& label,
+                                  const ExecutionSpace& ex,
+                                  InputIteratorType first_from,
+                                  InputIteratorType last_from,
+                                  OutputIteratorType first_dest,
+                                  const UnaryPredicate& pred) {
   // this is like copy_if except that we need to *ignore* the elements
   // satisfying the pred, so we can solve this as follows:
 
@@ -177,6 +252,20 @@ auto remove_copy_if_impl(const std::string& label, const ExecutionSpace& ex,
                                          first_dest, pred_wrapper_type(pred));
 }
 
+template <class TeamHandleType, class InputIteratorType,
+          class OutputIteratorType, class UnaryPredicate>
+KOKKOS_FUNCTION auto remove_copy_if_team_impl(const TeamHandleType& teamHandle,
+                                              InputIteratorType first_from,
+                                              InputIteratorType last_from,
+                                              OutputIteratorType first_dest,
+                                              const UnaryPredicate& pred) {
+  using value_type = typename InputIteratorType::value_type;
+  using pred_wrapper_type =
+      StdAlgoNegateUnaryPredicateWrapper<value_type, UnaryPredicate>;
+  return ::Kokkos::Experimental::copy_if(teamHandle, first_from, last_from,
+                                         first_dest, pred_wrapper_type(pred));
+}
+
 }  // namespace Impl
 }  // namespace Experimental
 }  // namespace Kokkos
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Replace.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Replace.hpp
index e3a6b538f4b6810f1b758b79c6df8bc727c63b38..5b5e4147d6b1aa40c1d2cde82214d9795c86aef1 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Replace.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Replace.hpp
@@ -50,24 +50,38 @@ struct StdReplaceFunctor {
 };
 
 template <class ExecutionSpace, class IteratorType, class ValueType>
-void replace_impl(const std::string& label, const ExecutionSpace& ex,
-                  IteratorType first, IteratorType last,
-                  const ValueType& old_value, const ValueType& new_value) {
+void replace_exespace_impl(const std::string& label, const ExecutionSpace& ex,
+                           IteratorType first, IteratorType last,
+                           const ValueType& old_value,
+                           const ValueType& new_value) {
   // checks
   Impl::static_assert_random_access_and_accessible(ex, first);
   Impl::expect_valid_range(first, last);
 
-  // aliases
-  using func_t = StdReplaceFunctor<IteratorType, ValueType>;
-
   // run
   const auto num_elements = Kokkos::Experimental::distance(first, last);
   ::Kokkos::parallel_for(label,
                          RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-                         func_t(first, old_value, new_value));
+                         StdReplaceFunctor(first, old_value, new_value));
   ex.fence("Kokkos::replace: fence after operation");
 }
 
+template <class TeamHandleType, class IteratorType, class ValueType>
+KOKKOS_FUNCTION void replace_team_impl(const TeamHandleType& teamHandle,
+                                       IteratorType first, IteratorType last,
+                                       const ValueType& old_value,
+                                       const ValueType& new_value) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(teamHandle, first);
+  Impl::expect_valid_range(first, last);
+
+  // run
+  const auto num_elements = Kokkos::Experimental::distance(first, last);
+  ::Kokkos::parallel_for(TeamThreadRange(teamHandle, 0, num_elements),
+                         StdReplaceFunctor(first, old_value, new_value));
+  teamHandle.team_barrier();
+}
+
 }  // namespace Impl
 }  // namespace Experimental
 }  // namespace Kokkos
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReplaceCopy.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReplaceCopy.hpp
index 729cf8931155daf8ce2a130de6e97f123ac1b7bd..61ffa9fd93d88ccf6ed55f5819ef123026a67fab 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReplaceCopy.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReplaceCopy.hpp
@@ -58,35 +58,56 @@ struct StdReplaceCopyFunctor {
 
 template <class ExecutionSpace, class InputIteratorType,
           class OutputIteratorType, class ValueType>
-OutputIteratorType replace_copy_impl(const std::string& label,
-                                     const ExecutionSpace& ex,
-                                     InputIteratorType first_from,
-                                     InputIteratorType last_from,
-                                     OutputIteratorType first_dest,
-                                     const ValueType& old_value,
-                                     const ValueType& new_value) {
+OutputIteratorType replace_copy_exespace_impl(const std::string& label,
+                                              const ExecutionSpace& ex,
+                                              InputIteratorType first_from,
+                                              InputIteratorType last_from,
+                                              OutputIteratorType first_dest,
+                                              const ValueType& old_value,
+                                              const ValueType& new_value) {
   // checks
   Impl::static_assert_random_access_and_accessible(ex, first_from, first_dest);
   Impl::static_assert_iterators_have_matching_difference_type(first_from,
                                                               first_dest);
   Impl::expect_valid_range(first_from, last_from);
 
-  // aliases
-  using func_t =
-      StdReplaceCopyFunctor<InputIteratorType, OutputIteratorType, ValueType>;
-
   // run
   const auto num_elements =
       Kokkos::Experimental::distance(first_from, last_from);
-  ::Kokkos::parallel_for(label,
-                         RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-                         func_t(first_from, first_dest, old_value, new_value));
+  ::Kokkos::parallel_for(
+      label, RangePolicy<ExecutionSpace>(ex, 0, num_elements),
+      StdReplaceCopyFunctor(first_from, first_dest, old_value, new_value));
   ex.fence("Kokkos::replace_copy: fence after operation");
 
   // return
   return first_dest + num_elements;
 }
 
+template <class TeamHandleType, class InputIteratorType,
+          class OutputIteratorType, class ValueType>
+KOKKOS_FUNCTION OutputIteratorType replace_copy_team_impl(
+    const TeamHandleType& teamHandle, InputIteratorType first_from,
+    InputIteratorType last_from, OutputIteratorType first_dest,
+    const ValueType& old_value, const ValueType& new_value) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(teamHandle, first_from,
+                                                   first_dest);
+  Impl::static_assert_iterators_have_matching_difference_type(first_from,
+                                                              first_dest);
+  Impl::expect_valid_range(first_from, last_from);
+
+  // run
+  const auto num_elements =
+      Kokkos::Experimental::distance(first_from, last_from);
+  ::Kokkos::parallel_for(
+      TeamThreadRange(teamHandle, 0, num_elements),
+      StdReplaceCopyFunctor(first_from, first_dest, old_value, new_value));
+  teamHandle.team_barrier();
+
+  // return
+  return first_dest + num_elements;
+}
+
 }  // namespace Impl
 }  // namespace Experimental
 }  // namespace Kokkos
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReplaceCopyIf.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReplaceCopyIf.hpp
index dca7e7f6a5173d042d15930926f37ae0b52f32d6..2cc38d1d0e72a515877398be7fba495d0dab00ed 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReplaceCopyIf.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReplaceCopyIf.hpp
@@ -27,16 +27,18 @@ namespace Kokkos {
 namespace Experimental {
 namespace Impl {
 
-template <class IndexType, class InputIterator, class OutputIterator,
-          class PredicateType, class ValueType>
+template <class InputIterator, class OutputIterator, class PredicateType,
+          class ValueType>
 struct StdReplaceIfCopyFunctor {
+  using index_type = typename InputIterator::difference_type;
+
   InputIterator m_first_from;
   OutputIterator m_first_dest;
   PredicateType m_pred;
   ValueType m_new_value;
 
   KOKKOS_FUNCTION
-  void operator()(IndexType i) const {
+  void operator()(index_type i) const {
     const auto& myvalue_from = m_first_from[i];
 
     if (m_pred(myvalue_from)) {
@@ -57,37 +59,62 @@ struct StdReplaceIfCopyFunctor {
 
 template <class ExecutionSpace, class InputIteratorType,
           class OutputIteratorType, class PredicateType, class ValueType>
-OutputIteratorType replace_copy_if_impl(const std::string& label,
-                                        const ExecutionSpace& ex,
-                                        InputIteratorType first_from,
-                                        InputIteratorType last_from,
-                                        OutputIteratorType first_dest,
-                                        PredicateType pred,
-                                        const ValueType& new_value) {
+OutputIteratorType replace_copy_if_exespace_impl(const std::string& label,
+                                                 const ExecutionSpace& ex,
+                                                 InputIteratorType first_from,
+                                                 InputIteratorType last_from,
+                                                 OutputIteratorType first_dest,
+                                                 PredicateType pred,
+                                                 const ValueType& new_value) {
   // checks
   Impl::static_assert_random_access_and_accessible(ex, first_from, first_dest);
   Impl::static_assert_iterators_have_matching_difference_type(first_from,
                                                               first_dest);
   Impl::expect_valid_range(first_from, last_from);
 
-  // aliases
-  using index_type = typename InputIteratorType::difference_type;
-  using func_t =
-      StdReplaceIfCopyFunctor<index_type, InputIteratorType, OutputIteratorType,
-                              PredicateType, ValueType>;
-
   // run
   const auto num_elements =
       Kokkos::Experimental::distance(first_from, last_from);
-  ::Kokkos::parallel_for(
-      label, RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-      func_t(first_from, first_dest, std::move(pred), new_value));
+  ::Kokkos::parallel_for(label,
+                         RangePolicy<ExecutionSpace>(ex, 0, num_elements),
+                         // use CTAD
+                         StdReplaceIfCopyFunctor(first_from, first_dest,
+                                                 std::move(pred), new_value));
   ex.fence("Kokkos::replace_copy_if: fence after operation");
 
   // return
   return first_dest + num_elements;
 }
 
+//
+// team-level impl
+//
+template <class TeamHandleType, class InputIteratorType,
+          class OutputIteratorType, class PredicateType, class ValueType>
+KOKKOS_FUNCTION OutputIteratorType replace_copy_if_team_impl(
+    const TeamHandleType& teamHandle, InputIteratorType first_from,
+    InputIteratorType last_from, OutputIteratorType first_dest,
+    PredicateType pred, const ValueType& new_value) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(teamHandle, first_from,
+                                                   first_dest);
+  Impl::static_assert_iterators_have_matching_difference_type(first_from,
+                                                              first_dest);
+  Impl::expect_valid_range(first_from, last_from);
+
+  // run
+  const auto num_elements =
+      Kokkos::Experimental::distance(first_from, last_from);
+  ::Kokkos::parallel_for(TeamThreadRange(teamHandle, 0, num_elements),
+                         // use CTAD
+                         StdReplaceIfCopyFunctor(first_from, first_dest,
+                                                 std::move(pred), new_value));
+  teamHandle.team_barrier();
+
+  // return
+  return first_dest + num_elements;
+}
+
 }  // namespace Impl
 }  // namespace Experimental
 }  // namespace Kokkos
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReplaceIf.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReplaceIf.hpp
index 54b6e1c5287d863eb58ad2aaf371397db806c328..61e8abf44cb46577ec74b56f1c0e93a3e540adb2 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReplaceIf.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReplaceIf.hpp
@@ -52,24 +52,40 @@ struct StdReplaceIfFunctor {
 
 template <class ExecutionSpace, class IteratorType, class PredicateType,
           class ValueType>
-void replace_if_impl(const std::string& label, const ExecutionSpace& ex,
-                     IteratorType first, IteratorType last, PredicateType pred,
-                     const ValueType& new_value) {
+void replace_if_exespace_impl(const std::string& label,
+                              const ExecutionSpace& ex, IteratorType first,
+                              IteratorType last, PredicateType pred,
+                              const ValueType& new_value) {
   // checks
   Impl::static_assert_random_access_and_accessible(ex, first);
   Impl::expect_valid_range(first, last);
 
-  // aliases
-  using func_t = StdReplaceIfFunctor<IteratorType, PredicateType, ValueType>;
-
   // run
   const auto num_elements = Kokkos::Experimental::distance(first, last);
-  ::Kokkos::parallel_for(label,
-                         RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-                         func_t(first, std::move(pred), new_value));
+  ::Kokkos::parallel_for(
+      label, RangePolicy<ExecutionSpace>(ex, 0, num_elements),
+      StdReplaceIfFunctor(first, std::move(pred), new_value));
   ex.fence("Kokkos::replace_if: fence after operation");
 }
 
+template <class TeamHandleType, class IteratorType, class PredicateType,
+          class ValueType>
+KOKKOS_FUNCTION void replace_if_team_impl(const TeamHandleType& teamHandle,
+                                          IteratorType first, IteratorType last,
+                                          PredicateType pred,
+                                          const ValueType& new_value) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(teamHandle, first);
+  Impl::expect_valid_range(first, last);
+
+  // run
+  const auto num_elements = Kokkos::Experimental::distance(first, last);
+  ::Kokkos::parallel_for(
+      TeamThreadRange(teamHandle, 0, num_elements),
+      StdReplaceIfFunctor(first, std::move(pred), new_value));
+  teamHandle.team_barrier();
+}
+
 }  // namespace Impl
 }  // namespace Experimental
 }  // namespace Kokkos
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp
index a4aaba26b9866780fc127924c040954b13c3459a..428dc0d744a40f97f73ccd120120bb6795f57643 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp
@@ -39,43 +39,50 @@ struct StdReverseFunctor {
 
   KOKKOS_FUNCTION
   void operator()(index_type i) const {
-    // the swap below is doing the same thing, but
-    // for Intel 18.0.5 does not work.
-    // But putting the impl directly here, it works.
-#ifdef KOKKOS_COMPILER_INTEL
-    typename InputIterator::value_type tmp = std::move(m_first[i]);
-    m_first[i]                             = std::move(m_last[-i - 1]);
-    m_last[-i - 1]                         = std::move(tmp);
-#else
     ::Kokkos::Experimental::swap(m_first[i], m_last[-i - 1]);
-#endif
   }
 
+  KOKKOS_FUNCTION
   StdReverseFunctor(InputIterator first, InputIterator last)
       : m_first(std::move(first)), m_last(std::move(last)) {}
 };
 
 template <class ExecutionSpace, class InputIterator>
-void reverse_impl(const std::string& label, const ExecutionSpace& ex,
-                  InputIterator first, InputIterator last) {
+void reverse_exespace_impl(const std::string& label, const ExecutionSpace& ex,
+                           InputIterator first, InputIterator last) {
   // checks
   Impl::static_assert_random_access_and_accessible(ex, first);
   Impl::expect_valid_range(first, last);
 
-  // aliases
-  using func_t = StdReverseFunctor<InputIterator>;
-
   // run
   if (last >= first + 2) {
     // only need half
     const auto num_elements = Kokkos::Experimental::distance(first, last) / 2;
     ::Kokkos::parallel_for(label,
                            RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-                           func_t(first, last));
+                           StdReverseFunctor(first, last));
     ex.fence("Kokkos::reverse: fence after operation");
   }
 }
 
+template <class TeamHandleType, class InputIterator>
+KOKKOS_FUNCTION void reverse_team_impl(const TeamHandleType& teamHandle,
+                                       InputIterator first,
+                                       InputIterator last) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(teamHandle, first);
+  Impl::expect_valid_range(first, last);
+
+  // run
+  if (last >= first + 2) {
+    // only need half
+    const auto num_elements = Kokkos::Experimental::distance(first, last) / 2;
+    ::Kokkos::parallel_for(TeamThreadRange(teamHandle, 0, num_elements),
+                           StdReverseFunctor(first, last));
+    teamHandle.team_barrier();
+  }
+}
+
 }  // namespace Impl
 }  // namespace Experimental
 }  // namespace Kokkos
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReverseCopy.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReverseCopy.hpp
index 6dd52813e050f717f6a2ce8cb03ef99fb9ee9530..dd20d90e399536fff3dfe939216c58e7aca0ed5d 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReverseCopy.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReverseCopy.hpp
@@ -27,46 +27,64 @@ namespace Kokkos {
 namespace Experimental {
 namespace Impl {
 
-template <class IndexType, class InputIterator, class OutputIterator>
+template <class InputIterator, class OutputIterator>
 struct StdReverseCopyFunctor {
-  static_assert(std::is_signed<IndexType>::value,
+  using index_type = typename InputIterator::difference_type;
+  static_assert(std::is_signed<index_type>::value,
                 "Kokkos: StdReverseCopyFunctor requires signed index type");
 
   InputIterator m_last;
   OutputIterator m_dest_first;
 
   KOKKOS_FUNCTION
-  void operator()(IndexType i) const { m_dest_first[i] = m_last[-1 - i]; }
+  void operator()(index_type i) const { m_dest_first[i] = m_last[-1 - i]; }
 
+  KOKKOS_FUNCTION
   StdReverseCopyFunctor(InputIterator _last, OutputIterator _dest_first)
       : m_last(std::move(_last)), m_dest_first(std::move(_dest_first)) {}
 };
 
 template <class ExecutionSpace, class InputIterator, class OutputIterator>
-OutputIterator reverse_copy_impl(const std::string& label,
-                                 const ExecutionSpace& ex, InputIterator first,
-                                 InputIterator last, OutputIterator d_first) {
+OutputIterator reverse_copy_exespace_impl(const std::string& label,
+                                          const ExecutionSpace& ex,
+                                          InputIterator first,
+                                          InputIterator last,
+                                          OutputIterator d_first) {
   // checks
   Impl::static_assert_random_access_and_accessible(ex, first, d_first);
   Impl::static_assert_iterators_have_matching_difference_type(first, d_first);
   Impl::expect_valid_range(first, last);
 
-  // aliases
-  using index_type = typename InputIterator::difference_type;
-  using func_t =
-      StdReverseCopyFunctor<index_type, InputIterator, OutputIterator>;
-
   // run
   const auto num_elements = Kokkos::Experimental::distance(first, last);
   ::Kokkos::parallel_for(label,
                          RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-                         func_t(last, d_first));
+                         StdReverseCopyFunctor(last, d_first));
   ex.fence("Kokkos::reverse_copy: fence after operation");
 
   // return
   return d_first + num_elements;
 }
 
+template <class TeamHandleType, class InputIterator, class OutputIterator>
+KOKKOS_FUNCTION OutputIterator
+reverse_copy_team_impl(const TeamHandleType& teamHandle, InputIterator first,
+                       InputIterator last, OutputIterator d_first) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(teamHandle, first, d_first);
+  Impl::static_assert_iterators_have_matching_difference_type(first, d_first);
+  Impl::expect_valid_range(first, last);
+
+  // run
+  const auto num_elements = Kokkos::Experimental::distance(first, last);
+  ::Kokkos::parallel_for(TeamThreadRange(teamHandle, 0, num_elements),
+                         StdReverseCopyFunctor(last, d_first));
+  teamHandle.team_barrier();
+
+  // return
+  return d_first + num_elements;
+}
+
 }  // namespace Impl
 }  // namespace Experimental
 }  // namespace Kokkos
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Rotate.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Rotate.hpp
index 3aa1ab5d405125bb13a820cc3ab17e3c552c52e1..7a4cb8e3253bf112e2d0dabf3208f6343d44d55b 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Rotate.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Rotate.hpp
@@ -20,6 +20,7 @@
 #include <Kokkos_Core.hpp>
 #include "Kokkos_Constraints.hpp"
 #include "Kokkos_HelperPredicates.hpp"
+#include "Kokkos_Reverse.hpp"
 #include <std_algorithms/Kokkos_Move.hpp>
 #include <std_algorithms/Kokkos_Distance.hpp>
 #include <string>
@@ -165,15 +166,17 @@ IteratorType rotate_with_pivot_in_right_half(const std::string& label,
 }
 
 template <class ExecutionSpace, class IteratorType>
-IteratorType rotate_impl(const std::string& label, const ExecutionSpace& ex,
-                         IteratorType first, IteratorType n_first,
-                         IteratorType last) {
+IteratorType rotate_exespace_impl(const std::string& label,
+                                  const ExecutionSpace& ex, IteratorType first,
+                                  IteratorType n_first, IteratorType last) {
   // checks
   Impl::static_assert_random_access_and_accessible(ex, first);
   Impl::expect_valid_range(first, last);
   Impl::expect_valid_range(first, n_first);
   Impl::expect_valid_range(n_first, last);
 
+  // might be worth checking if for exespace we should do
+  // something similar to what we do for team since it avoids a new allocation
   namespace KE                     = ::Kokkos::Experimental;
   const auto num_elements          = KE::distance(first, last);
   const auto n_distance_from_first = KE::distance(first, n_first);
@@ -184,6 +187,31 @@ IteratorType rotate_impl(const std::string& label, const ExecutionSpace& ex,
   }
 }
 
+template <class TeamHandleType, class IteratorType>
+KOKKOS_FUNCTION IteratorType rotate_team_impl(const TeamHandleType& teamHandle,
+                                              IteratorType first,
+                                              IteratorType n_first,
+                                              IteratorType last) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(teamHandle, first);
+  Impl::expect_valid_range(first, last);
+  Impl::expect_valid_range(first, n_first);
+  Impl::expect_valid_range(n_first, last);
+
+  namespace KE = ::Kokkos::Experimental;
+
+  auto result = first + (last - n_first);
+  // first reverse the whole range
+  KE::Impl::reverse_team_impl(teamHandle, first, last);
+  // re-reverse each piece
+  KE::Impl::reverse_team_impl(teamHandle, first, result);
+  KE::Impl::reverse_team_impl(teamHandle, result, last);
+
+  // no need for barrier here since reverse already calls it
+
+  return result;
+}
+
 }  // namespace Impl
 }  // namespace Experimental
 }  // namespace Kokkos
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RotateCopy.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RotateCopy.hpp
index 28023cc4dfa4cd70b2dc71edac90605c97d560a5..f22dae2de482f2e0cdde129d2a786fdadd5f16d0 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RotateCopy.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RotateCopy.hpp
@@ -27,16 +27,18 @@ namespace Kokkos {
 namespace Experimental {
 namespace Impl {
 
-template <class IndexType, class InputIterator, class OutputIterator>
+template <class InputIterator, class OutputIterator>
 struct StdRotateCopyFunctor {
+  using index_type = typename InputIterator::difference_type;
+
   InputIterator m_first;
   InputIterator m_last;
   InputIterator m_first_n;
   OutputIterator m_dest_first;
 
   KOKKOS_FUNCTION
-  void operator()(IndexType i) const {
-    const IndexType shift = m_last - m_first_n;
+  void operator()(index_type i) const {
+    const index_type shift = m_last - m_first_n;
 
     if (i < shift) {
       m_dest_first[i] = m_first_n[i];
@@ -45,6 +47,7 @@ struct StdRotateCopyFunctor {
     }
   }
 
+  KOKKOS_FUNCTION
   StdRotateCopyFunctor(InputIterator first, InputIterator last,
                        InputIterator first_n, OutputIterator dest_first)
       : m_first(std::move(first)),
@@ -54,10 +57,9 @@ struct StdRotateCopyFunctor {
 };
 
 template <class ExecutionSpace, class InputIterator, class OutputIterator>
-OutputIterator rotate_copy_impl(const std::string& label,
-                                const ExecutionSpace& ex, InputIterator first,
-                                InputIterator n_first, InputIterator last,
-                                OutputIterator d_first) {
+OutputIterator rotate_copy_exespace_impl(
+    const std::string& label, const ExecutionSpace& ex, InputIterator first,
+    InputIterator n_first, InputIterator last, OutputIterator d_first) {
   /*
     algorithm is implemented as follows:
 
@@ -97,16 +99,11 @@ OutputIterator rotate_copy_impl(const std::string& label,
     return d_first;
   }
 
-  // aliases
-  using index_type = typename InputIterator::difference_type;
-  using func_type =
-      StdRotateCopyFunctor<index_type, InputIterator, OutputIterator>;
-
   // run
   const auto num_elements = Kokkos::Experimental::distance(first, last);
   ::Kokkos::parallel_for(label,
                          RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-                         func_type(first, last, n_first, d_first));
+                         StdRotateCopyFunctor(first, last, n_first, d_first));
 
   ex.fence("Kokkos::rotate_copy: fence after operation");
 
@@ -114,6 +111,32 @@ OutputIterator rotate_copy_impl(const std::string& label,
   return d_first + num_elements;
 }
 
+template <class TeamHandleType, class InputIterator, class OutputIterator>
+KOKKOS_FUNCTION OutputIterator rotate_copy_team_impl(
+    const TeamHandleType& teamHandle, InputIterator first,
+    InputIterator n_first, InputIterator last, OutputIterator d_first) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(teamHandle, first, d_first);
+  Impl::static_assert_iterators_have_matching_difference_type(first, d_first);
+  Impl::expect_valid_range(first, last);
+  Impl::expect_valid_range(first, n_first);
+  Impl::expect_valid_range(n_first, last);
+
+  if (first == last) {
+    return d_first;
+  }
+
+  // run
+  const auto num_elements = Kokkos::Experimental::distance(first, last);
+  ::Kokkos::parallel_for(TeamThreadRange(teamHandle, 0, num_elements),
+                         StdRotateCopyFunctor(first, last, n_first, d_first));
+
+  teamHandle.team_barrier();
+
+  // return
+  return d_first + num_elements;
+}
+
 }  // namespace Impl
 }  // namespace Experimental
 }  // namespace Kokkos
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Search.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Search.hpp
index a612a57231f561b5556c8e4fcac315bb504ff099..fa04350eb52b9943aed8428f8618f269c28ae2b0 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Search.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Search.hpp
@@ -60,9 +60,11 @@ struct StdSearchFunctor {
       }
     }
 
-    const auto rv =
-        found ? red_value_type{i}
-              : red_value_type{::Kokkos::reduction_identity<IndexType>::min()};
+    // FIXME_NVHPC using a ternary operator causes problems
+    red_value_type rv = {::Kokkos::reduction_identity<IndexType>::min()};
+    if (found) {
+      rv = {i};
+    }
 
     m_reducer.join(red_value, rv);
   }
@@ -79,12 +81,16 @@ struct StdSearchFunctor {
         m_p(std::move(p)) {}
 };
 
+//
+// exespace impl
+//
 template <class ExecutionSpace, class IteratorType1, class IteratorType2,
           class BinaryPredicateType>
-IteratorType1 search_impl(const std::string& label, const ExecutionSpace& ex,
-                          IteratorType1 first, IteratorType1 last,
-                          IteratorType2 s_first, IteratorType2 s_last,
-                          const BinaryPredicateType& pred) {
+IteratorType1 search_exespace_impl(const std::string& label,
+                                   const ExecutionSpace& ex,
+                                   IteratorType1 first, IteratorType1 last,
+                                   IteratorType2 s_first, IteratorType2 s_last,
+                                   const BinaryPredicateType& pred) {
   // checks
   Impl::static_assert_random_access_and_accessible(ex, first, s_first);
   Impl::static_assert_iterators_have_matching_difference_type(first, s_first);
@@ -96,7 +102,6 @@ IteratorType1 search_impl(const std::string& label, const ExecutionSpace& ex,
   const auto num_elements = KE::distance(first, last);
   const auto s_count      = KE::distance(s_first, s_last);
   KOKKOS_EXPECTS(num_elements >= s_count);
-  (void)s_count;  // needed when macro above is a no-op
 
   if (s_first == s_last) {
     return first;
@@ -108,7 +113,8 @@ IteratorType1 search_impl(const std::string& label, const ExecutionSpace& ex,
 
   // special case where the two ranges have equal size
   if (num_elements == s_count) {
-    const auto equal_result = equal_impl(label, ex, first, last, s_first, pred);
+    const auto equal_result =
+        equal_exespace_impl(label, ex, first, last, s_first, pred);
     return (equal_result) ? first : last;
   } else {
     using index_type           = typename IteratorType1::difference_type;
@@ -147,13 +153,99 @@ IteratorType1 search_impl(const std::string& label, const ExecutionSpace& ex,
 }
 
 template <class ExecutionSpace, class IteratorType1, class IteratorType2>
-IteratorType1 search_impl(const std::string& label, const ExecutionSpace& ex,
-                          IteratorType1 first, IteratorType1 last,
-                          IteratorType2 s_first, IteratorType2 s_last) {
+IteratorType1 search_exespace_impl(const std::string& label,
+                                   const ExecutionSpace& ex,
+                                   IteratorType1 first, IteratorType1 last,
+                                   IteratorType2 s_first,
+                                   IteratorType2 s_last) {
+  using value_type1    = typename IteratorType1::value_type;
+  using value_type2    = typename IteratorType2::value_type;
+  using predicate_type = StdAlgoEqualBinaryPredicate<value_type1, value_type2>;
+  return search_exespace_impl(label, ex, first, last, s_first, s_last,
+                              predicate_type());
+}
+
+//
+// team impl
+//
+template <class TeamHandleType, class IteratorType1, class IteratorType2,
+          class BinaryPredicateType>
+KOKKOS_FUNCTION IteratorType1
+search_team_impl(const TeamHandleType& teamHandle, IteratorType1 first,
+                 IteratorType1 last, IteratorType2 s_first,
+                 IteratorType2 s_last, const BinaryPredicateType& pred) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(teamHandle, first, s_first);
+  Impl::static_assert_iterators_have_matching_difference_type(first, s_first);
+  Impl::expect_valid_range(first, last);
+  Impl::expect_valid_range(s_first, s_last);
+
+  // the target sequence should not be larger than the range [first, last)
+  namespace KE            = ::Kokkos::Experimental;
+  const auto num_elements = KE::distance(first, last);
+  const auto s_count      = KE::distance(s_first, s_last);
+  KOKKOS_EXPECTS(num_elements >= s_count);
+
+  if (s_first == s_last) {
+    return first;
+  }
+
+  if (first == last) {
+    return last;
+  }
+
+  // special case where the two ranges have equal size
+  if (num_elements == s_count) {
+    const auto equal_result =
+        equal_team_impl(teamHandle, first, last, s_first, pred);
+    return (equal_result) ? first : last;
+  } else {
+    using index_type           = typename IteratorType1::difference_type;
+    using reducer_type         = FirstLoc<index_type>;
+    using reduction_value_type = typename reducer_type::value_type;
+    using func_t = StdSearchFunctor<index_type, IteratorType1, IteratorType2,
+                                    reducer_type, BinaryPredicateType>;
+
+    // run
+    reduction_value_type red_result;
+    reducer_type reducer(red_result);
+
+    // decide the size of the range policy of the par_red:
+    // note that the last feasible index to start looking is the index
+    // whose distance from the "last" is equal to the sequence count.
+    // the +1 is because we need to include that location too.
+    const auto range_size = num_elements - s_count + 1;
+
+    // run par reduce
+    ::Kokkos::parallel_reduce(
+        TeamThreadRange(teamHandle, 0, range_size),
+        func_t(first, last, s_first, s_last, reducer, pred), reducer);
+
+    teamHandle.team_barrier();
+
+    // decide and return
+    if (red_result.min_loc_true ==
+        ::Kokkos::reduction_identity<index_type>::min()) {
+      // location has not been found
+      return last;
+    } else {
+      // location has been found
+      return first + red_result.min_loc_true;
+    }
+  }
+}
+
+template <class TeamHandleType, class IteratorType1, class IteratorType2>
+KOKKOS_FUNCTION IteratorType1 search_team_impl(const TeamHandleType& teamHandle,
+                                               IteratorType1 first,
+                                               IteratorType1 last,
+                                               IteratorType2 s_first,
+                                               IteratorType2 s_last) {
   using value_type1    = typename IteratorType1::value_type;
   using value_type2    = typename IteratorType2::value_type;
   using predicate_type = StdAlgoEqualBinaryPredicate<value_type1, value_type2>;
-  return search_impl(label, ex, first, last, s_first, s_last, predicate_type());
+  return search_team_impl(teamHandle, first, last, s_first, s_last,
+                          predicate_type());
 }
 
 }  // namespace Impl
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_SearchN.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_SearchN.hpp
index 0d3b6bc7060f15bd6646479465687e4f618d6cf6..0910f952c0cf7e992f85a54875ca4991b3a1058a 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_SearchN.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_SearchN.hpp
@@ -59,9 +59,11 @@ struct StdSearchNFunctor {
       }
     }
 
-    const auto rv =
-        found ? red_value_type{i}
-              : red_value_type{::Kokkos::reduction_identity<IndexType>::min()};
+    // FIXME_NVHPC using a ternary operator causes problems
+    red_value_type rv = {::Kokkos::reduction_identity<IndexType>::min()};
+    if (found) {
+      rv.min_loc_true = i;
+    }
 
     m_reducer.join(red_value, rv);
   }
@@ -77,12 +79,16 @@ struct StdSearchNFunctor {
         m_p(std::move(p)) {}
 };
 
+//
+// exespace impl
+//
 template <class ExecutionSpace, class IteratorType, class SizeType,
           class ValueType, class BinaryPredicateType>
-IteratorType search_n_impl(const std::string& label, const ExecutionSpace& ex,
-                           IteratorType first, IteratorType last,
-                           SizeType count, const ValueType& value,
-                           const BinaryPredicateType& pred) {
+IteratorType search_n_exespace_impl(const std::string& label,
+                                    const ExecutionSpace& ex,
+                                    IteratorType first, IteratorType last,
+                                    SizeType count, const ValueType& value,
+                                    const BinaryPredicateType& pred) {
   // checks
   static_assert_random_access_and_accessible(ex, first);
   expect_valid_range(first, last);
@@ -102,7 +108,7 @@ IteratorType search_n_impl(const std::string& label, const ExecutionSpace& ex,
   if ((std::size_t)num_elements == (std::size_t)count) {
     using equal_to_value = StdAlgoEqualsValUnaryPredicate<ValueType>;
     const auto satisfies =
-        all_of_impl(label, ex, first, last, equal_to_value(value));
+        all_of_exespace_impl(label, ex, first, last, equal_to_value(value));
     return (satisfies) ? first : last;
   } else {
     // aliases
@@ -144,9 +150,106 @@ IteratorType search_n_impl(const std::string& label, const ExecutionSpace& ex,
 
 template <class ExecutionSpace, class IteratorType, class SizeType,
           class ValueType>
-IteratorType search_n_impl(const std::string& label, const ExecutionSpace& ex,
-                           IteratorType first, IteratorType last,
-                           SizeType count, const ValueType& value) {
+IteratorType search_n_exespace_impl(const std::string& label,
+                                    const ExecutionSpace& ex,
+                                    IteratorType first, IteratorType last,
+                                    SizeType count, const ValueType& value) {
+  using iter_value_type = typename IteratorType::value_type;
+  using predicate_type =
+      StdAlgoEqualBinaryPredicate<iter_value_type, ValueType>;
+
+  /* above we use <iter_value_type, ValueType> for the predicate_type
+     to be consistent with the standard, which says:
+
+     "
+     The signature of the predicate function should be equivalent to:
+
+        bool pred(const Type1 &a, const Type2 &b);
+
+     The type Type1 must be such that an object of type ForwardIt can be
+     dereferenced and then implicitly converted to Type1. The type Type2 must be
+     such that an object of type T can be implicitly converted to Type2.
+     "
+
+     In our case, IteratorType = ForwardIt, and ValueType = T.
+   */
+
+  return search_n_exespace_impl(label, ex, first, last, count, value,
+                                predicate_type());
+}
+
+//
+// team impl
+//
+template <class TeamHandleType, class IteratorType, class SizeType,
+          class ValueType, class BinaryPredicateType>
+KOKKOS_FUNCTION IteratorType search_n_team_impl(
+    const TeamHandleType& teamHandle, IteratorType first, IteratorType last,
+    SizeType count, const ValueType& value, const BinaryPredicateType& pred) {
+  // checks
+  static_assert_random_access_and_accessible(teamHandle, first);
+  expect_valid_range(first, last);
+  KOKKOS_EXPECTS((std::ptrdiff_t)count >= 0);
+
+  // count should not be larger than the range [first, last)
+  namespace KE            = ::Kokkos::Experimental;
+  const auto num_elements = KE::distance(first, last);
+  // cast things to avoid compiler warning
+  KOKKOS_EXPECTS((std::size_t)num_elements >= (std::size_t)count);
+
+  if (first == last) {
+    return first;
+  }
+
+  // special case where num elements in [first, last) == count
+  if ((std::size_t)num_elements == (std::size_t)count) {
+    using equal_to_value = StdAlgoEqualsValUnaryPredicate<ValueType>;
+    const auto satisfies =
+        all_of_team_impl(teamHandle, first, last, equal_to_value(value));
+    return (satisfies) ? first : last;
+  } else {
+    // aliases
+    using index_type           = typename IteratorType::difference_type;
+    using reducer_type         = FirstLoc<index_type>;
+    using reduction_value_type = typename reducer_type::value_type;
+    using func_t =
+        StdSearchNFunctor<index_type, IteratorType, SizeType, ValueType,
+                          reducer_type, BinaryPredicateType>;
+
+    // run
+    reduction_value_type red_result;
+    reducer_type reducer(red_result);
+
+    // decide the size of the range policy of the par_red:
+    // the last feasible index to start looking is the index
+    // whose distance from the "last" is equal to count.
+    // the +1 is because we need to include that location too.
+    const auto range_size = num_elements - count + 1;
+
+    // run par reduce
+    ::Kokkos::parallel_reduce(TeamThreadRange(teamHandle, 0, range_size),
+                              func_t(first, last, count, value, reducer, pred),
+                              reducer);
+
+    teamHandle.team_barrier();
+
+    // decide and return
+    if (red_result.min_loc_true ==
+        ::Kokkos::reduction_identity<index_type>::min()) {
+      // location has not been found
+      return last;
+    } else {
+      // location has been found
+      return first + red_result.min_loc_true;
+    }
+  }
+}
+
+template <class TeamHandleType, class IteratorType, class SizeType,
+          class ValueType>
+KOKKOS_FUNCTION IteratorType
+search_n_team_impl(const TeamHandleType& teamHandle, IteratorType first,
+                   IteratorType last, SizeType count, const ValueType& value) {
   using iter_value_type = typename IteratorType::value_type;
   using predicate_type =
       StdAlgoEqualBinaryPredicate<iter_value_type, ValueType>;
@@ -167,7 +270,8 @@ IteratorType search_n_impl(const std::string& label, const ExecutionSpace& ex,
      In our case, IteratorType = ForwardIt, and ValueType = T.
    */
 
-  return search_n_impl(label, ex, first, last, count, value, predicate_type());
+  return search_n_team_impl(teamHandle, first, last, count, value,
+                            predicate_type());
 }
 
 }  // namespace Impl
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ShiftLeft.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ShiftLeft.hpp
index 7a33762f176062bbee756fafebeef0df5219541f..50bc7c8d610aebd3980cceca6c1b73ee8f06112f 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ShiftLeft.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ShiftLeft.hpp
@@ -29,9 +29,9 @@ namespace Experimental {
 namespace Impl {
 
 template <class ExecutionSpace, class IteratorType>
-IteratorType shift_left_impl(const std::string& label, const ExecutionSpace& ex,
-                             IteratorType first, IteratorType last,
-                             typename IteratorType::difference_type n) {
+IteratorType shift_left_exespace_impl(
+    const std::string& label, const ExecutionSpace& ex, IteratorType first,
+    IteratorType last, typename IteratorType::difference_type n) {
   // checks
   Impl::static_assert_random_access_and_accessible(ex, first);
   Impl::expect_valid_range(first, last);
@@ -104,6 +104,40 @@ IteratorType shift_left_impl(const std::string& label, const ExecutionSpace& ex,
   return last - n;
 }
 
+template <class TeamHandleType, class IteratorType>
+KOKKOS_FUNCTION IteratorType shift_left_team_impl(
+    const TeamHandleType& teamHandle, IteratorType first, IteratorType last,
+    typename IteratorType::difference_type n) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(teamHandle, first);
+  Impl::expect_valid_range(first, last);
+  KOKKOS_EXPECTS(n >= 0);
+
+  // handle trivial cases
+  if (n == 0) {
+    return last;
+  }
+
+  if (n >= Kokkos::Experimental::distance(first, last)) {
+    return first;
+  }
+
+  // we cannot use here a new allocation like we do for the
+  // execution space impl because for this team impl we are
+  // within a parallel region, so for now we solve serially
+
+  const std::size_t numElementsToMove =
+      ::Kokkos::Experimental::distance(first + n, last);
+  Kokkos::single(Kokkos::PerTeam(teamHandle), [=]() {
+    for (std::size_t i = 0; i < numElementsToMove; ++i) {
+      first[i] = std::move(first[i + n]);
+    }
+  });
+  teamHandle.team_barrier();
+
+  return last - n;
+}
+
 }  // namespace Impl
 }  // namespace Experimental
 }  // namespace Kokkos
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ShiftRight.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ShiftRight.hpp
index 2b0a4bb524ebea2f528001c0986b361a541d05c1..cac20bfbba6af9d402c3f79c46c00c07e6ac7fb6 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ShiftRight.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ShiftRight.hpp
@@ -29,10 +29,9 @@ namespace Experimental {
 namespace Impl {
 
 template <class ExecutionSpace, class IteratorType>
-IteratorType shift_right_impl(const std::string& label,
-                              const ExecutionSpace& ex, IteratorType first,
-                              IteratorType last,
-                              typename IteratorType::difference_type n) {
+IteratorType shift_right_exespace_impl(
+    const std::string& label, const ExecutionSpace& ex, IteratorType first,
+    IteratorType last, typename IteratorType::difference_type n) {
   // checks
   Impl::static_assert_random_access_and_accessible(ex, first);
   Impl::expect_valid_range(first, last);
@@ -104,6 +103,60 @@ IteratorType shift_right_impl(const std::string& label,
   return first + n;
 }
 
+template <class Iterator>
+struct StdShiftRightTeamSingleFunctor {
+  Iterator m_first;
+  Iterator m_last;
+  std::size_t m_shift;
+
+  KOKKOS_FUNCTION
+  void operator()() const {
+    // the impl function calling this functor guarantees that
+    // - m_shift is non-negative
+    // - m_first, m_last identify a valid range with m_last > m_first
+    // - m_shift is less than m_last - m_first
+    // so I can safely use std::size_t here
+  }
+
+  KOKKOS_FUNCTION
+  StdShiftRightTeamSingleFunctor(Iterator _first, Iterator _last, std::size_t n)
+      : m_first(std::move(_first)), m_last(std::move(_last)), m_shift(n) {}
+};
+
+template <class TeamHandleType, class IteratorType>
+KOKKOS_FUNCTION IteratorType shift_right_team_impl(
+    const TeamHandleType& teamHandle, IteratorType first, IteratorType last,
+    typename IteratorType::difference_type n) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(teamHandle, first);
+  Impl::expect_valid_range(first, last);
+  KOKKOS_EXPECTS(n >= 0);
+
+  // handle trivial cases
+  if (n == 0) {
+    return first;
+  }
+
+  if (n >= Kokkos::Experimental::distance(first, last)) {
+    return last;
+  }
+
+  // we cannot use here a new allocation like we do for the
+  // execution space impl because for this team impl we are
+  // within a parallel region, so for now we solve serially
+
+  const std::size_t numElementsToMove =
+      ::Kokkos::Experimental::distance(first, last - n);
+  Kokkos::single(Kokkos::PerTeam(teamHandle), [=]() {
+    for (std::size_t i = 0; i < numElementsToMove; ++i) {
+      last[-i - 1] = std::move(last[-n - i - 1]);
+    }
+  });
+  teamHandle.team_barrier();
+
+  return first + n;
+}
+
 }  // namespace Impl
 }  // namespace Experimental
 }  // namespace Kokkos
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_SwapRanges.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_SwapRanges.hpp
index 438acb989f9fc924ebb7bf2308b43fa440c1e115..5bc77ed7ddcdbbe5ab6b8d260a489a0f569b1f42 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_SwapRanges.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_SwapRanges.hpp
@@ -28,23 +28,15 @@ namespace Kokkos {
 namespace Experimental {
 namespace Impl {
 
-template <class IndexType, class IteratorType1, class IteratorType2>
+template <class IteratorType1, class IteratorType2>
 struct StdSwapRangesFunctor {
+  using index_type = typename IteratorType1::difference_type;
   IteratorType1 m_first1;
   IteratorType2 m_first2;
 
   KOKKOS_FUNCTION
-  void operator()(IndexType i) const {
-    // the swap below is doing the same thing, but
-    // for Intel 18.0.5 does not work.
-    // But putting the impl directly here, it works.
-#ifdef KOKKOS_COMPILER_INTEL
-    typename IteratorType1::value_type tmp = std::move(m_first1[i]);
-    m_first1[i]                            = std::move(m_first2[i]);
-    m_first2[i]                            = std::move(tmp);
-#else
+  void operator()(index_type i) const {
     ::Kokkos::Experimental::swap(m_first1[i], m_first2[i]);
-#endif
   }
 
   KOKKOS_FUNCTION
@@ -53,30 +45,48 @@ struct StdSwapRangesFunctor {
 };
 
 template <class ExecutionSpace, class IteratorType1, class IteratorType2>
-IteratorType2 swap_ranges_impl(const std::string& label,
-                               const ExecutionSpace& ex, IteratorType1 first1,
-                               IteratorType1 last1, IteratorType2 first2) {
+IteratorType2 swap_ranges_exespace_impl(const std::string& label,
+                                        const ExecutionSpace& ex,
+                                        IteratorType1 first1,
+                                        IteratorType1 last1,
+                                        IteratorType2 first2) {
   // checks
   Impl::static_assert_random_access_and_accessible(ex, first1, first2);
   Impl::static_assert_iterators_have_matching_difference_type(first1, first2);
   Impl::expect_valid_range(first1, last1);
 
-  // aliases
-  using index_type = typename IteratorType1::difference_type;
-  using func_t = StdSwapRangesFunctor<index_type, IteratorType1, IteratorType2>;
-
   // run
   const auto num_elements_to_swap =
       Kokkos::Experimental::distance(first1, last1);
   ::Kokkos::parallel_for(
       label, RangePolicy<ExecutionSpace>(ex, 0, num_elements_to_swap),
-      func_t(first1, first2));
+      StdSwapRangesFunctor(first1, first2));
   ex.fence("Kokkos::swap_ranges: fence after operation");
 
   // return
   return first2 + num_elements_to_swap;
 }
 
+template <class TeamHandleType, class IteratorType1, class IteratorType2>
+KOKKOS_FUNCTION IteratorType2
+swap_ranges_team_impl(const TeamHandleType& teamHandle, IteratorType1 first1,
+                      IteratorType1 last1, IteratorType2 first2) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(teamHandle, first1, first2);
+  Impl::static_assert_iterators_have_matching_difference_type(first1, first2);
+  Impl::expect_valid_range(first1, last1);
+
+  // run
+  const auto num_elements_to_swap =
+      Kokkos::Experimental::distance(first1, last1);
+  ::Kokkos::parallel_for(TeamThreadRange(teamHandle, 0, num_elements_to_swap),
+                         StdSwapRangesFunctor(first1, first2));
+  teamHandle.team_barrier();
+
+  // return
+  return first2 + num_elements_to_swap;
+}
+
 }  // namespace Impl
 }  // namespace Experimental
 }  // namespace Kokkos
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Transform.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Transform.hpp
index 840e9d205441a5d20692c347f5a3e6b1bddc2eaa..c7e0573324379216db732f466b5496977bbd8672 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Transform.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Transform.hpp
@@ -27,15 +27,19 @@ namespace Kokkos {
 namespace Experimental {
 namespace Impl {
 
-template <class IndexType, class InputIterator, class OutputIterator,
-          class UnaryFunctorType>
+template <class InputIterator, class OutputIterator, class UnaryFunctorType>
 struct StdTransformFunctor {
+  // we can use difference type from InputIterator since
+  // the impl functions calling this functor already
+  // static assert that the iterators have matching difference type
+  using index_type = typename InputIterator::difference_type;
+
   InputIterator m_first;
   OutputIterator m_d_first;
   UnaryFunctorType m_unary_op;
 
   KOKKOS_FUNCTION
-  void operator()(IndexType i) const { m_d_first[i] = m_unary_op(m_first[i]); }
+  void operator()(index_type i) const { m_d_first[i] = m_unary_op(m_first[i]); }
 
   KOKKOS_FUNCTION
   StdTransformFunctor(InputIterator _first, OutputIterator _m_d_first,
@@ -45,16 +49,21 @@ struct StdTransformFunctor {
         m_unary_op(std::move(_functor)) {}
 };
 
-template <class IndexType, class InputIterator1, class InputIterator2,
-          class OutputIterator, class BinaryFunctorType>
+template <class InputIterator1, class InputIterator2, class OutputIterator,
+          class BinaryFunctorType>
 struct StdTransformBinaryFunctor {
+  // we can use difference type from InputIterator1 since
+  // the impl functions calling this functor already
+  // static assert that the iterators have matching difference type
+  using index_type = typename InputIterator1::difference_type;
+
   InputIterator1 m_first1;
   InputIterator2 m_first2;
   OutputIterator m_d_first;
   BinaryFunctorType m_binary_op;
 
   KOKKOS_FUNCTION
-  void operator()(IndexType i) const {
+  void operator()(index_type i) const {
     m_d_first[i] = m_binary_op(m_first1[i], m_first2[i]);
   }
 
@@ -70,25 +79,19 @@ struct StdTransformBinaryFunctor {
 
 template <class ExecutionSpace, class InputIterator, class OutputIterator,
           class UnaryOperation>
-OutputIterator transform_impl(const std::string& label,
-                              const ExecutionSpace& ex, InputIterator first1,
-                              InputIterator last1, OutputIterator d_first,
-                              UnaryOperation unary_op) {
+OutputIterator transform_exespace_impl(
+    const std::string& label, const ExecutionSpace& ex, InputIterator first1,
+    InputIterator last1, OutputIterator d_first, UnaryOperation unary_op) {
   // checks
   Impl::static_assert_random_access_and_accessible(ex, first1, d_first);
   Impl::static_assert_iterators_have_matching_difference_type(first1, d_first);
   Impl::expect_valid_range(first1, last1);
 
-  // aliases
-  using index_type = typename InputIterator::difference_type;
-  using func_t = StdTransformFunctor<index_type, InputIterator, OutputIterator,
-                                     UnaryOperation>;
-
   // run
   const auto num_elements = Kokkos::Experimental::distance(first1, last1);
   ::Kokkos::parallel_for(label,
                          RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-                         func_t(first1, d_first, unary_op));
+                         StdTransformFunctor(first1, d_first, unary_op));
   ex.fence("Kokkos::transform: fence after operation");
 
   // return
@@ -97,32 +100,72 @@ OutputIterator transform_impl(const std::string& label,
 
 template <class ExecutionSpace, class InputIterator1, class InputIterator2,
           class OutputIterator, class BinaryOperation>
-OutputIterator transform_impl(const std::string& label,
-                              const ExecutionSpace& ex, InputIterator1 first1,
-                              InputIterator1 last1, InputIterator2 first2,
-                              OutputIterator d_first,
-                              BinaryOperation binary_op) {
+OutputIterator transform_exespace_impl(
+    const std::string& label, const ExecutionSpace& ex, InputIterator1 first1,
+    InputIterator1 last1, InputIterator2 first2, OutputIterator d_first,
+    BinaryOperation binary_op) {
   // checks
   Impl::static_assert_random_access_and_accessible(ex, first1, first2, d_first);
   Impl::static_assert_iterators_have_matching_difference_type(first1, first2,
                                                               d_first);
   Impl::expect_valid_range(first1, last1);
 
-  // aliases
-  using index_type = typename InputIterator1::difference_type;
-  using func_t =
-      StdTransformBinaryFunctor<index_type, InputIterator1, InputIterator2,
-                                OutputIterator, BinaryOperation>;
-
   // run
   const auto num_elements = Kokkos::Experimental::distance(first1, last1);
-  ::Kokkos::parallel_for(label,
-                         RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-                         func_t(first1, first2, d_first, binary_op));
+  ::Kokkos::parallel_for(
+      label, RangePolicy<ExecutionSpace>(ex, 0, num_elements),
+      StdTransformBinaryFunctor(first1, first2, d_first, binary_op));
   ex.fence("Kokkos::transform: fence after operation");
   return d_first + num_elements;
 }
 
+//
+// team-level impl
+//
+
+template <class TeamHandleType, class InputIterator, class OutputIterator,
+          class UnaryOperation>
+KOKKOS_FUNCTION OutputIterator transform_team_impl(
+    const TeamHandleType& teamHandle, InputIterator first1, InputIterator last1,
+    OutputIterator d_first, UnaryOperation unary_op) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(teamHandle, first1, d_first);
+  Impl::static_assert_iterators_have_matching_difference_type(first1, d_first);
+  Impl::expect_valid_range(first1, last1);
+
+  // run
+  const auto num_elements = Kokkos::Experimental::distance(first1, last1);
+  ::Kokkos::parallel_for(TeamThreadRange(teamHandle, 0, num_elements),
+                         StdTransformFunctor(first1, d_first, unary_op));
+  teamHandle.team_barrier();
+
+  // return
+  return d_first + num_elements;
+}
+
+template <class TeamHandleType, class InputIterator1, class InputIterator2,
+          class OutputIterator, class BinaryOperation>
+KOKKOS_FUNCTION OutputIterator
+transform_team_impl(const TeamHandleType& teamHandle, InputIterator1 first1,
+                    InputIterator1 last1, InputIterator2 first2,
+                    OutputIterator d_first, BinaryOperation binary_op) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(teamHandle, first1, first2,
+                                                   d_first);
+  Impl::static_assert_iterators_have_matching_difference_type(first1, first2,
+                                                              d_first);
+  Impl::expect_valid_range(first1, last1);
+
+  // run
+  const auto num_elements = Kokkos::Experimental::distance(first1, last1);
+  ::Kokkos::parallel_for(
+      TeamThreadRange(teamHandle, 0, num_elements),
+      StdTransformBinaryFunctor(first1, first2, d_first, binary_op));
+  teamHandle.team_barrier();
+
+  return d_first + num_elements;
+}
+
 }  // namespace Impl
 }  // namespace Experimental
 }  // namespace Kokkos
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_TransformExclusiveScan.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_TransformExclusiveScan.hpp
index 773e8c2f883556b53d85ed0750cc675422774d76..d832f8849d1c67e4c505a762372d50b478f0db9a 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_TransformExclusiveScan.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_TransformExclusiveScan.hpp
@@ -21,6 +21,7 @@
 #include "Kokkos_Constraints.hpp"
 #include "Kokkos_HelperPredicates.hpp"
 #include "Kokkos_ValueWrapperForNoNeutralElement.hpp"
+#include "Kokkos_FunctorsForExclusiveScan.hpp"
 #include <std_algorithms/Kokkos_Distance.hpp>
 #include <string>
 
@@ -28,67 +29,13 @@ namespace Kokkos {
 namespace Experimental {
 namespace Impl {
 
-template <class ExeSpace, class IndexType, class ValueType, class FirstFrom,
-          class FirstDest, class BinaryOpType, class UnaryOpType>
-struct TransformExclusiveScanFunctor {
-  using execution_space = ExeSpace;
-  using value_type =
-      ::Kokkos::Experimental::Impl::ValueWrapperForNoNeutralElement<ValueType>;
-
-  ValueType m_init_value;
-  FirstFrom m_first_from;
-  FirstDest m_first_dest;
-  BinaryOpType m_binary_op;
-  UnaryOpType m_unary_op;
-
-  KOKKOS_FUNCTION
-  TransformExclusiveScanFunctor(ValueType init, FirstFrom first_from,
-                                FirstDest first_dest, BinaryOpType bop,
-                                UnaryOpType uop)
-      : m_init_value(std::move(init)),
-        m_first_from(std::move(first_from)),
-        m_first_dest(std::move(first_dest)),
-        m_binary_op(std::move(bop)),
-        m_unary_op(std::move(uop)) {}
-
-  KOKKOS_FUNCTION
-  void operator()(const IndexType i, value_type& update,
-                  const bool final_pass) const {
-    if (final_pass) {
-      if (i == 0) {
-        // for both ExclusiveScan and TransformExclusiveScan,
-        // init is unmodified
-        m_first_dest[i] = m_init_value;
-      } else {
-        m_first_dest[i] = m_binary_op(update.val, m_init_value);
-      }
-    }
-
-    const auto tmp = value_type{m_unary_op(m_first_from[i]), false};
-    this->join(update, tmp);
-  }
-
-  KOKKOS_FUNCTION
-  void init(value_type& update) const {
-    update.val        = {};
-    update.is_initial = true;
-  }
-
-  KOKKOS_FUNCTION
-  void join(value_type& update, const value_type& input) const {
-    if (update.is_initial) {
-      update.val = input.val;
-    } else {
-      update.val = m_binary_op(update.val, input.val);
-    }
-    update.is_initial = false;
-  }
-};
-
+//
+// exespace impl
+//
 template <class ExecutionSpace, class InputIteratorType,
           class OutputIteratorType, class ValueType, class BinaryOpType,
           class UnaryOpType>
-OutputIteratorType transform_exclusive_scan_impl(
+OutputIteratorType transform_exclusive_scan_exespace_impl(
     const std::string& label, const ExecutionSpace& ex,
     InputIteratorType first_from, InputIteratorType last_from,
     OutputIteratorType first_dest, ValueType init_value, BinaryOpType bop,
@@ -101,23 +48,70 @@ OutputIteratorType transform_exclusive_scan_impl(
 
   // aliases
   using index_type = typename InputIteratorType::difference_type;
-  using func_type =
-      TransformExclusiveScanFunctor<ExecutionSpace, index_type, ValueType,
-                                    InputIteratorType, OutputIteratorType,
-                                    BinaryOpType, UnaryOpType>;
+
+  using func_type = std::conditional_t<
+      ::Kokkos::is_detected<ex_scan_has_reduction_identity_sum_t,
+                            ValueType>::value,
+      TransformExclusiveScanFunctorWithoutValueWrapper<
+          ExecutionSpace, index_type, ValueType, InputIteratorType,
+          OutputIteratorType, BinaryOpType, UnaryOpType>,
+      TransformExclusiveScanFunctorWithValueWrapper<
+          ExecutionSpace, index_type, ValueType, InputIteratorType,
+          OutputIteratorType, BinaryOpType, UnaryOpType> >;
 
   // run
   const auto num_elements =
       Kokkos::Experimental::distance(first_from, last_from);
   ::Kokkos::parallel_scan(
       label, RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-      func_type(init_value, first_from, first_dest, bop, uop));
+      func_type(std::move(init_value), first_from, first_dest, bop, uop));
   ex.fence("Kokkos::transform_exclusive_scan: fence after operation");
 
   // return
   return first_dest + num_elements;
 }
 
+//
+// team impl
+//
+template <class TeamHandleType, class InputIteratorType,
+          class OutputIteratorType, class ValueType, class BinaryOpType,
+          class UnaryOpType>
+KOKKOS_FUNCTION OutputIteratorType transform_exclusive_scan_team_impl(
+    const TeamHandleType& teamHandle, InputIteratorType first_from,
+    InputIteratorType last_from, OutputIteratorType first_dest,
+    ValueType init_value, BinaryOpType bop, UnaryOpType uop) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(teamHandle, first_from,
+                                                   first_dest);
+  Impl::static_assert_iterators_have_matching_difference_type(first_from,
+                                                              first_dest);
+  Impl::expect_valid_range(first_from, last_from);
+
+  static_assert(
+      ::Kokkos::is_detected_v<ex_scan_has_reduction_identity_sum_t, ValueType>,
+      "The team-level impl of Kokkos::Experimental::transform_exclusive_scan "
+      "currently does not support types without reduction identity");
+
+  // aliases
+  using exe_space  = typename TeamHandleType::execution_space;
+  using index_type = typename InputIteratorType::difference_type;
+  using func_type  = TransformExclusiveScanFunctorWithoutValueWrapper<
+      exe_space, index_type, ValueType, InputIteratorType, OutputIteratorType,
+      BinaryOpType, UnaryOpType>;
+
+  // run
+  const auto num_elements =
+      Kokkos::Experimental::distance(first_from, last_from);
+  ::Kokkos::parallel_scan(
+      TeamThreadRange(teamHandle, 0, num_elements),
+      func_type(std::move(init_value), first_from, first_dest, bop, uop));
+  teamHandle.team_barrier();
+
+  // return
+  return first_dest + num_elements;
+}
+
 }  // namespace Impl
 }  // namespace Experimental
 }  // namespace Kokkos
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_TransformInclusiveScan.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_TransformInclusiveScan.hpp
index 9dde2b0fb125d3f88806e3bf4437d1bedc947617..dc432573ee3564aa28286582ebc48e587fa0081e 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_TransformInclusiveScan.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_TransformInclusiveScan.hpp
@@ -31,7 +31,7 @@ namespace Impl {
 
 template <class ExeSpace, class IndexType, class ValueType, class FirstFrom,
           class FirstDest, class BinaryOpType, class UnaryOpType>
-struct TransformInclusiveScanNoInitValueFunctor {
+struct ExeSpaceTransformInclusiveScanNoInitValueFunctor {
   using execution_space = ExeSpace;
   using value_type      = ValueWrapperForNoNeutralElement<ValueType>;
 
@@ -41,9 +41,10 @@ struct TransformInclusiveScanNoInitValueFunctor {
   UnaryOpType m_unary_op;
 
   KOKKOS_FUNCTION
-  TransformInclusiveScanNoInitValueFunctor(FirstFrom first_from,
-                                           FirstDest first_dest,
-                                           BinaryOpType bop, UnaryOpType uop)
+  ExeSpaceTransformInclusiveScanNoInitValueFunctor(FirstFrom first_from,
+                                                   FirstDest first_dest,
+                                                   BinaryOpType bop,
+                                                   UnaryOpType uop)
       : m_first_from(std::move(first_from)),
         m_first_dest(std::move(first_dest)),
         m_binary_op(std::move(bop)),
@@ -67,6 +68,8 @@ struct TransformInclusiveScanNoInitValueFunctor {
 
   KOKKOS_FUNCTION
   void join(value_type& update, const value_type& input) const {
+    if (input.is_initial) return;
+
     if (update.is_initial) {
       update.val = input.val;
     } else {
@@ -78,7 +81,7 @@ struct TransformInclusiveScanNoInitValueFunctor {
 
 template <class ExeSpace, class IndexType, class ValueType, class FirstFrom,
           class FirstDest, class BinaryOpType, class UnaryOpType>
-struct TransformInclusiveScanWithInitValueFunctor {
+struct ExeSpaceTransformInclusiveScanWithInitValueFunctor {
   using execution_space = ExeSpace;
   using value_type      = ValueWrapperForNoNeutralElement<ValueType>;
 
@@ -89,10 +92,11 @@ struct TransformInclusiveScanWithInitValueFunctor {
   ValueType m_init;
 
   KOKKOS_FUNCTION
-  TransformInclusiveScanWithInitValueFunctor(FirstFrom first_from,
-                                             FirstDest first_dest,
-                                             BinaryOpType bop, UnaryOpType uop,
-                                             ValueType init)
+  ExeSpaceTransformInclusiveScanWithInitValueFunctor(FirstFrom first_from,
+                                                     FirstDest first_dest,
+                                                     BinaryOpType bop,
+                                                     UnaryOpType uop,
+                                                     ValueType init)
       : m_first_from(std::move(first_from)),
         m_first_dest(std::move(first_dest)),
         m_binary_op(std::move(bop)),
@@ -118,6 +122,8 @@ struct TransformInclusiveScanWithInitValueFunctor {
 
   KOKKOS_FUNCTION
   void join(value_type& update, const value_type& input) const {
+    if (input.is_initial) return;
+
     if (update.is_initial) {
       update.val = input.val;
     } else {
@@ -127,18 +133,20 @@ struct TransformInclusiveScanWithInitValueFunctor {
   }
 };
 
+//
+// exespace impl
+//
+
 // -------------------------------------------------------------
-// transform_inclusive_scan_impl without init_value
+// transform_inclusive_scan_exespace_impl without init_value
 // -------------------------------------------------------------
 template <class ExecutionSpace, class InputIteratorType,
           class OutputIteratorType, class BinaryOpType, class UnaryOpType>
-OutputIteratorType transform_inclusive_scan_impl(const std::string& label,
-                                                 const ExecutionSpace& ex,
-                                                 InputIteratorType first_from,
-                                                 InputIteratorType last_from,
-                                                 OutputIteratorType first_dest,
-                                                 BinaryOpType binary_op,
-                                                 UnaryOpType unary_op) {
+OutputIteratorType transform_inclusive_scan_exespace_impl(
+    const std::string& label, const ExecutionSpace& ex,
+    InputIteratorType first_from, InputIteratorType last_from,
+    OutputIteratorType first_dest, BinaryOpType binary_op,
+    UnaryOpType unary_op) {
   // checks
   Impl::static_assert_random_access_and_accessible(ex, first_from, first_dest);
   Impl::static_assert_iterators_have_matching_difference_type(first_from,
@@ -149,7 +157,7 @@ OutputIteratorType transform_inclusive_scan_impl(const std::string& label,
   using index_type = typename InputIteratorType::difference_type;
   using value_type =
       std::remove_const_t<typename InputIteratorType::value_type>;
-  using func_type = TransformInclusiveScanNoInitValueFunctor<
+  using func_type = ExeSpaceTransformInclusiveScanNoInitValueFunctor<
       ExecutionSpace, index_type, value_type, InputIteratorType,
       OutputIteratorType, BinaryOpType, UnaryOpType>;
 
@@ -166,12 +174,12 @@ OutputIteratorType transform_inclusive_scan_impl(const std::string& label,
 }
 
 // -------------------------------------------------------------
-// transform_inclusive_scan_impl with init_value
+// transform_inclusive_scan_exespace_impl with init_value
 // -------------------------------------------------------------
 template <class ExecutionSpace, class InputIteratorType,
           class OutputIteratorType, class BinaryOpType, class UnaryOpType,
           class ValueType>
-OutputIteratorType transform_inclusive_scan_impl(
+OutputIteratorType transform_inclusive_scan_exespace_impl(
     const std::string& label, const ExecutionSpace& ex,
     InputIteratorType first_from, InputIteratorType last_from,
     OutputIteratorType first_dest, BinaryOpType binary_op, UnaryOpType unary_op,
@@ -184,22 +192,182 @@ OutputIteratorType transform_inclusive_scan_impl(
 
   // aliases
   using index_type = typename InputIteratorType::difference_type;
-  using func_type  = TransformInclusiveScanWithInitValueFunctor<
+  using func_type  = ExeSpaceTransformInclusiveScanWithInitValueFunctor<
       ExecutionSpace, index_type, ValueType, InputIteratorType,
       OutputIteratorType, BinaryOpType, UnaryOpType>;
 
   // run
   const auto num_elements =
       Kokkos::Experimental::distance(first_from, last_from);
-  ::Kokkos::parallel_scan(
-      label, RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-      func_type(first_from, first_dest, binary_op, unary_op, init_value));
+  ::Kokkos::parallel_scan(label,
+                          RangePolicy<ExecutionSpace>(ex, 0, num_elements),
+                          func_type(first_from, first_dest, binary_op, unary_op,
+                                    std::move(init_value)));
   ex.fence("Kokkos::transform_inclusive_scan: fence after operation");
 
   // return
   return first_dest + num_elements;
 }
 
+//
+// team impl
+//
+
+template <class ExeSpace, class ValueType, class FirstFrom, class FirstDest,
+          class BinaryOpType, class UnaryOpType>
+struct TeamTransformInclusiveScanNoInitValueFunctor {
+  using execution_space = ExeSpace;
+  using index_type      = typename FirstFrom::difference_type;
+
+  FirstFrom m_first_from;
+  FirstDest m_first_dest;
+  BinaryOpType m_binary_op;
+  UnaryOpType m_unary_op;
+
+  KOKKOS_FUNCTION
+  TeamTransformInclusiveScanNoInitValueFunctor(FirstFrom first_from,
+                                               FirstDest first_dest,
+                                               BinaryOpType bop,
+                                               UnaryOpType uop)
+      : m_first_from(std::move(first_from)),
+        m_first_dest(std::move(first_dest)),
+        m_binary_op(std::move(bop)),
+        m_unary_op(std::move(uop)) {}
+
+  KOKKOS_FUNCTION
+  void operator()(const index_type i, ValueType& update,
+                  const bool final_pass) const {
+    const auto tmp = ValueType{m_unary_op(m_first_from[i])};
+    this->join(update, tmp);
+    if (final_pass) {
+      m_first_dest[i] = update;
+    }
+  }
+
+  KOKKOS_FUNCTION
+  void init(ValueType& update) const { update = {}; }
+
+  KOKKOS_FUNCTION
+  void join(ValueType& update, const ValueType& input) const {
+    update = m_binary_op(update, input);
+  }
+};
+
+template <class ExeSpace, class ValueType, class FirstFrom, class FirstDest,
+          class BinaryOpType, class UnaryOpType>
+struct TeamTransformInclusiveScanWithInitValueFunctor {
+  using execution_space = ExeSpace;
+  using index_type      = typename FirstFrom::difference_type;
+
+  FirstFrom m_first_from;
+  FirstDest m_first_dest;
+  BinaryOpType m_binary_op;
+  UnaryOpType m_unary_op;
+  ValueType m_init;
+
+  KOKKOS_FUNCTION
+  TeamTransformInclusiveScanWithInitValueFunctor(FirstFrom first_from,
+                                                 FirstDest first_dest,
+                                                 BinaryOpType bop,
+                                                 UnaryOpType uop,
+                                                 ValueType init)
+      : m_first_from(std::move(first_from)),
+        m_first_dest(std::move(first_dest)),
+        m_binary_op(std::move(bop)),
+        m_unary_op(std::move(uop)),
+        m_init(std::move(init)) {}
+
+  KOKKOS_FUNCTION
+  void operator()(const index_type i, ValueType& update,
+                  const bool final_pass) const {
+    const auto tmp = ValueType{m_unary_op(m_first_from[i])};
+    this->join(update, tmp);
+
+    if (final_pass) {
+      m_first_dest[i] = m_binary_op(update, m_init);
+    }
+  }
+
+  KOKKOS_FUNCTION
+  void init(ValueType& update) const { update = {}; }
+
+  KOKKOS_FUNCTION
+  void join(ValueType& update, const ValueType& input) const {
+    update = m_binary_op(update, input);
+  }
+};
+
+// -------------------------------------------------------------
+// transform_inclusive_scan_team_impl without init_value
+// -------------------------------------------------------------
+template <class TeamHandleType, class InputIteratorType,
+          class OutputIteratorType, class BinaryOpType, class UnaryOpType>
+KOKKOS_FUNCTION OutputIteratorType transform_inclusive_scan_team_impl(
+    const TeamHandleType& teamHandle, InputIteratorType first_from,
+    InputIteratorType last_from, OutputIteratorType first_dest,
+    BinaryOpType binary_op, UnaryOpType unary_op) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(teamHandle, first_from,
+                                                   first_dest);
+  Impl::static_assert_iterators_have_matching_difference_type(first_from,
+                                                              first_dest);
+  Impl::expect_valid_range(first_from, last_from);
+
+  // aliases
+  using exe_space = typename TeamHandleType::execution_space;
+  using value_type =
+      std::remove_const_t<typename InputIteratorType::value_type>;
+  using func_type = TeamTransformInclusiveScanNoInitValueFunctor<
+      exe_space, value_type, InputIteratorType, OutputIteratorType,
+      BinaryOpType, UnaryOpType>;
+
+  // run
+  const auto num_elements =
+      Kokkos::Experimental::distance(first_from, last_from);
+  ::Kokkos::parallel_scan(
+      TeamThreadRange(teamHandle, 0, num_elements),
+      func_type(first_from, first_dest, binary_op, unary_op));
+  teamHandle.team_barrier();
+
+  // return
+  return first_dest + num_elements;
+}
+
+// -------------------------------------------------------------
+// transform_inclusive_scan_team_impl with init_value
+// -------------------------------------------------------------
+template <class TeamHandleType, class InputIteratorType,
+          class OutputIteratorType, class BinaryOpType, class UnaryOpType,
+          class ValueType>
+KOKKOS_FUNCTION OutputIteratorType transform_inclusive_scan_team_impl(
+    const TeamHandleType& teamHandle, InputIteratorType first_from,
+    InputIteratorType last_from, OutputIteratorType first_dest,
+    BinaryOpType binary_op, UnaryOpType unary_op, ValueType init_value) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(teamHandle, first_from,
+                                                   first_dest);
+  Impl::static_assert_iterators_have_matching_difference_type(first_from,
+                                                              first_dest);
+  Impl::expect_valid_range(first_from, last_from);
+
+  // aliases
+  using exe_space = typename TeamHandleType::execution_space;
+  using func_type = TeamTransformInclusiveScanWithInitValueFunctor<
+      exe_space, ValueType, InputIteratorType, OutputIteratorType, BinaryOpType,
+      UnaryOpType>;
+
+  // run
+  const auto num_elements =
+      Kokkos::Experimental::distance(first_from, last_from);
+  ::Kokkos::parallel_scan(TeamThreadRange(teamHandle, 0, num_elements),
+                          func_type(first_from, first_dest, binary_op, unary_op,
+                                    std::move(init_value)));
+  teamHandle.team_barrier();
+
+  // return
+  return first_dest + num_elements;
+}
+
 }  // namespace Impl
 }  // namespace Experimental
 }  // namespace Kokkos
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_TransformReduce.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_TransformReduce.hpp
index 03771056a27b0b82ff138d8f356ff9dd601a326c..79bdf98915b28f994ec43ae8860aa0807df4b07e 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_TransformReduce.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_TransformReduce.hpp
@@ -110,9 +110,13 @@ struct StdTransformReduceTwoIntervalsFunctor {
 //
 //------------------------------
 
+//
+// exespace impl
+//
+
 template <class ExecutionSpace, class IteratorType, class ValueType,
           class JoinerType, class UnaryTransformerType>
-ValueType transform_reduce_custom_functors_impl(
+ValueType transform_reduce_custom_functors_exespace_impl(
     const std::string& label, const ExecutionSpace& ex, IteratorType first,
     IteratorType last, ValueType init_reduction_value, JoinerType joiner,
     UnaryTransformerType transformer) {
@@ -151,7 +155,7 @@ ValueType transform_reduce_custom_functors_impl(
 
 template <class ExecutionSpace, class IteratorType1, class IteratorType2,
           class ValueType, class JoinerType, class BinaryTransformerType>
-ValueType transform_reduce_custom_functors_impl(
+ValueType transform_reduce_custom_functors_exespace_impl(
     const std::string& label, const ExecutionSpace& ex, IteratorType1 first1,
     IteratorType1 last1, IteratorType2 first2, ValueType init_reduction_value,
     JoinerType joiner, BinaryTransformerType transformer) {
@@ -191,7 +195,7 @@ ValueType transform_reduce_custom_functors_impl(
 
 template <class ExecutionSpace, class IteratorType1, class IteratorType2,
           class ValueType>
-ValueType transform_reduce_default_functors_impl(
+ValueType transform_reduce_default_functors_exespace_impl(
     const std::string& label, const ExecutionSpace& ex, IteratorType1 first1,
     IteratorType1 last1, IteratorType2 first2, ValueType init_reduction_value) {
   // checks
@@ -205,11 +209,115 @@ ValueType transform_reduce_default_functors_impl(
       Impl::StdTranformReduceDefaultBinaryTransformFunctor<ValueType>;
   using joiner_type = Impl::StdTranformReduceDefaultJoinFunctor<ValueType>;
 
-  return transform_reduce_custom_functors_impl(
+  return transform_reduce_custom_functors_exespace_impl(
       label, ex, first1, last1, first2, std::move(init_reduction_value),
       joiner_type(), transformer_type());
 }
 
+//
+// team impl
+//
+
+template <class TeamHandleType, class IteratorType, class ValueType,
+          class JoinerType, class UnaryTransformerType>
+KOKKOS_FUNCTION ValueType transform_reduce_custom_functors_team_impl(
+    const TeamHandleType& teamHandle, IteratorType first, IteratorType last,
+    ValueType init_reduction_value, JoinerType joiner,
+    UnaryTransformerType transformer) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(teamHandle, first);
+  Impl::static_assert_is_not_openmptarget(teamHandle);
+  Impl::expect_valid_range(first, last);
+
+  if (first == last) {
+    // init is returned, unmodified
+    return init_reduction_value;
+  }
+
+  // aliases
+  using reducer_type =
+      ReducerWithArbitraryJoinerNoNeutralElement<ValueType, JoinerType>;
+  using functor_type =
+      StdTransformReduceSingleIntervalFunctor<IteratorType, reducer_type,
+                                              UnaryTransformerType>;
+  using reduction_value_type = typename reducer_type::value_type;
+
+  // run
+  reduction_value_type result;
+  reducer_type reducer(result, joiner);
+  const auto num_elements = Kokkos::Experimental::distance(first, last);
+  ::Kokkos::parallel_reduce(TeamThreadRange(teamHandle, 0, num_elements),
+                            functor_type(first, reducer, transformer), reducer);
+
+  teamHandle.team_barrier();
+
+  // as per standard, transform is not applied to the init value
+  // https://en.cppreference.com/w/cpp/algorithm/transform_reduce
+  return joiner(result.val, init_reduction_value);
+}
+
+template <class TeamHandleType, class IteratorType1, class IteratorType2,
+          class ValueType, class JoinerType, class BinaryTransformerType>
+KOKKOS_FUNCTION ValueType transform_reduce_custom_functors_team_impl(
+    const TeamHandleType& teamHandle, IteratorType1 first1, IteratorType1 last1,
+    IteratorType2 first2, ValueType init_reduction_value, JoinerType joiner,
+    BinaryTransformerType transformer) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(teamHandle, first1, first2);
+  Impl::static_assert_is_not_openmptarget(teamHandle);
+  Impl::static_assert_iterators_have_matching_difference_type(first1, first2);
+  Impl::expect_valid_range(first1, last1);
+
+  if (first1 == last1) {
+    // init is returned, unmodified
+    return init_reduction_value;
+  }
+
+  // aliases
+  using index_type = typename IteratorType1::difference_type;
+  using reducer_type =
+      ReducerWithArbitraryJoinerNoNeutralElement<ValueType, JoinerType>;
+  using functor_type =
+      StdTransformReduceTwoIntervalsFunctor<index_type, IteratorType1,
+                                            IteratorType2, reducer_type,
+                                            BinaryTransformerType>;
+  using reduction_value_type = typename reducer_type::value_type;
+
+  // run
+  reduction_value_type result;
+  reducer_type reducer(result, joiner);
+
+  const auto num_elements = Kokkos::Experimental::distance(first1, last1);
+  ::Kokkos::parallel_reduce(TeamThreadRange(teamHandle, 0, num_elements),
+                            functor_type(first1, first2, reducer, transformer),
+                            reducer);
+
+  teamHandle.team_barrier();
+
+  return joiner(result.val, init_reduction_value);
+}
+
+template <class TeamHandleType, class IteratorType1, class IteratorType2,
+          class ValueType>
+KOKKOS_FUNCTION ValueType transform_reduce_default_functors_team_impl(
+    const TeamHandleType& teamHandle, IteratorType1 first1, IteratorType1 last1,
+    IteratorType2 first2, ValueType init_reduction_value) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(teamHandle, first1, first2);
+  Impl::static_assert_is_not_openmptarget(teamHandle);
+  Impl::static_assert_iterators_have_matching_difference_type(first1, first2);
+  Impl::expect_valid_range(first1, last1);
+
+  // aliases
+  using transformer_type =
+      Impl::StdTranformReduceDefaultBinaryTransformFunctor<ValueType>;
+  using joiner_type = Impl::StdTranformReduceDefaultJoinFunctor<ValueType>;
+
+  return transform_reduce_custom_functors_team_impl(
+      teamHandle, first1, last1, first2, std::move(init_reduction_value),
+      joiner_type(), transformer_type());
+}
+
 }  // namespace Impl
 }  // namespace Experimental
 }  // namespace Kokkos
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Unique.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Unique.hpp
index 4bab551563f17e863d8289695fabe56d2d3156e7..11afa8ed6e082c2d292eb68f36a280ccba7d5b33 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Unique.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Unique.hpp
@@ -29,9 +29,10 @@ namespace Kokkos {
 namespace Experimental {
 namespace Impl {
 
-template <class IndexType, class InputIt, class OutputIt,
-          class BinaryPredicateType>
+template <class InputIt, class OutputIt, class BinaryPredicateType>
 struct StdUniqueFunctor {
+  using index_type = typename InputIt::difference_type;
+
   InputIt m_first_from;
   InputIt m_last_from;
   OutputIt m_first_dest;
@@ -46,7 +47,7 @@ struct StdUniqueFunctor {
         m_pred(std::move(pred)) {}
 
   KOKKOS_FUNCTION
-  void operator()(const IndexType i, IndexType& update,
+  void operator()(const index_type i, index_type& update,
                   const bool final_pass) const {
     auto& val_i         = m_first_from[i];
     const auto& val_ip1 = m_first_from[i + 1];
@@ -64,9 +65,9 @@ struct StdUniqueFunctor {
 };
 
 template <class ExecutionSpace, class IteratorType, class PredicateType>
-IteratorType unique_impl(const std::string& label, const ExecutionSpace& ex,
-                         IteratorType first, IteratorType last,
-                         PredicateType pred) {
+IteratorType unique_exespace_impl(const std::string& label,
+                                  const ExecutionSpace& ex, IteratorType first,
+                                  IteratorType last, PredicateType pred) {
   // checks
   Impl::static_assert_random_access_and_accessible(ex, first);
   Impl::expect_valid_range(first, last);
@@ -110,21 +111,17 @@ IteratorType unique_impl(const std::string& label, const ExecutionSpace& ex,
       // for same reason as the one explained in unique_copy
       const auto scan_size = num_elements_to_explore - 1;
       auto tmp_first       = ::Kokkos::Experimental::begin(tmp_view);
-      using output_it      = decltype(tmp_first);
 
       using index_type = typename IteratorType::difference_type;
-      using func_type =
-          StdUniqueFunctor<index_type, IteratorType, output_it, PredicateType>;
       index_type count = 0;
       ::Kokkos::parallel_scan(
           label, RangePolicy<ExecutionSpace>(ex, 0, scan_size),
-          func_type(it_found, last, tmp_first, pred), count);
+          StdUniqueFunctor(it_found, last, tmp_first, pred), count);
 
       // move last element too, for the same reason as the unique_copy
-      auto unused_r =
-          Impl::move_impl("Kokkos::move_from_unique", ex, it_found + scan_size,
-                          last, tmp_first + count);
-      (void)unused_r;  // r1 not used
+      [[maybe_unused]] auto unused_r = Impl::move_exespace_impl(
+          "Kokkos::move_from_unique", ex, it_found + scan_size, last,
+          tmp_first + count);
 
       // ----------
       // step 3
@@ -151,11 +148,69 @@ IteratorType unique_impl(const std::string& label, const ExecutionSpace& ex,
 }
 
 template <class ExecutionSpace, class IteratorType>
-IteratorType unique_impl(const std::string& label, const ExecutionSpace& ex,
-                         IteratorType first, IteratorType last) {
+IteratorType unique_exespace_impl(const std::string& label,
+                                  const ExecutionSpace& ex, IteratorType first,
+                                  IteratorType last) {
   using value_type    = typename IteratorType::value_type;
   using binary_pred_t = StdAlgoEqualBinaryPredicate<value_type>;
-  return unique_impl(label, ex, first, last, binary_pred_t());
+  return unique_exespace_impl(label, ex, first, last, binary_pred_t());
+}
+
+//
+// team level
+//
+template <class TeamHandleType, class IteratorType, class PredicateType>
+KOKKOS_FUNCTION IteratorType unique_team_impl(const TeamHandleType& teamHandle,
+                                              IteratorType first,
+                                              IteratorType last,
+                                              PredicateType pred) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(teamHandle, first);
+  Impl::expect_valid_range(first, last);
+
+  // branch for trivial vs non trivial case
+  const auto num_elements = Kokkos::Experimental::distance(first, last);
+  if (num_elements == 0) {
+    return first;
+  } else if (num_elements == 1) {
+    return last;
+  } else {
+    // FIXME: for the execution-space-based impl we used an auxiliary
+    // allocation, but for the team level we cannot do the same, so do this
+    // serially for now and later figure out if this can be done in parallel
+
+    std::size_t count = 0;
+    Kokkos::single(
+        Kokkos::PerTeam(teamHandle),
+        [=](std::size_t& lcount) {
+          IteratorType result = first;
+          IteratorType lfirst = first;
+          while (++lfirst != last) {
+            if (!pred(*result, *lfirst) && ++result != lfirst) {
+              *result = std::move(*lfirst);
+            }
+          }
+          lcount = Kokkos::Experimental::distance(first, result);
+        },
+        count);
+    // no barrier needed since single above broadcasts to all members
+
+    // +1 is needed because we want one element past the end
+    return first + count + 1;
+  }
+}
+
+template <class TeamHandleType, class IteratorType>
+KOKKOS_FUNCTION IteratorType unique_team_impl(const TeamHandleType& teamHandle,
+                                              IteratorType first,
+                                              IteratorType last) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(teamHandle, first);
+  Impl::expect_valid_range(first, last);
+
+  using binary_pred_t =
+      StdAlgoEqualBinaryPredicate<typename IteratorType::value_type>;
+  return unique_team_impl(teamHandle, first, last, binary_pred_t());
 }
 
 }  // namespace Impl
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_UniqueCopy.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_UniqueCopy.hpp
index d0aa1ed1d0e0dcac7ddcc3454b9f7f72a96b54b9..c7c293027862b27b1bc1d5d74d58c6753dc98da3 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_UniqueCopy.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_UniqueCopy.hpp
@@ -20,6 +20,7 @@
 #include <Kokkos_Core.hpp>
 #include "Kokkos_Constraints.hpp"
 #include "Kokkos_HelperPredicates.hpp"
+#include "Kokkos_MustUseKokkosSingleInTeam.hpp"
 #include "Kokkos_CopyCopyN.hpp"
 #include <std_algorithms/Kokkos_Distance.hpp>
 #include <string>
@@ -28,9 +29,9 @@ namespace Kokkos {
 namespace Experimental {
 namespace Impl {
 
-template <class IndexType, class InputIt, class OutputIt,
-          class BinaryPredicateType>
+template <class InputIt, class OutputIt, class BinaryPredicateType>
 struct StdUniqueCopyFunctor {
+  using index_type = typename InputIt::difference_type;
   InputIt m_first_from;
   InputIt m_last_from;
   OutputIt m_first_dest;
@@ -45,7 +46,7 @@ struct StdUniqueCopyFunctor {
         m_pred(std::move(pred)) {}
 
   KOKKOS_FUNCTION
-  void operator()(const IndexType i, IndexType& update,
+  void operator()(const index_type i, std::size_t& update,
                   const bool final_pass) const {
     const auto& val_i   = m_first_from[i];
     const auto& val_ip1 = m_first_from[i + 1];
@@ -64,10 +65,9 @@ struct StdUniqueCopyFunctor {
 
 template <class ExecutionSpace, class InputIterator, class OutputIterator,
           class PredicateType>
-OutputIterator unique_copy_impl(const std::string& label,
-                                const ExecutionSpace& ex, InputIterator first,
-                                InputIterator last, OutputIterator d_first,
-                                PredicateType pred) {
+OutputIterator unique_copy_exespace_impl(
+    const std::string& label, const ExecutionSpace& ex, InputIterator first,
+    InputIterator last, OutputIterator d_first, PredicateType pred) {
   // checks
   Impl::static_assert_random_access_and_accessible(ex, first, d_first);
   Impl::static_assert_iterators_have_matching_difference_type(first, d_first);
@@ -78,38 +78,119 @@ OutputIterator unique_copy_impl(const std::string& label,
   if (num_elements == 0) {
     return d_first;
   } else if (num_elements == 1) {
-    return Impl::copy_impl("Kokkos::copy_from_unique_copy", ex, first, last,
-                           d_first);
+    return Impl::copy_exespace_impl("Kokkos::copy_from_unique_copy", ex, first,
+                                    last, d_first);
   } else {
-    // aliases
-    using index_type = typename InputIterator::difference_type;
-    using func_type  = StdUniqueCopyFunctor<index_type, InputIterator,
-                                           OutputIterator, PredicateType>;
-
     // note here that we run scan for num_elements - 1
     // because of the way we implement this, the last element is always needed.
     // We avoid performing checks inside functor that we are within limits
     // and run a "safe" scan and then copy the last element.
     const auto scan_size = num_elements - 1;
-    index_type count     = 0;
-    ::Kokkos::parallel_scan(label,
-                            RangePolicy<ExecutionSpace>(ex, 0, scan_size),
-                            func_type(first, last, d_first, pred), count);
-
-    return Impl::copy_impl("Kokkos::copy_from_unique_copy", ex,
-                           first + scan_size, last, d_first + count);
+    std::size_t count    = 0;
+    ::Kokkos::parallel_scan(
+        label, RangePolicy<ExecutionSpace>(ex, 0, scan_size),
+        // use CTAD
+        StdUniqueCopyFunctor(first, last, d_first, pred), count);
+
+    return Impl::copy_exespace_impl("Kokkos::copy_from_unique_copy", ex,
+                                    first + scan_size, last, d_first + count);
   }
 }
 
 template <class ExecutionSpace, class InputIterator, class OutputIterator>
-OutputIterator unique_copy_impl(const std::string& label,
-                                const ExecutionSpace& ex, InputIterator first,
-                                InputIterator last, OutputIterator d_first) {
+OutputIterator unique_copy_exespace_impl(const std::string& label,
+                                         const ExecutionSpace& ex,
+                                         InputIterator first,
+                                         InputIterator last,
+                                         OutputIterator d_first) {
   // checks
   Impl::static_assert_random_access_and_accessible(ex, first, d_first);
   Impl::static_assert_iterators_have_matching_difference_type(first, d_first);
   Impl::expect_valid_range(first, last);
 
+  // aliases
+  using value_type1   = typename InputIterator::value_type;
+  using value_type2   = typename OutputIterator::value_type;
+  using binary_pred_t = StdAlgoEqualBinaryPredicate<value_type1, value_type2>;
+
+  // run
+  return unique_copy_exespace_impl(label, ex, first, last, d_first,
+                                   binary_pred_t());
+}
+
+//
+// team level
+//
+
+template <class TeamHandleType, class InputIterator, class OutputIterator,
+          class PredicateType>
+KOKKOS_FUNCTION OutputIterator unique_copy_team_impl(
+    const TeamHandleType& teamHandle, InputIterator first, InputIterator last,
+    OutputIterator d_first, PredicateType pred) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(teamHandle, first, d_first);
+  Impl::static_assert_iterators_have_matching_difference_type(first, d_first);
+  Impl::expect_valid_range(first, last);
+
+  // branch for trivial vs non trivial case
+  const std::size_t num_elements = Kokkos::Experimental::distance(first, last);
+  if (num_elements == 0) {
+    return d_first;
+  } else if (num_elements == 1) {
+    d_first[0] = first[0];
+    return d_first + 1;
+  }
+
+  else {
+    if constexpr (stdalgo_must_use_kokkos_single_for_team_scan_v<
+                      typename TeamHandleType::execution_space>) {
+      std::size_t count = 0;
+      Kokkos::single(
+          Kokkos::PerTeam(teamHandle),
+          [=](std::size_t& lcount) {
+            lcount = 0;
+            for (std::size_t i = 0; i < num_elements - 1; ++i) {
+              const auto& val_i   = first[i];
+              const auto& val_ip1 = first[i + 1];
+              if (!pred(val_i, val_ip1)) {
+                d_first[lcount++] = val_i;
+              }
+            }
+            // we need to copy the last element always
+            d_first[lcount++] = first[num_elements - 1];
+          },
+          count);
+      // no barrier needed since single above broadcasts to all members
+
+      return d_first + count;
+    } else {
+      const auto scan_size = num_elements - 1;
+      std::size_t count    = 0;
+      ::Kokkos::parallel_scan(TeamThreadRange(teamHandle, 0, scan_size),
+                              StdUniqueCopyFunctor(first, last, d_first, pred),
+                              count);
+      // no barrier needed since reducing into count
+
+      return Impl::copy_team_impl(teamHandle, first + scan_size, last,
+                                  d_first + count);
+    }
+
+#if defined KOKKOS_COMPILER_INTEL || \
+    (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130)
+    __builtin_unreachable();
+#endif
+  }
+}
+
+template <class TeamHandleType, class InputIterator, class OutputIterator>
+KOKKOS_FUNCTION OutputIterator
+unique_copy_team_impl(const TeamHandleType& teamHandle, InputIterator first,
+                      InputIterator last, OutputIterator d_first) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(teamHandle, first, d_first);
+  Impl::static_assert_iterators_have_matching_difference_type(first, d_first);
+  Impl::expect_valid_range(first, last);
+
   // aliases
   using value_type1 = typename InputIterator::value_type;
   using value_type2 = typename OutputIterator::value_type;
@@ -118,7 +199,8 @@ OutputIterator unique_copy_impl(const std::string& label,
   using binary_pred_t = StdAlgoEqualBinaryPredicate<value_type1, value_type2>;
 
   // run
-  return unique_copy_impl(label, ex, first, last, d_first, binary_pred_t());
+  return unique_copy_team_impl(teamHandle, first, last, d_first,
+                               binary_pred_t());
 }
 
 }  // namespace Impl
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ValueWrapperForNoNeutralElement.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ValueWrapperForNoNeutralElement.hpp
index 9b0d4d8244f28f5588274f870d7eed7b52eefc7f..8a73b8e0f1d7d4e07bf24cf3f4dc990fd4163b21 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ValueWrapperForNoNeutralElement.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ValueWrapperForNoNeutralElement.hpp
@@ -29,12 +29,6 @@ template <class Scalar>
 struct ValueWrapperForNoNeutralElement {
   Scalar val;
   bool is_initial = true;
-
-  KOKKOS_FUNCTION
-  void operator=(const ValueWrapperForNoNeutralElement& rhs) {
-    val        = rhs.val;
-    is_initial = rhs.is_initial;
-  }
 };
 
 }  // namespace Impl
diff --git a/packages/kokkos/algorithms/unit_tests/CMakeLists.txt b/packages/kokkos/algorithms/unit_tests/CMakeLists.txt
index 0fe9c2006ee2e526fac1e2018e82165e256215e7..419f5ec1d132564169425f3f90abf195fd801ade 100644
--- a/packages/kokkos/algorithms/unit_tests/CMakeLists.txt
+++ b/packages/kokkos/algorithms/unit_tests/CMakeLists.txt
@@ -16,35 +16,46 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget)
     set(dir ${CMAKE_CURRENT_BINARY_DIR}/${dir})
     file(MAKE_DIRECTORY ${dir})
 
-    # -------------------------
-    # Sort1d,3d, Random
-    # -------------------------
-    set(SOURCES_A)
-    if(Tag STREQUAL "OpenMP")
-      LIST(APPEND SOURCES_A
-	TestOpenMP_Sort1D.cpp
-	TestOpenMP_Sort3D.cpp
-	TestOpenMP_SortDynamicView.cpp
-	)
-    endif()
-
+    # ------------------------------------------
+    # Sort
+    # ------------------------------------------
     # Each of these inputs is an .hpp file.
     # Generate a .cpp file for each one that runs it on the current backend (Tag),
     # and add this .cpp file to the sources for UnitTest_RandomAndSort.
-    foreach(SOURCES_A_Input
-        TestRandomCommon
-        TestSortCommon
-        TestNestedSort
-    )
-      set(file ${dir}/${SOURCES_A_Input}.cpp)
+    set(ALGO_SORT_SOURCES)
+    foreach(SOURCE_Input
+	TestSort
+	TestSortCustomComp
+	TestBinSortA
+	TestBinSortB
+	TestNestedSort
+      )
+      set(file ${dir}/${SOURCE_Input}.cpp)
       # Write to a temporary intermediate file and call configure_file to avoid
       # updating timestamps triggering unnecessary rebuilds on subsequent cmake runs.
       file(WRITE ${dir}/dummy.cpp
         "#include <Test${Tag}_Category.hpp>\n"
-        "#include <${SOURCES_A_Input}.hpp>\n"
+        "#include <${SOURCE_Input}.hpp>\n"
         )
       configure_file(${dir}/dummy.cpp ${file})
-      list(APPEND SOURCES_A ${file})
+      list(APPEND ALGO_SORT_SOURCES ${file})
+    endforeach()
+
+    # ------------------------------------------
+    # Random
+    # ------------------------------------------
+    # do as above
+    set(ALGO_RANDOM_SOURCES)
+    foreach(SOURCE_Input
+	TestRandom
+      )
+      set(file ${dir}/${SOURCE_Input}.cpp)
+      file(WRITE ${dir}/dummy.cpp
+        "#include <Test${Tag}_Category.hpp>\n"
+        "#include <${SOURCE_Input}.hpp>\n"
+        )
+      configure_file(${dir}/dummy.cpp ${file})
+      list(APPEND ALGO_RANDOM_SOURCES ${file})
     endforeach()
 
     # ------------------------------------------
@@ -142,10 +153,256 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget)
       list(APPEND STDALGO_SOURCES_E Test${Name}.cpp)
     endforeach()
 
+    # ------------------------------------------
+    # std team Q
+    # ------------------------------------------
+    set(STDALGO_TEAM_SOURCES_Q)
+    foreach(Name
+	StdAlgorithmsCommon
+	StdAlgorithmsTeamInclusiveScan
+	StdAlgorithmsTeamTransformInclusiveScan
+      )
+      list(APPEND STDALGO_TEAM_SOURCES_Q Test${Name}.cpp)
+    endforeach()
+
+    # ------------------------------------------
+    # std team P
+    # ------------------------------------------
+    set(STDALGO_TEAM_SOURCES_P)
+    foreach(Name
+	StdAlgorithmsCommon
+	StdAlgorithmsTeamExclusiveScan
+	StdAlgorithmsTeamTransformExclusiveScan
+      )
+      list(APPEND STDALGO_TEAM_SOURCES_P Test${Name}.cpp)
+    endforeach()
+
+    # ------------------------------------------
+    # std team M
+    # ------------------------------------------
+    set(STDALGO_TEAM_SOURCES_M)
+    foreach(Name
+	StdAlgorithmsCommon
+	StdAlgorithmsTeamTransformUnaryOp
+	StdAlgorithmsTeamTransformBinaryOp
+	StdAlgorithmsTeamGenerate
+	StdAlgorithmsTeamGenerate_n
+	StdAlgorithmsTeamSwapRanges
+      )
+      list(APPEND STDALGO_TEAM_SOURCES_M Test${Name}.cpp)
+    endforeach()
+
+    # ------------------------------------------
+    # std team L
+    # ------------------------------------------
+    set(STDALGO_TEAM_SOURCES_L)
+    foreach(Name
+	StdAlgorithmsCommon
+	StdAlgorithmsTeamIsSorted
+	StdAlgorithmsTeamIsSortedUntil
+	StdAlgorithmsTeamIsPartitioned
+	StdAlgorithmsTeamPartitionCopy
+	StdAlgorithmsTeamPartitionPoint
+	)
+      list(APPEND STDALGO_TEAM_SOURCES_L Test${Name}.cpp)
+    endforeach()
+
+    # ------------------------------------------
+    # std team I
+    # ------------------------------------------
+    set(STDALGO_TEAM_SOURCES_I)
+    foreach(Name
+	StdAlgorithmsCommon
+	StdAlgorithmsTeamUnique
+	StdAlgorithmsTeamAdjacentDifference
+	StdAlgorithmsTeamReduce
+	StdAlgorithmsTeamTransformReduce
+	)
+      list(APPEND STDALGO_TEAM_SOURCES_I Test${Name}.cpp)
+    endforeach()
+
+    # ------------------------------------------
+    # std team H
+    # ------------------------------------------
+    set(STDALGO_TEAM_SOURCES_H)
+    foreach(Name
+	StdAlgorithmsCommon
+	StdAlgorithmsTeamCopy
+	StdAlgorithmsTeamCopy_n
+	StdAlgorithmsTeamCopyBackward
+	StdAlgorithmsTeamCopyIf
+	StdAlgorithmsTeamUniqueCopy
+	StdAlgorithmsTeamRemove
+	StdAlgorithmsTeamRemoveIf
+	StdAlgorithmsTeamRemoveCopy
+	StdAlgorithmsTeamRemoveCopyIf
+	)
+      list(APPEND STDALGO_TEAM_SOURCES_H Test${Name}.cpp)
+    endforeach()
+
+    # ------------------------------------------
+    # std team G
+    # ------------------------------------------
+    set(STDALGO_TEAM_SOURCES_G)
+    foreach(Name
+	StdAlgorithmsCommon
+	StdAlgorithmsTeamMove
+	StdAlgorithmsTeamMoveBackward
+	StdAlgorithmsTeamShiftLeft
+	StdAlgorithmsTeamShiftRight
+	)
+      list(APPEND STDALGO_TEAM_SOURCES_G Test${Name}.cpp)
+    endforeach()
+
+    # ------------------------------------------
+    # std team F
+    # ------------------------------------------
+    set(STDALGO_TEAM_SOURCES_F)
+    foreach(Name
+	StdAlgorithmsCommon
+	StdAlgorithmsTeamReverse
+	StdAlgorithmsTeamReverseCopy
+	StdAlgorithmsTeamRotate
+	StdAlgorithmsTeamRotateCopy
+      )
+      list(APPEND STDALGO_TEAM_SOURCES_F Test${Name}.cpp)
+    endforeach()
+
+    # ------------------------------------------
+    # std team E
+    # ------------------------------------------
+    set(STDALGO_TEAM_SOURCES_E)
+    foreach(Name
+	StdAlgorithmsCommon
+	StdAlgorithmsTeamFill
+	StdAlgorithmsTeamFill_n
+	StdAlgorithmsTeamReplace
+	StdAlgorithmsTeamReplaceIf
+	StdAlgorithmsTeamReplaceCopy
+	StdAlgorithmsTeamReplaceCopyIf
+	)
+      list(APPEND STDALGO_TEAM_SOURCES_E Test${Name}.cpp)
+    endforeach()
+
+    # ------------------------------------------
+    # std team D
+    # ------------------------------------------
+    set(STDALGO_TEAM_SOURCES_D)
+    foreach(Name
+	StdAlgorithmsCommon
+	StdAlgorithmsTeamMinElement
+	StdAlgorithmsTeamMaxElement
+	StdAlgorithmsTeamMinMaxElement
+	)
+      list(APPEND STDALGO_TEAM_SOURCES_D Test${Name}.cpp)
+    endforeach()
+
+    # ------------------------------------------
+    # std team C
+    # ------------------------------------------
+    set(STDALGO_TEAM_SOURCES_C)
+    foreach(Name
+	StdAlgorithmsCommon
+	StdAlgorithmsTeamFind
+	StdAlgorithmsTeamFindIf
+	StdAlgorithmsTeamFindIfNot
+	StdAlgorithmsTeamAllOf
+	StdAlgorithmsTeamAnyOf
+	StdAlgorithmsTeamNoneOf
+	StdAlgorithmsTeamSearchN
+	)
+      list(APPEND STDALGO_TEAM_SOURCES_C Test${Name}.cpp)
+    endforeach()
+
+    # ------------------------------------------
+    # std team B
+    # ------------------------------------------
+    set(STDALGO_TEAM_SOURCES_B)
+    foreach(Name
+	StdAlgorithmsCommon
+	StdAlgorithmsTeamEqual
+	StdAlgorithmsTeamSearch
+	StdAlgorithmsTeamFindEnd
+	StdAlgorithmsTeamFindFirstOf
+      )
+      list(APPEND STDALGO_TEAM_SOURCES_B Test${Name}.cpp)
+    endforeach()
+
+    # ------------------------------------------
+    # std team A
+    # ------------------------------------------
+    set(STDALGO_TEAM_SOURCES_A)
+    foreach(Name
+	StdAlgorithmsCommon
+	StdAlgorithmsTeamAdjacentFind
+	StdAlgorithmsTeamCount
+	StdAlgorithmsTeamCountIf
+	StdAlgorithmsTeamForEach
+	StdAlgorithmsTeamForEachN
+	StdAlgorithmsTeamLexicographicalCompare
+	StdAlgorithmsTeamMismatch
+      )
+      list(APPEND STDALGO_TEAM_SOURCES_A Test${Name}.cpp)
+    endforeach()
+
   endif()
 endforeach()
 
-# FIXME_OPENMPTARGET These tests cause internal compiler errors as of 09/01/22
+# FIXME_OPENMPTARGET - remove sort test as it leads to ICE with clang/16 and above at compile time.
+if(KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL "Clang" AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 16.0.0)
+    list(REMOVE_ITEM ALGO_SORT_SOURCES
+    TestSort.cpp
+  )
+endif()
+
+# FIXME_OPENMPTARGET remove tests for OpenMPTarget because in these cases
+# the impl needs to use either Kokkos or tailored reducers
+# which results in runtime memory errors.
+if(KOKKOS_ENABLE_OPENMPTARGET)
+  list(REMOVE_ITEM STDALGO_TEAM_SOURCES_L
+    TestStdAlgorithmsTeamIsPartitioned.cpp
+    TestStdAlgorithmsTeamPartitionPoint.cpp
+    TestStdAlgorithmsTeamPartitionCopy.cpp
+  )
+endif()
+
+# FIXME_OPENMPTARGET need to remove tests for OpenMPTarget because
+# in these cases the impl needs to use either Kokkos or
+# tailored reducers which results in runtime memory errors.
+if(KOKKOS_ENABLE_OPENMPTARGET)
+  list(REMOVE_ITEM STDALGO_TEAM_SOURCES_C
+    TestStdAlgorithmsTeamFind.cpp
+    TestStdAlgorithmsTeamFindIf.cpp
+    TestStdAlgorithmsTeamFindIfNot.cpp
+    TestStdAlgorithmsTeamAllOf.cpp
+    TestStdAlgorithmsTeamAnyOf.cpp
+    TestStdAlgorithmsTeamNoneOf.cpp
+    TestStdAlgorithmsTeamSearchN.cpp
+  )
+endif()
+
+# FIXME_OPENMPTARGET This test causes internal compiler errors as of 09/01/22
+# when compiling for Intel's Xe-HP GPUs.
+# FRIZZI: 04/26/2023: not sure if the compilation error is still applicable
+# but we conservatively leave this guard on
+if(NOT (KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM))
+  KOKKOS_ADD_EXECUTABLE_AND_TEST(
+    UnitTest_Sort
+    SOURCES
+    UnitTestMain.cpp
+    TestStdAlgorithmsCommon.cpp
+    ${ALGO_SORT_SOURCES}
+  )
+
+  KOKKOS_ADD_EXECUTABLE_AND_TEST(
+    UnitTest_Random
+    SOURCES
+    UnitTestMain.cpp
+    ${ALGO_RANDOM_SOURCES}
+  )
+endif()
+
+# FIXME_OPENMPTARGET: These tests cause internal compiler errors as of 09/01/22
 # when compiling for Intel's Xe-HP GPUs.
 if(KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM)
   list(REMOVE_ITEM STDALGO_SOURCES_D
@@ -160,31 +417,53 @@ if(KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM)
   )
 endif()
 
-# FIXME_OPENMPTARGET This test causes internal compiler errors as of 09/01/22
-# when compiling for Intel's Xe-HP GPUs.
-if(NOT (KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM))
-  KOKKOS_ADD_EXECUTABLE_AND_TEST(
-    UnitTest_RandomAndSort
-    SOURCES
-      UnitTestMain.cpp
-      ${SOURCES_A}
+# FIXME_OPENMPTARGET remove tests for OpenMPTarget
+# causing failures for various reasons
+if(KOKKOS_ENABLE_OPENMPTARGET)
+  # the following use either Kokkos or tailored reducers
+  # which results in runtime memory errors.
+  list(REMOVE_ITEM STDALGO_TEAM_SOURCES_B
+    TestStdAlgorithmsTeamFindEnd.cpp
+    TestStdAlgorithmsTeamFindFirstOf.cpp
+    TestStdAlgorithmsTeamSearch.cpp
+  )
+
+  list(REMOVE_ITEM STDALGO_TEAM_SOURCES_A
+    TestStdAlgorithmsTeamAdjacentFind.cpp
+    TestStdAlgorithmsTeamLexicographicalCompare.cpp
+    TestStdAlgorithmsTeamMismatch.cpp
+  )
+
+  # this causes an illegal memory access if team_members_have_matching_result
+  # is called
+  list(REMOVE_ITEM STDALGO_TEAM_SOURCES_M
+    TestStdAlgorithmsTeamTransformBinaryOp.cpp
   )
 endif()
 
 foreach(ID A;B;C;D;E)
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
-    UnitTest_StdSet_${ID}
+    AlgorithmsUnitTest_StdSet_${ID}
     SOURCES
     UnitTestMain.cpp
     ${STDALGO_SOURCES_${ID}}
     )
 endforeach()
 
+foreach(ID A;B;C;D;E;F;G;H;I;L;M;P;Q)
+  KOKKOS_ADD_EXECUTABLE_AND_TEST(
+    AlgorithmsUnitTest_StdSet_Team_${ID}
+    SOURCES
+    UnitTestMain.cpp
+    ${STDALGO_TEAM_SOURCES_${ID}}
+    )
+endforeach()
+
 # FIXME_OPENMPTARGET This test causes internal compiler errors as of 09/01/22
 # when compiling for Intel's Xe-HP GPUs.
 if(NOT (KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM))
   KOKKOS_ADD_EXECUTABLE(
-    UnitTest_StdAlgoCompileOnly
+    AlgorithmsUnitTest_StdAlgoCompileOnly
     SOURCES TestStdAlgorithmsCompileOnly.cpp
   )
 endif()
diff --git a/packages/kokkos/algorithms/unit_tests/Makefile b/packages/kokkos/algorithms/unit_tests/Makefile
index e961e7ba2c18f615a41168e044f9ad7f42d528a6..601217799a88bff73f467d2311bdba313ef32bb4 100644
--- a/packages/kokkos/algorithms/unit_tests/Makefile
+++ b/packages/kokkos/algorithms/unit_tests/Makefile
@@ -27,44 +27,48 @@ TARGETS =
 
 tmp := $(foreach device, $(KOKKOS_DEVICELIST), \
   $(if $(filter Test$(device).cpp, $(shell ls Test$(device).cpp 2>/dev/null)),,\
-    $(shell echo "\#include <Test"${device}"_Category.hpp>" > Test$(device).cpp); \
-    $(shell echo "\#include <TestRandomCommon.hpp>" >> Test$(device).cpp); \
-    $(shell echo "\#include <TestSortCommon.hpp>" >> Test$(device).cpp); \
-  ) \
+     $(shell echo "\#include <Test"${device}"_Category.hpp>" > Test$(device).cpp); \
+     $(shell echo "\#include <TestRandom.hpp>" >> Test$(device).cpp); \
+     $(shell echo "\#include <TestSort.hpp>" >> Test$(device).cpp); \
+     $(shell echo "\#include <TestBinSortA.hpp>" >> Test$(device).cpp); \
+     $(shell echo "\#include <TestBinSortB.hpp>" >> Test$(device).cpp); \
+     $(shell echo "\#include <TestNestedSort.hpp>" >> Test$(device).cpp); \
+     $(shell echo "\#include <TestSortCustomComp.hpp>" >> Test$(device).cpp); \
+   ) \
 )
 
 ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
-	OBJ_CUDA = TestCuda.o UnitTestMain.o gtest-all.o
+	OBJ_CUDA = TestCuda.o TestStdAlgorithmsCommon.o UnitTestMain.o gtest-all.o
 	TARGETS += KokkosAlgorithms_UnitTest_Cuda
 	TEST_TARGETS += test-cuda
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
-	OBJ_HIP = TestHIP.o UnitTestMain.o gtest-all.o
+	OBJ_HIP = TestHIP.o TestStdAlgorithmsCommon.o UnitTestMain.o gtest-all.o
 	TARGETS += KokkosAlgorithms_UnitTest_HIP
 	TEST_TARGETS += test-hip
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_THREADS), 1)
-	OBJ_THREADS = TestThreads.o UnitTestMain.o gtest-all.o
+	OBJ_THREADS = TestThreads.o TestStdAlgorithmsCommon.o UnitTestMain.o gtest-all.o
 	TARGETS += KokkosAlgorithms_UnitTest_Threads
 	TEST_TARGETS += test-threads
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
-	OBJ_OPENMP = TestOpenMP.o TestOpenMP_Sort1D.o TestOpenMP_Sort3D.o TestOpenMP_SortDynamicView.o UnitTestMain.o gtest-all.o
+	OBJ_OPENMP = TestOpenMP.o TestStdAlgorithmsCommon.o UnitTestMain.o gtest-all.o
 	TARGETS += KokkosAlgorithms_UnitTest_OpenMP
 	TEST_TARGETS += test-openmp
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
-	OBJ_HPX = TestHPX.o UnitTestMain.o gtest-all.o
+	OBJ_HPX = TestHPX.o TestStdAlgorithmsCommon.o UnitTestMain.o gtest-all.o
 	TARGETS += KokkosAlgorithms_UnitTest_HPX
 	TEST_TARGETS += test-hpx
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
-	OBJ_SERIAL = TestSerial.o UnitTestMain.o gtest-all.o
+	OBJ_SERIAL = TestSerial.o TestStdAlgorithmsCommon.o UnitTestMain.o gtest-all.o
 	TARGETS += KokkosAlgorithms_UnitTest_Serial
 	TEST_TARGETS += test-serial
 endif
diff --git a/packages/kokkos/algorithms/unit_tests/TestBinSortA.hpp b/packages/kokkos/algorithms/unit_tests/TestBinSortA.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..dd3569e6715a8ef5e8c99b4bb0ba28702b6fe13d
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestBinSortA.hpp
@@ -0,0 +1,270 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_ALGORITHMS_UNITTESTS_TEST_BINSORTA_HPP
+#define KOKKOS_ALGORITHMS_UNITTESTS_TEST_BINSORTA_HPP
+
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Random.hpp>
+#include <Kokkos_Sort.hpp>
+#include <random>
+
+namespace Test {
+namespace BinSortSetA {
+
+template <class ExecutionSpace, class Scalar>
+struct bin3d_is_sorted_struct {
+  using value_type      = unsigned int;
+  using execution_space = ExecutionSpace;
+
+  Kokkos::View<Scalar * [3], ExecutionSpace> keys;
+
+  int max_bins;
+  Scalar min;
+  Scalar max;
+
+  bin3d_is_sorted_struct(Kokkos::View<Scalar * [3], ExecutionSpace> keys_,
+                         int max_bins_, Scalar min_, Scalar max_)
+      : keys(keys_), max_bins(max_bins_), min(min_), max(max_) {}
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int i, unsigned int& count) const {
+    int ix1 = int((keys(i, 0) - min) / max * max_bins);
+    int iy1 = int((keys(i, 1) - min) / max * max_bins);
+    int iz1 = int((keys(i, 2) - min) / max * max_bins);
+    int ix2 = int((keys(i + 1, 0) - min) / max * max_bins);
+    int iy2 = int((keys(i + 1, 1) - min) / max * max_bins);
+    int iz2 = int((keys(i + 1, 2) - min) / max * max_bins);
+
+    if (ix1 > ix2)
+      count++;
+    else if (ix1 == ix2) {
+      if (iy1 > iy2)
+        count++;
+      else if ((iy1 == iy2) && (iz1 > iz2))
+        count++;
+    }
+  }
+};
+
+template <class ExecutionSpace, class Scalar>
+struct sum3D {
+  using value_type      = double;
+  using execution_space = ExecutionSpace;
+
+  Kokkos::View<Scalar * [3], ExecutionSpace> keys;
+
+  sum3D(Kokkos::View<Scalar * [3], ExecutionSpace> keys_) : keys(keys_) {}
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int i, double& count) const {
+    count += keys(i, 0);
+    count += keys(i, 1);
+    count += keys(i, 2);
+  }
+};
+
+template <class ExecutionSpace, typename KeyType>
+void test_3D_sort_impl(unsigned int n) {
+  using KeyViewType = Kokkos::View<KeyType * [3], ExecutionSpace>;
+
+  KeyViewType keys("Keys", n * n * n);
+
+  Kokkos::Random_XorShift64_Pool<ExecutionSpace> g(1931);
+  Kokkos::fill_random(keys, g, 100.0);
+
+  double sum_before       = 0.0;
+  double sum_after        = 0.0;
+  unsigned int sort_fails = 0;
+
+  ExecutionSpace exec;
+  Kokkos::parallel_reduce(
+      Kokkos::RangePolicy<ExecutionSpace>(exec, 0, keys.extent(0)),
+      sum3D<ExecutionSpace, KeyType>(keys), sum_before);
+
+  int bin_1d = 1;
+  while (bin_1d * bin_1d * bin_1d * 4 < (int)keys.extent(0)) bin_1d *= 2;
+  int bin_max[3]                          = {bin_1d, bin_1d, bin_1d};
+  typename KeyViewType::value_type min[3] = {0, 0, 0};
+  typename KeyViewType::value_type max[3] = {100, 100, 100};
+
+  using BinOp = Kokkos::BinOp3D<KeyViewType>;
+  BinOp bin_op(bin_max, min, max);
+  Kokkos::BinSort<KeyViewType, BinOp> Sorter(keys, bin_op, false);
+  Sorter.create_permute_vector(exec);
+  Sorter.sort(exec, keys);
+
+  Kokkos::parallel_reduce(
+      Kokkos::RangePolicy<ExecutionSpace>(exec, 0, keys.extent(0)),
+      sum3D<ExecutionSpace, KeyType>(keys), sum_after);
+  Kokkos::parallel_reduce(
+      Kokkos::RangePolicy<ExecutionSpace>(exec, 0, keys.extent(0) - 1),
+      bin3d_is_sorted_struct<ExecutionSpace, KeyType>(keys, bin_1d, min[0],
+                                                      max[0]),
+      sort_fails);
+
+  double ratio   = sum_before / sum_after;
+  double epsilon = 1e-10;
+  unsigned int equal_sum =
+      (ratio > (1.0 - epsilon)) && (ratio < (1.0 + epsilon)) ? 1 : 0;
+
+  if (sort_fails)
+    printf("3D Sort Sum: %f %f Fails: %u\n", sum_before, sum_after, sort_fails);
+
+  ASSERT_EQ(sort_fails, 0u);
+  ASSERT_EQ(equal_sum, 1u);
+}
+
+template <class ExecutionSpace>
+void test_issue_1160_impl() {
+  Kokkos::View<int*, ExecutionSpace> element_("element", 10);
+  Kokkos::View<double*, ExecutionSpace> x_("x", 10);
+  Kokkos::View<double*, ExecutionSpace> v_("y", 10);
+
+  auto h_element = Kokkos::create_mirror_view(element_);
+  auto h_x       = Kokkos::create_mirror_view(x_);
+  auto h_v       = Kokkos::create_mirror_view(v_);
+
+  h_element(0) = 9;
+  h_element(1) = 8;
+  h_element(2) = 7;
+  h_element(3) = 6;
+  h_element(4) = 5;
+  h_element(5) = 4;
+  h_element(6) = 3;
+  h_element(7) = 2;
+  h_element(8) = 1;
+  h_element(9) = 0;
+
+  for (int i = 0; i < 10; ++i) {
+    h_v.access(i, 0) = h_x.access(i, 0) = double(h_element(i));
+  }
+  ExecutionSpace exec;
+  Kokkos::deep_copy(exec, element_, h_element);
+  Kokkos::deep_copy(exec, x_, h_x);
+  Kokkos::deep_copy(exec, v_, h_v);
+
+  using KeyViewType = decltype(element_);
+  using BinOp       = Kokkos::BinOp1D<KeyViewType>;
+
+  int begin = 3;
+  int end   = 8;
+  auto max  = h_element(begin);
+  auto min  = h_element(end - 1);
+  BinOp binner(end - begin, min, max);
+
+  Kokkos::BinSort<KeyViewType, BinOp> Sorter(element_, begin, end, binner,
+                                             false);
+  Sorter.create_permute_vector(exec);
+  Sorter.sort(exec, element_, begin, end);
+
+  Sorter.sort(exec, x_, begin, end);
+  Sorter.sort(exec, v_, begin, end);
+
+  Kokkos::deep_copy(exec, h_element, element_);
+  Kokkos::deep_copy(exec, h_x, x_);
+  Kokkos::deep_copy(exec, h_v, v_);
+  exec.fence();
+
+  ASSERT_EQ(h_element(0), 9);
+  ASSERT_EQ(h_element(1), 8);
+  ASSERT_EQ(h_element(2), 7);
+  ASSERT_EQ(h_element(3), 2);
+  ASSERT_EQ(h_element(4), 3);
+  ASSERT_EQ(h_element(5), 4);
+  ASSERT_EQ(h_element(6), 5);
+  ASSERT_EQ(h_element(7), 6);
+  ASSERT_EQ(h_element(8), 1);
+  ASSERT_EQ(h_element(9), 0);
+
+  for (int i = 0; i < 10; ++i) {
+    ASSERT_EQ(h_element(i), int(h_x.access(i, 0)));
+    ASSERT_EQ(h_element(i), int(h_v.access(i, 0)));
+  }
+}
+
+template <class ExecutionSpace, class T>
+void test_sort_integer_overflow() {
+  // FIXME: this test is meant to test something for BinSort,
+  // but actually uses the kokkos::sort API with the assumption
+  // that underneath it calls binsort. I don't think this is correct,
+  // because if the kokkos::sort API chages impl, this test is not testing
+  // what it meants to test... so need to change this to actually use BinSort
+  // directly.
+
+  // array with two extrema in reverse order to expose integer overflow bug in
+  // bin calculation
+  T a[2]  = {Kokkos::Experimental::finite_max<T>::value,
+            Kokkos::Experimental::finite_min<T>::value};
+  auto vd = Kokkos::create_mirror_view_and_copy(
+      ExecutionSpace(), Kokkos::View<T[2], Kokkos::HostSpace>(a));
+  Kokkos::sort(vd);
+  auto vh = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), vd);
+  EXPECT_TRUE(std::is_sorted(vh.data(), vh.data() + 2))
+      << "view (" << vh[0] << ", " << vh[1] << ") is not sorted";
+}
+
+}  // namespace BinSortSetA
+
+TEST(TEST_CATEGORY, BinSortGenericTests) {
+  using ExecutionSpace = TEST_EXECSPACE;
+  using key_type       = unsigned;
+  constexpr int N      = 171;
+
+  BinSortSetA::test_3D_sort_impl<ExecutionSpace, key_type>(N);
+  BinSortSetA::test_issue_1160_impl<ExecutionSpace>();
+  BinSortSetA::test_sort_integer_overflow<ExecutionSpace, long long>();
+  BinSortSetA::test_sort_integer_overflow<ExecutionSpace, unsigned long long>();
+  BinSortSetA::test_sort_integer_overflow<ExecutionSpace, int>();
+}
+
+TEST(TEST_CATEGORY, BinSortEmptyView) {
+  using ExecutionSpace = TEST_EXECSPACE;
+
+  // the bounds and extents used below are totally arbitrary
+  // and, in theory, should have no impact
+
+  using KeyViewType = Kokkos::View<int*, ExecutionSpace>;
+  KeyViewType kv("kv", 20);
+
+  using BinOp_t = Kokkos::BinOp1D<KeyViewType>;
+  BinOp_t binOp(5, 0, 10);
+  Kokkos::BinSort<KeyViewType, BinOp_t> Sorter(ExecutionSpace{}, kv, binOp);
+
+  // does not matter if we use int or something else
+  Kokkos::View<int*, ExecutionSpace> v("v", 0);
+
+  // test all exposed public sort methods
+  ASSERT_NO_THROW(Sorter.sort(ExecutionSpace(), v, 0, 0));
+  ASSERT_NO_THROW(Sorter.sort(v, 0, 0));
+  ASSERT_NO_THROW(Sorter.sort(ExecutionSpace(), v));
+  ASSERT_NO_THROW(Sorter.sort(v));
+}
+
+TEST(TEST_CATEGORY, BinSortEmptyKeysView) {
+  using ExecutionSpace = TEST_EXECSPACE;
+
+  using KeyViewType = Kokkos::View<int*, ExecutionSpace>;
+  KeyViewType kv("kv", 0);
+
+  using BinOp_t = Kokkos::BinOp1D<KeyViewType>;
+  BinOp_t binOp(5, 0, 10);
+  Kokkos::BinSort<KeyViewType, BinOp_t> Sorter(ExecutionSpace{}, kv, binOp);
+
+  ASSERT_NO_THROW(Sorter.create_permute_vector(ExecutionSpace{}));
+}
+
+}  // namespace Test
+#endif
diff --git a/packages/kokkos/algorithms/unit_tests/TestBinSortB.hpp b/packages/kokkos/algorithms/unit_tests/TestBinSortB.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a90224bf31589433ba028e36385b74abdeffd8b5
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestBinSortB.hpp
@@ -0,0 +1,198 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_ALGORITHMS_UNITTESTS_TEST_BINSORTB_HPP
+#define KOKKOS_ALGORITHMS_UNITTESTS_TEST_BINSORTB_HPP
+
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Random.hpp>
+#include <Kokkos_Sort.hpp>
+#include <Kokkos_StdAlgorithms.hpp>
+#include <TestStdAlgorithmsCommon.hpp>
+#include <random>
+#include <numeric>  //needed for iota
+
+namespace Test {
+namespace BinSortSetB {
+
+template <class KeyType, class ExecutionSpace>
+auto create_rank1_dev_and_host_views_of_keys(const ExecutionSpace& exec,
+                                             int N) {
+  namespace KE = Kokkos::Experimental;
+  Kokkos::DefaultHostExecutionSpace defaultHostExeSpace;
+
+  using KeyViewType = Kokkos::View<KeyType*, ExecutionSpace>;
+  KeyViewType keys("keys", N);
+  auto keys_h = Kokkos::create_mirror_view(keys);
+  std::iota(KE::begin(keys_h), KE::end(keys_h), KeyType(0));
+  KE::reverse(defaultHostExeSpace, keys_h);
+  // keys now is = [N-1,N-2,...,2,1,0], shuffle it for avoid trivial case
+  std::random_device rd;
+  std::mt19937 g(rd());
+  std::shuffle(KE::begin(keys_h), KE::end(keys_h), g);
+  Kokkos::deep_copy(exec, keys, keys_h);
+
+  return std::make_pair(keys, keys_h);
+}
+
+template <class ExecutionSpace, class ValueType, int ValuesViewRank,
+          std::enable_if_t<ValuesViewRank == 1, int> = 0>
+auto create_strided_view(std::size_t numRows, std::size_t /*numCols*/) {
+  Kokkos::LayoutStride layout{numRows, 2};
+  using v_t = Kokkos::View<ValueType*, Kokkos::LayoutStride, ExecutionSpace>;
+  v_t v("v", layout);
+  return v;
+}
+
+template <class ExecutionSpace, class ValueType, int ValuesViewRank,
+          std::enable_if_t<ValuesViewRank == 2, int> = 0>
+auto create_strided_view(std::size_t numRows, std::size_t numCols) {
+  Kokkos::LayoutStride layout{numRows, 2, numCols, numRows * 2};
+  using v_t = Kokkos::View<ValueType**, Kokkos::LayoutStride, ExecutionSpace>;
+  v_t v("v", layout);
+  return v;
+}
+
+template <class ExecutionSpace, class KeyType, class ValueType,
+          int ValuesViewRank>
+void test_on_view_with_stride(std::size_t numRows, std::size_t indB,
+                              std::size_t indE, std::size_t numCols = 1) {
+  ExecutionSpace exec;
+  Kokkos::DefaultHostExecutionSpace defaultHostExeSpace;
+  namespace KE = Kokkos::Experimental;
+
+  // 1. generate 1D view of keys
+  auto [keys, keys_h] =
+      create_rank1_dev_and_host_views_of_keys<KeyType>(exec, numRows);
+  using KeyViewType = decltype(keys);
+
+  // need this map key->row to use later for checking
+  std::unordered_map<KeyType, std::size_t> keyToRowBeforeSort;
+  for (std::size_t i = 0; i < numRows; ++i) {
+    keyToRowBeforeSort[keys_h(i)] = i;
+  }
+
+  // 2. create binOp
+  using BinOp = Kokkos::BinOp1D<KeyViewType>;
+  auto itB    = KE::cbegin(keys_h) + indB;
+  auto itE    = itB + indE - indB;
+  auto it     = KE::minmax_element(defaultHostExeSpace, itB, itE);
+  // seems like the behavior is odd when we use # buckets = # keys
+  // so use +5 for using more buckets than keys.
+  // This is something to investigate.
+  BinOp binner(indE - indB + 5, *it.first, *it.second);
+
+  // 3. create sorter
+  Kokkos::BinSort<KeyViewType, BinOp> sorter(keys, indB, indE, binner, false);
+  sorter.create_permute_vector(exec);
+  sorter.sort(exec, keys, indB, indE);
+  Kokkos::deep_copy(exec, keys_h, keys);
+
+  auto v = create_strided_view<ExecutionSpace, ValueType, ValuesViewRank>(
+      numRows, numCols);
+
+  Kokkos::Random_XorShift64_Pool<ExecutionSpace> pool(73931);
+  Kokkos::fill_random(v, pool, ValueType(545));
+  auto v_before_sort_h = stdalgos::create_host_space_copy(v);
+  sorter.sort(exec, v, indB, indE);
+  auto v_after_sort_h = stdalgos::create_host_space_copy(v);
+
+  for (size_t i = 0; i < v.extent(0); ++i) {
+    // if i within [indB,indE), the sorting was done
+    // so we need to do proper checking since rows have changed
+    if (i >= size_t(indB) && i < size_t(indE)) {
+      const KeyType key = keys_h(i);
+      if constexpr (ValuesViewRank == 1) {
+        ASSERT_TRUE(v_before_sort_h(keyToRowBeforeSort.at(key)) ==
+                    v_after_sort_h(i));
+      } else {
+        for (size_t j = 0; j < v.extent(1); ++j) {
+          ASSERT_TRUE(v_before_sort_h(keyToRowBeforeSort.at(key), j) ==
+                      v_after_sort_h(i, j));
+        }
+      }
+    }
+    // outside the target bounds, then the i-th row remains unchanged
+    else {
+      if constexpr (ValuesViewRank == 1) {
+        ASSERT_TRUE(v_before_sort_h(i) == v_after_sort_h(i));
+      } else {
+        for (size_t j = 0; j < v.extent(1); ++j) {
+          ASSERT_TRUE(v_before_sort_h(i, j) == v_after_sort_h(i, j));
+        }
+      }
+    }
+  }
+}
+
+template <class ExecutionSpace, class KeyType, class ValueType>
+void run_for_rank1() {
+  constexpr int rank = 1;
+
+  // trivial case
+  test_on_view_with_stride<ExecutionSpace, KeyType, ValueType, rank>(1, 0, 1);
+
+  // nontrivial cases
+  for (std::size_t N : {311, 710017}) {
+    // various cases for bounds
+    test_on_view_with_stride<ExecutionSpace, KeyType, ValueType, rank>(N, 0, N);
+    test_on_view_with_stride<ExecutionSpace, KeyType, ValueType, rank>(N, 3, N);
+    test_on_view_with_stride<ExecutionSpace, KeyType, ValueType, rank>(N, 0,
+                                                                       N - 4);
+    test_on_view_with_stride<ExecutionSpace, KeyType, ValueType, rank>(N, 4,
+                                                                       N - 3);
+  }
+}
+
+template <class ExecutionSpace, class KeyType, class ValueType>
+void run_for_rank2() {
+  constexpr int rank = 2;
+
+  // trivial case
+  test_on_view_with_stride<ExecutionSpace, KeyType, ValueType, rank>(1, 0, 1,
+                                                                     1);
+
+  // nontrivial cases
+  for (std::size_t Nr : {11, 1157, 710017}) {
+    for (std::size_t Nc : {3, 51}) {
+      // various cases for bounds
+      test_on_view_with_stride<ExecutionSpace, KeyType, ValueType, rank>(
+          Nr, 0, Nr, Nc);
+      test_on_view_with_stride<ExecutionSpace, KeyType, ValueType, rank>(
+          Nr, 3, Nr, Nc);
+      test_on_view_with_stride<ExecutionSpace, KeyType, ValueType, rank>(
+          Nr, 0, Nr - 4, Nc);
+      test_on_view_with_stride<ExecutionSpace, KeyType, ValueType, rank>(
+          Nr, 4, Nr - 3, Nc);
+    }
+  }
+}
+
+}  // namespace BinSortSetB
+
+TEST(TEST_CATEGORY, BinSortUnsignedKeyLayoutStrideValues) {
+  using ExeSpace = TEST_EXECSPACE;
+  using key_type = unsigned;
+  BinSortSetB::run_for_rank1<ExeSpace, key_type, int>();
+  BinSortSetB::run_for_rank1<ExeSpace, key_type, double>();
+
+  BinSortSetB::run_for_rank2<ExeSpace, key_type, int>();
+  BinSortSetB::run_for_rank2<ExeSpace, key_type, double>();
+}
+
+}  // namespace Test
+#endif
diff --git a/packages/kokkos/algorithms/unit_tests/TestNestedSort.hpp b/packages/kokkos/algorithms/unit_tests/TestNestedSort.hpp
index 37ee211b42a7681967ba17c40935e8a9d8699ba4..1b7a3f48fc521fa38f87f7aa077e06efd53e495b 100644
--- a/packages/kokkos/algorithms/unit_tests/TestNestedSort.hpp
+++ b/packages/kokkos/algorithms/unit_tests/TestNestedSort.hpp
@@ -17,14 +17,14 @@
 #ifndef KOKKOS_ALGORITHMS_UNITTESTS_TEST_NESTED_SORT_HPP
 #define KOKKOS_ALGORITHMS_UNITTESTS_TEST_NESTED_SORT_HPP
 
+#include <gtest/gtest.h>
 #include <unordered_set>
 #include <random>
 #include <Kokkos_Random.hpp>
 #include <Kokkos_NestedSort.hpp>
 
 namespace Test {
-
-namespace Impl {
+namespace NestedSortImpl {
 
 // Comparator for sorting in descending order
 template <typename Key>
@@ -383,24 +383,28 @@ void test_nested_sort_by_key(unsigned int N, KeyType minKey, KeyType maxKey,
   test_nested_sort_by_key_impl<ExecutionSpace, KeyType, ValueType>(
       N, N, false, true, minKey, maxKey, minVal, maxVal);
 }
-}  // namespace Impl
+}  // namespace NestedSortImpl
 
 TEST(TEST_CATEGORY, NestedSort) {
-  Impl::test_nested_sort<TEST_EXECSPACE, unsigned>(171, 0U, UINT_MAX);
-  Impl::test_nested_sort<TEST_EXECSPACE, float>(42, -1e6f, 1e6f);
-  Impl::test_nested_sort<TEST_EXECSPACE, char>(67, CHAR_MIN, CHAR_MAX);
+  using ExecutionSpace = TEST_EXECSPACE;
+  NestedSortImpl::test_nested_sort<ExecutionSpace, unsigned>(171, 0U, UINT_MAX);
+  NestedSortImpl::test_nested_sort<ExecutionSpace, float>(42, -1e6f, 1e6f);
+  NestedSortImpl::test_nested_sort<ExecutionSpace, char>(67, CHAR_MIN,
+                                                         CHAR_MAX);
 }
 
 TEST(TEST_CATEGORY, NestedSortByKey) {
+  using ExecutionSpace = TEST_EXECSPACE;
+
   // Second/third template arguments are key and value respectively.
   // In sort_by_key_X functions, a key view and a value view are both permuted
   // to make the keys sorted. This means that the value type doesn't need to be
   // ordered, unlike key
-  Impl::test_nested_sort_by_key<TEST_EXECSPACE, unsigned, unsigned>(
+  NestedSortImpl::test_nested_sort_by_key<ExecutionSpace, unsigned, unsigned>(
       161, 0U, UINT_MAX, 0U, UINT_MAX);
-  Impl::test_nested_sort_by_key<TEST_EXECSPACE, float, char>(
+  NestedSortImpl::test_nested_sort_by_key<ExecutionSpace, float, char>(
       267, -1e6f, 1e6f, CHAR_MIN, CHAR_MAX);
-  Impl::test_nested_sort_by_key<TEST_EXECSPACE, char, double>(
+  NestedSortImpl::test_nested_sort_by_key<ExecutionSpace, char, double>(
       11, CHAR_MIN, CHAR_MAX, 2.718, 3.14);
 }
 
diff --git a/packages/kokkos/algorithms/unit_tests/TestOpenMP_SortDynamicView.cpp b/packages/kokkos/algorithms/unit_tests/TestOpenMP_SortDynamicView.cpp
deleted file mode 100644
index 549d09f1f247e24faf59944b6a2a0de9127d8bce..0000000000000000000000000000000000000000
--- a/packages/kokkos/algorithms/unit_tests/TestOpenMP_SortDynamicView.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#include <Kokkos_Macros.hpp>
-#ifdef KOKKOS_ENABLE_OPENMP
-
-#include <gtest/gtest.h>
-#include <Kokkos_Core.hpp>
-
-//----------------------------------------------------------------------------
-#include <TestRandom.hpp>
-#include <TestSort.hpp>
-#include <iomanip>
-
-namespace Test {
-
-TEST(openmp, SortUnsignedDynamicView) {
-  Impl::test_dynamic_view_sort<Kokkos::OpenMP, unsigned>(171);
-}
-
-}  // namespace Test
-#else
-void KOKKOS_ALGORITHMS_UNITTESTS_TESTOPENMP_PREVENT_LINK_ERROR() {}
-#endif
diff --git a/packages/kokkos/algorithms/unit_tests/TestRandom.hpp b/packages/kokkos/algorithms/unit_tests/TestRandom.hpp
index 607e94c7845ed9529a1b561cdbda13ea0d277f39..472af1403b2de44a502500b2f75fe62bb14f5304 100644
--- a/packages/kokkos/algorithms/unit_tests/TestRandom.hpp
+++ b/packages/kokkos/algorithms/unit_tests/TestRandom.hpp
@@ -14,8 +14,8 @@
 //
 //@HEADER
 
-#ifndef KOKKOS_TEST_DUALVIEW_HPP
-#define KOKKOS_TEST_DUALVIEW_HPP
+#ifndef KOKKOS_ALGORITHMS_UNITTESTS_TEST_RANDOM_HPP
+#define KOKKOS_ALGORITHMS_UNITTESTS_TEST_RANDOM_HPP
 
 #include <gtest/gtest.h>
 #include <iostream>
@@ -27,10 +27,12 @@
 #include <Kokkos_Random.hpp>
 #include <cmath>
 #include <chrono>
+#include <vector>
+#include <algorithm>
+#include <numeric>
 
 namespace Test {
-
-namespace Impl {
+namespace AlgoRandomImpl {
 
 // This test runs the random number generators and uses some statistic tests to
 // check the 'goodness' of the random numbers:
@@ -469,42 +471,135 @@ struct TestDynRankView {
     ASSERT_LE(val.max_val, max);
   }
 };
-}  // namespace Impl
 
-template <typename ExecutionSpace>
-void test_random_xorshift64() {
+template <class ExecutionSpace, class GeneratorPool>
+struct generate_random_stream {
+  using ViewType = Kokkos::View<uint64_t**, ExecutionSpace>;
+
+  ViewType vals;
+  GeneratorPool rand_pool;
+  int samples;
+
+  generate_random_stream(ViewType vals_, GeneratorPool rand_pool_, int samples_)
+      : vals(vals_), rand_pool(rand_pool_), samples(samples_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int i) const {
+    typename GeneratorPool::generator_type rand_gen = rand_pool.get_state();
+
+    for (int k = 0; k < samples; k++) vals(i, k) = rand_gen.urand64();
+
+    rand_pool.free_state(rand_gen);
+  }
+};
+
+// NOTE: this doesn't test the statistical independence of multiple streams
+// generated by a Random pool, it only tests for complete duplicates.
+template <class ExecutionSpace, class Pool>
+void test_duplicate_stream() {
+  using ViewType = Kokkos::View<uint64_t**, ExecutionSpace>;
+
+  // Heuristic to create a "large enough" number of streams.
+  int n_streams = ExecutionSpace{}.concurrency() * 4;
+  int samples   = 8;
+
+  Pool rand_pool(42);
+  ViewType vals_d("Vals", n_streams, samples);
+
+  Kokkos::parallel_for(
+      Kokkos::RangePolicy<ExecutionSpace>(0, n_streams),
+      generate_random_stream<ExecutionSpace, Pool>(vals_d, rand_pool, samples));
+
+  auto vals_h =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, vals_d);
+
+  /*
+  To quickly find streams that are identical, we sort them by the first number,
+  if that's equal then the second and so on. We then test each neighbor pair
+  for duplicates.
+  */
+  std::vector<size_t> indices(n_streams);
+  std::iota(indices.begin(), indices.end(), 0);
+
+  auto comparator = [&](size_t i, size_t j) {
+    for (int k = 0; k < samples; k++) {
+      if (vals_h(i, k) != vals_h(j, k)) return vals_h(i, k) < vals_h(j, k);
+    }
+    return false;
+  };
+  std::sort(indices.begin(), indices.end(), comparator);
+
+  for (int i = 0; i < n_streams - 1; i++) {
+    int idx1 = indices[i];
+    int idx2 = indices[i + 1];
+
+    int k = 0;
+    while (k < samples && vals_h(idx1, k) == vals_h(idx2, k)) k++;
+    ASSERT_LT(k, samples) << "Duplicate streams found";
+  }
+}
+
+}  // namespace AlgoRandomImpl
+
+TEST(TEST_CATEGORY, Random_XorShift64) {
+  using ExecutionSpace = TEST_EXECSPACE;
+
 #if defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_CUDA) || \
     defined(KOKKOS_ENABLE_HIP)
   const int num_draws = 132141141;
 #else  // SERIAL, HPX, OPENMP
   const int num_draws = 10240000;
 #endif
-  Impl::test_random<Kokkos::Random_XorShift64_Pool<ExecutionSpace>>(num_draws);
-  Impl::test_random<Kokkos::Random_XorShift64_Pool<
+  AlgoRandomImpl::test_random<Kokkos::Random_XorShift64_Pool<ExecutionSpace>>(
+      num_draws);
+  AlgoRandomImpl::test_random<Kokkos::Random_XorShift64_Pool<
       Kokkos::Device<ExecutionSpace, typename ExecutionSpace::memory_space>>>(
       num_draws);
-  Impl::TestDynRankView<ExecutionSpace,
-                        Kokkos::Random_XorShift64_Pool<ExecutionSpace>>(10000)
+  AlgoRandomImpl::TestDynRankView<
+      ExecutionSpace, Kokkos::Random_XorShift64_Pool<ExecutionSpace>>(10000)
       .run();
 }
 
-template <typename ExecutionSpace>
-void test_random_xorshift1024() {
+TEST(TEST_CATEGORY, Random_XorShift1024_0) {
+  using ExecutionSpace = TEST_EXECSPACE;
+
 #if defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_CUDA) || \
     defined(KOKKOS_ENABLE_HIP)
   const int num_draws = 52428813;
 #else  // SERIAL, HPX, OPENMP
   const int num_draws = 10130144;
 #endif
-  Impl::test_random<Kokkos::Random_XorShift1024_Pool<ExecutionSpace>>(
+  AlgoRandomImpl::test_random<Kokkos::Random_XorShift1024_Pool<ExecutionSpace>>(
       num_draws);
-  Impl::test_random<Kokkos::Random_XorShift1024_Pool<
+  AlgoRandomImpl::test_random<Kokkos::Random_XorShift1024_Pool<
       Kokkos::Device<ExecutionSpace, typename ExecutionSpace::memory_space>>>(
       num_draws);
-  Impl::TestDynRankView<ExecutionSpace,
-                        Kokkos::Random_XorShift1024_Pool<ExecutionSpace>>(10000)
+  AlgoRandomImpl::TestDynRankView<
+      ExecutionSpace, Kokkos::Random_XorShift1024_Pool<ExecutionSpace>>(10000)
       .run();
 }
-}  // namespace Test
 
-#endif  // KOKKOS_TEST_UNORDERED_MAP_HPP
+TEST(TEST_CATEGORY, Multi_streams) {
+  using ExecutionSpace = TEST_EXECSPACE;
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+  if constexpr (std::is_same_v<ExecutionSpace,
+                               Kokkos::Experimental::OpenMPTarget>) {
+    GTEST_SKIP() << "Libomptarget error";  // FIXME_OPENMPTARGET
+  }
+#endif
+
+#if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU)
+  if constexpr (std::is_same_v<ExecutionSpace, Kokkos::Experimental::SYCL>) {
+    GTEST_SKIP() << "Failing on NVIDIA GPUs";  // FIXME_SYCL
+  }
+#endif
+
+  using Pool64   = Kokkos::Random_XorShift64_Pool<ExecutionSpace>;
+  using Pool1024 = Kokkos::Random_XorShift1024_Pool<ExecutionSpace>;
+
+  AlgoRandomImpl::test_duplicate_stream<ExecutionSpace, Pool64>();
+  AlgoRandomImpl::test_duplicate_stream<ExecutionSpace, Pool1024>();
+}
+
+}  // namespace Test
+#endif
diff --git a/packages/kokkos/algorithms/unit_tests/TestRandomAccessIterator.cpp b/packages/kokkos/algorithms/unit_tests/TestRandomAccessIterator.cpp
index 439d171c8aec4a2ca6146e6e0963d515dd74b725..282d85548c55411e4e5c7bedd3a8c3f12948cd47 100644
--- a/packages/kokkos/algorithms/unit_tests/TestRandomAccessIterator.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestRandomAccessIterator.cpp
@@ -54,7 +54,7 @@ void test_random_access_it_verify(IteratorType it, ValueType gold_value) {
   Kokkos::parallel_for("_std_algo_copy", 1, cf);
   auto v_h =
       Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), checkView);
-  EXPECT_EQ(v_h(), gold_value);
+  ASSERT_EQ(v_h(), gold_value);
 }
 
 TEST_F(random_access_iterator_test, dereference) {
@@ -96,9 +96,9 @@ void test_random_access_it_subscript_op_verify(IteratorType it) {
 
   auto v_h =
       Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), checkView);
-  EXPECT_EQ(v_h(0), (value_t)0);
-  EXPECT_EQ(v_h(1), (value_t)1);
-  EXPECT_EQ(v_h(2), (value_t)2);
+  ASSERT_EQ(v_h(0), (value_t)0);
+  ASSERT_EQ(v_h(1), (value_t)1);
+  ASSERT_EQ(v_h(2), (value_t)2);
 }
 
 TEST_F(random_access_iterator_test, subscript_operator) {
@@ -188,9 +188,9 @@ TEST_F(random_access_iterator_test, operatorsSet4) {
   auto it7 = KE::Impl::RandomAccessIterator<static_view_t>(m_static_view, 3);
   auto it8 = KE::Impl::RandomAccessIterator<dyn_view_t>(m_dynamic_view, 3);
   auto it9 = KE::Impl::RandomAccessIterator<strided_view_t>(m_strided_view, 3);
-  EXPECT_EQ(it1, it7);
-  EXPECT_EQ(it2, it8);
-  EXPECT_EQ(it3, it9);
+  ASSERT_EQ(it1, it7);
+  ASSERT_EQ(it2, it8);
+  ASSERT_EQ(it3, it9);
   EXPECT_GE(it1, it7);
   EXPECT_GE(it2, it8);
   EXPECT_GE(it3, it9);
@@ -205,16 +205,27 @@ TEST_F(random_access_iterator_test, assignment_operator) {
   EXPECT_NE(it1, it2);
 
   it2 = it1;
-  EXPECT_EQ(it1, it2);
+  ASSERT_EQ(it1, it2);
 }
 
 TEST_F(random_access_iterator_test, distance) {
   auto first = KE::begin(m_dynamic_view);
   auto last  = KE::end(m_dynamic_view);
 
-  EXPECT_EQ(0, KE::distance(first, first));
-  EXPECT_EQ(1, KE::distance(first, first + 1));
-  EXPECT_EQ(m_dynamic_view.extent(0), size_t(KE::distance(first, last)));
+  ASSERT_EQ(0, KE::distance(first, first));
+  ASSERT_EQ(1, KE::distance(first, first + 1));
+  ASSERT_EQ(m_dynamic_view.extent(0), size_t(KE::distance(first, last)));
+}
+
+TEST_F(random_access_iterator_test, traits_helpers) {
+  using T1_t = KE::Impl::RandomAccessIterator<static_view_t>;
+  using T2_t = KE::Impl::RandomAccessIterator<dyn_view_t>;
+  using T3_t = KE::Impl::RandomAccessIterator<strided_view_t>;
+
+  namespace KE = Kokkos::Experimental;
+  static_assert(KE::Impl::are_iterators_v<T1_t, T2_t, T3_t>);
+  static_assert(KE::Impl::are_random_access_iterators_v<T1_t, T2_t, T3_t>);
+  static_assert(!KE::Impl::are_iterators_v<int, T2_t, T3_t>);
 }
 
 }  // namespace stdalgos
diff --git a/packages/kokkos/algorithms/unit_tests/TestSort.hpp b/packages/kokkos/algorithms/unit_tests/TestSort.hpp
index d903888878c93af5a00dd2f2a864cc1f017ebf2f..968fb8950b74892fff7928d00f3a7a0af380732b 100644
--- a/packages/kokkos/algorithms/unit_tests/TestSort.hpp
+++ b/packages/kokkos/algorithms/unit_tests/TestSort.hpp
@@ -14,8 +14,8 @@
 //
 //@HEADER
 
-#ifndef KOKKOS_ALGORITHMS_UNITTESTS_TESTSORT_HPP
-#define KOKKOS_ALGORITHMS_UNITTESTS_TESTSORT_HPP
+#ifndef KOKKOS_ALGORITHMS_UNITTESTS_TEST_SORT_HPP
+#define KOKKOS_ALGORITHMS_UNITTESTS_TEST_SORT_HPP
 
 #include <gtest/gtest.h>
 #include <Kokkos_Core.hpp>
@@ -24,8 +24,7 @@
 #include <Kokkos_Sort.hpp>
 
 namespace Test {
-
-namespace Impl {
+namespace SortImpl {
 
 template <class ExecutionSpace, class Scalar>
 struct is_sorted_struct {
@@ -53,56 +52,6 @@ struct sum {
   void operator()(int i, double& count) const { count += keys(i); }
 };
 
-template <class ExecutionSpace, class Scalar>
-struct bin3d_is_sorted_struct {
-  using value_type      = unsigned int;
-  using execution_space = ExecutionSpace;
-
-  Kokkos::View<Scalar * [3], ExecutionSpace> keys;
-
-  int max_bins;
-  Scalar min;
-  Scalar max;
-
-  bin3d_is_sorted_struct(Kokkos::View<Scalar * [3], ExecutionSpace> keys_,
-                         int max_bins_, Scalar min_, Scalar max_)
-      : keys(keys_), max_bins(max_bins_), min(min_), max(max_) {}
-  KOKKOS_INLINE_FUNCTION
-  void operator()(int i, unsigned int& count) const {
-    int ix1 = int((keys(i, 0) - min) / max * max_bins);
-    int iy1 = int((keys(i, 1) - min) / max * max_bins);
-    int iz1 = int((keys(i, 2) - min) / max * max_bins);
-    int ix2 = int((keys(i + 1, 0) - min) / max * max_bins);
-    int iy2 = int((keys(i + 1, 1) - min) / max * max_bins);
-    int iz2 = int((keys(i + 1, 2) - min) / max * max_bins);
-
-    if (ix1 > ix2)
-      count++;
-    else if (ix1 == ix2) {
-      if (iy1 > iy2)
-        count++;
-      else if ((iy1 == iy2) && (iz1 > iz2))
-        count++;
-    }
-  }
-};
-
-template <class ExecutionSpace, class Scalar>
-struct sum3D {
-  using value_type      = double;
-  using execution_space = ExecutionSpace;
-
-  Kokkos::View<Scalar * [3], ExecutionSpace> keys;
-
-  sum3D(Kokkos::View<Scalar * [3], ExecutionSpace> keys_) : keys(keys_) {}
-  KOKKOS_INLINE_FUNCTION
-  void operator()(int i, double& count) const {
-    count += keys(i, 0);
-    count += keys(i, 1);
-    count += keys(i, 2);
-  }
-};
-
 template <class ExecutionSpace, typename KeyType>
 void test_1D_sort_impl(unsigned int n) {
   using KeyViewType = Kokkos::View<KeyType*, ExecutionSpace>;
@@ -142,57 +91,6 @@ void test_1D_sort_impl(unsigned int n) {
   ASSERT_EQ(equal_sum, 1u);
 }
 
-template <class ExecutionSpace, typename KeyType>
-void test_3D_sort_impl(unsigned int n) {
-  using KeyViewType = Kokkos::View<KeyType * [3], ExecutionSpace>;
-
-  KeyViewType keys("Keys", n * n * n);
-
-  Kokkos::Random_XorShift64_Pool<ExecutionSpace> g(1931);
-  Kokkos::fill_random(keys, g, 100.0);
-
-  double sum_before       = 0.0;
-  double sum_after        = 0.0;
-  unsigned int sort_fails = 0;
-
-  ExecutionSpace exec;
-  Kokkos::parallel_reduce(
-      Kokkos::RangePolicy<ExecutionSpace>(exec, 0, keys.extent(0)),
-      sum3D<ExecutionSpace, KeyType>(keys), sum_before);
-
-  int bin_1d = 1;
-  while (bin_1d * bin_1d * bin_1d * 4 < (int)keys.extent(0)) bin_1d *= 2;
-  int bin_max[3]                          = {bin_1d, bin_1d, bin_1d};
-  typename KeyViewType::value_type min[3] = {0, 0, 0};
-  typename KeyViewType::value_type max[3] = {100, 100, 100};
-
-  using BinOp = Kokkos::BinOp3D<KeyViewType>;
-  BinOp bin_op(bin_max, min, max);
-  Kokkos::BinSort<KeyViewType, BinOp> Sorter(keys, bin_op, false);
-  Sorter.create_permute_vector(exec);
-  Sorter.sort(exec, keys);
-
-  Kokkos::parallel_reduce(
-      Kokkos::RangePolicy<ExecutionSpace>(exec, 0, keys.extent(0)),
-      sum3D<ExecutionSpace, KeyType>(keys), sum_after);
-  Kokkos::parallel_reduce(
-      Kokkos::RangePolicy<ExecutionSpace>(exec, 0, keys.extent(0) - 1),
-      bin3d_is_sorted_struct<ExecutionSpace, KeyType>(keys, bin_1d, min[0],
-                                                      max[0]),
-      sort_fails);
-
-  double ratio   = sum_before / sum_after;
-  double epsilon = 1e-10;
-  unsigned int equal_sum =
-      (ratio > (1.0 - epsilon)) && (ratio < (1.0 + epsilon)) ? 1 : 0;
-
-  if (sort_fails)
-    printf("3D Sort Sum: %f %f Fails: %u\n", sum_before, sum_after, sort_fails);
-
-  ASSERT_EQ(sort_fails, 0u);
-  ASSERT_EQ(equal_sum, 1u);
-}
-
 //----------------------------------------------------------------------------
 
 template <class ExecutionSpace, typename KeyType>
@@ -259,74 +157,6 @@ void test_dynamic_view_sort_impl(unsigned int n) {
 
 //----------------------------------------------------------------------------
 
-template <class ExecutionSpace>
-void test_issue_1160_impl() {
-  Kokkos::View<int*, ExecutionSpace> element_("element", 10);
-  Kokkos::View<double*, ExecutionSpace> x_("x", 10);
-  Kokkos::View<double*, ExecutionSpace> v_("y", 10);
-
-  auto h_element = Kokkos::create_mirror_view(element_);
-  auto h_x       = Kokkos::create_mirror_view(x_);
-  auto h_v       = Kokkos::create_mirror_view(v_);
-
-  h_element(0) = 9;
-  h_element(1) = 8;
-  h_element(2) = 7;
-  h_element(3) = 6;
-  h_element(4) = 5;
-  h_element(5) = 4;
-  h_element(6) = 3;
-  h_element(7) = 2;
-  h_element(8) = 1;
-  h_element(9) = 0;
-
-  for (int i = 0; i < 10; ++i) {
-    h_v.access(i, 0) = h_x.access(i, 0) = double(h_element(i));
-  }
-  ExecutionSpace exec;
-  Kokkos::deep_copy(exec, element_, h_element);
-  Kokkos::deep_copy(exec, x_, h_x);
-  Kokkos::deep_copy(exec, v_, h_v);
-
-  using KeyViewType = decltype(element_);
-  using BinOp       = Kokkos::BinOp1D<KeyViewType>;
-
-  int begin = 3;
-  int end   = 8;
-  auto max  = h_element(begin);
-  auto min  = h_element(end - 1);
-  BinOp binner(end - begin, min, max);
-
-  Kokkos::BinSort<KeyViewType, BinOp> Sorter(element_, begin, end, binner,
-                                             false);
-  Sorter.create_permute_vector(exec);
-  Sorter.sort(exec, element_, begin, end);
-
-  Sorter.sort(exec, x_, begin, end);
-  Sorter.sort(exec, v_, begin, end);
-
-  Kokkos::deep_copy(exec, h_element, element_);
-  Kokkos::deep_copy(exec, h_x, x_);
-  Kokkos::deep_copy(exec, h_v, v_);
-  exec.fence();
-
-  ASSERT_EQ(h_element(0), 9);
-  ASSERT_EQ(h_element(1), 8);
-  ASSERT_EQ(h_element(2), 7);
-  ASSERT_EQ(h_element(3), 2);
-  ASSERT_EQ(h_element(4), 3);
-  ASSERT_EQ(h_element(5), 4);
-  ASSERT_EQ(h_element(6), 5);
-  ASSERT_EQ(h_element(7), 6);
-  ASSERT_EQ(h_element(8), 1);
-  ASSERT_EQ(h_element(9), 0);
-
-  for (int i = 0; i < 10; ++i) {
-    ASSERT_EQ(h_element(i), int(h_x.access(i, 0)));
-    ASSERT_EQ(h_element(i), int(h_v.access(i, 0)));
-  }
-}
-
 template <class ExecutionSpace>
 void test_issue_4978_impl() {
   Kokkos::View<long long*, ExecutionSpace> element_("element", 9);
@@ -376,55 +206,33 @@ void test_sort_integer_overflow() {
       << "view (" << vh[0] << ", " << vh[1] << ") is not sorted";
 }
 
-//----------------------------------------------------------------------------
+}  // namespace SortImpl
 
-template <class ExecutionSpace, typename KeyType>
-void test_1D_sort(unsigned int N) {
-  test_1D_sort_impl<ExecutionSpace, KeyType>(N * N * N);
-}
+TEST(TEST_CATEGORY, SortUnsignedValueType) {
+  using ExecutionSpace = TEST_EXECSPACE;
+  using key_type       = unsigned;
+  constexpr int N      = 171;
 
-template <class ExecutionSpace, typename KeyType>
-void test_3D_sort(unsigned int N) {
-  test_3D_sort_impl<ExecutionSpace, KeyType>(N);
-}
+  SortImpl::test_1D_sort_impl<ExecutionSpace, key_type>(N * N * N);
 
-template <class ExecutionSpace, typename KeyType>
-void test_dynamic_view_sort(unsigned int N) {
-  test_dynamic_view_sort_impl<ExecutionSpace, KeyType>(N * N);
-}
+#ifndef KOKKOS_ENABLE_OPENMPTARGET
+  // FIXME_OPENMPTARGET: OpenMPTarget doesn't support DynamicView yet.
+  SortImpl::test_dynamic_view_sort_impl<ExecutionSpace, key_type>(N * N);
+#endif
 
-template <class ExecutionSpace>
-void test_issue_1160_sort() {
-  test_issue_1160_impl<ExecutionSpace>();
+  SortImpl::test_issue_4978_impl<ExecutionSpace>();
 }
 
-template <class ExecutionSpace>
-void test_issue_4978_sort() {
-  test_issue_4978_impl<ExecutionSpace>();
-}
+TEST(TEST_CATEGORY, SortEmptyView) {
+  using ExecutionSpace = TEST_EXECSPACE;
 
-template <class ExecutionSpace, typename KeyType>
-void test_sort(unsigned int N) {
-  test_1D_sort<ExecutionSpace, KeyType>(N);
-#if defined(KOKKOS_ENABLE_CUDA) && \
-    defined(KOKKOS_COMPILER_NVHPC)  // FIXME_NVHPC
-  if (!std::is_same_v<ExecutionSpace, Kokkos::Cuda>)
-#endif
-    test_3D_sort<ExecutionSpace, KeyType>(N);
-// FIXME_OPENMPTARGET: OpenMPTarget doesn't support DynamicView yet.
-#ifndef KOKKOS_ENABLE_OPENMPTARGET
-  test_dynamic_view_sort<ExecutionSpace, KeyType>(N);
-#endif
-#if defined(KOKKOS_ENABLE_CUDA) && \
-    defined(KOKKOS_COMPILER_NVHPC)  // FIXME_NVHPC
-  if (!std::is_same_v<ExecutionSpace, Kokkos::Cuda>)
-#endif
-    test_issue_1160_sort<ExecutionSpace>();
-  test_issue_4978_sort<ExecutionSpace>();
-  test_sort_integer_overflow<ExecutionSpace, long long>();
-  test_sort_integer_overflow<ExecutionSpace, unsigned long long>();
-  test_sort_integer_overflow<ExecutionSpace, int>();
+  // does not matter if we use int or something else
+  Kokkos::View<int*, ExecutionSpace> v("v", 0);
+
+  // TODO check the synchronous behavior of the calls below
+  ASSERT_NO_THROW(Kokkos::sort(ExecutionSpace(), v));
+  ASSERT_NO_THROW(Kokkos::sort(v));
 }
-}  // namespace Impl
+
 }  // namespace Test
-#endif /* KOKKOS_ALGORITHMS_UNITTESTS_TESTSORT_HPP */
+#endif
diff --git a/packages/kokkos/algorithms/unit_tests/TestSortCustomComp.hpp b/packages/kokkos/algorithms/unit_tests/TestSortCustomComp.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a29ee1edeceec7a0dbe49635b91f7a8b76ae2720
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestSortCustomComp.hpp
@@ -0,0 +1,133 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_ALGORITHMS_UNITTESTS_TEST_SORT_CUSTOM_COMP_HPP
+#define KOKKOS_ALGORITHMS_UNITTESTS_TEST_SORT_CUSTOM_COMP_HPP
+
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Random.hpp>
+#include <Kokkos_Sort.hpp>
+#include <TestStdAlgorithmsCommon.hpp>
+
+namespace {
+namespace SortWithComp {
+
+template <class ExecutionSpace, class LayoutTagType, class ValueType>
+auto create_random_view_and_host_clone(
+    LayoutTagType LayoutTag, std::size_t n,
+    Kokkos::pair<ValueType, ValueType> bounds, const std::string& label,
+    std::size_t seedIn = 12371) {
+  using namespace ::Test::stdalgos;
+
+  using mem_space = typename ExecutionSpace::memory_space;
+  auto dataView   = create_view<ValueType, mem_space>(LayoutTag, n, label);
+
+  // dataView might not be deep copyable (e.g. strided layout) so to
+  // randomize it, we make a new view that is for sure deep copyable,
+  // modify it on the host, deep copy to device and then launch
+  // a kernel to copy to dataView
+
+  auto dataView_dc =
+      create_deep_copyable_compatible_view_with_same_extent(dataView);
+  auto dataView_dc_h = create_mirror_view(Kokkos::HostSpace(), dataView_dc);
+
+  // randomly fill the view
+  Kokkos::Random_XorShift64_Pool<Kokkos::DefaultHostExecutionSpace> pool(
+      seedIn);
+  Kokkos::fill_random(dataView_dc_h, pool, bounds.first, bounds.second);
+
+  // copy to dataView_dc and then to dataView
+  Kokkos::deep_copy(dataView_dc, dataView_dc_h);
+  // use CTAD
+  CopyFunctor F1(dataView_dc, dataView);
+  Kokkos::RangePolicy<ExecutionSpace> policy(0, dataView.extent(0));
+  Kokkos::parallel_for("copy", policy, F1);
+
+  return std::make_pair(dataView, dataView_dc_h);
+}
+
+template <class T>
+struct MyComp {
+  KOKKOS_FUNCTION
+  bool operator()(T a, T b) const {
+    // we return a>b on purpose here, rather than doing a<b
+    return a > b;
+  }
+};
+
+// clang-format off
+template <class ExecutionSpace, class Tag, class ValueType>
+void run_all_scenarios(int api)
+{
+  using comp_t = MyComp<ValueType>;
+
+  const std::vector<std::size_t> my_scenarios = {0, 1, 2, 9, 1003, 51513};
+  for (std::size_t N : my_scenarios)
+  {
+    auto [dataView, dataViewBeforeOp_h] = create_random_view_and_host_clone<ExecutionSpace>(
+        Tag{}, N, Kokkos::pair<ValueType, ValueType>{-1045, 565},
+        "dataView");
+
+    namespace KE = Kokkos::Experimental;
+
+    if (api == 0) {
+      Kokkos::sort(dataView, comp_t{});
+      std::sort(KE::begin(dataViewBeforeOp_h), KE::end(dataViewBeforeOp_h),
+                comp_t{});
+    }
+
+    else if (api == 1) {
+      auto exespace = ExecutionSpace();
+      Kokkos::sort(exespace, dataView, comp_t{});
+      std::sort(KE::begin(dataViewBeforeOp_h), KE::end(dataViewBeforeOp_h),
+                comp_t{});
+      exespace.fence();
+    }
+
+    auto dataView_h = Test::stdalgos::create_host_space_copy(dataView);
+    Test::stdalgos::compare_views(dataViewBeforeOp_h, dataView_h);
+
+    // To actually check that Kokkos::sort used the custom
+    // comparator MyComp, we should have a result in non-ascending order.
+    // We can verify this by running std::is_sorted and if that returns
+    // false, then it means everything ran as expected.
+    // Note: std::is_sorted returns true for ranges of length one,
+    // so this check makes sense only when N >= 2.
+    if (N >= 2){
+      ASSERT_FALSE(std::is_sorted( KE::cbegin(dataView_h), KE::cend(dataView_h)));
+    }
+  }
+}
+
+TEST(TEST_CATEGORY, SortWithCustomComparator) {
+  using ExeSpace = TEST_EXECSPACE;
+  using namespace ::Test::stdalgos;
+  for (int api = 0; api < 2; api++) {
+    run_all_scenarios<ExeSpace, DynamicTag, int>(api);
+    run_all_scenarios<ExeSpace, DynamicTag, double>(api);
+    run_all_scenarios<ExeSpace, DynamicLayoutLeftTag, int>(api);
+    run_all_scenarios<ExeSpace, DynamicLayoutLeftTag, double>(api);
+    run_all_scenarios<ExeSpace, DynamicLayoutRightTag, int>(api);
+    run_all_scenarios<ExeSpace, DynamicLayoutRightTag, double>(api);
+    run_all_scenarios<ExeSpace, StridedThreeTag, int>(api);
+    run_all_scenarios<ExeSpace, StridedThreeTag, double>(api);
+  }
+}
+
+}  // namespace SortWithComp
+}  // namespace anonym
+#endif
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentDifference.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentDifference.cpp
index d414d524b61d21828808e3cf7e540ea377df85ec..75ad533f6ee4f2b129801066653739d7352870d7 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentDifference.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentDifference.cpp
@@ -157,7 +157,7 @@ void verify_data(TestViewType test_view, GoldViewType gold) {
   const auto gold_h = create_mirror_view_and_copy(Kokkos::HostSpace(), gold);
 
   for (std::size_t i = 0; i < test_view.extent(0); ++i) {
-    EXPECT_EQ(gold_h(i), test_view_dc_h(i));
+    ASSERT_EQ(gold_h(i), test_view_dc_h(i));
   }
 }
 
@@ -197,7 +197,7 @@ void run_single_scenario(const InfoType& scenario_info,
     auto res1 = KE::adjacent_difference(exespace(), KE::cbegin(view_from),
                                         KE::cend(view_from),
                                         KE::begin(view_dest), args...);
-    EXPECT_EQ(res1, KE::end(view_dest));
+    ASSERT_EQ(res1, KE::end(view_dest));
     verify_data(view_dest, gold);
   }
 
@@ -207,7 +207,7 @@ void run_single_scenario(const InfoType& scenario_info,
     auto res2 = KE::adjacent_difference(
         "label", exespace(), KE::cbegin(view_from), KE::cend(view_from),
         KE::begin(view_dest), args...);
-    EXPECT_EQ(res2, KE::end(view_dest));
+    ASSERT_EQ(res2, KE::end(view_dest));
     verify_data(view_dest, gold);
   }
 
@@ -216,7 +216,7 @@ void run_single_scenario(const InfoType& scenario_info,
         create_view<ValueType>(Tag{}, view_ext, "adj_diff_dest_view");
     auto res3 =
         KE::adjacent_difference(exespace(), view_from, view_dest, args...);
-    EXPECT_EQ(res3, KE::end(view_dest));
+    ASSERT_EQ(res3, KE::end(view_dest));
     verify_data(view_dest, gold);
   }
 
@@ -225,7 +225,7 @@ void run_single_scenario(const InfoType& scenario_info,
         create_view<ValueType>(Tag{}, view_ext, "adj_diff_dest_view");
     auto res4 = KE::adjacent_difference("label", exespace(), view_from,
                                         view_dest, args...);
-    EXPECT_EQ(res4, KE::end(view_dest));
+    ASSERT_EQ(res4, KE::end(view_dest));
     verify_data(view_dest, gold);
   }
 
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentFind.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentFind.cpp
index ee34761265021e404b3848d00e3a2871e752be14..fa4ff48dbef81c6927bfe4760c9dcd995346f6d3 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentFind.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentFind.cpp
@@ -229,7 +229,7 @@ void verify(DiffType my_diff, ViewType view, Args... args) {
       my_std_adjacent_find(KE::cbegin(view_h), KE::cend(view_h), args...);
   const auto std_diff = std_r - KE::cbegin(view_h);
 
-  EXPECT_EQ(my_diff, std_diff);
+  ASSERT_EQ(my_diff, std_diff);
 }
 
 template <class Tag, class ValueType, class InfoType, class... Args>
@@ -287,12 +287,6 @@ void run_all_scenarios() {
 }
 
 TEST(std_algorithms_nonmod_seq_ops, adjacent_find) {
-#if defined(KOKKOS_ENABLE_CUDA) && \
-    defined(KOKKOS_COMPILER_NVHPC)  // FIXME_NVHPC
-  if constexpr (std::is_same_v<exespace, Kokkos::Cuda>) {
-    GTEST_SKIP() << "FIXME wrong result";
-  }
-#endif
   run_all_scenarios<DynamicTag, int>();
   run_all_scenarios<DynamicTag, double>();
   run_all_scenarios<StridedThreeTag, int>();
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsAllAnyNoneOf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsAllAnyNoneOf.cpp
index 1c39a4735e62576fe9ebfb8a3a702f6e9436ed5c..cccc0f6c18b447e25f8aad942981690467684dfb 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsAllAnyNoneOf.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsAllAnyNoneOf.cpp
@@ -147,12 +147,6 @@ void run_all_scenarios() {
 }
 
 TEST(std_algorithms_all_any_none_of_test, test) {
-#if defined(KOKKOS_ENABLE_CUDA) && \
-    defined(KOKKOS_COMPILER_NVHPC)  // FIXME_NVHPC
-  if constexpr (std::is_same_v<exespace, Kokkos::Cuda>) {
-    GTEST_SKIP() << "FIXME wrong result";
-  }
-#endif
   run_all_scenarios<DynamicTag, double>();
   run_all_scenarios<StridedTwoTag, int>();
   run_all_scenarios<StridedThreeTag, unsigned>();
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCommon.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCommon.cpp
index 39a508bb8feacfdf872bd7f0614b74970524d441..833145bdb464725c0c8b807a60048fe11ebff498 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCommon.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCommon.cpp
@@ -21,9 +21,23 @@ namespace stdalgos {
 
 std::string view_tag_to_string(DynamicTag) { return "dynamic_view"; }
 
+std::string view_tag_to_string(DynamicLayoutLeftTag) {
+  return "dynamic_layout_left_view";
+}
+
+std::string view_tag_to_string(DynamicLayoutRightTag) {
+  return "dynamic_layout_right_view";
+}
+
 std::string view_tag_to_string(StridedTwoTag) { return "stride2_view"; }
 
 std::string view_tag_to_string(StridedThreeTag) { return "stride3_view"; }
 
+std::string view_tag_to_string(StridedTwoRowsTag) { return "stride2rows_view"; }
+
+std::string view_tag_to_string(StridedThreeRowsTag) {
+  return "stride3rows_view";
+}
+
 }  // namespace stdalgos
 }  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp
index 694676a878a2a9c601b5fcc70febe6207903f742..b962218b5f0ca0a832dda9c8c91828ecfd948c11 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp
@@ -18,7 +18,9 @@
 #define KOKKOS_ALGORITHMS_UNITTESTS_TEST_STD_ALGOS_COMMON_HPP
 
 #include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
 #include <Kokkos_StdAlgorithms.hpp>
+#include <Kokkos_Random.hpp>
 #include <TestStdAlgorithmsHelperFunctors.hpp>
 #include <utility>
 #include <numeric>
@@ -29,10 +31,30 @@ namespace stdalgos {
 
 using exespace = Kokkos::DefaultExecutionSpace;
 
+//
+// tags
+//
 struct DynamicTag {};
+struct DynamicLayoutLeftTag {};
+struct DynamicLayoutRightTag {};
+
+// these are for rank-1
 struct StridedTwoTag {};
 struct StridedThreeTag {};
 
+// these are for rank-2
+struct StridedTwoRowsTag {};
+struct StridedThreeRowsTag {};
+
+#ifndef _WIN32
+const std::vector<int> teamSizesToTest = {1, 2, 23, 77, 123};
+#else
+// avoid timeouts in AppVeyor CI
+const std::vector<int> teamSizesToTest = {1, 2, 23};
+#endif
+
+// map of scenarios where the key is a description
+// and the value is the extent
 const std::map<std::string, std::size_t> default_scenarios = {
     {"empty", 0},          {"one-element", 1}, {"two-elements-a", 2},
     {"two-elements-b", 2}, {"small-a", 9},     {"small-b", 13},
@@ -41,51 +63,467 @@ const std::map<std::string, std::size_t> default_scenarios = {
 
 // see cpp file for these functions
 std::string view_tag_to_string(DynamicTag);
+std::string view_tag_to_string(DynamicLayoutLeftTag);
+std::string view_tag_to_string(DynamicLayoutRightTag);
 std::string view_tag_to_string(StridedTwoTag);
 std::string view_tag_to_string(StridedThreeTag);
+std::string view_tag_to_string(StridedTwoRowsTag);
+std::string view_tag_to_string(StridedThreeRowsTag);
+
+//
+// overload set for create_view for rank1
+//
 
-template <class ValueType>
+// dynamic
+template <class ValueType,
+          class MemSpace = typename Kokkos::DefaultExecutionSpace::memory_space>
 auto create_view(DynamicTag, std::size_t ext, const std::string label) {
-  using view_t = Kokkos::View<ValueType*>;
+  using view_t = Kokkos::View<ValueType*, MemSpace>;
   view_t view{label + "_" + view_tag_to_string(DynamicTag{}), ext};
   return view;
 }
 
-template <class ValueType>
+// dynamic layout left
+template <class ValueType,
+          class MemSpace = typename Kokkos::DefaultExecutionSpace::memory_space>
+auto create_view(DynamicLayoutLeftTag, std::size_t ext,
+                 const std::string label) {
+  using view_t = Kokkos::View<ValueType*, Kokkos::LayoutLeft, MemSpace>;
+  view_t view{label + "_" + view_tag_to_string(DynamicLayoutLeftTag{}), ext};
+  return view;
+}
+
+// dynamic layout right
+template <class ValueType,
+          class MemSpace = typename Kokkos::DefaultExecutionSpace::memory_space>
+auto create_view(DynamicLayoutRightTag, std::size_t ext,
+                 const std::string label) {
+  using view_t = Kokkos::View<ValueType*, Kokkos::LayoutRight, MemSpace>;
+  view_t view{label + "_" + view_tag_to_string(DynamicLayoutRightTag{}), ext};
+  return view;
+}
+
+// stride2
+template <class ValueType,
+          class MemSpace = typename Kokkos::DefaultExecutionSpace::memory_space>
 auto create_view(StridedTwoTag, std::size_t ext, const std::string label) {
-  using view_t = Kokkos::View<ValueType*, Kokkos::LayoutStride>;
+  using view_t = Kokkos::View<ValueType*, Kokkos::LayoutStride, MemSpace>;
   Kokkos::LayoutStride layout{ext, 2};
-  view_t view{label + "_" + view_tag_to_string(DynamicTag{}), layout};
+  view_t view{label + "_" + view_tag_to_string(StridedTwoTag{}), layout};
   return view;
 }
 
-template <class ValueType>
+// stride3
+template <class ValueType,
+          class MemSpace = typename Kokkos::DefaultExecutionSpace::memory_space>
 auto create_view(StridedThreeTag, std::size_t ext, const std::string label) {
-  using view_t = Kokkos::View<ValueType*, Kokkos::LayoutStride>;
+  using view_t = Kokkos::View<ValueType*, Kokkos::LayoutStride, MemSpace>;
   Kokkos::LayoutStride layout{ext, 3};
-  view_t view{label + "_" + view_tag_to_string(DynamicTag{}), layout};
+  view_t view{label + "_" + view_tag_to_string(StridedThreeTag{}), layout};
+  return view;
+}
+
+//
+// overload set for create_view for rank2
+//
+
+// dynamic
+template <class ValueType,
+          class MemSpace = typename Kokkos::DefaultExecutionSpace::memory_space>
+auto create_view(DynamicTag, std::size_t ext0, std::size_t ext1,
+                 const std::string label) {
+  using view_t = Kokkos::View<ValueType**, MemSpace>;
+  view_t view{label + "_" + view_tag_to_string(DynamicTag{}), ext0, ext1};
+  return view;
+}
+
+// dynamic layout left
+template <class ValueType,
+          class MemSpace = typename Kokkos::DefaultExecutionSpace::memory_space>
+auto create_view(DynamicLayoutLeftTag, std::size_t ext0, std::size_t ext1,
+                 const std::string label) {
+  using view_t = Kokkos::View<ValueType**, Kokkos::LayoutLeft, MemSpace>;
+  view_t view{label + "_" + view_tag_to_string(DynamicLayoutLeftTag{}), ext0,
+              ext1};
+  return view;
+}
+
+// dynamic layout right
+template <class ValueType,
+          class MemSpace = typename Kokkos::DefaultExecutionSpace::memory_space>
+auto create_view(DynamicLayoutRightTag, std::size_t ext0, std::size_t ext1,
+                 const std::string label) {
+  using view_t = Kokkos::View<ValueType**, Kokkos::LayoutRight, MemSpace>;
+  view_t view{label + "_" + view_tag_to_string(DynamicLayoutRightTag{}), ext0,
+              ext1};
+  return view;
+}
+
+// stride2rows
+template <class ValueType,
+          class MemSpace = typename Kokkos::DefaultExecutionSpace::memory_space>
+auto create_view(StridedTwoRowsTag, std::size_t ext0, std::size_t ext1,
+                 const std::string label) {
+  using view_t = Kokkos::View<ValueType**, Kokkos::LayoutStride, MemSpace>;
+  Kokkos::LayoutStride layout{ext0, 2, ext1, ext0 * 2};
+  view_t view{label + "_" + view_tag_to_string(StridedTwoRowsTag{}), layout};
+  return view;
+}
+
+// stride3rows
+template <class ValueType,
+          class MemSpace = typename Kokkos::DefaultExecutionSpace::memory_space>
+auto create_view(StridedThreeRowsTag, std::size_t ext0, std::size_t ext1,
+                 const std::string label) {
+  using view_t = Kokkos::View<ValueType**, Kokkos::LayoutStride, MemSpace>;
+  Kokkos::LayoutStride layout{ext0, 3, ext1, ext0 * 3};
+  view_t view{label + "_" + view_tag_to_string(StridedThreeRowsTag{}), layout};
   return view;
 }
 
 template <class ViewType>
 auto create_deep_copyable_compatible_view_with_same_extent(ViewType view) {
-  const std::size_t ext      = view.extent(0);
-  using view_value_type      = typename ViewType::value_type;
-  using view_exespace        = typename ViewType::execution_space;
-  using view_deep_copyable_t = Kokkos::View<view_value_type*, view_exespace>;
-  view_deep_copyable_t view_dc("view_dc", ext);
-  return view_dc;
+  using view_value_type  = typename ViewType::value_type;
+  using view_exespace    = typename ViewType::execution_space;
+  const std::size_t ext0 = view.extent(0);
+  if constexpr (ViewType::rank == 1) {
+    using view_deep_copyable_t = Kokkos::View<view_value_type*, view_exespace>;
+    return view_deep_copyable_t{"view_dc", ext0};
+  } else {
+    static_assert(ViewType::rank == 2, "Only rank 1 or 2 supported.");
+    using view_deep_copyable_t = Kokkos::View<view_value_type**, view_exespace>;
+    const std::size_t ext1     = view.extent(1);
+    return view_deep_copyable_t{"view_dc", ext0, ext1};
+  }
+
+  // this is needed for intel to avoid
+  // error #1011: missing return statement at end of non-void function
+#if defined KOKKOS_COMPILER_INTEL || \
+    (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130)
+  __builtin_unreachable();
+#endif
 }
 
 template <class ViewType>
 auto create_deep_copyable_compatible_clone(ViewType view) {
   auto view_dc    = create_deep_copyable_compatible_view_with_same_extent(view);
   using view_dc_t = decltype(view_dc);
-  CopyFunctor<ViewType, view_dc_t> F1(view, view_dc);
-  Kokkos::parallel_for("copy", view.extent(0), F1);
+  using exe_space = typename view_dc_t::execution_space;
+  if constexpr (ViewType::rank == 1) {
+    CopyFunctor<ViewType, view_dc_t> F1(view, view_dc);
+    Kokkos::RangePolicy<exe_space> policy(0, view.extent(0));
+    Kokkos::parallel_for("copy", policy, F1);
+
+  } else {
+    static_assert(ViewType::rank == 2, "Only rank 1 or 2 supported.");
+    CopyFunctorRank2<ViewType, view_dc_t> F1(view, view_dc);
+    Kokkos::RangePolicy<exe_space> policy(0, view.extent(0) * view.extent(1));
+    Kokkos::parallel_for("copy", policy, F1);
+  }
   return view_dc;
 }
 
+//
+// others
+//
+
+template <class TeamHandleType, class ValueType1, class ValueType2>
+KOKKOS_FUNCTION bool team_members_have_matching_result(
+    const TeamHandleType& teamHandle, const ValueType1 memberValueIn,
+    const ValueType2 targetIn) {
+  using T             = std::common_type_t<ValueType1, ValueType2>;
+  const T memberValue = memberValueIn;
+  const T target      = targetIn;
+
+  // set accum to 1 if a mismach is found
+  const bool mismatch = memberValue != target;
+  int accum           = static_cast<int>(mismatch);
+  // FIXME_OPENMPTARGET: team API does not meet the TeamHandle concept and
+  // ignores the reducer passed
+#if defined KOKKOS_ENABLE_OPENMPTARGET
+  Kokkos::Sum<int> dummyReducer(accum);
+  const auto result = teamHandle.team_reduce(accum, dummyReducer);
+  return (result == 0);
+#else
+  teamHandle.team_reduce(Kokkos::Sum<int>(accum));
+  return (accum == 0);
+#endif
+}
+
+template <class ValueType1, class ValueType2>
+auto make_bounds(const ValueType1& lower, const ValueType2 upper) {
+  return Kokkos::pair<ValueType1, ValueType2>{lower, upper};
+}
+
+// libstdc++ as provided by GCC 8 does not have reduce, transform_reduce,
+// exclusive_scan, inclusive_scan, transform_exclusive_scan,
+// transform_inclusive_scan and for GCC 9.1, 9.2 fails to compile them for
+// missing overload not accepting policy so use here simplified versions of
+// them, only for testing purpose
+#if defined(_GLIBCXX_RELEASE) && (_GLIBCXX_RELEASE <= 9)
+
+template <class InputIterator, class ValueType, class BinaryOp>
+ValueType testing_reduce(InputIterator first, InputIterator last,
+                         ValueType initIn, BinaryOp binOp) {
+  using value_type = std::remove_const_t<ValueType>;
+  value_type init  = initIn;
+
+  while (last - first >= 4) {
+    ValueType v1 = binOp(first[0], first[1]);
+    ValueType v2 = binOp(first[2], first[3]);
+    ValueType v3 = binOp(v1, v2);
+    init         = binOp(init, v3);
+    first += 4;
+  }
+
+  for (; first != last; ++first) {
+    init = binOp(init, *first);
+  }
+
+  return init;
+}
+
+template <class InputIterator, class ValueType>
+ValueType testing_reduce(InputIterator first, InputIterator last,
+                         ValueType init) {
+  return testing_reduce(
+      first, last, init,
+      [](const ValueType& lhs, const ValueType& rhs) { return lhs + rhs; });
+}
+
+template <class InputIterator>
+auto testing_reduce(InputIterator first, InputIterator last) {
+  using ValueType = typename InputIterator::value_type;
+  return testing_reduce(
+      first, last, ValueType{},
+      [](const ValueType& lhs, const ValueType& rhs) { return lhs + rhs; });
+}
+
+template <class InputIterator1, class InputIterator2, class ValueType,
+          class BinaryJoiner, class BinaryTransform>
+ValueType testing_transform_reduce(InputIterator1 first1, InputIterator1 last1,
+                                   InputIterator2 first2, ValueType initIn,
+                                   BinaryJoiner binJoiner,
+                                   BinaryTransform binTransform) {
+  using value_type = std::remove_const_t<ValueType>;
+  value_type init  = initIn;
+
+  while (last1 - first1 >= 4) {
+    ValueType v1 = binJoiner(binTransform(first1[0], first2[0]),
+                             binTransform(first1[1], first2[1]));
+
+    ValueType v2 = binJoiner(binTransform(first1[2], first2[2]),
+                             binTransform(first1[3], first2[3]));
+
+    ValueType v3 = binJoiner(v1, v2);
+    init         = binJoiner(init, v3);
+
+    first1 += 4;
+    first2 += 4;
+  }
+
+  for (; first1 != last1; ++first1, ++first2) {
+    init = binJoiner(init, binTransform(*first1, *first2));
+  }
+
+  return init;
+}
+
+template <class InputIterator1, class InputIterator2, class ValueType>
+ValueType testing_transform_reduce(InputIterator1 first1, InputIterator1 last1,
+                                   InputIterator2 first2, ValueType init) {
+  return testing_transform_reduce(
+      first1, last1, first2, init,
+      [](const ValueType& lhs, const ValueType& rhs) { return lhs + rhs; },
+      [](const ValueType& lhs, const ValueType& rhs) { return lhs * rhs; });
+}
+
+template <class InputIterator, class ValueType, class BinaryJoiner,
+          class UnaryTransform>
+ValueType testing_transform_reduce(InputIterator first, InputIterator last,
+                                   ValueType initIn, BinaryJoiner binJoiner,
+                                   UnaryTransform unaryTransform) {
+  using value_type = std::remove_const_t<ValueType>;
+  value_type init  = initIn;
+
+  while (last - first >= 4) {
+    ValueType v1 =
+        binJoiner(unaryTransform(first[0]), unaryTransform(first[1]));
+    ValueType v2 =
+        binJoiner(unaryTransform(first[2]), unaryTransform(first[3]));
+    ValueType v3 = binJoiner(v1, v2);
+    init         = binJoiner(init, v3);
+    first += 4;
+  }
+
+  for (; first != last; ++first) {
+    init = binJoiner(init, unaryTransform(*first));
+  }
+
+  return init;
+}
+
+/*
+   EXCLUSIVE_SCAN
+ */
+template <class InputIterator, class OutputIterator, class ValueType,
+          class BinaryOp>
+OutputIterator testing_exclusive_scan(InputIterator first, InputIterator last,
+                                      OutputIterator result, ValueType initIn,
+                                      BinaryOp binOp) {
+  using value_type = std::remove_const_t<ValueType>;
+  value_type init  = initIn;
+
+  while (first != last) {
+    auto v = init;
+    init   = binOp(init, *first);
+    ++first;
+    *result++ = v;
+  }
+
+  return result;
+}
+
+template <class InputIterator, class OutputIterator, class ValueType>
+OutputIterator testing_exclusive_scan(InputIterator first, InputIterator last,
+                                      OutputIterator result, ValueType init) {
+  return testing_exclusive_scan(
+      first, last, result, init,
+      [](const ValueType& lhs, const ValueType& rhs) { return lhs + rhs; });
+}
+
+/*
+   INCLUSIVE_SCAN
+ */
+template <class InputIterator, class OutputIterator, class BinaryOp,
+          class ValueType>
+OutputIterator testing_inclusive_scan(InputIterator first, InputIterator last,
+                                      OutputIterator result, BinaryOp binOp,
+                                      ValueType initIn) {
+  using value_type = std::remove_const_t<ValueType>;
+  value_type init  = initIn;
+  for (; first != last; ++first) {
+    init      = binOp(init, *first);
+    *result++ = init;
+  }
+
+  return result;
+}
+
+template <class InputIterator, class OutputIterator, class BinaryOp>
+OutputIterator testing_inclusive_scan(InputIterator first, InputIterator last,
+                                      OutputIterator result, BinaryOp bop) {
+  if (first != last) {
+    auto init = *first;
+    *result++ = init;
+    ++first;
+    if (first != last) {
+      result = testing_inclusive_scan(first, last, result, bop, init);
+    }
+  }
+  return result;
+}
+
+template <class InputIterator, class OutputIterator>
+OutputIterator testing_inclusive_scan(InputIterator first, InputIterator last,
+                                      OutputIterator result) {
+  using ValueType = typename InputIterator::value_type;
+  return testing_inclusive_scan(
+      first, last, result,
+      [](const ValueType& lhs, const ValueType& rhs) { return lhs + rhs; });
+}
+
+/*
+   TRANSFORM_EXCLUSIVE_SCAN
+ */
+template <class InputIterator, class OutputIterator, class ValueType,
+          class BinaryOp, class UnaryOp>
+OutputIterator testing_transform_exclusive_scan(
+    InputIterator first, InputIterator last, OutputIterator result,
+    ValueType initIn, BinaryOp binOp, UnaryOp unaryOp) {
+  using value_type = std::remove_const_t<ValueType>;
+  value_type init  = initIn;
+
+  while (first != last) {
+    auto v = init;
+    init   = binOp(init, unaryOp(*first));
+    ++first;
+    *result++ = v;
+  }
+
+  return result;
+}
+
+template <class InputIterator, class OutputIterator, class BinaryOp,
+          class UnaryOp, class ValueType>
+OutputIterator testing_transform_inclusive_scan(InputIterator first,
+                                                InputIterator last,
+                                                OutputIterator result,
+                                                BinaryOp binOp, UnaryOp unaryOp,
+                                                ValueType initIn) {
+  using value_type = std::remove_const_t<ValueType>;
+  value_type init  = initIn;
+
+  for (; first != last; ++first) {
+    init      = binOp(init, unaryOp(*first));
+    *result++ = init;
+  }
+
+  return result;
+}
+
+template <class InputIterator, class OutputIterator, class BinaryOp,
+          class UnaryOp>
+OutputIterator testing_transform_inclusive_scan(InputIterator first,
+                                                InputIterator last,
+                                                OutputIterator result,
+                                                BinaryOp binOp,
+                                                UnaryOp unaryOp) {
+  if (first != last) {
+    auto init = unaryOp(*first);
+    *result++ = init;
+    ++first;
+    if (first != last) {
+      result = testing_transform_inclusive_scan(first, last, result, binOp,
+                                                unaryOp, init);
+    }
+  }
+
+  return result;
+}
+
+#endif
+
+template <class LayoutTagType, class ValueType>
+auto create_random_view_and_host_clone(
+    LayoutTagType LayoutTag, std::size_t numRows, std::size_t numCols,
+    Kokkos::pair<ValueType, ValueType> bounds, const std::string& label,
+    std::size_t seedIn = 12371) {
+  // construct in memory space associated with default exespace
+  auto dataView = create_view<ValueType>(LayoutTag, numRows, numCols, label);
+
+  // dataView might not deep copyable (e.g. strided layout) so to
+  // randomize it, we make a new view that is for sure deep copyable,
+  // modify it on the host, deep copy to device and then launch
+  // a kernel to copy to dataView
+  auto dataView_dc =
+      create_deep_copyable_compatible_view_with_same_extent(dataView);
+  auto dataView_dc_h = create_mirror_view(Kokkos::HostSpace(), dataView_dc);
+
+  // randomly fill the view
+  Kokkos::Random_XorShift64_Pool<Kokkos::DefaultHostExecutionSpace> pool(
+      seedIn);
+  Kokkos::fill_random(dataView_dc_h, pool, bounds.first, bounds.second);
+
+  // copy to dataView_dc and then to dataView
+  Kokkos::deep_copy(dataView_dc, dataView_dc_h);
+  // use CTAD
+  CopyFunctorRank2 F1(dataView_dc, dataView);
+  Kokkos::parallel_for("copy", dataView.extent(0) * dataView.extent(1), F1);
+
+  return std::make_pair(dataView, dataView_dc_h);
+}
+
 template <class ViewType>
 auto create_host_space_copy(ViewType view) {
   auto view_dc = create_deep_copyable_compatible_clone(view);
@@ -110,7 +548,7 @@ verify_values(ValueType expected, const ViewType view) {
                 "Non-matching value types of view and reference value");
   auto view_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), view);
   for (std::size_t i = 0; i < view_h.extent(0); i++) {
-    EXPECT_EQ(expected, view_h(i));
+    ASSERT_EQ(expected, view_h(i));
   }
 }
 
@@ -130,7 +568,7 @@ verify_values(ValueType expected, const ViewType view) {
   auto view_h =
       Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), tmpView);
   for (std::size_t i = 0; i < view_h.extent(0); i++) {
-    EXPECT_EQ(expected, view_h(i));
+    ASSERT_EQ(expected, view_h(i));
   }
 }
 
@@ -147,7 +585,7 @@ compare_views(ViewType1 expected, const ViewType2 actual) {
       Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), actual);
 
   for (std::size_t i = 0; i < expected_h.extent(0); i++) {
-    EXPECT_EQ(expected_h(i), actual_h(i));
+    ASSERT_EQ(expected_h(i), actual_h(i));
   }
 }
 
@@ -171,7 +609,32 @@ compare_views(ViewType1 expected, const ViewType2 actual) {
       Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), expected);
 
   for (std::size_t i = 0; i < expected_h.extent(0); i++) {
-    EXPECT_EQ(expected_h(i), actual_h(i));
+    ASSERT_EQ(expected_h(i), actual_h(i));
+  }
+}
+
+template <class ViewType1, class ViewType2>
+void expect_equal_host_views(ViewType1 A, const ViewType2 B) {
+  static_assert(
+      ViewType1::rank == 2 && ViewType2::rank == 2 &&
+          std::is_same_v<typename ViewType1::memory_space, Kokkos::HostSpace> &&
+          std::is_same_v<typename ViewType2::memory_space, Kokkos::HostSpace>,
+      "Expected 2-dimensional host view.");
+  ASSERT_EQ(A.extent(0), B.extent(0));
+  ASSERT_EQ(A.extent(1), B.extent(1));
+
+  constexpr bool values_are_floast =
+      std::is_floating_point_v<typename ViewType1::value_type> ||
+      std::is_floating_point_v<typename ViewType2::value_type>;
+
+  for (std::size_t i = 0; i < A.extent(0); i++) {
+    for (std::size_t j = 0; j < A.extent(1); j++) {
+      if constexpr (values_are_floast) {
+        EXPECT_FLOAT_EQ(A(i, j), B(i, j));
+      } else {
+        ASSERT_EQ(A(i, j), B(i, j));
+      }
+    }
   }
 }
 
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp
index 5d55199801217f0a43ce8d53e94e90d11c8ff445..386d533f7a8308244193bf66ce1b93edaaf8ca69 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp
@@ -42,8 +42,8 @@ TEST(std_algorithms, is_admissible_to_std_algorithms) {
   using strided_view_1d_t = Kokkos::View<value_type*, Kokkos::LayoutStride>;
   Kokkos::LayoutStride layout1d{extent0, 2};
   strided_view_1d_t strided_view_1d{"std-algo-test-1d-strided-view", layout1d};
-  EXPECT_EQ(layout1d.dimension[0], 13u);
-  EXPECT_EQ(layout1d.stride[0], 2u);
+  ASSERT_EQ(layout1d.dimension[0], 13u);
+  ASSERT_EQ(layout1d.stride[0], 2u);
   // they are admissible
   KE::Impl::static_assert_is_admissible_to_kokkos_std_algorithms(
       static_view_1d);
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCopyIf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCopyIf.cpp
index e21d50f69b9e001685aa62e45f023e6b2ac62782..5778e37be04d160a8947f1669e52639484e0a21f 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCopyIf.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCopyIf.cpp
@@ -135,49 +135,49 @@ void verify_data(const std::string& name, ViewTypeFrom view_from,
   }
 
   else if (name == "one-element-a") {
-    EXPECT_EQ(view_test_h(0), static_cast<value_type>(0));
+    ASSERT_EQ(view_test_h(0), static_cast<value_type>(0));
   }
 
   else if (name == "one-element-b") {
-    EXPECT_EQ(view_test_h(0), static_cast<value_type>(2));
+    ASSERT_EQ(view_test_h(0), static_cast<value_type>(2));
   }
 
   else if (name == "two-elements-a") {
-    EXPECT_EQ(view_test_h(0), static_cast<value_type>(2));
-    EXPECT_EQ(view_test_h(1), static_cast<value_type>(0));
+    ASSERT_EQ(view_test_h(0), static_cast<value_type>(2));
+    ASSERT_EQ(view_test_h(1), static_cast<value_type>(0));
   }
 
   else if (name == "two-elements-b") {
-    EXPECT_EQ(view_test_h(0), static_cast<value_type>(2));
-    EXPECT_EQ(view_test_h(1), static_cast<value_type>(0));
+    ASSERT_EQ(view_test_h(0), static_cast<value_type>(2));
+    ASSERT_EQ(view_test_h(1), static_cast<value_type>(0));
   }
 
   else if (name == "small-a") {
-    EXPECT_EQ(view_test_h(0), static_cast<value_type>(-4));
-    EXPECT_EQ(view_test_h(1), static_cast<value_type>(-2));
-    EXPECT_EQ(view_test_h(2), static_cast<value_type>(0));
-    EXPECT_EQ(view_test_h(3), static_cast<value_type>(2));
-    EXPECT_EQ(view_test_h(4), static_cast<value_type>(4));
-    EXPECT_EQ(view_test_h(5), static_cast<value_type>(0));
-    EXPECT_EQ(view_test_h(6), static_cast<value_type>(0));
-    EXPECT_EQ(view_test_h(7), static_cast<value_type>(0));
-    EXPECT_EQ(view_test_h(8), static_cast<value_type>(0));
+    ASSERT_EQ(view_test_h(0), static_cast<value_type>(-4));
+    ASSERT_EQ(view_test_h(1), static_cast<value_type>(-2));
+    ASSERT_EQ(view_test_h(2), static_cast<value_type>(0));
+    ASSERT_EQ(view_test_h(3), static_cast<value_type>(2));
+    ASSERT_EQ(view_test_h(4), static_cast<value_type>(4));
+    ASSERT_EQ(view_test_h(5), static_cast<value_type>(0));
+    ASSERT_EQ(view_test_h(6), static_cast<value_type>(0));
+    ASSERT_EQ(view_test_h(7), static_cast<value_type>(0));
+    ASSERT_EQ(view_test_h(8), static_cast<value_type>(0));
   }
 
   else if (name == "small-b") {
-    EXPECT_EQ(view_test_h(0), static_cast<value_type>(22));
-    EXPECT_EQ(view_test_h(1), static_cast<value_type>(-12));
-    EXPECT_EQ(view_test_h(2), static_cast<value_type>(22));
-    EXPECT_EQ(view_test_h(3), static_cast<value_type>(-12));
-    EXPECT_EQ(view_test_h(4), static_cast<value_type>(22));
-    EXPECT_EQ(view_test_h(5), static_cast<value_type>(-12));
-    EXPECT_EQ(view_test_h(6), static_cast<value_type>(22));
-    EXPECT_EQ(view_test_h(7), static_cast<value_type>(-12));
-    EXPECT_EQ(view_test_h(8), static_cast<value_type>(22));
-    EXPECT_EQ(view_test_h(9), static_cast<value_type>(-12));
-    EXPECT_EQ(view_test_h(10), static_cast<value_type>(22));
-    EXPECT_EQ(view_test_h(11), static_cast<value_type>(-12));
-    EXPECT_EQ(view_test_h(12), static_cast<value_type>(22));
+    ASSERT_EQ(view_test_h(0), static_cast<value_type>(22));
+    ASSERT_EQ(view_test_h(1), static_cast<value_type>(-12));
+    ASSERT_EQ(view_test_h(2), static_cast<value_type>(22));
+    ASSERT_EQ(view_test_h(3), static_cast<value_type>(-12));
+    ASSERT_EQ(view_test_h(4), static_cast<value_type>(22));
+    ASSERT_EQ(view_test_h(5), static_cast<value_type>(-12));
+    ASSERT_EQ(view_test_h(6), static_cast<value_type>(22));
+    ASSERT_EQ(view_test_h(7), static_cast<value_type>(-12));
+    ASSERT_EQ(view_test_h(8), static_cast<value_type>(22));
+    ASSERT_EQ(view_test_h(9), static_cast<value_type>(-12));
+    ASSERT_EQ(view_test_h(10), static_cast<value_type>(22));
+    ASSERT_EQ(view_test_h(11), static_cast<value_type>(-12));
+    ASSERT_EQ(view_test_h(12), static_cast<value_type>(22));
   }
 
   else if (name == "medium" || name == "large") {
@@ -190,14 +190,14 @@ void verify_data(const std::string& name, ViewTypeFrom view_from,
     std::size_t count = 0;
     for (std::size_t i = 0; i < view_from_h.extent(0); ++i) {
       if (pred(view_from_h(i))) {
-        EXPECT_EQ(view_test_h(count), view_from_h(i));
+        ASSERT_EQ(view_test_h(count), view_from_h(i));
         count++;
       }
     }
     // all other entries of test view should be zero
     for (; count < view_test_h.extent(0); ++count) {
       // std::cout << count << '\n';
-      EXPECT_EQ(view_test_h(count), value_type(0));
+      ASSERT_EQ(view_test_h(count), value_type(0));
     }
   }
 
@@ -226,7 +226,7 @@ void run_single_scenario(const InfoType& scenario_info) {
     auto rit       = KE::copy_if(exespace(), KE::cbegin(view_from),
                            KE::cend(view_from), KE::begin(view_dest), pred);
     verify_data(name, view_from, view_dest, pred);
-    EXPECT_EQ(rit, (KE::begin(view_dest) + n));
+    ASSERT_EQ(rit, (KE::begin(view_dest) + n));
   }
 
   {
@@ -235,7 +235,7 @@ void run_single_scenario(const InfoType& scenario_info) {
     auto rit       = KE::copy_if("label", exespace(), KE::cbegin(view_from),
                            KE::cend(view_from), KE::begin(view_dest), pred);
     verify_data(name, view_from, view_dest, pred);
-    EXPECT_EQ(rit, (KE::begin(view_dest) + n));
+    ASSERT_EQ(rit, (KE::begin(view_dest) + n));
   }
 
   {
@@ -243,7 +243,7 @@ void run_single_scenario(const InfoType& scenario_info) {
     auto view_dest = create_view<ValueType>(Tag{}, view_ext, "copy_if_dest");
     auto rit       = KE::copy_if(exespace(), view_from, view_dest, pred);
     verify_data(name, view_from, view_dest, pred);
-    EXPECT_EQ(rit, (KE::begin(view_dest) + n));
+    ASSERT_EQ(rit, (KE::begin(view_dest) + n));
   }
 
   {
@@ -251,7 +251,7 @@ void run_single_scenario(const InfoType& scenario_info) {
     auto view_dest = create_view<ValueType>(Tag{}, view_ext, "copy_if_dest");
     auto rit = KE::copy_if("label", exespace(), view_from, view_dest, pred);
     verify_data(name, view_from, view_dest, pred);
-    EXPECT_EQ(rit, (KE::begin(view_dest) + n));
+    ASSERT_EQ(rit, (KE::begin(view_dest) + n));
   }
 
   Kokkos::fence();
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCount.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCount.cpp
index 9423d2e15a46fe550282bf2b2106037610c18b6c..32e988370909e850662a4aab61592210fe1c532e 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCount.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCount.cpp
@@ -35,13 +35,13 @@ void test_count(const ViewType view) {
     const value_t count_value = 0;
     const auto std_result =
         std::count(KE::cbegin(expected), KE::cend(expected), count_value);
-    EXPECT_EQ(view.extent(0), size_t(std_result));
+    ASSERT_EQ(view.extent(0), size_t(std_result));
 
     // pass const iterators
-    EXPECT_EQ(std_result, KE::count(exespace(), KE::cbegin(view),
+    ASSERT_EQ(std_result, KE::count(exespace(), KE::cbegin(view),
                                     KE::cend(view), count_value));
     // pass view
-    EXPECT_EQ(std_result, KE::count(exespace(), view, count_value));
+    ASSERT_EQ(std_result, KE::count(exespace(), view, count_value));
   }
 
   {
@@ -50,10 +50,10 @@ void test_count(const ViewType view) {
         std::count(KE::cbegin(expected), KE::cend(expected), count_value);
 
     // pass iterators
-    EXPECT_EQ(std_result, KE::count("label", exespace(), KE::begin(view),
+    ASSERT_EQ(std_result, KE::count("label", exespace(), KE::begin(view),
                                     KE::end(view), count_value));
     // pass view
-    EXPECT_EQ(std_result, KE::count("label", exespace(), view, count_value));
+    ASSERT_EQ(std_result, KE::count("label", exespace(), view, count_value));
   }
 }
 
@@ -67,24 +67,24 @@ void test_count_if(const ViewType view) {
 
   // no positive elements (all zeroes)
   const auto predicate = IsPositiveFunctor<value_type>();
-  EXPECT_EQ(0,
+  ASSERT_EQ(0,
             std::count_if(KE::begin(expected), KE::end(expected), predicate));
 
   // pass iterators
-  EXPECT_EQ(
+  ASSERT_EQ(
       0, KE::count_if(exespace(), KE::begin(view), KE::end(view), predicate));
   // pass view
-  EXPECT_EQ(0, KE::count_if(exespace(), view, predicate));
+  ASSERT_EQ(0, KE::count_if(exespace(), view, predicate));
 
   fill_views_inc(view, expected);
 
   const auto std_result =
       std::count_if(KE::begin(expected), KE::end(expected), predicate);
   // pass const iterators
-  EXPECT_EQ(std_result, KE::count_if("label", exespace(), KE::cbegin(view),
+  ASSERT_EQ(std_result, KE::count_if("label", exespace(), KE::cbegin(view),
                                      KE::cend(view), predicate));
   // pass view
-  EXPECT_EQ(std_result, KE::count_if("label", exespace(), view, predicate));
+  ASSERT_EQ(std_result, KE::count_if("label", exespace(), view, predicate));
 }
 
 template <class Tag, class ValueType>
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp
index 4969541a023bed06076120ff09e6187048041bdb..6ab68a1987df17a837c9895a1a53d71075bbcb5a 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp
@@ -157,7 +157,7 @@ void verify_data(ViewType1 data_view,  // contains data
       //           << gold_h(i) << " " << test_view_h(i) << " "
       //           << std::abs(gold_h(i) - test_view_h(i)) << std::endl;
       if (std::is_same<gold_view_value_type, int>::value) {
-        EXPECT_EQ(gold_h(i), test_view_h(i));
+        ASSERT_EQ(gold_h(i), test_view_h(i));
       } else {
         const auto error =
             std::abs(static_cast<double>(gold_h(i) - test_view_h(i)));
@@ -213,7 +213,7 @@ void run_single_scenario_default_op(const InfoType& scenario_info,
     auto r = KE::exclusive_scan(exespace(), KE::cbegin(view_from),
                                 KE::cend(view_from), KE::begin(view_dest),
                                 init_value);
-    EXPECT_EQ(r, KE::end(view_dest));
+    ASSERT_EQ(r, KE::end(view_dest));
     verify_data(view_from, view_dest, init_value, default_op());
   }
 
@@ -222,14 +222,14 @@ void run_single_scenario_default_op(const InfoType& scenario_info,
     auto r = KE::exclusive_scan("label", exespace(), KE::cbegin(view_from),
                                 KE::cend(view_from), KE::begin(view_dest),
                                 init_value);
-    EXPECT_EQ(r, KE::end(view_dest));
+    ASSERT_EQ(r, KE::end(view_dest));
     verify_data(view_from, view_dest, init_value, default_op());
   }
 
   {
     fill_zero(view_dest);
     auto r = KE::exclusive_scan(exespace(), view_from, view_dest, init_value);
-    EXPECT_EQ(r, KE::end(view_dest));
+    ASSERT_EQ(r, KE::end(view_dest));
     verify_data(view_from, view_dest, init_value, default_op());
   }
 
@@ -237,7 +237,7 @@ void run_single_scenario_default_op(const InfoType& scenario_info,
     fill_zero(view_dest);
     auto r = KE::exclusive_scan("label", exespace(), view_from, view_dest,
                                 init_value);
-    EXPECT_EQ(r, KE::end(view_dest));
+    ASSERT_EQ(r, KE::end(view_dest));
     verify_data(view_from, view_dest, init_value, default_op());
   }
 
@@ -263,7 +263,7 @@ void run_single_scenario_custom_op(const InfoType& scenario_info,
     auto r = KE::exclusive_scan(exespace(), KE::cbegin(view_from),
                                 KE::cend(view_from), KE::begin(view_dest),
                                 init_value, bop);
-    EXPECT_EQ(r, KE::end(view_dest));
+    ASSERT_EQ(r, KE::end(view_dest));
     verify_data(view_from, view_dest, init_value, bop);
   }
 
@@ -272,7 +272,7 @@ void run_single_scenario_custom_op(const InfoType& scenario_info,
     auto r = KE::exclusive_scan("label", exespace(), KE::cbegin(view_from),
                                 KE::cend(view_from), KE::begin(view_dest),
                                 init_value, bop);
-    EXPECT_EQ(r, KE::end(view_dest));
+    ASSERT_EQ(r, KE::end(view_dest));
     verify_data(view_from, view_dest, init_value, bop);
   }
 
@@ -280,7 +280,7 @@ void run_single_scenario_custom_op(const InfoType& scenario_info,
     fill_zero(view_dest);
     auto r =
         KE::exclusive_scan(exespace(), view_from, view_dest, init_value, bop);
-    EXPECT_EQ(r, KE::end(view_dest));
+    ASSERT_EQ(r, KE::end(view_dest));
     verify_data(view_from, view_dest, init_value, bop);
   }
 
@@ -288,7 +288,7 @@ void run_single_scenario_custom_op(const InfoType& scenario_info,
     fill_zero(view_dest);
     auto r = KE::exclusive_scan("label", exespace(), view_from, view_dest,
                                 init_value, bop);
-    EXPECT_EQ(r, KE::end(view_dest));
+    ASSERT_EQ(r, KE::end(view_dest));
     verify_data(view_from, view_dest, init_value, bop);
   }
 
@@ -344,6 +344,47 @@ TEST(std_algorithms_numeric_ops_test, exclusive_scan) {
   run_exclusive_scan_all_scenarios<StridedThreeTag, CustomValueType>();
 }
 
+TEST(std_algorithms_numeric_ops_test, exclusive_scan_functor) {
+  int dummy       = 0;
+  using view_type = Kokkos::View<int*, exespace>;
+  view_type dummy_view("dummy_view", 0);
+  using functor_type =
+      Kokkos::Experimental::Impl::ExclusiveScanDefaultFunctorWithValueWrapper<
+          exespace, int, int, view_type, view_type>;
+  functor_type functor(dummy, dummy_view, dummy_view);
+  using value_type = functor_type::value_type;
+
+  value_type value1;
+  functor.init(value1);
+  ASSERT_EQ(value1.val, 0);
+  ASSERT_EQ(value1.is_initial, true);
+
+  value_type value2;
+  value2.val        = 1;
+  value2.is_initial = false;
+  functor.join(value1, value2);
+  ASSERT_EQ(value1.val, 1);
+  ASSERT_EQ(value1.is_initial, false);
+
+  functor.init(value1);
+  functor.join(value2, value1);
+  ASSERT_EQ(value2.val, 1);
+  ASSERT_EQ(value2.is_initial, false);
+
+  functor.init(value2);
+  functor.join(value2, value1);
+  ASSERT_EQ(value2.val, 0);
+  ASSERT_EQ(value2.is_initial, true);
+
+  value1.val        = 1;
+  value1.is_initial = false;
+  value2.val        = 2;
+  value2.is_initial = false;
+  functor.join(value2, value1);
+  ASSERT_EQ(value2.val, 3);
+  ASSERT_EQ(value2.is_initial, false);
+}
+
 }  // namespace EScan
 }  // namespace stdalgos
 }  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsFind.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsFind.cpp
index 3b8b5e85af408a023e1ad0033288270496646d3f..2692df698219379102d3a1eb62ea4bfae5ff4973 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsFind.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsFind.cpp
@@ -34,14 +34,14 @@ void test_find(const ViewType view) {
   constexpr value_t find_value = 13;
 
   // value not found, return last
-  EXPECT_EQ(KE::end(expected),
+  ASSERT_EQ(KE::end(expected),
             std::find(KE::begin(expected), KE::end(expected), find_value));
 
   // pass const iterators, returns const iterator
-  EXPECT_EQ(KE::cend(view),
+  ASSERT_EQ(KE::cend(view),
             KE::find(exespace(), KE::cbegin(view), KE::cend(view), find_value));
   // pass view, returns iterator
-  EXPECT_EQ(KE::end(view), KE::find(exespace(), view, find_value));
+  ASSERT_EQ(KE::end(view), KE::find(exespace(), view, find_value));
 
   fill_views_inc(view, expected);
 
@@ -50,10 +50,10 @@ void test_find(const ViewType view) {
   auto distance = std::distance(KE::begin(expected), std_result);
 
   // pass iterators, returns iterator
-  EXPECT_EQ(KE::begin(view) + distance,
+  ASSERT_EQ(KE::begin(view) + distance,
             KE::find(exespace(), KE::begin(view), KE::end(view), find_value));
   // pass view, returns iterator
-  EXPECT_EQ(KE::begin(view) + distance, KE::find(exespace(), view, find_value));
+  ASSERT_EQ(KE::begin(view) + distance, KE::find(exespace(), view, find_value));
 }
 
 template <class ViewType>
@@ -67,15 +67,15 @@ void test_find_if(const ViewType view) {
   const auto not_equals_zero = NotEqualsZeroFunctor<value_type>();
 
   // value not found, return last
-  EXPECT_EQ(
+  ASSERT_EQ(
       KE::end(expected),
       std::find_if(KE::begin(expected), KE::end(expected), not_equals_zero));
 
   // pass iterators, returns iterator
-  EXPECT_EQ(KE::end(view), KE::find_if(exespace(), KE::begin(view),
+  ASSERT_EQ(KE::end(view), KE::find_if(exespace(), KE::begin(view),
                                        KE::end(view), not_equals_zero));
   // pass view, returns iterator
-  EXPECT_EQ(KE::end(view), KE::find_if(exespace(), view, not_equals_zero));
+  ASSERT_EQ(KE::end(view), KE::find_if(exespace(), view, not_equals_zero));
 
   fill_views_inc(view, expected);
 
@@ -86,11 +86,11 @@ void test_find_if(const ViewType view) {
   auto distance = std::distance(KE::begin(expected), std_result);
 
   // pass const iterators, returns const iterator
-  EXPECT_EQ(
+  ASSERT_EQ(
       KE::cbegin(view) + distance,
       KE::find_if(exespace(), KE::cbegin(view), KE::cend(view), equals_val));
   // pass view, returns iterator
-  EXPECT_EQ(KE::begin(view) + distance,
+  ASSERT_EQ(KE::begin(view) + distance,
             KE::find_if(exespace(), view, equals_val));
 }
 
@@ -105,15 +105,15 @@ void test_find_if_not(const ViewType view) {
   const auto not_equals_zero = NotEqualsZeroFunctor<value_type>();
 
   // first value matches
-  EXPECT_EQ(KE::begin(expected),
+  ASSERT_EQ(KE::begin(expected),
             std::find_if_not(KE::begin(expected), KE::end(expected),
                              not_equals_zero));
 
   // pass iterators, returns iterator
-  EXPECT_EQ(KE::begin(view), KE::find_if_not(exespace(), KE::begin(view),
+  ASSERT_EQ(KE::begin(view), KE::find_if_not(exespace(), KE::begin(view),
                                              KE::end(view), not_equals_zero));
   // pass view, returns iterator
-  EXPECT_EQ(KE::begin(view),
+  ASSERT_EQ(KE::begin(view),
             KE::find_if_not(exespace(), view, not_equals_zero));
 
   fill_views_inc(view, expected);
@@ -124,11 +124,11 @@ void test_find_if_not(const ViewType view) {
   auto distance = std::distance(KE::begin(expected), std_result);
 
   // pass const iterators, returns const iterator
-  EXPECT_EQ(KE::cbegin(view) + distance,
+  ASSERT_EQ(KE::cbegin(view) + distance,
             KE::find_if_not(exespace(), KE::cbegin(view), KE::cend(view),
                             equals_zero));
   // pass view, returns const iterator
-  EXPECT_EQ(KE::begin(view) + distance,
+  ASSERT_EQ(KE::begin(view) + distance,
             KE::find_if_not(exespace(), view, equals_zero));
 }
 
@@ -151,12 +151,6 @@ void run_all_scenarios() {
 }
 
 TEST(std_algorithms_find_test, test) {
-#if defined(KOKKOS_ENABLE_CUDA) && \
-    defined(KOKKOS_COMPILER_NVHPC)  // FIXME_NVHPC
-  if constexpr (std::is_same_v<exespace, Kokkos::Cuda>) {
-    GTEST_SKIP() << "FIXME wrong result";
-  }
-#endif
   run_all_scenarios<DynamicTag, double>();
   run_all_scenarios<StridedTwoTag, int>();
   run_all_scenarios<StridedThreeTag, unsigned>();
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsFindEnd.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsFindEnd.cpp
index ddc4bc1ba676e62b6fc04b07b092b1a39a4a9d49..5a5359b0b2340d682394cd9c26e4f3e1f3555bf5 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsFindEnd.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsFindEnd.cpp
@@ -282,7 +282,7 @@ void run_single_scenario(const InfoType& scenario_info, std::size_t seq_ext,
     const auto mydiff  = myrit - KE::cbegin(view);
     const auto stddiff = stdrit - KE::cbegin(view_h);
     // std::cout << "result : " << mydiff << " " << stddiff << std::endl;
-    EXPECT_EQ(mydiff, stddiff);
+    ASSERT_EQ(mydiff, stddiff);
   }
 
   {
@@ -291,21 +291,21 @@ void run_single_scenario(const InfoType& scenario_info, std::size_t seq_ext,
                      KE::cbegin(s_view), KE::cend(s_view), args...);
     const auto mydiff  = myrit - KE::cbegin(view);
     const auto stddiff = stdrit - KE::cbegin(view_h);
-    EXPECT_EQ(mydiff, stddiff);
+    ASSERT_EQ(mydiff, stddiff);
   }
 
   {
     auto myrit         = KE::find_end(exespace(), view, s_view, args...);
     const auto mydiff  = myrit - KE::begin(view);
     const auto stddiff = stdrit - KE::cbegin(view_h);
-    EXPECT_EQ(mydiff, stddiff);
+    ASSERT_EQ(mydiff, stddiff);
   }
 
   {
     auto myrit = KE::find_end("label", exespace(), view, s_view, args...);
     const auto mydiff  = myrit - KE::begin(view);
     const auto stddiff = stdrit - KE::cbegin(view_h);
-    EXPECT_EQ(mydiff, stddiff);
+    ASSERT_EQ(mydiff, stddiff);
   }
 
   Kokkos::fence();
@@ -348,12 +348,6 @@ void run_all_scenarios() {
 }
 
 TEST(std_algorithms_non_mod_seq_ops, find_end) {
-#if defined(KOKKOS_ENABLE_CUDA) && \
-    defined(KOKKOS_COMPILER_NVHPC)  // FIXME_NVHPC
-  if constexpr (std::is_same_v<exespace, Kokkos::Cuda>) {
-    GTEST_SKIP() << "FIXME wrong result";
-  }
-#endif
   run_all_scenarios<DynamicTag, int>();
   run_all_scenarios<StridedThreeTag, int>();
 }
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsFindFirstOf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsFindFirstOf.cpp
index c2f7a2fdb8f8287040013a375d7ec52fe9202e1b..d77edb5fed3e7f0e498e606677f64992ecb2a3b6 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsFindFirstOf.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsFindFirstOf.cpp
@@ -201,7 +201,7 @@ void run_single_scenario(const InfoType& scenario_info, std::size_t seq_ext,
                           KE::cbegin(s_view), KE::cend(s_view), args...);
     const auto mydiff  = myrit - KE::cbegin(view);
     const auto stddiff = stdrit - KE::cbegin(view_h);
-    EXPECT_EQ(mydiff, stddiff);
+    ASSERT_EQ(mydiff, stddiff);
   }
 
   {
@@ -210,21 +210,21 @@ void run_single_scenario(const InfoType& scenario_info, std::size_t seq_ext,
                           KE::cbegin(s_view), KE::cend(s_view), args...);
     const auto mydiff  = myrit - KE::cbegin(view);
     const auto stddiff = stdrit - KE::cbegin(view_h);
-    EXPECT_EQ(mydiff, stddiff);
+    ASSERT_EQ(mydiff, stddiff);
   }
 
   {
     auto myrit         = KE::find_first_of(exespace(), view, s_view, args...);
     const auto mydiff  = myrit - KE::begin(view);
     const auto stddiff = stdrit - KE::cbegin(view_h);
-    EXPECT_EQ(mydiff, stddiff);
+    ASSERT_EQ(mydiff, stddiff);
   }
 
   {
     auto myrit = KE::find_first_of("label", exespace(), view, s_view, args...);
     const auto mydiff  = myrit - KE::begin(view);
     const auto stddiff = stdrit - KE::cbegin(view_h);
-    EXPECT_EQ(mydiff, stddiff);
+    ASSERT_EQ(mydiff, stddiff);
   }
 
   Kokkos::fence();
@@ -264,12 +264,6 @@ void run_all_scenarios() {
 }
 
 TEST(std_algorithms_non_mod_seq_ops, find_first_of) {
-#if defined(KOKKOS_ENABLE_CUDA) && \
-    defined(KOKKOS_COMPILER_NVHPC)  // FIXME_NVHPC
-  if constexpr (std::is_same_v<exespace, Kokkos::Cuda>) {
-    GTEST_SKIP() << "FIXME wrong result";
-  }
-#endif
   run_all_scenarios<DynamicTag, int>();
   run_all_scenarios<StridedThreeTag, int>();
 }
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsForEach.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsForEach.cpp
index 83b44f01aa7d825acc1ec72c0945cbedd32a91a8..793b98a67f16a81cc264514e8bfec41e197395cf 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsForEach.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsForEach.cpp
@@ -91,23 +91,23 @@ void test_for_each_n(const ViewType view) {
   const auto non_mod_functor = NoOpNonMutableFunctor<value_t>();
 
   // pass const iterators, functor takes const ref
-  EXPECT_EQ(KE::cbegin(view) + n,
+  ASSERT_EQ(KE::cbegin(view) + n,
             KE::for_each_n(exespace(), KE::cbegin(view), n, non_mod_functor));
   verify_values(value_t{0}, view);
 
   // pass view, functor takes const ref
-  EXPECT_EQ(KE::begin(view) + n,
+  ASSERT_EQ(KE::begin(view) + n,
             KE::for_each_n(exespace(), view, n, non_mod_functor));
   verify_values(value_t{0}, view);
 
   // pass iterators, functor takes non-const ref
   const auto mod_functor = IncrementElementWiseFunctor<value_t>();
-  EXPECT_EQ(KE::begin(view) + n,
+  ASSERT_EQ(KE::begin(view) + n,
             KE::for_each_n(exespace(), KE::begin(view), n, mod_functor));
   verify_values(value_t{1}, view);
 
   // pass view, functor takes non-const ref
-  EXPECT_EQ(KE::begin(view) + n,
+  ASSERT_EQ(KE::begin(view) + n,
             KE::for_each_n("label", exespace(), view, n, mod_functor));
   verify_values(value_t{2}, view);
 }
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsHelperFunctors.hpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsHelperFunctors.hpp
index a2a31f1f65a11b78f41adfc491359c034c183cff..8dbd6cd7e30bf9523ab4ee4902db51ff01fb8d69 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsHelperFunctors.hpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsHelperFunctors.hpp
@@ -37,6 +37,24 @@ struct CopyFunctor {
   void operator()(int i) const { m_view_to(i) = m_view_from(i); }
 };
 
+template <class ViewTypeFrom, class ViewTypeTo>
+struct CopyFunctorRank2 {
+  ViewTypeFrom m_view_from;
+  ViewTypeTo m_view_to;
+
+  CopyFunctorRank2() = delete;
+
+  CopyFunctorRank2(const ViewTypeFrom view_from, const ViewTypeTo view_to)
+      : m_view_from(view_from), m_view_to(view_to) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int k) const {
+    const auto i    = k / m_view_from.extent(1);
+    const auto j    = k % m_view_from.extent(1);
+    m_view_to(i, j) = m_view_from(i, j);
+  }
+};
+
 template <class ItTypeFrom, class ViewTypeTo>
 struct CopyFromIteratorFunctor {
   ItTypeFrom m_it_from;
@@ -121,7 +139,7 @@ struct EqualsValFunctor {
   bool operator()(const ValueType val) const { return val == m_value; }
 };
 
-template <class ValueType1, class ValueType2>
+template <class ValueType1, class ValueType2 = ValueType1>
 struct CustomLessThanComparator {
   KOKKOS_INLINE_FUNCTION
   bool operator()(const ValueType1& a, const ValueType2& b) const {
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp
index 510f1d195a19b412abfb4baa522f416e5ca6f796..8e60a43e5ffb67e594daadf2b1d7b1a7e2b0e4a9 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp
@@ -171,7 +171,7 @@ void verify_data(ViewType1 data_view,  // contains data
       //           << std::abs(gold_h(i) - test_view_h(i)) << std::endl;
 
       if (std::is_same<gold_view_value_type, int>::value) {
-        EXPECT_EQ(gold_h(i), test_view_h(i));
+        ASSERT_EQ(gold_h(i), test_view_h(i));
       } else {
         const auto error =
             std::abs(static_cast<double>(gold_h(i) - test_view_h(i)));
@@ -224,7 +224,7 @@ void run_single_scenario_default_op(const InfoType& scenario_info) {
     fill_zero(view_dest);
     auto r = KE::inclusive_scan(exespace(), KE::cbegin(view_from),
                                 KE::cend(view_from), KE::begin(view_dest));
-    EXPECT_EQ(r, KE::end(view_dest));
+    ASSERT_EQ(r, KE::end(view_dest));
     verify_data(view_from, view_dest, default_op());
   }
 
@@ -232,21 +232,21 @@ void run_single_scenario_default_op(const InfoType& scenario_info) {
     fill_zero(view_dest);
     auto r = KE::inclusive_scan("label", exespace(), KE::cbegin(view_from),
                                 KE::cend(view_from), KE::begin(view_dest));
-    EXPECT_EQ(r, KE::end(view_dest));
+    ASSERT_EQ(r, KE::end(view_dest));
     verify_data(view_from, view_dest, default_op());
   }
 
   {
     fill_zero(view_dest);
     auto r = KE::inclusive_scan(exespace(), view_from, view_dest);
-    EXPECT_EQ(r, KE::end(view_dest));
+    ASSERT_EQ(r, KE::end(view_dest));
     verify_data(view_from, view_dest, default_op());
   }
 
   {
     fill_zero(view_dest);
     auto r = KE::inclusive_scan("label", exespace(), view_from, view_dest);
-    EXPECT_EQ(r, KE::end(view_dest));
+    ASSERT_EQ(r, KE::end(view_dest));
     verify_data(view_from, view_dest, default_op());
   }
 
@@ -279,7 +279,7 @@ void run_single_scenario_custom_op(const InfoType& scenario_info, BinaryOp bop,
     auto r = KE::inclusive_scan(exespace(), KE::cbegin(view_from),
                                 KE::cend(view_from), KE::begin(view_dest), bop,
                                 args...);
-    EXPECT_EQ(r, KE::end(view_dest));
+    ASSERT_EQ(r, KE::end(view_dest));
     verify_data(view_from, view_dest, bop, args...);
   }
 
@@ -288,14 +288,14 @@ void run_single_scenario_custom_op(const InfoType& scenario_info, BinaryOp bop,
     auto r = KE::inclusive_scan("label", exespace(), KE::cbegin(view_from),
                                 KE::cend(view_from), KE::begin(view_dest), bop,
                                 args...);
-    EXPECT_EQ(r, KE::end(view_dest));
+    ASSERT_EQ(r, KE::end(view_dest));
     verify_data(view_from, view_dest, bop, args...);
   }
 
   {
     fill_zero(view_dest);
     auto r = KE::inclusive_scan(exespace(), view_from, view_dest, bop, args...);
-    EXPECT_EQ(r, KE::end(view_dest));
+    ASSERT_EQ(r, KE::end(view_dest));
     verify_data(view_from, view_dest, bop, args...);
   }
 
@@ -303,7 +303,7 @@ void run_single_scenario_custom_op(const InfoType& scenario_info, BinaryOp bop,
     fill_zero(view_dest);
     auto r = KE::inclusive_scan("label", exespace(), view_from, view_dest, bop,
                                 args...);
-    EXPECT_EQ(r, KE::end(view_dest));
+    ASSERT_EQ(r, KE::end(view_dest));
     verify_data(view_from, view_dest, bop, args...);
   }
 
@@ -353,6 +353,45 @@ TEST(std_algorithms_numeric_ops_test, inclusive_scan) {
   run_inclusive_scan_all_scenarios<StridedThreeTag, CustomValueType>();
 }
 
+TEST(std_algorithms_numeric_ops_test, inclusive_scan_functor) {
+  using view_type = Kokkos::View<int*, exespace>;
+  view_type dummy_view("dummy_view", 0);
+  using functor_type = Kokkos::Experimental::Impl::InclusiveScanDefaultFunctor<
+      exespace, int, int, view_type, view_type>;
+  functor_type functor(dummy_view, dummy_view);
+  using value_type = functor_type::value_type;
+
+  value_type value1;
+  functor.init(value1);
+  ASSERT_EQ(value1.val, 0);
+  ASSERT_EQ(value1.is_initial, true);
+
+  value_type value2;
+  value2.val        = 1;
+  value2.is_initial = false;
+  functor.join(value1, value2);
+  ASSERT_EQ(value1.val, 1);
+  ASSERT_EQ(value1.is_initial, false);
+
+  functor.init(value1);
+  functor.join(value2, value1);
+  ASSERT_EQ(value2.val, 1);
+  ASSERT_EQ(value2.is_initial, false);
+
+  functor.init(value2);
+  functor.join(value2, value1);
+  ASSERT_EQ(value2.val, 0);
+  ASSERT_EQ(value2.is_initial, true);
+
+  value1.val        = 1;
+  value1.is_initial = false;
+  value2.val        = 2;
+  value2.is_initial = false;
+  functor.join(value2, value1);
+  ASSERT_EQ(value2.val, 3);
+  ASSERT_EQ(value2.is_initial, false);
+}
+
 }  // namespace IncScan
 }  // namespace stdalgos
 }  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp
index ce8669a84f237ec519a310c657e5d14128ca1ab4..dcfe8ad67e115a3d410b12ebf0f2ea10720d39ed 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp
@@ -145,10 +145,10 @@ void run_single_scenario(const InfoType& scenario_info) {
       KE::is_sorted_until("label", exespace(), KE::begin(view), KE::end(view));
   auto r3 = KE::is_sorted_until(exespace(), view);
   auto r4 = KE::is_sorted_until("label", exespace(), view);
-  EXPECT_EQ(r1, gold);
-  EXPECT_EQ(r2, gold);
-  EXPECT_EQ(r3, gold);
-  EXPECT_EQ(r4, gold);
+  ASSERT_EQ(r1, gold);
+  ASSERT_EQ(r2, gold);
+  ASSERT_EQ(r3, gold);
+  ASSERT_EQ(r4, gold);
 
 #if !defined KOKKOS_ENABLE_OPENMPTARGET
   CustomLessThanComparator<ValueType, ValueType> comp;
@@ -160,10 +160,10 @@ void run_single_scenario(const InfoType& scenario_info) {
   auto r8 = KE::is_sorted_until("label", exespace(), view, comp);
 #endif
 
-  EXPECT_EQ(r1, gold);
-  EXPECT_EQ(r2, gold);
-  EXPECT_EQ(r3, gold);
-  EXPECT_EQ(r4, gold);
+  ASSERT_EQ(r1, gold);
+  ASSERT_EQ(r2, gold);
+  ASSERT_EQ(r3, gold);
+  ASSERT_EQ(r4, gold);
 
   Kokkos::fence();
 }
@@ -185,12 +185,6 @@ void run_is_sorted_until_all_scenarios() {
 }
 
 TEST(std_algorithms_sorting_ops_test, is_sorted_until) {
-#if defined(KOKKOS_ENABLE_CUDA) && \
-    defined(KOKKOS_COMPILER_NVHPC)  // FIXME_NVHPC
-  if constexpr (std::is_same_v<exespace, Kokkos::Cuda>) {
-    GTEST_SKIP() << "FIXME wrong result";
-  }
-#endif
   run_is_sorted_until_all_scenarios<DynamicTag, double>();
   run_is_sorted_until_all_scenarios<StridedTwoTag, double>();
   run_is_sorted_until_all_scenarios<StridedThreeTag, double>();
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsLexicographicalCompare.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsLexicographicalCompare.cpp
index 2acd4934accf621a01e820c9b7c398dee91faf13..5d9e7db803c55eec173cc35fe528258143d3ecd1 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsLexicographicalCompare.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsLexicographicalCompare.cpp
@@ -44,16 +44,16 @@ void test_lexicographical_compare(const ViewType1 view_1, ViewType2 view_2) {
         std::lexicographical_compare(h_first_1, h_last_1, h_first_2, h_last_2);
 
     // pass iterators
-    EXPECT_EQ(std_result, KE::lexicographical_compare(exespace(), first_1,
+    ASSERT_EQ(std_result, KE::lexicographical_compare(exespace(), first_1,
                                                       last_1, first_2, last_2));
-    EXPECT_EQ(std_result,
+    ASSERT_EQ(std_result,
               KE::lexicographical_compare("label", exespace(), first_1, last_1,
                                           first_2, last_2));
 
     // pass views
-    EXPECT_EQ(std_result,
+    ASSERT_EQ(std_result,
               KE::lexicographical_compare(exespace(), view_1, view_2));
-    EXPECT_EQ(std_result,
+    ASSERT_EQ(std_result,
               KE::lexicographical_compare("label", exespace(), view_1, view_2));
   }
 
@@ -67,17 +67,17 @@ void test_lexicographical_compare(const ViewType1 view_1, ViewType2 view_2) {
         h_first_1, h_last_1, h_first_2, h_last_2, custom_comparator);
 
     // pass iterators
-    EXPECT_EQ(std_result,
+    ASSERT_EQ(std_result,
               KE::lexicographical_compare(exespace(), first_1, last_1, first_2,
                                           last_2, custom_comparator));
-    EXPECT_EQ(std_result,
+    ASSERT_EQ(std_result,
               KE::lexicographical_compare("label", exespace(), first_1, last_1,
                                           first_2, last_2, custom_comparator));
 
     // pass views
-    EXPECT_EQ(std_result, KE::lexicographical_compare(
+    ASSERT_EQ(std_result, KE::lexicographical_compare(
                               exespace(), view_1, view_2, custom_comparator));
-    EXPECT_EQ(std_result,
+    ASSERT_EQ(std_result,
               KE::lexicographical_compare("label", exespace(), view_1, view_2,
                                           custom_comparator));
   }
@@ -86,7 +86,7 @@ void test_lexicographical_compare(const ViewType1 view_1, ViewType2 view_2) {
     // empty vs non-empty
     auto std_result =
         std::lexicographical_compare(h_first_1, h_first_1, h_first_2, h_last_2);
-    EXPECT_EQ(std_result, KE::lexicographical_compare(
+    ASSERT_EQ(std_result, KE::lexicographical_compare(
                               exespace(), first_1, first_1, first_2, last_2));
   }
 
@@ -95,7 +95,7 @@ void test_lexicographical_compare(const ViewType1 view_1, ViewType2 view_2) {
     if (view_1.extent(0) > 1) {
       auto std_result = std::lexicographical_compare(h_first_1, h_last_1 - 1,
                                                      h_first_2, h_last_2);
-      EXPECT_EQ(std_result,
+      ASSERT_EQ(std_result,
                 KE::lexicographical_compare(exespace(), first_1, last_1 - 1,
                                             first_2, last_2));
     }
@@ -140,12 +140,6 @@ void run_all_scenarios() {
 }
 
 TEST(std_algorithms_lexicographical_compare_test, test) {
-#if defined(KOKKOS_ENABLE_CUDA) && \
-    defined(KOKKOS_COMPILER_NVHPC)  // FIXME_NVHPC
-  if constexpr (std::is_same_v<exespace, Kokkos::Cuda>) {
-    GTEST_SKIP() << "FIXME wrong result";
-  }
-#endif
 // FIXME: should this disable only custom comparator tests?
 #if !defined KOKKOS_ENABLE_OPENMPTARGET
   run_all_scenarios<DynamicTag, double>();
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsMinMaxElementOps.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsMinMaxElementOps.cpp
index f8634ffafe2818b4138927e812e326c5cd9e172f..bc4323178428cc9048507b519dee98484e3eec4b 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsMinMaxElementOps.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsMinMaxElementOps.cpp
@@ -173,7 +173,7 @@ void std_algo_min_max_test_verify(Kokkos::pair<IndexType, ValueType> goldPair,
                                   const ItType result,
                                   TestedViewType testedView) {
   // check that iterator is pointing to right element
-  EXPECT_EQ(result - KE::begin(testedView), goldPair.first);
+  ASSERT_EQ(result - KE::begin(testedView), goldPair.first);
 
   // create a view for the result to copy into it the iterator's value
   using result_view_t = Kokkos::View<int>;
@@ -184,7 +184,7 @@ void std_algo_min_max_test_verify(Kokkos::pair<IndexType, ValueType> goldPair,
       Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), resultView);
 
   // use the host mirror of the result view to check that the values match
-  EXPECT_EQ(result_v_h(), goldPair.second);
+  ASSERT_EQ(result_v_h(), goldPair.second);
 }
 
 template <class GoldSolutionType, class ItType, class TestedViewType>
@@ -199,39 +199,39 @@ template <class ViewType>
 void test_max_element_trivial_data(ViewType view) {
   /* if we pass empty range, should return last */
   auto result = KE::max_element(exespace(), KE::cbegin(view), KE::cbegin(view));
-  EXPECT_EQ(result, KE::cbegin(view));
+  ASSERT_EQ(result, KE::cbegin(view));
 
   /* if we pass empty range, should return last */
   auto it0     = KE::cbegin(view) + 3;
   auto it1     = it0;
   auto result2 = KE::max_element(exespace(), it0, it1);
-  EXPECT_EQ(result2, it1);
+  ASSERT_EQ(result2, it1);
 }
 
 template <class ViewType>
 void test_min_element_trivial_data(ViewType view) {
   /* if we pass empty range, should return last */
   auto result = KE::min_element(exespace(), KE::cbegin(view), KE::cbegin(view));
-  EXPECT_EQ(result, KE::cbegin(view));
+  ASSERT_EQ(result, KE::cbegin(view));
 
   /* if we pass empty range, should return last */
   auto it0     = KE::cbegin(view) + 3;
   auto it1     = it0;
   auto result2 = KE::min_element(exespace(), it0, it1);
-  EXPECT_EQ(result2, it1);
+  ASSERT_EQ(result2, it1);
 }
 
 template <class ViewType>
 void test_minmax_element_empty_range(ViewType view) {
   auto result =
       KE::minmax_element(exespace(), KE::cbegin(view), KE::cbegin(view));
-  EXPECT_EQ(result.first, KE::cbegin(view));
-  EXPECT_EQ(result.second, KE::cbegin(view));
+  ASSERT_EQ(result.first, KE::cbegin(view));
+  ASSERT_EQ(result.second, KE::cbegin(view));
   auto it0     = KE::cbegin(view) + 3;
   auto it1     = it0;
   auto result2 = KE::minmax_element(exespace(), it0, it1);
-  EXPECT_EQ(result2.first, it1);
-  EXPECT_EQ(result2.second, it1);
+  ASSERT_EQ(result2.first, it1);
+  ASSERT_EQ(result2.second, it1);
 }
 
 template <class ViewType>
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsMismatch.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsMismatch.cpp
index bb4b6fb2a2ac1ecbb078962012d17295c3a421c4..f3b3e269c446658c75363ac141391e8ebb842e94 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsMismatch.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsMismatch.cpp
@@ -120,10 +120,10 @@ void run_single_scenario(ViewType view1, ViewType view2,
     const auto my_diff12 = my_res1.second - f2;
     const auto my_diff21 = my_res2.first - f1;
     const auto my_diff22 = my_res2.second - f2;
-    EXPECT_EQ(my_diff11, std_diff1);
-    EXPECT_EQ(my_diff12, std_diff2);
-    EXPECT_EQ(my_diff21, std_diff1);
-    EXPECT_EQ(my_diff22, std_diff2);
+    ASSERT_EQ(my_diff11, std_diff1);
+    ASSERT_EQ(my_diff12, std_diff2);
+    ASSERT_EQ(my_diff21, std_diff1);
+    ASSERT_EQ(my_diff22, std_diff2);
   }
 
   {
@@ -134,10 +134,10 @@ void run_single_scenario(ViewType view1, ViewType view2,
     const auto my_diff12 = my_res1.second - KE::begin(view2);
     const auto my_diff21 = my_res2.first - KE::begin(view1);
     const auto my_diff22 = my_res2.second - KE::begin(view2);
-    EXPECT_EQ(my_diff11, std_diff1);
-    EXPECT_EQ(my_diff12, std_diff2);
-    EXPECT_EQ(my_diff21, std_diff1);
-    EXPECT_EQ(my_diff22, std_diff2);
+    ASSERT_EQ(my_diff11, std_diff1);
+    ASSERT_EQ(my_diff12, std_diff2);
+    ASSERT_EQ(my_diff21, std_diff1);
+    ASSERT_EQ(my_diff22, std_diff2);
   }
 }
 
@@ -189,12 +189,6 @@ void run_all_scenarios() {
 }
 
 TEST(std_algorithms_mismatch_test, test) {
-#if defined(KOKKOS_ENABLE_CUDA) && \
-    defined(KOKKOS_COMPILER_NVHPC)  // FIXME_NVHPC
-  if constexpr (std::is_same_v<exespace, Kokkos::Cuda>) {
-    GTEST_SKIP() << "FIXME wrong result";
-  }
-#endif
   run_all_scenarios<DynamicTag, double>();
   run_all_scenarios<StridedThreeTag, int>();
 }
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp
index 4fce044bcf76a4cbe380196f00f50e50ac55f87d..4604764097ebc1ff69f391a44364521acdefeaec 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp
@@ -52,14 +52,14 @@ TEST(std_algorithms_mod_ops_test, move) {
 
   // move constr
   MyMovableType b(std::move(a));
-  EXPECT_EQ(b.m_value, 11);
-  EXPECT_EQ(a.m_value, -2);
+  ASSERT_EQ(b.m_value, 11);
+  ASSERT_EQ(a.m_value, -2);
 
   // move assign
   MyMovableType c;
   c = std::move(b);
-  EXPECT_EQ(c.m_value, 11);
-  EXPECT_EQ(b.m_value, -4);
+  ASSERT_EQ(c.m_value, 11);
+  ASSERT_EQ(b.m_value, -4);
 }
 
 template <class ViewType>
@@ -97,8 +97,8 @@ TEST(std_algorithms_mod_ops_test, swap) {
     int a = 1;
     int b = 2;
     KE::swap(a, b);
-    EXPECT_EQ(a, 2);
-    EXPECT_EQ(b, 1);
+    ASSERT_EQ(a, 2);
+    ASSERT_EQ(b, 1);
   }
 
   {
@@ -151,17 +151,17 @@ void test_iter_swap(ViewType view) {
   using value_type = typename ViewType::value_type;
   auto a_dc        = create_deep_copyable_compatible_clone(view);
   auto a_h         = create_mirror_view_and_copy(Kokkos::HostSpace(), a_dc);
-  EXPECT_EQ(view.extent_int(0), 10);
-  EXPECT_EQ(a_h(0), value_type(3));
-  EXPECT_EQ(a_h(1), value_type(1));
-  EXPECT_EQ(a_h(2), value_type(2));
-  EXPECT_EQ(a_h(3), value_type(0));
-  EXPECT_EQ(a_h(4), value_type(6));
-  EXPECT_EQ(a_h(5), value_type(5));
-  EXPECT_EQ(a_h(6), value_type(4));
-  EXPECT_EQ(a_h(7), value_type(7));
-  EXPECT_EQ(a_h(8), value_type(8));
-  EXPECT_EQ(a_h(9), value_type(9));
+  ASSERT_EQ(view.extent_int(0), 10);
+  ASSERT_EQ(a_h(0), value_type(3));
+  ASSERT_EQ(a_h(1), value_type(1));
+  ASSERT_EQ(a_h(2), value_type(2));
+  ASSERT_EQ(a_h(3), value_type(0));
+  ASSERT_EQ(a_h(4), value_type(6));
+  ASSERT_EQ(a_h(5), value_type(5));
+  ASSERT_EQ(a_h(6), value_type(4));
+  ASSERT_EQ(a_h(7), value_type(7));
+  ASSERT_EQ(a_h(8), value_type(8));
+  ASSERT_EQ(a_h(9), value_type(9));
 }
 
 TEST(std_algorithms_mod_ops_test, iter_swap_static_view) {
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsModSeqOps.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsModSeqOps.cpp
index 6b806d7bc5c05bc4250389ee43f4a6cb81e06c3c..f80f30797e43cb5a45cdc255b9ef676a293caf2d 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsModSeqOps.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsModSeqOps.cpp
@@ -34,21 +34,21 @@ struct std_algorithms_mod_seq_ops_test : std_algorithms_test {
 TEST_F(std_algorithms_mod_seq_ops_test, copy) {
   auto result = KE::copy(exespace(), KE::begin(m_static_view),
                          KE::end(m_static_view), KE::begin(m_strided_view));
-  EXPECT_EQ(KE::end(m_strided_view), result);
+  ASSERT_EQ(KE::end(m_strided_view), result);
   compare_views(m_static_view, m_strided_view);
 
   auto result2 = KE::copy(exespace(), KE::begin(m_strided_view),
                           KE::end(m_strided_view), KE::begin(m_dynamic_view));
-  EXPECT_EQ(KE::end(m_dynamic_view), result2);
+  ASSERT_EQ(KE::end(m_dynamic_view), result2);
   compare_views(m_dynamic_view, m_strided_view);
 }
 
 TEST_F(std_algorithms_mod_seq_ops_test, copy_view) {
-  EXPECT_EQ(KE::end(m_dynamic_view),
+  ASSERT_EQ(KE::end(m_dynamic_view),
             KE::copy(exespace(), m_static_view, m_dynamic_view));
   compare_views(m_static_view, m_dynamic_view);
 
-  EXPECT_EQ(KE::end(m_strided_view),
+  ASSERT_EQ(KE::end(m_strided_view),
             KE::copy(exespace(), m_dynamic_view, m_strided_view));
   compare_views(m_dynamic_view, m_strided_view);
 }
@@ -70,11 +70,11 @@ TEST_F(std_algorithms_mod_seq_ops_test, copy_n) {
   // pass iterators
   auto first = KE::begin(m_static_view);
   auto dest  = KE::begin(m_dynamic_view);
-  EXPECT_EQ(dest + n, KE::copy_n(exespace(), first, n, dest));
+  ASSERT_EQ(dest + n, KE::copy_n(exespace(), first, n, dest));
   compare_views(expected, m_dynamic_view);
 
   // pass views
-  EXPECT_EQ(KE::begin(m_strided_view) + n,
+  ASSERT_EQ(KE::begin(m_strided_view) + n,
             KE::copy_n(exespace(), m_static_view, n, m_strided_view));
   compare_views(expected, m_strided_view);
 }
@@ -85,12 +85,12 @@ TEST_F(std_algorithms_mod_seq_ops_test, copy_backward) {
   auto dest  = KE::end(m_dynamic_view);
 
   // pass iterators
-  EXPECT_EQ(KE::begin(m_dynamic_view),
+  ASSERT_EQ(KE::begin(m_dynamic_view),
             KE::copy_backward(exespace(), first, last, dest));
   compare_views(m_static_view, m_dynamic_view);
 
   // pass views
-  EXPECT_EQ(KE::begin(m_strided_view),
+  ASSERT_EQ(KE::begin(m_strided_view),
             KE::copy_backward(exespace(), m_static_view, m_strided_view));
   compare_views(m_static_view, m_strided_view);
 }
@@ -112,11 +112,11 @@ TEST_F(std_algorithms_mod_seq_ops_test, reverse_copy) {
   auto last  = KE::end(m_static_view);
   auto dest  = KE::begin(m_dynamic_view);
 
-  EXPECT_EQ(KE::end(m_dynamic_view),
+  ASSERT_EQ(KE::end(m_dynamic_view),
             KE::reverse_copy(exespace(), first, last, dest));
   compare_views(expected, m_dynamic_view);
 
-  EXPECT_EQ(KE::end(m_strided_view),
+  ASSERT_EQ(KE::end(m_strided_view),
             KE::reverse_copy(exespace(), m_static_view, m_strided_view));
   compare_views(expected, m_strided_view);
 }
@@ -151,25 +151,25 @@ TEST_F(std_algorithms_mod_seq_ops_test, fill_n) {
 
   // fill all elements
   // pass iterator
-  EXPECT_EQ(KE::end(m_static_view),
+  ASSERT_EQ(KE::end(m_static_view),
             KE::fill_n(exespace(), KE::begin(m_static_view),
                        m_static_view.extent(0), fill_n_value));
   verify_values(fill_n_value, m_static_view);
 
   // pass view
-  EXPECT_EQ(KE::end(m_strided_view),
+  ASSERT_EQ(KE::end(m_strided_view),
             KE::fill_n(exespace(), m_strided_view, m_strided_view.extent(0),
                        fill_n_value));
   verify_values(fill_n_value, m_strided_view);
 
   // fill zero elements
   // pass view
-  EXPECT_EQ(KE::begin(m_dynamic_view),
+  ASSERT_EQ(KE::begin(m_dynamic_view),
             KE::fill_n(exespace(), m_dynamic_view, 0, fill_n_new_value));
 
   // fill single element
   // pass iterator
-  EXPECT_EQ(
+  ASSERT_EQ(
       KE::begin(m_static_view) + 1,
       KE::fill_n(exespace(), KE::begin(m_static_view), 1, fill_n_new_value));
 
@@ -212,21 +212,21 @@ TEST_F(std_algorithms_mod_seq_ops_test, transform_from_fixture_unary_op) {
   auto r1 = KE::transform(exespace(), KE::begin(m_static_view),
                           KE::end(m_static_view), KE::begin(m_dynamic_view),
                           TransformFunctor());
-  EXPECT_EQ(r1, KE::end(m_dynamic_view));
+  ASSERT_EQ(r1, KE::end(m_dynamic_view));
   compare_views(gold_source, m_static_view);
   verify_values(-1., m_dynamic_view);
 
   // transform dynamic view, store results in strided view
   auto r2 = KE::transform(exespace(), m_dynamic_view, m_strided_view,
                           TransformFunctor());
-  EXPECT_EQ(r2, KE::end(m_strided_view));
+  ASSERT_EQ(r2, KE::end(m_strided_view));
   verify_values(-1., m_dynamic_view);
   verify_values(-1., m_strided_view);
 
   // transform strided view, store results in static view
   auto r3 = KE::transform(exespace(), m_strided_view, m_static_view,
                           TransformFunctor());
-  EXPECT_EQ(r3, KE::end(m_static_view));
+  ASSERT_EQ(r3, KE::end(m_static_view));
   verify_values(-1., m_static_view);
   verify_values(-1., m_strided_view);
 }
@@ -254,7 +254,7 @@ TEST_F(std_algorithms_mod_seq_ops_test, transform_from_fixture_binary_op) {
   auto r1 = KE::transform(exespace(), KE::begin(m_static_view),
                           KE::end(m_static_view), KE::begin(m_dynamic_view),
                           KE::begin(m_strided_view), TransformBinaryFunctor());
-  EXPECT_EQ(r1, KE::end(m_strided_view));
+  ASSERT_EQ(r1, KE::end(m_strided_view));
   compare_views(expected, m_strided_view);
 
   expected(0) = 0;
@@ -269,7 +269,7 @@ TEST_F(std_algorithms_mod_seq_ops_test, transform_from_fixture_binary_op) {
   expected(9) = 18;
   auto r2 = KE::transform("label", exespace(), m_static_view, m_strided_view,
                           m_dynamic_view, TransformBinaryFunctor());
-  EXPECT_EQ(r2, KE::end(m_dynamic_view));
+  ASSERT_EQ(r2, KE::end(m_dynamic_view));
   compare_views(expected, m_dynamic_view);
 }
 
@@ -296,19 +296,19 @@ TEST_F(std_algorithms_mod_seq_ops_test, generate) {
 
 TEST_F(std_algorithms_mod_seq_ops_test, generate_n) {
   // iterator + functor
-  EXPECT_EQ(KE::end(m_static_view),
+  ASSERT_EQ(KE::end(m_static_view),
             KE::generate_n(exespace(), KE::begin(m_static_view),
                            m_static_view.extent(0), GenerateFunctor()));
   verify_values(generated_value, m_static_view);
 
   // view + functor
-  EXPECT_EQ(KE::end(m_dynamic_view),
+  ASSERT_EQ(KE::end(m_dynamic_view),
             KE::generate_n(exespace(), m_dynamic_view, m_dynamic_view.extent(0),
                            GenerateFunctor()));
   verify_values(generated_value, m_dynamic_view);
 
   // view + functor, negative n
-  EXPECT_EQ(KE::begin(m_strided_view),
+  ASSERT_EQ(KE::begin(m_strided_view),
             KE::generate_n(exespace(), m_strided_view, -1, GenerateFunctor()));
 }
 
@@ -352,7 +352,7 @@ void test_swap_ranges(ViewType view) {
   auto last1  = first1 + 4;
   auto first2 = KE::begin(viewB) + 1;
   auto r      = KE::swap_ranges(exespace(), first1, last1, first2);
-  EXPECT_EQ(r, first2 + 4);
+  ASSERT_EQ(r, first2 + 4);
 
   /* check VIEW_A */
   static_view_type checkViewA("tmp");
@@ -360,16 +360,16 @@ void test_swap_ranges(ViewType view) {
   parallel_for(ext, cp_func_a_t(view, checkViewA));
   auto cvA_h =
       Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), checkViewA);
-  EXPECT_EQ(cvA_h(0), 0);
-  EXPECT_EQ(cvA_h(1), 1);
-  EXPECT_EQ(cvA_h(2), 99);
-  EXPECT_EQ(cvA_h(3), 98);
-  EXPECT_EQ(cvA_h(4), 97);
-  EXPECT_EQ(cvA_h(5), 96);
-  EXPECT_EQ(cvA_h(6), 6);
-  EXPECT_EQ(cvA_h(7), 7);
-  EXPECT_EQ(cvA_h(8), 8);
-  EXPECT_EQ(cvA_h(9), 9);
+  ASSERT_EQ(cvA_h(0), 0);
+  ASSERT_EQ(cvA_h(1), 1);
+  ASSERT_EQ(cvA_h(2), 99);
+  ASSERT_EQ(cvA_h(3), 98);
+  ASSERT_EQ(cvA_h(4), 97);
+  ASSERT_EQ(cvA_h(5), 96);
+  ASSERT_EQ(cvA_h(6), 6);
+  ASSERT_EQ(cvA_h(7), 7);
+  ASSERT_EQ(cvA_h(8), 8);
+  ASSERT_EQ(cvA_h(9), 9);
 
   /* check viewB */
   static_view_type checkViewB("tmpB");
@@ -377,16 +377,16 @@ void test_swap_ranges(ViewType view) {
   Kokkos::parallel_for(ext, cp_func_b_t(viewB, checkViewB));
   auto cvB_h =
       Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), checkViewB);
-  EXPECT_EQ(cvB_h(0), 100);
-  EXPECT_EQ(cvB_h(1), 2);
-  EXPECT_EQ(cvB_h(2), 3);
-  EXPECT_EQ(cvB_h(3), 4);
-  EXPECT_EQ(cvB_h(4), 5);
-  EXPECT_EQ(cvB_h(5), 95);
-  EXPECT_EQ(cvB_h(6), 94);
-  EXPECT_EQ(cvB_h(7), 93);
-  EXPECT_EQ(cvB_h(8), 92);
-  EXPECT_EQ(cvB_h(9), 91);
+  ASSERT_EQ(cvB_h(0), 100);
+  ASSERT_EQ(cvB_h(1), 2);
+  ASSERT_EQ(cvB_h(2), 3);
+  ASSERT_EQ(cvB_h(3), 4);
+  ASSERT_EQ(cvB_h(4), 5);
+  ASSERT_EQ(cvB_h(5), 95);
+  ASSERT_EQ(cvB_h(6), 94);
+  ASSERT_EQ(cvB_h(7), 93);
+  ASSERT_EQ(cvB_h(8), 92);
+  ASSERT_EQ(cvB_h(9), 91);
 }
 
 TEST_F(std_algorithms_mod_seq_ops_test, swap_ranges) {
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsMoveBackward.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsMoveBackward.cpp
index 635714eb5451f52bbbcfbe5867b555ee826a4db3..b201ab95c1a6b967d1fe17731295c0a086aa5716 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsMoveBackward.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsMoveBackward.cpp
@@ -53,20 +53,20 @@ void run_single_scenario(const InfoType& scenario_info, int apiId) {
     auto rit =
         KE::move_backward(exespace(), KE::begin(v), KE::end(v), KE::end(v2));
     const int dist = KE::distance(KE::begin(v2), rit);
-    EXPECT_EQ(dist, 5);
+    ASSERT_EQ(dist, 5);
   } else if (apiId == 1) {
     auto rit       = KE::move_backward("mylabel", exespace(), KE::begin(v),
                                  KE::end(v), KE::end(v2));
     const int dist = KE::distance(KE::begin(v2), rit);
-    EXPECT_EQ(dist, 5);
+    ASSERT_EQ(dist, 5);
   } else if (apiId == 2) {
     auto rit       = KE::move_backward(exespace(), v, v2);
     const int dist = KE::distance(KE::begin(v2), rit);
-    EXPECT_EQ(dist, 5);
+    ASSERT_EQ(dist, 5);
   } else if (apiId == 3) {
     auto rit       = KE::move_backward("mylabel", exespace(), v, v2);
     const int dist = KE::distance(KE::begin(v2), rit);
-    EXPECT_EQ(dist, 5);
+    ASSERT_EQ(dist, 5);
   }
 
   // check
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsNumerics.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsNumerics.cpp
index 288a67c36956d7c939210bd70d6fb30bf1d9d46b..0933c4e135fd147026a733860876049ed38f36be 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsNumerics.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsNumerics.cpp
@@ -151,8 +151,8 @@ void run_and_check_transform_reduce_default(ViewType1 first_view,
   const auto r2 = KE::transform_reduce(
       "MYLABEL", ExecutionSpace(), KE::cbegin(first_view),
       KE::cbegin(first_view), KE::cbegin(second_view), init_value);
-  EXPECT_EQ(r1, init_value);
-  EXPECT_EQ(r2, init_value);
+  ASSERT_EQ(r1, init_value);
+  ASSERT_EQ(r2, init_value);
 
   // non-trivial cases
   const auto r3 = KE::transform_reduce(ExecutionSpace(), KE::cbegin(first_view),
@@ -168,10 +168,10 @@ void run_and_check_transform_reduce_default(ViewType1 first_view,
   const auto r6 = KE::transform_reduce("MYLABEL", ExecutionSpace(), first_view,
                                        second_view, init_value);
 
-  EXPECT_EQ(r3, result_value);
-  EXPECT_EQ(r4, result_value);
-  EXPECT_EQ(r5, result_value);
-  EXPECT_EQ(r6, result_value);
+  ASSERT_EQ(r3, result_value);
+  ASSERT_EQ(r4, result_value);
+  ASSERT_EQ(r5, result_value);
+  ASSERT_EQ(r6, result_value);
 }
 
 TEST_F(std_algorithms_numerics_test,
@@ -254,8 +254,8 @@ void run_and_check_transform_reduce_overloadA(ViewType1 first_view,
                            KE::cbegin(first_view), KE::cbegin(second_view),
                            init_value, std::forward<Args>(args)...);
 
-  EXPECT_EQ(r1, init_value);
-  EXPECT_EQ(r2, init_value);
+  ASSERT_EQ(r1, init_value);
+  ASSERT_EQ(r2, init_value);
 
   // non trivial cases
   const auto r3 = KE::transform_reduce(
@@ -273,10 +273,10 @@ void run_and_check_transform_reduce_overloadA(ViewType1 first_view,
       KE::transform_reduce("MYLABEL", ExecutionSpace(), first_view, second_view,
                            init_value, std::forward<Args>(args)...);
 
-  EXPECT_EQ(r3, result_value);
-  EXPECT_EQ(r4, result_value);
-  EXPECT_EQ(r5, result_value);
-  EXPECT_EQ(r6, result_value);
+  ASSERT_EQ(r3, result_value);
+  ASSERT_EQ(r4, result_value);
+  ASSERT_EQ(r5, result_value);
+  ASSERT_EQ(r6, result_value);
 }
 
 TEST_F(std_algorithms_numerics_test,
@@ -373,8 +373,8 @@ void run_and_check_transform_reduce_overloadB(ViewType view,
                                        KE::cbegin(view), KE::cbegin(view),
                                        init_value, std::forward<Args>(args)...);
 
-  EXPECT_EQ(r1, init_value);
-  EXPECT_EQ(r2, init_value);
+  ASSERT_EQ(r1, init_value);
+  ASSERT_EQ(r2, init_value);
 
   // non trivial
   const auto r3 =
@@ -390,10 +390,10 @@ void run_and_check_transform_reduce_overloadB(ViewType view,
   const auto r6 = KE::transform_reduce("MYLABEL", ExecutionSpace(), view,
                                        init_value, std::forward<Args>(args)...);
 
-  EXPECT_EQ(r3, result_value);
-  EXPECT_EQ(r4, result_value);
-  EXPECT_EQ(r5, result_value);
-  EXPECT_EQ(r6, result_value);
+  ASSERT_EQ(r3, result_value);
+  ASSERT_EQ(r4, result_value);
+  ASSERT_EQ(r5, result_value);
+  ASSERT_EQ(r6, result_value);
 }
 
 TEST_F(std_algorithms_numerics_test,
@@ -447,8 +447,8 @@ void run_and_check_reduce_overloadA(ViewType view, ValueType non_trivial_result,
       KE::reduce(ExecutionSpace(), KE::cbegin(view), KE::cbegin(view));
   const auto r2 = KE::reduce("MYLABEL", ExecutionSpace(), KE::cbegin(view),
                              KE::cbegin(view));
-  EXPECT_EQ(r1, trivial_result);
-  EXPECT_EQ(r2, trivial_result);
+  ASSERT_EQ(r1, trivial_result);
+  ASSERT_EQ(r2, trivial_result);
 
   // non trivial cases
   const auto r3 =
@@ -458,10 +458,10 @@ void run_and_check_reduce_overloadA(ViewType view, ValueType non_trivial_result,
   const auto r5 = KE::reduce(ExecutionSpace(), view);
   const auto r6 = KE::reduce("MYLABEL", ExecutionSpace(), view);
 
-  EXPECT_EQ(r3, non_trivial_result);
-  EXPECT_EQ(r4, non_trivial_result);
-  EXPECT_EQ(r5, non_trivial_result);
-  EXPECT_EQ(r6, non_trivial_result);
+  ASSERT_EQ(r3, non_trivial_result);
+  ASSERT_EQ(r4, non_trivial_result);
+  ASSERT_EQ(r5, non_trivial_result);
+  ASSERT_EQ(r6, non_trivial_result);
 }
 
 TEST_F(std_algorithms_numerics_test,
@@ -503,8 +503,8 @@ void run_and_check_reduce_overloadB(ViewType view, ValueType result_value,
                              KE::cbegin(view), init_value);
   const auto r2 = KE::reduce("MYLABEL", ExecutionSpace(), KE::cbegin(view),
                              KE::cbegin(view), init_value);
-  EXPECT_EQ(r1, init_value);
-  EXPECT_EQ(r2, init_value);
+  ASSERT_EQ(r1, init_value);
+  ASSERT_EQ(r2, init_value);
 
   // non trivial cases
   const auto r3 = KE::reduce(ExecutionSpace(), KE::cbegin(view), KE::cend(view),
@@ -514,10 +514,10 @@ void run_and_check_reduce_overloadB(ViewType view, ValueType result_value,
   const auto r5 = KE::reduce(ExecutionSpace(), view, init_value);
   const auto r6 = KE::reduce("MYLABEL", ExecutionSpace(), view, init_value);
 
-  EXPECT_EQ(r3, result_value);
-  EXPECT_EQ(r4, result_value);
-  EXPECT_EQ(r5, result_value);
-  EXPECT_EQ(r6, result_value);
+  ASSERT_EQ(r3, result_value);
+  ASSERT_EQ(r4, result_value);
+  ASSERT_EQ(r5, result_value);
+  ASSERT_EQ(r6, result_value);
 }
 
 TEST_F(std_algorithms_numerics_test,
@@ -553,8 +553,8 @@ void run_and_check_reduce_overloadC(ViewType view, ValueType result_value,
                              KE::cbegin(view), init_value, joiner);
   const auto r2 = KE::reduce("MYLABEL", ExecutionSpace(), KE::cbegin(view),
                              KE::cbegin(view), init_value, joiner);
-  EXPECT_EQ(r1, init_value);
-  EXPECT_EQ(r2, init_value);
+  ASSERT_EQ(r1, init_value);
+  ASSERT_EQ(r2, init_value);
 
   // non trivial cases
   const auto r3 = KE::reduce(ExecutionSpace(), KE::cbegin(view), KE::cend(view),
@@ -565,10 +565,10 @@ void run_and_check_reduce_overloadC(ViewType view, ValueType result_value,
   const auto r6 =
       KE::reduce("MYLABEL", ExecutionSpace(), view, init_value, joiner);
 
-  EXPECT_EQ(r3, result_value);
-  EXPECT_EQ(r4, result_value);
-  EXPECT_EQ(r5, result_value);
-  EXPECT_EQ(r6, result_value);
+  ASSERT_EQ(r3, result_value);
+  ASSERT_EQ(r4, result_value);
+  ASSERT_EQ(r5, result_value);
+  ASSERT_EQ(r6, result_value);
 }
 
 TEST_F(std_algorithms_numerics_test,
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp
index 0399e9eee4d42290299e254a0cbd40ba941efc1e..f169fd9ce881700b98154e484df9340eee4130a7 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp
@@ -130,12 +130,12 @@ void verify_data(const std::string& name, ResultType my_result,
   const std::size_t my_diff_true = my_result.first - KE::begin(view_dest_true);
   const std::size_t my_diff_false =
       my_result.second - KE::begin(view_dest_false);
-  EXPECT_EQ(std_diff_true, my_diff_true);
-  EXPECT_EQ(std_diff_false, my_diff_false);
+  ASSERT_EQ(std_diff_true, my_diff_true);
+  ASSERT_EQ(std_diff_false, my_diff_false);
 
   auto view_dest_true_h = create_host_space_copy(view_dest_true);
   for (std::size_t i = 0; i < std_diff_true; ++i) {
-    EXPECT_EQ(std_vec_true[i], view_dest_true_h(i));
+    ASSERT_EQ(std_vec_true[i], view_dest_true_h(i));
     // std::cout << "i= " << i << " "
     // 	      << " std_true = " << std_vec_true[i] << " "
     // 	      << " mine     = " << view_dest_true_h(i) << '\n';
@@ -143,45 +143,45 @@ void verify_data(const std::string& name, ResultType my_result,
 
   auto view_dest_false_h = create_host_space_copy(view_dest_false);
   for (std::size_t i = 0; i < std_diff_false; ++i) {
-    EXPECT_EQ(std_vec_false[i], view_dest_false_h(i));
+    ASSERT_EQ(std_vec_false[i], view_dest_false_h(i));
     // std::cout << "i= " << i << " "
     // 	      << " std_false = " << std_vec_false[i] << " "
     // 	      << " mine     = " << view_dest_false_h(i) << '\n';
   }
 
   if (name == "empty") {
-    EXPECT_EQ(my_diff_true, 0u);
-    EXPECT_EQ(my_diff_false, 0u);
+    ASSERT_EQ(my_diff_true, 0u);
+    ASSERT_EQ(my_diff_false, 0u);
   }
 
   else if (name == "one-element-a") {
-    EXPECT_EQ(my_diff_true, 0u);
-    EXPECT_EQ(my_diff_false, 1u);
+    ASSERT_EQ(my_diff_true, 0u);
+    ASSERT_EQ(my_diff_false, 1u);
   }
 
   else if (name == "one-element-b") {
-    EXPECT_EQ(my_diff_true, 1u);
-    EXPECT_EQ(my_diff_false, 0u);
+    ASSERT_EQ(my_diff_true, 1u);
+    ASSERT_EQ(my_diff_false, 0u);
   }
 
   else if (name == "two-elements-a") {
-    EXPECT_EQ(my_diff_true, 1u);
-    EXPECT_EQ(my_diff_false, 1u);
+    ASSERT_EQ(my_diff_true, 1u);
+    ASSERT_EQ(my_diff_false, 1u);
   }
 
   else if (name == "two-elements-b") {
-    EXPECT_EQ(my_diff_true, 1u);
-    EXPECT_EQ(my_diff_false, 1u);
+    ASSERT_EQ(my_diff_true, 1u);
+    ASSERT_EQ(my_diff_false, 1u);
   }
 
   else if (name == "small-b") {
-    EXPECT_EQ(my_diff_true, 13u);
-    EXPECT_EQ(my_diff_false, 0u);
+    ASSERT_EQ(my_diff_true, 13u);
+    ASSERT_EQ(my_diff_false, 0u);
   }
 
   else if (name == "small-c") {
-    EXPECT_EQ(my_diff_true, 0u);
-    EXPECT_EQ(my_diff_false, 15u);
+    ASSERT_EQ(my_diff_true, 0u);
+    ASSERT_EQ(my_diff_false, 15u);
   }
 }
 
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsPartitioningOps.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsPartitioningOps.cpp
index 1bfb536c2c750d582116e840d9a663bfbee8fff3..0c6e9df047a0b94478d02456876d01d87d6a6c8f 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsPartitioningOps.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsPartitioningOps.cpp
@@ -127,7 +127,7 @@ struct std_algorithms_partitioning_test : public std_algorithms_test {
       case Mixed: return false;
       case NegativeFirst: return true;
       case AllNegative: return true;
-      case AllPositive: return false;
+      case AllPositive: return true;
       case NegativeLast: return false;
       case SingleNegative: return true;
       default: return false;
@@ -148,12 +148,6 @@ struct std_algorithms_partitioning_test : public std_algorithms_test {
 };
 
 TEST_F(std_algorithms_partitioning_test, is_partitioned_trivial) {
-#if defined(KOKKOS_ENABLE_CUDA) && \
-    defined(KOKKOS_COMPILER_NVHPC)  // FIXME_NVHPC
-  if constexpr (std::is_same_v<exespace, Kokkos::Cuda>) {
-    GTEST_SKIP() << "FIXME wrong result";
-  }
-#endif
   IsNegativeFunctor<value_type> p;
   const auto result1 = KE::is_partitioned(exespace(), KE::cbegin(m_static_view),
                                           KE::cbegin(m_static_view), p);
@@ -169,12 +163,6 @@ TEST_F(std_algorithms_partitioning_test, is_partitioned_trivial) {
 }
 
 TEST_F(std_algorithms_partitioning_test, is_partitioned_accepting_iterators) {
-#if defined(KOKKOS_ENABLE_CUDA) && \
-    defined(KOKKOS_COMPILER_NVHPC)  // FIXME_NVHPC
-  if constexpr (std::is_same_v<exespace, Kokkos::Cuda>) {
-    GTEST_SKIP() << "FIXME wrong result";
-  }
-#endif
   const IsNegativeFunctor<value_type> p;
 
   for (int id = 0; id < FixtureViews::Count; ++id) {
@@ -183,25 +171,19 @@ TEST_F(std_algorithms_partitioning_test, is_partitioned_accepting_iterators) {
         goldSolutionIsPartitioned(static_cast<FixtureViews>(id));
     const auto result1 = KE::is_partitioned(
         exespace(), KE::cbegin(m_static_view), KE::cend(m_static_view), p);
-    EXPECT_EQ(goldBool, result1);
+    ASSERT_EQ(goldBool, result1);
 
     const auto result2 = KE::is_partitioned(
         exespace(), KE::cbegin(m_dynamic_view), KE::cend(m_dynamic_view), p);
-    EXPECT_EQ(goldBool, result2);
+    ASSERT_EQ(goldBool, result2);
 
     const auto result3 = KE::is_partitioned(
         exespace(), KE::cbegin(m_strided_view), KE::cend(m_strided_view), p);
-    EXPECT_EQ(goldBool, result3);
+    ASSERT_EQ(goldBool, result3);
   }
 }
 
 TEST_F(std_algorithms_partitioning_test, is_partitioned_accepting_view) {
-#if defined(KOKKOS_ENABLE_CUDA) && \
-    defined(KOKKOS_COMPILER_NVHPC)  // FIXME_NVHPC
-  if constexpr (std::is_same_v<exespace, Kokkos::Cuda>) {
-    GTEST_SKIP() << "FIXME wrong result";
-  }
-#endif
   const IsNegativeFunctor<value_type> p;
 
   for (int id = 0; id < FixtureViews::Count; ++id) {
@@ -209,23 +191,17 @@ TEST_F(std_algorithms_partitioning_test, is_partitioned_accepting_view) {
     const bool goldBool =
         goldSolutionIsPartitioned(static_cast<FixtureViews>(id));
     const auto result1 = KE::is_partitioned(exespace(), m_static_view, p);
-    EXPECT_EQ(goldBool, result1);
+    ASSERT_EQ(goldBool, result1);
 
     const auto result2 = KE::is_partitioned(exespace(), m_dynamic_view, p);
-    EXPECT_EQ(goldBool, result2);
+    ASSERT_EQ(goldBool, result2);
 
     const auto result3 = KE::is_partitioned(exespace(), m_strided_view, p);
-    EXPECT_EQ(goldBool, result3);
+    ASSERT_EQ(goldBool, result3);
   }
 }
 
 TEST_F(std_algorithms_partitioning_test, partition_point) {
-#if defined(KOKKOS_ENABLE_CUDA) && \
-    defined(KOKKOS_COMPILER_NVHPC)  // FIXME_NVHPC
-  if constexpr (std::is_same_v<exespace, Kokkos::Cuda>) {
-    GTEST_SKIP() << "FIXME wrong result";
-  }
-#endif
   const IsNegativeFunctor<value_type> p;
 
   for (int id = 0; id < FixtureViews::Count; ++id) {
@@ -235,17 +211,17 @@ TEST_F(std_algorithms_partitioning_test, partition_point) {
     auto first1        = KE::cbegin(m_static_view);
     auto last1         = KE::cend(m_static_view);
     const auto result1 = KE::partition_point(exespace(), first1, last1, p);
-    EXPECT_EQ(goldIndex, result1 - first1);
+    ASSERT_EQ(goldIndex, result1 - first1);
 
     auto first2        = KE::cbegin(m_dynamic_view);
     auto last2         = KE::cend(m_dynamic_view);
     const auto result2 = KE::partition_point(exespace(), first2, last2, p);
-    EXPECT_EQ(goldIndex, result2 - first2);
+    ASSERT_EQ(goldIndex, result2 - first2);
 
     auto first3        = KE::cbegin(m_strided_view);
     auto last3         = KE::cend(m_strided_view);
     const auto result3 = KE::partition_point(exespace(), first3, last3, p);
-    EXPECT_EQ(goldIndex, result3 - first3);
+    ASSERT_EQ(goldIndex, result3 - first3);
   }
 }
 
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemove.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemove.cpp
index 8832d71f9537163d4bb5697888202ee98f5504e3..c35fc5c24b20687d5d410c1d0ed61bb3b547ce66 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemove.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemove.cpp
@@ -117,12 +117,12 @@ void verify_data(ViewTypeData view_data_h, ViewTypeTest view_test,
   // check that returned iterators are correct
   const std::size_t std_diff = std_result - KE::begin(view_data_h);
   const std::size_t my_diff  = my_result - KE::begin(view_test);
-  EXPECT_EQ(std_diff, my_diff);
+  ASSERT_EQ(std_diff, my_diff);
 
   // check the actual data after algo has been applied
   auto view_test_h = create_host_space_copy(view_test);
   for (std::size_t i = 0; i < my_diff; ++i) {
-    EXPECT_EQ(view_test_h(i), view_data_h[i]);
+    ASSERT_EQ(view_test_h(i), view_data_h[i]);
     // std::cout << "i= " << i << " "
     // 	      << "mine: " << view_test_h(i) << " "
     // 	      << "std: " << view_data_h(i)
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopy.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopy.cpp
index 949f8f60c938cfe315ea32ea4f04941551a2ac77..3d7c52108be0578943dc0580fbe948f16cc73d35 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopy.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopy.cpp
@@ -135,12 +135,12 @@ void verify_data(ViewFromType view_from, ViewDestType view_dest,
   // check that returned iterators are correct
   const std::size_t std_diff = std_result - gold_dest_std.begin();
   const std::size_t my_diff  = my_result - KE::begin(view_dest);
-  EXPECT_EQ(std_diff, my_diff);
+  ASSERT_EQ(std_diff, my_diff);
 
   // check the actual data after algo has been applied
   auto view_dest_h = create_host_space_copy(view_dest);
   for (std::size_t i = 0; i < my_diff; ++i) {
-    EXPECT_EQ(view_dest_h(i), gold_dest_std[i]);
+    ASSERT_EQ(view_dest_h(i), gold_dest_std[i]);
     // std::cout << "i= " << i << " "
     // 	      << "mine: " << view_dest_h(i) << " "
     // 	      << "std: " << gold_dest_std[i]
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopyIf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopyIf.cpp
index 9dc1e4a7e164299693f83407b0736cdcd4e2909d..cb699aa923568e6c2c2079cae096fe92650f7332 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopyIf.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopyIf.cpp
@@ -119,12 +119,12 @@ void verify_data(ViewTypeFrom view_from, ViewTypeDest view_dest,
   // check that returned iterators are correct
   const std::size_t std_diff = std_result - gold_dest_std.begin();
   const std::size_t my_diff  = my_result - KE::begin(view_dest);
-  EXPECT_EQ(std_diff, my_diff);
+  ASSERT_EQ(std_diff, my_diff);
 
   // check the actual data after algo has been applied
   auto view_dest_h = create_host_space_copy(view_dest);
   for (std::size_t i = 0; i < my_diff; ++i) {
-    EXPECT_EQ(view_dest_h(i), gold_dest_std[i]);
+    ASSERT_EQ(view_dest_h(i), gold_dest_std[i]);
     // std::cout << "i= " << i << " "
     // 	      << "mine: " << view_dest_h(i) << " "
     // 	      << "std: " << gold_dest_std[i]
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveIf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveIf.cpp
index e9d15f29d88415c03c9d66866375fa781e371ecb..f06f2234eedb6bde1a17804fb3a4309a1632278f 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveIf.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveIf.cpp
@@ -112,12 +112,12 @@ void verify_data(ViewTypeData view_data_h, ViewTypeTest view_test,
   // check that returned iterators are correct
   const std::size_t std_diff = std_result - KE::begin(view_data_h);
   const std::size_t my_diff  = my_result - KE::begin(view_test);
-  EXPECT_EQ(std_diff, my_diff);
+  ASSERT_EQ(std_diff, my_diff);
 
   // check the actual data after algo has been applied
   auto view_test_h = create_host_space_copy(view_test);
   for (std::size_t i = 0; i < my_diff; ++i) {
-    EXPECT_EQ(view_test_h(i), view_data_h[i]);
+    ASSERT_EQ(view_test_h(i), view_data_h[i]);
     // std::cout << "i= " << i << " "
     // 	      << "mine: " << view_test_h(i) << " "
     // 	      << "std: " << view_data_h(i)
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplace.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplace.cpp
index b226de5535fee97c9f1679407a27c02fc06f5f52..a22ab32d764ae46783c8a3f9d4b794df95733bb8 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplace.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplace.cpp
@@ -104,30 +104,30 @@ void verify_data(const std::string& name, ViewType1 test_view,
   }
 
   else if (name == "one-element-a") {
-    EXPECT_EQ(view_h(0), ValueType{1});
+    ASSERT_EQ(view_h(0), ValueType{1});
   }
 
   else if (name == "one-element-b") {
-    EXPECT_EQ(view_h(0), new_value);
+    ASSERT_EQ(view_h(0), new_value);
   }
 
   else if (name == "two-elements-a") {
-    EXPECT_EQ(view_h(0), ValueType{1});
-    EXPECT_EQ(view_h(1), new_value);
+    ASSERT_EQ(view_h(0), ValueType{1});
+    ASSERT_EQ(view_h(1), new_value);
   }
 
   else if (name == "two-elements-b") {
-    EXPECT_EQ(view_h(0), new_value);
-    EXPECT_EQ(view_h(1), ValueType{-1});
+    ASSERT_EQ(view_h(0), new_value);
+    ASSERT_EQ(view_h(1), ValueType{-1});
   }
 
   else if (name == "small-a") {
     for (std::size_t i = 0; i < view_h.extent(0); ++i) {
       if (i == 0 || i == 3 || i == 5 || i == 6) {
-        EXPECT_EQ(view_h(i), new_value);
+        ASSERT_EQ(view_h(i), new_value);
       } else {
         const auto gold = ValueType{-5} + static_cast<ValueType>(i + 1);
-        EXPECT_EQ(view_h(i), gold);
+        ASSERT_EQ(view_h(i), gold);
       }
     }
   }
@@ -135,9 +135,9 @@ void verify_data(const std::string& name, ViewType1 test_view,
   else if (name == "small-b") {
     for (std::size_t i = 0; i < view_h.extent(0); ++i) {
       if (i < 4) {
-        EXPECT_EQ(view_h(i), ValueType{-1});
+        ASSERT_EQ(view_h(i), ValueType{-1});
       } else {
-        EXPECT_EQ(view_h(i), new_value);
+        ASSERT_EQ(view_h(i), new_value);
       }
     }
   }
@@ -145,9 +145,9 @@ void verify_data(const std::string& name, ViewType1 test_view,
   else if (name == "medium" || name == "large") {
     for (std::size_t i = 0; i < view_h.extent(0); ++i) {
       if (i % 2 == 0) {
-        EXPECT_EQ(view_h(i), ValueType{-1});
+        ASSERT_EQ(view_h(i), ValueType{-1});
       } else {
-        EXPECT_EQ(view_h(i), new_value);
+        ASSERT_EQ(view_h(i), new_value);
       }
     }
   }
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopy.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopy.cpp
index 16b181fdd220a026ad5e62646e2e6af7e9e6cab5..a964ec8e173e7656a4b69fbfeb0a4e9adfa65da8 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopy.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopy.cpp
@@ -112,40 +112,40 @@ void verify_data(const std::string& name, ViewTypeFrom view_from,
   }
 
   else if (name == "one-element-a") {
-    EXPECT_EQ(view_from_h(0), ValueType{1});
-    EXPECT_EQ(view_test_h(0), view_from_h(0));
+    ASSERT_EQ(view_from_h(0), ValueType{1});
+    ASSERT_EQ(view_test_h(0), view_from_h(0));
   }
 
   else if (name == "one-element-b") {
-    EXPECT_EQ(view_from_h(0), ValueType{2});
-    EXPECT_EQ(view_test_h(0), new_value);
+    ASSERT_EQ(view_from_h(0), ValueType{2});
+    ASSERT_EQ(view_test_h(0), new_value);
   }
 
   else if (name == "two-elements-a") {
-    EXPECT_EQ(view_from_h(0), ValueType{1});
-    EXPECT_EQ(view_from_h(1), ValueType{2});
+    ASSERT_EQ(view_from_h(0), ValueType{1});
+    ASSERT_EQ(view_from_h(1), ValueType{2});
 
-    EXPECT_EQ(view_test_h(0), view_from_h(0));
-    EXPECT_EQ(view_test_h(1), new_value);
+    ASSERT_EQ(view_test_h(0), view_from_h(0));
+    ASSERT_EQ(view_test_h(1), new_value);
   }
 
   else if (name == "two-elements-b") {
-    EXPECT_EQ(view_from_h(0), ValueType{2});
-    EXPECT_EQ(view_from_h(1), ValueType{-1});
+    ASSERT_EQ(view_from_h(0), ValueType{2});
+    ASSERT_EQ(view_from_h(1), ValueType{-1});
 
-    EXPECT_EQ(view_test_h(0), new_value);
-    EXPECT_EQ(view_test_h(1), view_from_h(1));
+    ASSERT_EQ(view_test_h(0), new_value);
+    ASSERT_EQ(view_test_h(1), view_from_h(1));
   }
 
   else if (name == "small-a") {
     for (std::size_t i = 0; i < view_test_h.extent(0); ++i) {
       if (i == 0 || i == 3 || i == 5 || i == 6) {
-        EXPECT_EQ(view_from_h(i), ValueType{2});
-        EXPECT_EQ(view_test_h(i), new_value);
+        ASSERT_EQ(view_from_h(i), ValueType{2});
+        ASSERT_EQ(view_test_h(i), new_value);
       } else {
         const auto gold = ValueType{-5} + static_cast<ValueType>(i + 1);
-        EXPECT_EQ(view_from_h(i), gold);
-        EXPECT_EQ(view_test_h(i), gold);
+        ASSERT_EQ(view_from_h(i), gold);
+        ASSERT_EQ(view_test_h(i), gold);
       }
     }
   }
@@ -153,11 +153,11 @@ void verify_data(const std::string& name, ViewTypeFrom view_from,
   else if (name == "small-b") {
     for (std::size_t i = 0; i < view_test_h.extent(0); ++i) {
       if (i < 4) {
-        EXPECT_EQ(view_from_h(i), ValueType{-1});
-        EXPECT_EQ(view_test_h(i), view_from_h(i));
+        ASSERT_EQ(view_from_h(i), ValueType{-1});
+        ASSERT_EQ(view_test_h(i), view_from_h(i));
       } else {
-        EXPECT_EQ(view_from_h(i), ValueType{2});
-        EXPECT_EQ(view_test_h(i), new_value);
+        ASSERT_EQ(view_from_h(i), ValueType{2});
+        ASSERT_EQ(view_test_h(i), new_value);
       }
     }
   }
@@ -165,11 +165,11 @@ void verify_data(const std::string& name, ViewTypeFrom view_from,
   else if (name == "medium" || name == "large") {
     for (std::size_t i = 0; i < view_test_h.extent(0); ++i) {
       if (i % 2 == 0) {
-        EXPECT_EQ(view_from_h(i), ValueType{-1});
-        EXPECT_EQ(view_test_h(i), view_from_h(i));
+        ASSERT_EQ(view_from_h(i), ValueType{-1});
+        ASSERT_EQ(view_test_h(i), view_from_h(i));
       } else {
-        EXPECT_EQ(view_from_h(i), ValueType{2});
-        EXPECT_EQ(view_test_h(i), new_value);
+        ASSERT_EQ(view_from_h(i), ValueType{2});
+        ASSERT_EQ(view_test_h(i), new_value);
       }
     }
   }
@@ -202,7 +202,7 @@ void run_single_scenario(const InfoType& scenario_info) {
         KE::replace_copy(exespace(), KE::cbegin(view_from), KE::cend(view_from),
                          KE::begin(view_dest), old_value, new_value);
     verify_data(name, view_from, view_dest, new_value);
-    EXPECT_EQ(rit, (KE::begin(view_dest) + view_ext));
+    ASSERT_EQ(rit, (KE::begin(view_dest) + view_ext));
   }
 
   {
@@ -215,7 +215,7 @@ void run_single_scenario(const InfoType& scenario_info) {
                                 KE::cend(view_from), KE::begin(view_dest),
                                 old_value, new_value);
     verify_data(name, view_from, view_dest, new_value);
-    EXPECT_EQ(rit, (KE::begin(view_dest) + view_ext));
+    ASSERT_EQ(rit, (KE::begin(view_dest) + view_ext));
   }
 
   {
@@ -227,7 +227,7 @@ void run_single_scenario(const InfoType& scenario_info) {
     auto rit = KE::replace_copy(exespace(), view_from, view_dest, old_value,
                                 new_value);
     verify_data(name, view_from, view_dest, new_value);
-    EXPECT_EQ(rit, (KE::begin(view_dest) + view_ext));
+    ASSERT_EQ(rit, (KE::begin(view_dest) + view_ext));
   }
 
   {
@@ -239,7 +239,7 @@ void run_single_scenario(const InfoType& scenario_info) {
     auto rit = KE::replace_copy("label", exespace(), view_from, view_dest,
                                 old_value, new_value);
     verify_data(name, view_from, view_dest, new_value);
-    EXPECT_EQ(rit, (KE::begin(view_dest) + view_ext));
+    ASSERT_EQ(rit, (KE::begin(view_dest) + view_ext));
   }
 
   Kokkos::fence();
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopyIf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopyIf.cpp
index a402e30ad9c02066ef2a7f7e263bc430fc709512..ceeba889711953979009726e7e174ee1e996a4a0 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopyIf.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopyIf.cpp
@@ -112,40 +112,40 @@ void verify_data(const std::string& name, ViewTypeFrom view_from,
   }
 
   else if (name == "one-element-a") {
-    EXPECT_EQ(view_from_h(0), ValueType{1});
-    EXPECT_EQ(view_test_h(0), view_from_h(0));
+    ASSERT_EQ(view_from_h(0), ValueType{1});
+    ASSERT_EQ(view_test_h(0), view_from_h(0));
   }
 
   else if (name == "one-element-b") {
-    EXPECT_EQ(view_from_h(0), ValueType{2});
-    EXPECT_EQ(view_test_h(0), new_value);
+    ASSERT_EQ(view_from_h(0), ValueType{2});
+    ASSERT_EQ(view_test_h(0), new_value);
   }
 
   else if (name == "two-elements-a") {
-    EXPECT_EQ(view_from_h(0), ValueType{1});
-    EXPECT_EQ(view_from_h(1), ValueType{2});
+    ASSERT_EQ(view_from_h(0), ValueType{1});
+    ASSERT_EQ(view_from_h(1), ValueType{2});
 
-    EXPECT_EQ(view_test_h(0), view_from_h(0));
-    EXPECT_EQ(view_test_h(1), new_value);
+    ASSERT_EQ(view_test_h(0), view_from_h(0));
+    ASSERT_EQ(view_test_h(1), new_value);
   }
 
   else if (name == "two-elements-b") {
-    EXPECT_EQ(view_from_h(0), ValueType{2});
-    EXPECT_EQ(view_from_h(1), ValueType{-1});
+    ASSERT_EQ(view_from_h(0), ValueType{2});
+    ASSERT_EQ(view_from_h(1), ValueType{-1});
 
-    EXPECT_EQ(view_test_h(0), new_value);
-    EXPECT_EQ(view_test_h(1), view_from_h(1));
+    ASSERT_EQ(view_test_h(0), new_value);
+    ASSERT_EQ(view_test_h(1), view_from_h(1));
   }
 
   else if (name == "small-a") {
     for (std::size_t i = 0; i < view_test_h.extent(0); ++i) {
       if (i == 0 || i == 3 || i == 5 || i == 6) {
-        EXPECT_EQ(view_from_h(i), ValueType{2});
-        EXPECT_EQ(view_test_h(i), new_value);
+        ASSERT_EQ(view_from_h(i), ValueType{2});
+        ASSERT_EQ(view_test_h(i), new_value);
       } else {
         const auto gold = ValueType{-5} + static_cast<ValueType>(i + 1);
-        EXPECT_EQ(view_from_h(i), gold);
-        EXPECT_EQ(view_test_h(i), gold);
+        ASSERT_EQ(view_from_h(i), gold);
+        ASSERT_EQ(view_test_h(i), gold);
       }
     }
   }
@@ -153,11 +153,11 @@ void verify_data(const std::string& name, ViewTypeFrom view_from,
   else if (name == "small-b") {
     for (std::size_t i = 0; i < view_test_h.extent(0); ++i) {
       if (i < 4) {
-        EXPECT_EQ(view_from_h(i), ValueType{-1});
-        EXPECT_EQ(view_test_h(i), view_from_h(i));
+        ASSERT_EQ(view_from_h(i), ValueType{-1});
+        ASSERT_EQ(view_test_h(i), view_from_h(i));
       } else {
-        EXPECT_EQ(view_from_h(i), ValueType{2});
-        EXPECT_EQ(view_test_h(i), new_value);
+        ASSERT_EQ(view_from_h(i), ValueType{2});
+        ASSERT_EQ(view_test_h(i), new_value);
       }
     }
   }
@@ -165,11 +165,11 @@ void verify_data(const std::string& name, ViewTypeFrom view_from,
   else if (name == "medium" || name == "large") {
     for (std::size_t i = 0; i < view_test_h.extent(0); ++i) {
       if (i % 2 == 0) {
-        EXPECT_EQ(view_from_h(i), ValueType{-1});
-        EXPECT_EQ(view_test_h(i), view_from_h(i));
+        ASSERT_EQ(view_from_h(i), ValueType{-1});
+        ASSERT_EQ(view_test_h(i), view_from_h(i));
       } else {
-        EXPECT_EQ(view_from_h(i), ValueType{2});
-        EXPECT_EQ(view_test_h(i), new_value);
+        ASSERT_EQ(view_from_h(i), ValueType{2});
+        ASSERT_EQ(view_test_h(i), new_value);
       }
     }
   }
@@ -209,7 +209,7 @@ void run_single_scenario(const InfoType& scenario_info) {
                                    KE::cend(view_from), KE::begin(view_dest),
                                    pred_type(), new_value);
     verify_data(name, view_from, view_dest, new_value);
-    EXPECT_EQ(rit, (KE::begin(view_dest) + view_ext));
+    ASSERT_EQ(rit, (KE::begin(view_dest) + view_ext));
   }
 
   {
@@ -220,7 +220,7 @@ void run_single_scenario(const InfoType& scenario_info) {
                                    KE::cend(view_from), KE::begin(view_dest),
                                    pred_type(), new_value);
     verify_data(name, view_from, view_dest, new_value);
-    EXPECT_EQ(rit, (KE::begin(view_dest) + view_ext));
+    ASSERT_EQ(rit, (KE::begin(view_dest) + view_ext));
   }
 
   {
@@ -230,7 +230,7 @@ void run_single_scenario(const InfoType& scenario_info) {
     auto rit = KE::replace_copy_if(exespace(), view_from, view_dest,
                                    pred_type(), new_value);
     verify_data(name, view_from, view_dest, new_value);
-    EXPECT_EQ(rit, (KE::begin(view_dest) + view_ext));
+    ASSERT_EQ(rit, (KE::begin(view_dest) + view_ext));
   }
 
   {
@@ -240,7 +240,7 @@ void run_single_scenario(const InfoType& scenario_info) {
     auto rit = KE::replace_copy_if("label", exespace(), view_from, view_dest,
                                    pred_type(), new_value);
     verify_data(name, view_from, view_dest, new_value);
-    EXPECT_EQ(rit, (KE::begin(view_dest) + view_ext));
+    ASSERT_EQ(rit, (KE::begin(view_dest) + view_ext));
   }
 
   Kokkos::fence();
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceIf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceIf.cpp
index f481144e1ce36a4884a0b83f295681c8ed27b29b..802c0093c5ccc92c7d15da032854606793ae2fb9 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceIf.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceIf.cpp
@@ -138,7 +138,7 @@ void verify_data(ViewType1 data_view,  // contains data
       // 		<< data_view_dc(i) << " "
       // 		<< data_view_h(i) << " "
       // 		<< test_view_h(i) << std::endl;
-      EXPECT_EQ(data_view_h(i), test_view_h(i));
+      ASSERT_EQ(data_view_h(i), test_view_h(i));
     }
   }
 }
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReverse.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReverse.cpp
index 7d16e54029d35885bec3e71108d463812aee57e7..6e6ca72783003d6f407b793b7fec1989c5b4c72a 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReverse.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReverse.cpp
@@ -77,7 +77,7 @@ void verify_data(ViewType1 test_view, ViewType2 orig_view) {
 
   const std::size_t ext = test_view.extent(0);
   for (std::size_t i = 0; i < ext; ++i) {
-    EXPECT_EQ(tv_h(i), ov_h(ext - i - 1));
+    ASSERT_EQ(tv_h(i), ov_h(ext - i - 1));
   }
 }
 
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotate.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotate.cpp
index a5a6f99bac36f2f1e30f44831bbfab47a1a61a5c..5638cbee4a621aec4dfcb80966674d233533e4c6 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotate.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotate.cpp
@@ -136,13 +136,13 @@ void verify_data(ResultIt result_it, ViewType view, ViewHostType data_view_host,
   // make sure results match
   const auto my_diff  = result_it - KE::begin(view);
   const auto std_diff = std_rit - KE::begin(data_view_host);
-  EXPECT_EQ(my_diff, std_diff);
+  ASSERT_EQ(my_diff, std_diff);
 
   // check views match
   auto view_h           = create_host_space_copy(view);
   const std::size_t ext = view_h.extent(0);
   for (std::size_t i = 0; i < ext; ++i) {
-    EXPECT_EQ(view_h(i), data_view_host[i]);
+    ASSERT_EQ(view_h(i), data_view_host[i]);
     // std::cout << "i= " << i << " "
     // 	      << "mine: " << view_h(i) << " "
     // 	      << "std: " << data_view_host(i)
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotateCopy.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotateCopy.cpp
index 27451a1d04953384b488b06ea0086db343f088a5..d0caca7cea3f9576ff6814ef79d40ae0ef8db06f 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotateCopy.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotateCopy.cpp
@@ -139,7 +139,7 @@ void verify_data(ViewTypeFrom view_from, ViewTypeTest view_test,
                    std_gold_h.begin());
 
   for (std::size_t i = 0; i < ext; ++i) {
-    EXPECT_EQ(view_test_h(i), std_gold_h[i]);
+    ASSERT_EQ(view_test_h(i), std_gold_h[i]);
     // std::cout << "i= " << i << " "
     // 	      << "from: " << view_from_h(i) << " "
     // 	      << "mine: " << view_test_h(i) << " "
@@ -177,7 +177,7 @@ void run_single_scenario(const InfoType& scenario_info,
     auto rit  = KE::rotate_copy(exespace(), KE::cbegin(view_from), n_it,
                                KE::cend(view_from), KE::begin(view_dest));
     verify_data(view_from, view_dest, rotation_point);
-    EXPECT_EQ(rit, (KE::begin(view_dest) + view_ext));
+    ASSERT_EQ(rit, (KE::begin(view_dest) + view_ext));
   }
 
   {
@@ -187,7 +187,7 @@ void run_single_scenario(const InfoType& scenario_info,
     auto rit = KE::rotate_copy("label", exespace(), KE::cbegin(view_from), n_it,
                                KE::cend(view_from), KE::begin(view_dest));
     verify_data(view_from, view_dest, rotation_point);
-    EXPECT_EQ(rit, (KE::begin(view_dest) + view_ext));
+    ASSERT_EQ(rit, (KE::begin(view_dest) + view_ext));
   }
 
   {
@@ -196,7 +196,7 @@ void run_single_scenario(const InfoType& scenario_info,
     auto rit =
         KE::rotate_copy(exespace(), view_from, rotation_point, view_dest);
     verify_data(view_from, view_dest, rotation_point);
-    EXPECT_EQ(rit, (KE::begin(view_dest) + view_ext));
+    ASSERT_EQ(rit, (KE::begin(view_dest) + view_ext));
   }
 
   {
@@ -205,7 +205,7 @@ void run_single_scenario(const InfoType& scenario_info,
     auto rit = KE::rotate_copy("label", exespace(), view_from, rotation_point,
                                view_dest);
     verify_data(view_from, view_dest, rotation_point);
-    EXPECT_EQ(rit, (KE::begin(view_dest) + view_ext));
+    ASSERT_EQ(rit, (KE::begin(view_dest) + view_ext));
   }
 
   Kokkos::fence();
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch.cpp
index ab4bf50713633a81b98d5745f054225644200cc6..021609c444d276dd7ba659262815f2057ad04582 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch.cpp
@@ -259,7 +259,7 @@ void run_single_scenario(const InfoType& scenario_info, std::size_t seq_ext,
                             KE::cbegin(s_view), KE::cend(s_view), args...);
     const auto mydiff = myrit - KE::cbegin(view);
     const auto stddiff = stdrit - KE::cbegin(view_h);
-    EXPECT_EQ(mydiff, stddiff);
+    ASSERT_EQ(mydiff, stddiff);
   }
 
   {
@@ -268,21 +268,21 @@ void run_single_scenario(const InfoType& scenario_info, std::size_t seq_ext,
                    KE::cbegin(s_view), KE::cend(s_view), args...);
     const auto mydiff  = myrit - KE::cbegin(view);
     const auto stddiff = stdrit - KE::cbegin(view_h);
-    EXPECT_EQ(mydiff, stddiff);
+    ASSERT_EQ(mydiff, stddiff);
   }
 
   {
     auto myrit         = KE::search(exespace(), view, s_view, args...);
     const auto mydiff  = myrit - KE::begin(view);
     const auto stddiff = stdrit - KE::cbegin(view_h);
-    EXPECT_EQ(mydiff, stddiff);
+    ASSERT_EQ(mydiff, stddiff);
   }
 
   {
     auto myrit         = KE::search("label", exespace(), view, s_view, args...);
     const auto mydiff  = myrit - KE::begin(view);
     const auto stddiff = stdrit - KE::cbegin(view_h);
-    EXPECT_EQ(mydiff, stddiff);
+    ASSERT_EQ(mydiff, stddiff);
   }
 
   Kokkos::fence();
@@ -325,12 +325,6 @@ void run_all_scenarios() {
 }
 
 TEST(std_algorithms_non_mod_seq_ops, search) {
-#if defined(KOKKOS_ENABLE_CUDA) && \
-    defined(KOKKOS_COMPILER_NVHPC)  // FIXME_NVHPC
-  if constexpr (std::is_same_v<exespace, Kokkos::Cuda>) {
-    GTEST_SKIP() << "FIXME wrong result";
-  }
-#endif
   run_all_scenarios<DynamicTag, int>();
   run_all_scenarios<StridedThreeTag, int>();
 }
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch_n.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch_n.cpp
index a6fe9c1e896c5f92055dfed6230dea00999c3dd7..53ad8daa2ec93779c3ee877827125fd1378d674c 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch_n.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch_n.cpp
@@ -203,26 +203,26 @@ void run_single_scenario(const InfoType& scenario_info, std::size_t count,
     auto myrit = KE::search_n(exespace(), KE::cbegin(view), KE::cend(view),
                               count, value, args...);
     const auto mydiff = myrit - KE::cbegin(view);
-    EXPECT_EQ(mydiff, stddiff);
+    ASSERT_EQ(mydiff, stddiff);
   }
 
   {
     auto myrit        = KE::search_n("label", exespace(), KE::cbegin(view),
                               KE::cend(view), count, value, args...);
     const auto mydiff = myrit - KE::cbegin(view);
-    EXPECT_EQ(mydiff, stddiff);
+    ASSERT_EQ(mydiff, stddiff);
   }
 
   {
     auto myrit = KE::search_n("label", exespace(), view, count, value, args...);
     const auto mydiff = myrit - KE::begin(view);
-    EXPECT_EQ(mydiff, stddiff);
+    ASSERT_EQ(mydiff, stddiff);
   }
 
   {
     auto myrit        = KE::search_n(exespace(), view, count, value, args...);
     const auto mydiff = myrit - KE::begin(view);
-    EXPECT_EQ(mydiff, stddiff);
+    ASSERT_EQ(mydiff, stddiff);
   }
 
   Kokkos::fence();
@@ -297,12 +297,6 @@ void run_all_scenarios() {
 }
 
 TEST(std_algorithms_non_mod_seq_ops, search_n) {
-#if defined(KOKKOS_ENABLE_CUDA) && \
-    defined(KOKKOS_COMPILER_NVHPC)  // FIXME_NVHPC
-  if constexpr (std::is_same_v<exespace, Kokkos::Cuda>) {
-    GTEST_SKIP() << "FIXME wrong result";
-  }
-#endif
   run_all_scenarios<DynamicTag, int>();
   run_all_scenarios<StridedThreeTag, int>();
 }
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftLeft.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftLeft.cpp
index 8e4ced963584b1afc7650bcbd0a5241902f3f212..0b5fe9216eac36e5f3422efa61d3bd22b322d017 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftLeft.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftLeft.cpp
@@ -103,12 +103,12 @@ void verify_data(ResultIt result_it, ViewType view, ViewHostType data_view_host,
   // make sure results match
   const auto my_diff  = result_it - KE::begin(view);
   const auto std_diff = std_rit - KE::begin(data_view_host);
-  EXPECT_EQ(my_diff, std_diff);
+  ASSERT_EQ(my_diff, std_diff);
 
   // check views match
   auto view_h = create_host_space_copy(view);
   for (std::size_t i = 0; i < (std::size_t)my_diff; ++i) {
-    EXPECT_EQ(view_h(i), data_view_host[i]);
+    ASSERT_EQ(view_h(i), data_view_host[i]);
     // std::cout << "i= " << i << " "
     // 	      << "mine: " << view_h(i) << " "
     // 	      << "std: " << data_view_host(i)
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftRight.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftRight.cpp
index a1614be027b474537199975052965929ab3016af..8e4ae94375902ccf9a8f7e0d899b8ab7cecb2844 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftRight.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftRight.cpp
@@ -101,14 +101,14 @@ void verify_data(ResultIt result_it, ViewType view, ViewHostType data_view_host,
   // make sure results match
   const auto my_diff  = KE::end(view) - result_it;
   const auto std_diff = KE::end(data_view_host) - std_rit;
-  EXPECT_EQ(my_diff, std_diff);
+  ASSERT_EQ(my_diff, std_diff);
 
   // check views match
   auto view_h = create_host_space_copy(view);
   auto it1    = KE::cbegin(view_h);
   auto it2    = KE::cbegin(data_view_host);
   for (std::size_t i = 0; i < (std::size_t)my_diff; ++i) {
-    EXPECT_EQ(it1[i], it2[i]);
+    ASSERT_EQ(it1[i], it2[i]);
     // std::cout << "i= " << i << " "
     // 	      << "mine: " << it1[i] << " "
     // 	      << "std:  " << it2[i]
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamAdjacentDifference.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamAdjacentDifference.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c388cadc9bba4a1095515d75cb452232474dea83
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamAdjacentDifference.cpp
@@ -0,0 +1,220 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamAdjacentDifference {
+
+namespace KE = Kokkos::Experimental;
+
+template <class ValueType>
+struct PlusFunctor {
+  KOKKOS_INLINE_FUNCTION constexpr ValueType operator()(
+      const ValueType& lhs, const ValueType& rhs) const {
+    return lhs + rhs;
+  }
+};
+
+template <class SourceViewType, class DestViewType, class DistancesViewType,
+          class IntraTeamSentinelView, class BinaryOp>
+struct TestFunctorA {
+  SourceViewType m_sourceView;
+  DestViewType m_destView;
+  DistancesViewType m_distancesView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+  int m_apiPick;
+  BinaryOp m_binaryOp;
+
+  TestFunctorA(const SourceViewType sourceView, const DestViewType destView,
+               const DistancesViewType distancesView,
+               const IntraTeamSentinelView intraTeamSentinelView, int apiPick,
+               BinaryOp binaryOp)
+      : m_sourceView(sourceView),
+        m_destView(destView),
+        m_distancesView(distancesView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_apiPick(apiPick),
+        m_binaryOp(std::move(binaryOp)) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto myRowIndex = member.league_rank();
+    auto myRowViewFrom =
+        Kokkos::subview(m_sourceView, myRowIndex, Kokkos::ALL());
+    auto myRowViewDest = Kokkos::subview(m_destView, myRowIndex, Kokkos::ALL());
+    ptrdiff_t resultDist = 0;
+
+    switch (m_apiPick) {
+      case 0: {
+        auto it    = KE::adjacent_difference(member, KE::cbegin(myRowViewFrom),
+                                          KE::cend(myRowViewFrom),
+                                          KE::begin(myRowViewDest));
+        resultDist = KE::distance(KE::begin(myRowViewDest), it);
+        Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+          m_distancesView(myRowIndex) = resultDist;
+        });
+        break;
+      }
+
+      case 1: {
+        auto it    = KE::adjacent_difference(member, KE::cbegin(myRowViewFrom),
+                                          KE::cend(myRowViewFrom),
+                                          KE::begin(myRowViewDest), m_binaryOp);
+        resultDist = KE::distance(KE::begin(myRowViewDest), it);
+        Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+          m_distancesView(myRowIndex) = resultDist;
+        });
+        break;
+      }
+
+      case 2: {
+        auto it = KE::adjacent_difference(member, myRowViewFrom, myRowViewDest);
+        resultDist = KE::distance(KE::begin(myRowViewDest), it);
+        Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+          m_distancesView(myRowIndex) = resultDist;
+        });
+        break;
+      }
+
+      case 3: {
+        auto it = KE::adjacent_difference(member, myRowViewFrom, myRowViewDest,
+                                          m_binaryOp);
+        resultDist = KE::distance(KE::begin(myRowViewDest), it);
+        Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+          m_distancesView(myRowIndex) = resultDist;
+        });
+        break;
+      }
+    }
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck = team_members_have_matching_result(
+        member, resultDist, m_distancesView(myRowIndex));
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(myRowIndex) = intraTeamCheck;
+    });
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
+  /* description:
+     use a rank-2 view randomly filled with values,
+     and run a team-level adjacent_difference
+   */
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // create a view in the memory space associated with default exespace
+  // with as many rows as the number of teams and fill it with random
+  // values from an arbitrary range.
+  constexpr ValueType lowerBound = 5;
+  constexpr ValueType upperBound = 523;
+  const auto bounds              = make_bounds(lowerBound, upperBound);
+
+  auto [sourceView, sourceViewBeforeOp_h] = create_random_view_and_host_clone(
+      LayoutTag{}, numTeams, numCols, bounds, "sourceView");
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+
+  // to verify that things work, each team stores the result
+  // of its adjacent_difference call, and then we check
+  // that these match what we expect
+  Kokkos::View<ValueType**> destView("destView", numTeams, numCols);
+
+  // adjacent_difference returns an iterator so to verify that it is correct
+  // each team stores the distance of the returned iterator from the beginning
+  // of the interval that team operates on and then we check that these
+  // distances match the std result
+  Kokkos::View<std::size_t*> distancesView("distancesView", numTeams);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  // use CTAD for functor
+  TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView,
+                   apiId, PlusFunctor<ValueType>{});
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // run cpp-std kernel and check
+  // -----------------------------------------------
+  auto distancesView_h         = create_host_space_copy(distancesView);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+  Kokkos::View<ValueType**, Kokkos::HostSpace> stdDestView("stdDestView",
+                                                           numTeams, numCols);
+
+  for (std::size_t i = 0; i < sourceView.extent(0); ++i) {
+    auto rowFrom = Kokkos::subview(sourceViewBeforeOp_h, i, Kokkos::ALL());
+    auto rowDest = Kokkos::subview(stdDestView, i, Kokkos::ALL());
+    ASSERT_TRUE(intraTeamSentinelView_h(i));
+
+    switch (apiId) {
+      case 0:
+      case 2: {
+        auto it = std::adjacent_difference(KE::begin(rowFrom), KE::end(rowFrom),
+                                           KE::begin(rowDest));
+
+        const std::size_t stdDistance = KE::distance(KE::begin(rowDest), it);
+        ASSERT_EQ(stdDistance, distancesView_h(i));
+        break;
+      }
+
+      case 1:
+      case 3: {
+        auto it = std::adjacent_difference(KE::begin(rowFrom), KE::end(rowFrom),
+                                           KE::begin(rowDest),
+                                           PlusFunctor<ValueType>{});
+
+        const std::size_t stdDistance = KE::distance(KE::begin(rowDest), it);
+        ASSERT_EQ(stdDistance, distancesView_h(i));
+        break;
+      }
+    }
+  }
+
+  auto dataViewAfterOp_h = create_host_space_copy(destView);
+  expect_equal_host_views(stdDestView, dataViewAfterOp_h);
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios() {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8153}) {
+      for (int apiId : {0, 1, 2, 3}) {
+        test_A<LayoutTag, ValueType>(numTeams, numCols, apiId);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_adjacent_difference_team_test, test) {
+  run_all_scenarios<DynamicTag, double>();
+  run_all_scenarios<StridedTwoRowsTag, int>();
+  run_all_scenarios<StridedThreeRowsTag, unsigned>();
+}
+
+}  // namespace TeamAdjacentDifference
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamAdjacentFind.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamAdjacentFind.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..95f2934e01b15d73ebd8fd879c90d96bc9d08f5e
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamAdjacentFind.cpp
@@ -0,0 +1,256 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamAdjacentFind {
+
+namespace KE = Kokkos::Experimental;
+
+template <class ValueType>
+struct IsEqualFunctor {
+  KOKKOS_INLINE_FUNCTION constexpr bool operator()(const ValueType& lhs,
+                                                   const ValueType& rhs) const {
+    return lhs == rhs;
+  }
+};
+
+template <class DataViewType, class DistancesViewType,
+          class IntraTeamSentinelView, class BinaryPredType>
+struct TestFunctorA {
+  DataViewType m_dataView;
+  DistancesViewType m_distancesView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+  int m_apiPick;
+  BinaryPredType m_binaryPred;
+
+  TestFunctorA(const DataViewType dataView,
+               const DistancesViewType distancesView,
+               const IntraTeamSentinelView intraTeamSentinelView, int apiPick,
+               BinaryPredType binaryPred)
+      : m_dataView(dataView),
+        m_distancesView(distancesView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_apiPick(apiPick),
+        m_binaryPred(binaryPred) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto myRowIndex = member.league_rank();
+    auto myRowViewFrom = Kokkos::subview(m_dataView, myRowIndex, Kokkos::ALL());
+    ptrdiff_t resultDist = 0;
+
+    switch (m_apiPick) {
+      case 0: {
+        const auto it = KE::adjacent_find(member, KE::cbegin(myRowViewFrom),
+                                          KE::cend(myRowViewFrom));
+        resultDist    = KE::distance(KE::cbegin(myRowViewFrom), it);
+        Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+          m_distancesView(myRowIndex) = resultDist;
+        });
+        break;
+      }
+
+      case 1: {
+        const auto it = KE::adjacent_find(member, myRowViewFrom);
+        resultDist    = KE::distance(KE::begin(myRowViewFrom), it);
+        Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+          m_distancesView(myRowIndex) = resultDist;
+        });
+        break;
+      }
+
+      case 2: {
+        const auto it =
+            KE::adjacent_find(member, KE::cbegin(myRowViewFrom),
+                              KE::cend(myRowViewFrom), m_binaryPred);
+        resultDist = KE::distance(KE::cbegin(myRowViewFrom), it);
+        Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+          m_distancesView(myRowIndex) = resultDist;
+        });
+        break;
+      }
+
+      case 3: {
+        const auto it = KE::adjacent_find(member, myRowViewFrom, m_binaryPred);
+        resultDist    = KE::distance(KE::begin(myRowViewFrom), it);
+        Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+          m_distancesView(myRowIndex) = resultDist;
+        });
+        break;
+      }
+    }
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck = team_members_have_matching_result(
+        member, resultDist, m_distancesView(myRowIndex));
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(myRowIndex) = intraTeamCheck;
+    });
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(const bool ensureAdjacentFindCanFind, std::size_t numTeams,
+            std::size_t numCols, int apiId) {
+  /* description:
+     use a rank-2 view randomly filled with values,
+     and run a team-level adjacent_find
+   */
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // create a view in the memory space associated with default exespace
+  // with as many rows as the number of teams and fill it with random
+  // values from an arbitrary range.
+  constexpr ValueType lowerBound = 5;
+  constexpr ValueType upperBound = 523;
+  const auto bounds              = make_bounds(lowerBound, upperBound);
+
+  auto [dataView, dataViewBeforeOp_h] = create_random_view_and_host_clone(
+      LayoutTag{}, numTeams, numCols, bounds, "dataView");
+
+  // If ensureAdjacentFindCanFind == true ensure there are two consecutive equal
+  // elements in each row
+
+  // dataView might not deep copyable (e.g. strided layout) so to prepare it
+  // correctly, we make a new view that is for sure deep copyable, modify it on
+  // the host, deep copy to device and then launch a kernel to copy to dataView
+  auto dataView_dc =
+      create_deep_copyable_compatible_view_with_same_extent(dataView);
+  auto dataView_dc_h = create_mirror_view(Kokkos::HostSpace(), dataView_dc);
+
+  if (ensureAdjacentFindCanFind && numCols > 1) {
+    for (std::size_t i = 0; i < numTeams; ++i) {
+      const auto j = numCols / 2;
+
+      dataView_dc_h(i, j - 1) = dataView_dc_h(i, j);
+    }
+  }
+
+  // copy to dataView_dc and then to dataView
+  Kokkos::deep_copy(dataView_dc, dataView_dc_h);
+
+  CopyFunctorRank2 cpFun(dataView_dc, dataView);
+  Kokkos::parallel_for("copy", dataView.extent(0) * dataView.extent(1), cpFun);
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+
+  // adjacent_find returns an iterator so to verify that it is correct
+  // each team stores the distance of the returned iterator from the beginning
+  // of the interval that team operates on and then we check that these
+  // distances match the std result
+  Kokkos::View<std::size_t*> distancesView("distancesView", numTeams);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  // use CTAD for functor
+  IsEqualFunctor<ValueType> binaryPred;
+  TestFunctorA fnc(dataView, distancesView, intraTeamSentinelView, apiId,
+                   binaryPred);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // run cpp-std kernel and check
+  // -----------------------------------------------
+  auto distancesView_h         = create_host_space_copy(distancesView);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+
+  for (std::size_t i = 0; i < dataView.extent(0); ++i) {
+    auto rowFrom            = Kokkos::subview(dataView_dc_h, i, Kokkos::ALL());
+    const auto rowFromBegin = KE::cbegin(rowFrom);
+    const auto rowFromEnd   = KE::cend(rowFrom);
+    const std::size_t beginEndDist = KE::distance(rowFromBegin, rowFromEnd);
+
+    ASSERT_TRUE(intraTeamSentinelView_h(i));
+    switch (apiId) {
+      case 0:
+      case 1: {
+        const auto it = std::adjacent_find(rowFromBegin, rowFromEnd);
+        const std::size_t stdDistance = KE::distance(rowFromBegin, it);
+        ASSERT_EQ(stdDistance, distancesView_h(i));
+
+        if (numCols == 1) {
+          ASSERT_EQ(distancesView_h(i), beginEndDist);
+        } else if (ensureAdjacentFindCanFind) {
+          EXPECT_NE(distancesView_h(i), beginEndDist);
+        }
+
+        break;
+      }
+
+      case 2:
+      case 3: {
+        const auto it =
+            std::adjacent_find(rowFromBegin, rowFromEnd, binaryPred);
+        const std::size_t stdDistance = KE::distance(rowFromBegin, it);
+
+        ASSERT_EQ(stdDistance, distancesView_h(i));
+
+        if (numCols == 1) {
+          ASSERT_EQ(distancesView_h(i), beginEndDist);
+        } else if (ensureAdjacentFindCanFind) {
+          EXPECT_NE(distancesView_h(i), beginEndDist);
+        }
+
+        break;
+      }
+    }
+  }
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios(const bool ensureAdjacentFindCanFind) {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : {1, 2, 13, 101, 1444, 8153}) {
+      for (int apiId : {0, 1, 2, 3}) {
+        test_A<LayoutTag, ValueType>(ensureAdjacentFindCanFind, numTeams,
+                                     numCols, apiId);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_adjacent_find_team_test,
+     two_consecutive_equal_elements_exist) {
+  constexpr bool ensureAdjacentFindCanFind = true;
+
+  run_all_scenarios<DynamicTag, double>(ensureAdjacentFindCanFind);
+  run_all_scenarios<StridedTwoRowsTag, int>(ensureAdjacentFindCanFind);
+  run_all_scenarios<StridedThreeRowsTag, unsigned>(ensureAdjacentFindCanFind);
+}
+
+TEST(std_algorithms_adjacent_find_team_test,
+     two_consecutive_equal_elements_might_exist) {
+  constexpr bool ensureAdjacentFindCanFind = false;
+
+  run_all_scenarios<DynamicTag, double>(ensureAdjacentFindCanFind);
+  run_all_scenarios<StridedTwoRowsTag, int>(ensureAdjacentFindCanFind);
+  run_all_scenarios<StridedThreeRowsTag, unsigned>(ensureAdjacentFindCanFind);
+}
+
+}  // namespace TeamAdjacentFind
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamAllOf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamAllOf.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7b3dca330af43c354d11bdda17ffbd9b99669766
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamAllOf.cpp
@@ -0,0 +1,165 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamAllOf {
+
+namespace KE = Kokkos::Experimental;
+
+template <class ValueType>
+struct GreaterThanValueFunctor {
+  ValueType m_val;
+
+  KOKKOS_INLINE_FUNCTION
+  GreaterThanValueFunctor(ValueType val) : m_val(val) {}
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator()(ValueType val) const { return (val > m_val); }
+};
+
+template <class DataViewType, class AlllOfResultsViewType,
+          class IntraTeamSentinelView, class UnaryPredType>
+struct TestFunctorA {
+  DataViewType m_dataView;
+  AlllOfResultsViewType m_allOfResultsView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+  int m_apiPick;
+  UnaryPredType m_unaryPred;
+
+  TestFunctorA(const DataViewType dataView,
+               const AlllOfResultsViewType allOfResultsView,
+               const IntraTeamSentinelView intraTeamSentinelView, int apiPick,
+               UnaryPredType unaryPred)
+      : m_dataView(dataView),
+        m_allOfResultsView(allOfResultsView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_apiPick(apiPick),
+        m_unaryPred(unaryPred) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto myRowIndex = member.league_rank();
+
+    auto myRowViewFrom = Kokkos::subview(m_dataView, myRowIndex, Kokkos::ALL());
+    bool result        = false;
+
+    switch (m_apiPick) {
+      case 0: {
+        result = KE::all_of(member, KE::cbegin(myRowViewFrom),
+                            KE::cend(myRowViewFrom), m_unaryPred);
+        Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+          m_allOfResultsView(myRowIndex) = result;
+        });
+        break;
+      }
+
+      case 1: {
+        result = KE::all_of(member, myRowViewFrom, m_unaryPred);
+        Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+          m_allOfResultsView(myRowIndex) = result;
+        });
+        break;
+      }
+    }
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck = team_members_have_matching_result(
+        member, result, m_allOfResultsView(myRowIndex));
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(myRowIndex) = intraTeamCheck;
+    });
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
+  /* description:
+     use a rank-2 view randomly filled with values,
+     and run a team-level all_of
+   */
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // create a view in the memory space associated with default exespace
+  // with as many rows as the number of teams and fill it with random
+  // values from an arbitrary range.
+  constexpr ValueType lowerBound = 5;
+  constexpr ValueType upperBound = 523;
+  const auto bounds              = make_bounds(lowerBound, upperBound);
+
+  auto [dataView, dataViewBeforeOp_h] = create_random_view_and_host_clone(
+      LayoutTag{}, numTeams, numCols, bounds, "dataView");
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+
+  // to verify that things work, each team stores the result of its all_of call,
+  // and then we check that these match what we expect
+  Kokkos::View<bool*> allOfResultsView("allOfResultsView", numTeams);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  GreaterThanValueFunctor unaryPred{lowerBound - 1};
+
+  // use CTAD for functor
+  TestFunctorA fnc(dataView, allOfResultsView, intraTeamSentinelView, apiId,
+                   unaryPred);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // run cpp-std kernel and check
+  // -----------------------------------------------
+  auto allOfResultsView_h      = create_host_space_copy(allOfResultsView);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+
+  for (std::size_t i = 0; i < dataView.extent(0); ++i) {
+    auto rowFrom = Kokkos::subview(dataViewBeforeOp_h, i, Kokkos::ALL());
+    const bool result =
+        std::all_of(KE::cbegin(rowFrom), KE::cend(rowFrom), unaryPred);
+    ASSERT_EQ(result, allOfResultsView_h(i));
+    ASSERT_TRUE(intraTeamSentinelView_h(i));
+  }
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios() {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8153}) {
+      for (int apiId : {0, 1}) {
+        test_A<LayoutTag, ValueType>(numTeams, numCols, apiId);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_all_of_team_test, test) {
+  run_all_scenarios<DynamicTag, double>();
+  run_all_scenarios<StridedTwoRowsTag, int>();
+  run_all_scenarios<StridedThreeRowsTag, unsigned>();
+}
+
+}  // namespace TeamAllOf
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamAnyOf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamAnyOf.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f99617008eecefc6df64ba0387542420bf86f0e0
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamAnyOf.cpp
@@ -0,0 +1,165 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamAnyOf {
+
+namespace KE = Kokkos::Experimental;
+
+template <class ValueType>
+struct GreaterThanValueFunctor {
+  ValueType m_val;
+
+  KOKKOS_INLINE_FUNCTION
+  GreaterThanValueFunctor(ValueType val) : m_val(val) {}
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator()(ValueType val) const { return (val > m_val); }
+};
+
+template <class DataViewType, class AnyOfResultsViewType,
+          class IntraTeamSentinelView, class UnaryPredType>
+struct TestFunctorA {
+  DataViewType m_dataView;
+  AnyOfResultsViewType m_anyOfResultsView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+  int m_apiPick;
+  UnaryPredType m_unaryPred;
+
+  TestFunctorA(const DataViewType dataView,
+               const AnyOfResultsViewType anyOfResultsView,
+               const IntraTeamSentinelView intraTeamSentinelView, int apiPick,
+               UnaryPredType unaryPred)
+      : m_dataView(dataView),
+        m_anyOfResultsView(anyOfResultsView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_apiPick(apiPick),
+        m_unaryPred(unaryPred) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto myRowIndex = member.league_rank();
+
+    auto myRowViewFrom = Kokkos::subview(m_dataView, myRowIndex, Kokkos::ALL());
+    bool result        = false;
+
+    switch (m_apiPick) {
+      case 0: {
+        result = KE::any_of(member, KE::cbegin(myRowViewFrom),
+                            KE::cend(myRowViewFrom), m_unaryPred);
+        Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+          m_anyOfResultsView(myRowIndex) = result;
+        });
+        break;
+      }
+
+      case 1: {
+        result = KE::any_of(member, myRowViewFrom, m_unaryPred);
+        Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+          m_anyOfResultsView(myRowIndex) = result;
+        });
+        break;
+      }
+    }
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck = team_members_have_matching_result(
+        member, result, m_anyOfResultsView(myRowIndex));
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(myRowIndex) = intraTeamCheck;
+    });
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
+  /* description:
+     use a rank-2 view randomly filled with values,
+     and run a team-level any_of
+   */
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // create a view in the memory space associated with default exespace
+  // with as many rows as the number of teams and fill it with random
+  // values from an arbitrary range.
+  constexpr ValueType lowerBound = 5;
+  constexpr ValueType upperBound = 523;
+  const auto bounds              = make_bounds(lowerBound, upperBound);
+
+  auto [dataView, dataViewBeforeOp_h] = create_random_view_and_host_clone(
+      LayoutTag{}, numTeams, numCols, bounds, "dataView");
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+
+  // to verify that things work, each team stores the result of its any_of call,
+  // and then we check that these match what we expect
+  Kokkos::View<bool*> anyOfResultsView("anyOfResultsView", numTeams);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  GreaterThanValueFunctor unaryPred{lowerBound};
+
+  // use CTAD for functor
+  TestFunctorA fnc(dataView, anyOfResultsView, intraTeamSentinelView, apiId,
+                   unaryPred);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // run cpp-std kernel and check
+  // -----------------------------------------------
+  auto anyOfResultsView_h      = create_host_space_copy(anyOfResultsView);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+
+  for (std::size_t i = 0; i < dataView.extent(0); ++i) {
+    auto rowFrom = Kokkos::subview(dataViewBeforeOp_h, i, Kokkos::ALL());
+    const bool result =
+        std::any_of(KE::cbegin(rowFrom), KE::cend(rowFrom), unaryPred);
+    ASSERT_EQ(result, anyOfResultsView_h(i));
+    ASSERT_TRUE(intraTeamSentinelView_h(i));
+  }
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios() {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8153}) {
+      for (int apiId : {0, 1}) {
+        test_A<LayoutTag, ValueType>(numTeams, numCols, apiId);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_any_of_team_test, test) {
+  run_all_scenarios<DynamicTag, double>();
+  run_all_scenarios<StridedTwoRowsTag, int>();
+  run_all_scenarios<StridedThreeRowsTag, unsigned>();
+}
+
+}  // namespace TeamAnyOf
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopy.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopy.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e24ac37bf0122dbf9dfc51dd2e5bc2c121c97144
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopy.cpp
@@ -0,0 +1,157 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamCopy {
+
+namespace KE = Kokkos::Experimental;
+
+template <class SourceViewType, class DestViewType, class DistancesViewType,
+          class IntraTeamSentinelView>
+struct TestFunctorA {
+  SourceViewType m_sourceView;
+  DestViewType m_destView;
+  DistancesViewType m_distancesView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+  int m_apiPick;
+
+  TestFunctorA(const SourceViewType sourceView, const DestViewType destView,
+               const DistancesViewType distancesView,
+               const IntraTeamSentinelView intraTeamSentinelView, int apiPick)
+      : m_sourceView(sourceView),
+        m_destView(destView),
+        m_distancesView(distancesView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_apiPick(apiPick) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto myRowIndex = member.league_rank();
+    auto myRowViewFrom =
+        Kokkos::subview(m_sourceView, myRowIndex, Kokkos::ALL());
+    auto myRowViewDest = Kokkos::subview(m_destView, myRowIndex, Kokkos::ALL());
+    ptrdiff_t resultDist = 0;
+
+    if (m_apiPick == 0) {
+      auto it    = KE::copy(member, KE::begin(myRowViewFrom),
+                         KE::end(myRowViewFrom), KE::begin(myRowViewDest));
+      resultDist = KE::distance(KE::begin(myRowViewDest), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    } else if (m_apiPick == 1) {
+      auto it    = KE::copy(member, myRowViewFrom, myRowViewDest);
+      resultDist = KE::distance(KE::begin(myRowViewDest), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    }
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck = team_members_have_matching_result(
+        member, resultDist, m_distancesView(myRowIndex));
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(myRowIndex) = intraTeamCheck;
+    });
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
+  /* description:
+     randomly fill a source view and copy into a destination view.
+     The operation is done via a team parfor with one row per team.
+   */
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // create a view in the memory space associated with default exespace
+  // with as many rows as the number of teams and fill it with random
+  // values from an arbitrary range
+  auto [sourceView, cloneOfSourceViewBeforeOp_h] =
+      create_random_view_and_host_clone(
+          LayoutTag{}, numTeams, numCols,
+          Kokkos::pair<ValueType, ValueType>{11, 523}, "sourceView");
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+  // create the destination view
+  Kokkos::View<ValueType**> destView("destView", numTeams, numCols);
+  // make a host copy of the destination view that should be unchanged after the
+  // op
+  auto destViewBeforeOp_h = create_host_space_copy(destView);
+
+  // KE::copy returns an iterator so to verify that it is correct
+  // each team stores the distance of the returned iterator from the
+  // beginning of the interval that team operates on and then we check
+  // that these distances match the expectation
+  Kokkos::View<std::size_t*> distancesView("distancesView", numTeams);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  // use CTAD for functor
+  TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView,
+                   apiId);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // check
+  // -----------------------------------------------
+  auto distancesView_h         = create_host_space_copy(distancesView);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+  auto destViewAfterOp_h       = create_host_space_copy(destView);
+  for (std::size_t i = 0; i < destViewBeforeOp_h.extent(0); ++i) {
+    ASSERT_TRUE(intraTeamSentinelView_h(i));
+    for (std::size_t j = 0; j < destViewBeforeOp_h.extent(1); ++j) {
+      ASSERT_EQ(destViewBeforeOp_h(i, j), ValueType(0));
+      EXPECT_TRUE(destViewAfterOp_h(i, j) != destViewBeforeOp_h(i, j));
+    }
+    // each team should return an iterator past the last column
+    EXPECT_TRUE(distancesView_h(i) == numCols);
+  }
+
+  expect_equal_host_views(cloneOfSourceViewBeforeOp_h, destViewAfterOp_h);
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios() {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 11113}) {
+      for (int apiId : {0, 1}) {
+        test_A<LayoutTag, ValueType>(numTeams, numCols, apiId);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_copy_team_test, test) {
+  run_all_scenarios<DynamicTag, double>();
+  run_all_scenarios<StridedTwoRowsTag, int>();
+  run_all_scenarios<StridedThreeRowsTag, unsigned>();
+}
+
+}  // namespace TeamCopy
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopyBackward.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopyBackward.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c17c1810deaa45c1e343e251451d5138066fd65b
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopyBackward.cpp
@@ -0,0 +1,168 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamCopybackward {
+
+namespace KE = Kokkos::Experimental;
+
+template <class SourceViewType, class DestViewType, class DistancesViewType,
+          class IntraTeamSentinelView>
+struct TestFunctorA {
+  SourceViewType m_sourceView;
+  DestViewType m_destView;
+  DistancesViewType m_distancesView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+  int m_apiPick;
+
+  TestFunctorA(const SourceViewType sourceView, const DestViewType destView,
+               const DistancesViewType distancesView,
+               const IntraTeamSentinelView intraTeamSentinelView, int apiPick)
+      : m_sourceView(sourceView),
+        m_destView(destView),
+        m_distancesView(distancesView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_apiPick(apiPick) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto myRowIndex = member.league_rank();
+    auto myRowViewFrom =
+        Kokkos::subview(m_sourceView, myRowIndex, Kokkos::ALL());
+    auto myRowViewDest = Kokkos::subview(m_destView, myRowIndex, Kokkos::ALL());
+    ptrdiff_t resultDist = 0;
+
+    if (m_apiPick == 0) {
+      auto it =
+          KE::copy_backward(member, KE::cbegin(myRowViewFrom),
+                            KE::cend(myRowViewFrom), KE::end(myRowViewDest));
+      resultDist = KE::distance(KE::begin(myRowViewDest), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    } else if (m_apiPick == 1) {
+      auto it    = KE::copy_backward(member, myRowViewFrom, myRowViewDest);
+      resultDist = KE::distance(KE::begin(myRowViewDest), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    }
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck = team_members_have_matching_result(
+        member, resultDist, m_distancesView(myRowIndex));
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(myRowIndex) = intraTeamCheck;
+    });
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
+  /* description:
+     randomly fill a source view and copy_backward into a destination view.
+     The operation is done via a team parfor with one row per team.
+   */
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // create a view in the memory space associated with default exespace
+  // with as many rows as the number of teams and fill it with random
+  // values from an arbitrary range
+  auto [sourceView, cloneOfSourceViewBeforeOp_h] =
+      create_random_view_and_host_clone(
+          LayoutTag{}, numTeams, numCols,
+          Kokkos::pair<ValueType, ValueType>{11, 523}, "sourceView");
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+  // create the destination view: for a meaningful test, the destination
+  // view must have more columns that than the source view so that we
+  // can check that the elements are copied into the right place
+  constexpr std::size_t extra = 10;
+  Kokkos::View<ValueType**> destView("destView", numTeams, numCols + extra);
+  // make a host copy of the destination view that should be unchanged after the
+  // op
+  auto destViewBeforeOp_h = create_host_space_copy(destView);
+
+  // copy_backward returns an iterator so to verify that it is correct
+  // each team stores the distance of the returned iterator from the
+  // beginning of the interval that team operates on and then we check
+  // that these distances match the expectation
+  Kokkos::View<std::size_t*> distancesView("distancesView", numTeams);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  // use CTAD for functor
+  TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView,
+                   apiId);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // check
+  // -----------------------------------------------
+  auto distancesView_h         = create_host_space_copy(distancesView);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+  auto destViewAfterOp_h       = create_host_space_copy(destView);
+  for (std::size_t i = 0; i < destViewAfterOp_h.extent(0); ++i) {
+    ASSERT_TRUE(intraTeamSentinelView_h(i));
+
+    // first extra num of columns should be unchanged
+    for (std::size_t j = 0; j < extra; ++j) {
+      EXPECT_TRUE(destViewAfterOp_h(i, j) == destViewBeforeOp_h(i, j));
+    }
+
+    // after extra # of column (inclusive) should match the source view
+    for (std::size_t j = extra; j < destViewBeforeOp_h.extent(1); ++j) {
+      EXPECT_TRUE(cloneOfSourceViewBeforeOp_h(i, j - extra) ==
+                  destViewAfterOp_h(i, j));
+    }
+
+    // each team should have returned an interator whose distance
+    // from the beginning of the row should satisfy this
+    EXPECT_TRUE(distancesView_h(i) == extra);
+  }
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios() {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 11113}) {
+      for (int apiId : {0, 1}) {
+        test_A<LayoutTag, ValueType>(numTeams, numCols, apiId);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_copy_backward_team_test, test) {
+  run_all_scenarios<DynamicTag, double>();
+  run_all_scenarios<StridedTwoRowsTag, int>();
+  run_all_scenarios<StridedThreeRowsTag, unsigned>();
+}
+
+}  // namespace TeamCopybackward
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b32a9be3a1788c7b842ba9b8013371cf56816e4e
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp
@@ -0,0 +1,176 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamCopyIf {
+
+namespace KE = Kokkos::Experimental;
+
+template <class ValueType>
+struct GreaterThanValueFunctor {
+  ValueType m_val;
+
+  KOKKOS_INLINE_FUNCTION
+  GreaterThanValueFunctor(ValueType val) : m_val(val) {}
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator()(ValueType val) const { return (val > m_val); }
+};
+
+template <class SourceViewType, class DestViewType, class DistancesViewType,
+          class IntraTeamSentinelView, class ValueType>
+struct TestFunctorA {
+  SourceViewType m_sourceView;
+  DestViewType m_destView;
+  DistancesViewType m_distancesView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+  ValueType m_threshold;
+  int m_apiPick;
+
+  TestFunctorA(const SourceViewType sourceView, const DestViewType destView,
+               const DistancesViewType distancesView,
+               const IntraTeamSentinelView intraTeamSentinelView,
+               ValueType threshold, int apiPick)
+      : m_sourceView(sourceView),
+        m_destView(destView),
+        m_distancesView(distancesView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_threshold(threshold),
+        m_apiPick(apiPick) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto myRowIndex = member.league_rank();
+    auto myRowViewFrom =
+        Kokkos::subview(m_sourceView, myRowIndex, Kokkos::ALL());
+    auto myRowViewDest = Kokkos::subview(m_destView, myRowIndex, Kokkos::ALL());
+
+    GreaterThanValueFunctor predicate(m_threshold);
+    ptrdiff_t resultDist = 0;
+
+    if (m_apiPick == 0) {
+      auto it =
+          KE::copy_if(member, KE::begin(myRowViewFrom), KE::end(myRowViewFrom),
+                      KE::begin(myRowViewDest), predicate);
+      resultDist = KE::distance(KE::begin(myRowViewDest), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    } else if (m_apiPick == 1) {
+      auto it    = KE::copy_if(member, myRowViewFrom, myRowViewDest, predicate);
+      resultDist = KE::distance(KE::begin(myRowViewDest), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    }
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck = team_members_have_matching_result(
+        member, resultDist, m_distancesView(myRowIndex));
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(myRowIndex) = intraTeamCheck;
+    });
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
+  /* description:
+     use a rank-2 view randomly filled with values,
+     and run a team-level copy_if where only the values strictly
+     greater than a threshold are copied into a new view
+   */
+
+  const auto threshold = static_cast<ValueType>(151);
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // create a view in the memory space associated with default exespace
+  // with as many rows as the number of teams and fill it with random
+  // values from an arbitrary range.
+  auto [sourceView, sourceViewBeforeOp_h] = create_random_view_and_host_clone(
+      LayoutTag{}, numTeams, numCols,
+      Kokkos::pair<ValueType, ValueType>{5, 523}, "sourceView");
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+  // create the destination view
+  Kokkos::View<ValueType**> destView("destView", numTeams, numCols);
+
+  // copy_if returns an iterator so to verify that it is correct
+  // each team stores the distance of the returned iterator from the
+  // beginning of the interval that team operates on and then we check
+  // that these distances match the std result
+  Kokkos::View<std::size_t*> distancesView("distancesView", numTeams);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  // use CTAD for functor
+  TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView,
+                   threshold, apiId);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // run cpp-std kernel and check
+  // -----------------------------------------------
+  auto distancesView_h         = create_host_space_copy(distancesView);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+  Kokkos::View<ValueType**, Kokkos::HostSpace> stdDestView("stdDestView",
+                                                           numTeams, numCols);
+  GreaterThanValueFunctor predicate(threshold);
+  for (std::size_t i = 0; i < sourceView.extent(0); ++i) {
+    auto rowFrom = Kokkos::subview(sourceViewBeforeOp_h, i, Kokkos::ALL());
+    auto rowDest = Kokkos::subview(stdDestView, i, Kokkos::ALL());
+    auto it      = std::copy_if(KE::cbegin(rowFrom), KE::cend(rowFrom),
+                           KE::begin(rowDest), predicate);
+    const std::size_t stdDistance = KE::distance(KE::begin(rowDest), it);
+    ASSERT_EQ(stdDistance, distancesView_h(i));
+    ASSERT_TRUE(intraTeamSentinelView_h(i));
+  }
+
+  auto dataViewAfterOp_h = create_host_space_copy(destView);
+  expect_equal_host_views(stdDestView, dataViewAfterOp_h);
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios() {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8153}) {
+      for (int apiId : {0, 1}) {
+        test_A<LayoutTag, ValueType>(numTeams, numCols, apiId);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_copy_if_team_test, test) {
+  run_all_scenarios<DynamicTag, double>();
+  run_all_scenarios<StridedTwoRowsTag, int>();
+  run_all_scenarios<StridedThreeRowsTag, unsigned>();
+}
+
+}  // namespace TeamCopyIf
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopy_n.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopy_n.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7cbc788f8e3c1dae46fe408062e50dbb460e9cf5
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopy_n.cpp
@@ -0,0 +1,176 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamCopy_n {
+
+namespace KE = Kokkos::Experimental;
+
+template <class SourceViewType, class DestViewType, class DistancesViewType,
+          class IntraTeamSentinelView>
+struct TestFunctorA {
+  SourceViewType m_sourceView;
+  DestViewType m_destView;
+  DistancesViewType m_distancesView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+  std::size_t m_copyCount;
+  int m_apiPick;
+
+  TestFunctorA(const SourceViewType fromView, const DestViewType destView,
+               const DistancesViewType distancesView,
+               const IntraTeamSentinelView intraTeamSentinelView,
+               std::size_t copyCount, int apiPick)
+      : m_sourceView(fromView),
+        m_destView(destView),
+        m_distancesView(distancesView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_copyCount(copyCount),
+        m_apiPick(apiPick) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto myRowIndex = member.league_rank();
+    auto myRowViewFrom =
+        Kokkos::subview(m_sourceView, myRowIndex, Kokkos::ALL());
+    auto myRowViewDest = Kokkos::subview(m_destView, myRowIndex, Kokkos::ALL());
+    ptrdiff_t resultDist = 0;
+
+    if (m_apiPick == 0) {
+      auto it    = KE::copy_n(member, KE::begin(myRowViewFrom), m_copyCount,
+                           KE::begin(myRowViewDest));
+      resultDist = KE::distance(KE::begin(myRowViewDest), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    } else if (m_apiPick == 1) {
+      auto it = KE::copy_n(member, myRowViewFrom, m_copyCount, myRowViewDest);
+      resultDist = KE::distance(KE::begin(myRowViewDest), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    }
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck = team_members_have_matching_result(
+        member, resultDist, m_distancesView(myRowIndex));
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(myRowIndex) = intraTeamCheck;
+    });
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(std::size_t numTeams, std::size_t numCols, std::size_t copyCount,
+            int apiId) {
+  /* description:
+     randomly fill a source view and copy a copyCount set of values
+     for each row into a destination view. The operation is done via
+     a team parfor with one row per team.
+   */
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // create a view in the memory space associated with default exespace
+  // with as many rows as the number of teams and fill it with random
+  // values from an arbitrary range
+  auto [sourceView, cloneOfSourceViewBeforeOp_h] =
+      create_random_view_and_host_clone(
+          LayoutTag{}, numTeams, numCols,
+          Kokkos::pair<ValueType, ValueType>{11, 523}, "sourceView");
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+  // create the destination view
+  Kokkos::View<ValueType**> destView("destView", numTeams, numCols);
+  // make a host copy of destView, should be unchanged after the op
+  auto destViewBeforeOp_h = create_host_space_copy(destView);
+
+  // copy_n returns an iterator so to verify that it is correct
+  // each team stores the distance of the returned iterator from the
+  // beginning of the interval that team operates on and then we check
+  // that these distances match the expectation
+  Kokkos::View<std::size_t*> distancesView("distancesView", numTeams);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  // use CTAD for functor
+  TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView,
+                   copyCount, apiId);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // check
+  // -----------------------------------------------
+  auto distancesView_h         = create_host_space_copy(distancesView);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+  auto destViewAfterOp_h       = create_host_space_copy(destView);
+  for (std::size_t i = 0; i < destViewBeforeOp_h.extent(0); ++i) {
+    ASSERT_TRUE(intraTeamSentinelView_h(i));
+    for (std::size_t j = 0; j < copyCount; ++j) {
+      ASSERT_EQ(destViewAfterOp_h(i, j), cloneOfSourceViewBeforeOp_h(i, j));
+    }
+    for (std::size_t j = copyCount; j < numCols; ++j) {
+      EXPECT_TRUE(destViewAfterOp_h(i, j) == destViewBeforeOp_h(i, j));
+    }
+    // each team should return an iterator past the last column
+    EXPECT_TRUE(distancesView_h(i) == copyCount);
+  }
+}
+
+template <class Tag, class ValueType>
+void run_all_scenarios() {
+  // prepare a map where, for a given set of num cols
+  // we provide a list of counts of elements to copy
+  // key = num of columns,
+  // value = list of num of elemenents to copy
+  const std::map<int, std::vector<int>> scenarios = {
+      {0, {0}},
+      {2, {0, 1, 2}},
+      {6, {0, 1, 2, 5}},
+      {13, {0, 1, 2, 8, 11}},
+      {56, {0, 1, 2, 8, 11, 33, 56}},
+      {123, {0, 1, 11, 33, 56, 89, 112}}};
+
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& scenario : scenarios) {
+      const std::size_t numCols = scenario.first;
+      for (int copyCount : scenario.second) {
+        for (int apiId : {0, 1}) {
+          test_A<Tag, ValueType>(numTeams, numCols, copyCount, apiId);
+        }
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_copy_n_team_test, test) {
+  run_all_scenarios<DynamicTag, double>();
+  run_all_scenarios<StridedTwoRowsTag, int>();
+  run_all_scenarios<StridedThreeRowsTag, unsigned>();
+}
+
+}  // namespace TeamCopy_n
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCount.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCount.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..922424afbd98bdd1ee32f79c134215a1b8193ed0
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCount.cpp
@@ -0,0 +1,201 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamCount {
+
+namespace KE = Kokkos::Experimental;
+
+template <class ViewType, class ValuesViewType, class CountsViewType,
+          class IntraTeamSentinelView>
+struct TestFunctorA {
+  ViewType m_view;
+  ValuesViewType m_valuesView;
+  CountsViewType m_countsView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+  int m_apiPick;
+
+  TestFunctorA(const ViewType view, const ValuesViewType valuesView,
+               const CountsViewType countsView,
+               const IntraTeamSentinelView intraTeamSentinelView, int apiPick)
+      : m_view(view),
+        m_valuesView(valuesView),
+        m_countsView(countsView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_apiPick(apiPick) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto rowIndex = member.league_rank();
+    const auto value    = m_valuesView(rowIndex);
+    auto rowView        = Kokkos::subview(m_view, rowIndex, Kokkos::ALL());
+    std::size_t result  = 0;
+
+    switch (m_apiPick) {
+      case 0: {
+        result =
+            KE::count(member, KE::cbegin(rowView), KE::cend(rowView), value);
+        Kokkos::single(Kokkos::PerTeam(member),
+                       [=, *this]() { m_countsView(rowIndex) = result; });
+
+        break;
+      }
+
+      case 1: {
+        result = KE::count(member, rowView, value);
+        Kokkos::single(Kokkos::PerTeam(member),
+                       [=, *this]() { m_countsView(rowIndex) = result; });
+
+        break;
+      }
+    }
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck = team_members_have_matching_result(
+        member, result, m_countsView(rowIndex));
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(rowIndex) = intraTeamCheck;
+    });
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(const bool searched_value_exist, std::size_t numTeams,
+            std::size_t numCols, int apiId) {
+  /* description:
+     use a rank-2 view randomly filled with values,
+     and run a team-level count
+   */
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // create a view in the memory space associated with default exespace
+  // with as many rows as the number of teams and fill it with random
+  // values from an arbitrary range.
+
+  // Boundaries choosen so that every drawn number is at least once in the given
+  // row
+  const ValueType lowerBound = numCols / 4;
+  const ValueType upperBound = 1 + numCols * 3 / 4;
+  const auto bounds          = make_bounds(lowerBound, upperBound);
+
+  auto [dataView, dataViewBeforeOp_h] = create_random_view_and_host_clone(
+      LayoutTag{}, numTeams, numCols, bounds, "dataView");
+
+  // If searched_value_exist == true, we want to ensure that count result is >
+  // 0, so we randomly pick a value to look for from a given row.
+  //
+  // If searched_value_exist == false, we want to ensure that count returns 0,
+  // so we pick a value that's outside of view boundaries.
+  Kokkos::View<ValueType*> valuesView("valuesView", numTeams);
+  auto valuesView_h = create_mirror_view(Kokkos::HostSpace(), valuesView);
+
+  using rand_pool =
+      Kokkos::Random_XorShift64_Pool<Kokkos::DefaultHostExecutionSpace>;
+  rand_pool pool(lowerBound * upperBound);
+
+  if (searched_value_exist) {
+    Kokkos::View<std::size_t*, Kokkos::DefaultHostExecutionSpace> randomIndices(
+        "randomIndices", numTeams);
+    Kokkos::fill_random(randomIndices, pool, 0, numCols);
+
+    for (std::size_t i = 0; i < numTeams; ++i) {
+      const std::size_t j = randomIndices(i);
+      valuesView_h(i)     = dataViewBeforeOp_h(i, j);
+    }
+  } else {
+    Kokkos::fill_random(valuesView_h, pool, upperBound, upperBound * 2);
+  }
+
+  Kokkos::deep_copy(valuesView, valuesView_h);
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+
+  // to verify that things work, each team stores the result of its count
+  // call, and then we check that these match what we expect
+  Kokkos::View<std::size_t*> countsView("countsView", numTeams);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  // use CTAD for functor
+  TestFunctorA fnc(dataView, valuesView, countsView, intraTeamSentinelView,
+                   apiId);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // check
+  // -----------------------------------------------
+  auto countsView_h            = create_host_space_copy(countsView);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+  if (searched_value_exist) {
+    for (std::size_t i = 0; i < dataView.extent(0); ++i) {
+      auto rowFrom = Kokkos::subview(dataViewBeforeOp_h, i, Kokkos::ALL());
+      const auto rowFromBegin = KE::cbegin(rowFrom);
+      const auto rowFromEnd   = KE::cend(rowFrom);
+      const auto val          = valuesView_h(i);
+
+      const std::size_t result = std::count(rowFromBegin, rowFromEnd, val);
+      ASSERT_EQ(result, countsView_h(i));
+      ASSERT_TRUE(intraTeamSentinelView_h(i));
+    }
+  } else {
+    for (std::size_t i = 0; i < countsView.extent(0); ++i) {
+      constexpr std::size_t zero = 0;
+      ASSERT_EQ(countsView_h(i), zero);
+      ASSERT_TRUE(intraTeamSentinelView_h(i));
+    }
+  }
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios(const bool searchedValueExist) {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : {1, 2, 13, 101, 1444, 8153}) {
+      for (int apiId : {0, 1}) {
+        test_A<LayoutTag, ValueType>(searchedValueExist, numTeams, numCols,
+                                     apiId);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_count_team_test, count_returns_nonzero) {
+  constexpr bool searchedValueExist = true;
+  run_all_scenarios<DynamicTag, double>(searchedValueExist);
+  run_all_scenarios<StridedTwoRowsTag, int>(searchedValueExist);
+  run_all_scenarios<StridedThreeRowsTag, unsigned>(searchedValueExist);
+}
+
+TEST(std_algorithms_count_team_test, count_returns_zero) {
+  constexpr bool searchedValueExist = false;
+  run_all_scenarios<DynamicTag, double>(searchedValueExist);
+  run_all_scenarios<StridedTwoRowsTag, int>(searchedValueExist);
+  run_all_scenarios<StridedThreeRowsTag, unsigned>(searchedValueExist);
+}
+
+}  // namespace TeamCount
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCountIf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCountIf.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5a9be4cd111d1a13e97f93edecdd605046cf6f65
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCountIf.cpp
@@ -0,0 +1,162 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamCountIf {
+
+namespace KE = Kokkos::Experimental;
+
+template <class ValueType>
+struct GreaterThanValueFunctor {
+  ValueType m_val;
+
+  KOKKOS_INLINE_FUNCTION
+  GreaterThanValueFunctor(ValueType val) : m_val(val) {}
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator()(ValueType val) const { return (val > m_val); }
+};
+
+template <class ViewType, class CountsViewType, class IntraTeamSentinelView,
+          class ValueType>
+struct TestFunctorA {
+  ViewType m_view;
+  CountsViewType m_countsView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+  ValueType m_threshold;
+  int m_apiPick;
+
+  TestFunctorA(const ViewType view, const CountsViewType countsView,
+               const IntraTeamSentinelView intraTeamSentinelView,
+               ValueType threshold, int apiPick)
+      : m_view(view),
+        m_countsView(countsView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_threshold(threshold),
+        m_apiPick(apiPick) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto myRowIndex = member.league_rank();
+    auto myRowView        = Kokkos::subview(m_view, myRowIndex, Kokkos::ALL());
+    std::size_t myCount   = 0;
+
+    GreaterThanValueFunctor predicate(m_threshold);
+    if (m_apiPick == 0) {
+      myCount = KE::count_if(member, KE::begin(myRowView), KE::end(myRowView),
+                             predicate);
+
+      Kokkos::single(Kokkos::PerTeam(member),
+                     [=, *this]() { m_countsView(myRowIndex) = myCount; });
+    } else if (m_apiPick == 1) {
+      myCount = KE::count_if(member, myRowView, predicate);
+      Kokkos::single(Kokkos::PerTeam(member),
+                     [=, *this]() { m_countsView(myRowIndex) = myCount; });
+    }
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck = team_members_have_matching_result(
+        member, myCount, m_countsView(myRowIndex));
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(myRowIndex) = intraTeamCheck;
+    });
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
+  /* description:
+     use a rank-2 view randomly filled with values,
+     and run a team-level count_if where only the values
+     strictly greater than a threshold are counted
+   */
+
+  const auto threshold = static_cast<ValueType>(151);
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // create a view in the memory space associated with default exespace
+  // with as many rows as the number of teams and fill it with random
+  // values from an arbitrary range.
+  constexpr ValueType lowerBound = 5;
+  constexpr ValueType upperBound = 523;
+  const auto bounds              = make_bounds(lowerBound, upperBound);
+
+  auto [dataView, cloneOfDataViewBeforeOp_h] =
+      create_random_view_and_host_clone(LayoutTag{}, numTeams, numCols, bounds,
+                                        "dataView");
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+
+  // to verify that things work, each team stores the result
+  // of its count_if call, and then we check
+  // that these match what we expect
+  Kokkos::View<std::size_t*> countsView("countsView", numTeams);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  // use CTAD for functor
+  TestFunctorA fnc(dataView, countsView, intraTeamSentinelView, threshold,
+                   apiId);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // check
+  // -----------------------------------------------
+  auto countsView_h            = create_host_space_copy(countsView);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+  for (std::size_t i = 0; i < cloneOfDataViewBeforeOp_h.extent(0); ++i) {
+    std::size_t goldCountForRow = 0;
+    for (std::size_t j = 0; j < cloneOfDataViewBeforeOp_h.extent(1); ++j) {
+      if (cloneOfDataViewBeforeOp_h(i, j) > threshold) {
+        goldCountForRow++;
+      }
+    }
+    ASSERT_EQ(goldCountForRow, countsView_h(i));
+    ASSERT_TRUE(intraTeamSentinelView_h(i));
+  }
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios() {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8153}) {
+      for (int apiId : {0, 1}) {
+        test_A<LayoutTag, ValueType>(numTeams, numCols, apiId);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_count_if_team_test, test) {
+  run_all_scenarios<DynamicTag, double>();
+  run_all_scenarios<StridedTwoRowsTag, int>();
+  run_all_scenarios<StridedThreeRowsTag, unsigned>();
+}
+
+}  // namespace TeamCountIf
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamEqual.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamEqual.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..82cce0b384e5980953baaaf3f86e66b0a732a6d1
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamEqual.cpp
@@ -0,0 +1,278 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamEqual {
+
+namespace KE = Kokkos::Experimental;
+
+template <class ValueType>
+struct EqualFunctor {
+  KOKKOS_INLINE_FUNCTION bool operator()(const ValueType& lhs,
+                                         const ValueType& rhs) const {
+    return lhs == rhs;
+  }
+};
+
+template <class DataViewType, class CompViewType, class ResultsViewType,
+          class IntraTeamSentinelView, class BinaryPredType>
+struct TestFunctorA {
+  DataViewType m_dataView;
+  CompViewType m_compView;
+  ResultsViewType m_resultsView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+  int m_apiPick;
+  BinaryPredType m_binaryPred;
+
+  TestFunctorA(const DataViewType dataView, const CompViewType compView,
+               const ResultsViewType resultsView,
+               const IntraTeamSentinelView intraTeamSentinelView, int apiPick,
+               BinaryPredType binaryPred)
+      : m_dataView(dataView),
+        m_compView(compView),
+        m_resultsView(resultsView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_apiPick(apiPick),
+        m_binaryPred(binaryPred) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto rowIndex = member.league_rank();
+
+    auto rowData         = Kokkos::subview(m_dataView, rowIndex, Kokkos::ALL());
+    const auto dataBegin = KE::cbegin(rowData);
+    const auto dataEnd   = KE::cend(rowData);
+
+    auto rowComp         = Kokkos::subview(m_compView, rowIndex, Kokkos::ALL());
+    const auto compBegin = KE::cbegin(rowComp);
+    const auto compEnd   = KE::cend(rowComp);
+
+    bool result = false;
+    switch (m_apiPick) {
+      case 0: {
+        result = KE::equal(member, dataBegin, dataEnd, compBegin);
+        Kokkos::single(Kokkos::PerTeam(member),
+                       [=, *this]() { m_resultsView(rowIndex) = result; });
+        break;
+      }
+
+      case 1: {
+        result = KE::equal(member, dataBegin, dataEnd, compBegin, m_binaryPred);
+        Kokkos::single(Kokkos::PerTeam(member),
+                       [=, *this]() { m_resultsView(rowIndex) = result; });
+        break;
+      }
+
+      case 2: {
+        result = KE::equal(member, rowData, rowComp);
+        Kokkos::single(Kokkos::PerTeam(member),
+                       [=, *this]() { m_resultsView(rowIndex) = result; });
+        break;
+      }
+
+      case 3: {
+        result = KE::equal(member, rowData, rowComp, m_binaryPred);
+        Kokkos::single(Kokkos::PerTeam(member),
+                       [=, *this]() { m_resultsView(rowIndex) = result; });
+
+        break;
+      }
+
+      case 4: {
+        result = KE::equal(member, dataBegin, dataEnd, compBegin, compEnd);
+        Kokkos::single(Kokkos::PerTeam(member),
+                       [=, *this]() { m_resultsView(rowIndex) = result; });
+        break;
+      }
+
+      case 5: {
+        result = KE::equal(member, dataBegin, dataEnd, compBegin, compEnd,
+                           m_binaryPred);
+        Kokkos::single(Kokkos::PerTeam(member),
+                       [=, *this]() { m_resultsView(rowIndex) = result; });
+        break;
+      }
+    }
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck = team_members_have_matching_result(
+        member, result, m_resultsView(rowIndex));
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(rowIndex) = intraTeamCheck;
+    });
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(const bool viewsAreEqual, std::size_t numTeams, std::size_t numCols,
+            int apiId) {
+  /* description:
+     use a rank-2 view randomly filled with values,
+     and run a team-level equal
+   */
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // create a view in the memory space associated with default exespace
+  // with as many rows as the number of teams and fill it with random
+  // values from an arbitrary range.
+  constexpr ValueType lowerBound = 5;
+  constexpr ValueType upperBound = 523;
+  const auto bounds              = make_bounds(lowerBound, upperBound);
+
+  auto [dataView, dataViewBeforeOp_h] = create_random_view_and_host_clone(
+      LayoutTag{}, numTeams, numCols, bounds, "dataView");
+
+  // create a view to compare it with dataView. If viewsAreEqual == true,
+  // compView is a copy of dataView. If viewsAreEqual == false, compView is
+  // randomly filled
+  auto compView   = create_deep_copyable_compatible_clone(dataView);
+  auto compView_h = create_mirror_view(Kokkos::HostSpace(), compView);
+  if (viewsAreEqual) {
+    Kokkos::deep_copy(compView_h, dataViewBeforeOp_h);
+  } else {
+    using rand_pool =
+        Kokkos::Random_XorShift64_Pool<Kokkos::DefaultHostExecutionSpace>;
+    rand_pool pool(lowerBound * upperBound);
+    Kokkos::fill_random(compView_h, pool, lowerBound, upperBound);
+  }
+
+  Kokkos::deep_copy(compView, compView_h);
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+
+  // create the view to store results of equal()
+  Kokkos::View<bool*> resultsView("resultsView", numTeams);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  EqualFunctor<ValueType> binaryPred{};
+
+  // use CTAD for functor
+  TestFunctorA fnc(dataView, compView, resultsView, intraTeamSentinelView,
+                   apiId, binaryPred);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // run cpp-std kernel and check
+  // -----------------------------------------------
+  auto resultsView_h           = create_host_space_copy(resultsView);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+
+  for (std::size_t i = 0; i < dataView.extent(0); ++i) {
+    auto rowData = Kokkos::subview(dataViewBeforeOp_h, i, Kokkos::ALL());
+    const auto dataBegin = KE::cbegin(rowData);
+    const auto dataEnd   = KE::cend(rowData);
+
+    auto rowComp         = Kokkos::subview(compView_h, i, Kokkos::ALL());
+    const auto compBegin = KE::cbegin(rowComp);
+    const auto compEnd   = KE::cend(rowComp);
+
+    ASSERT_TRUE(intraTeamSentinelView_h(i));
+    switch (apiId) {
+      case 0:
+      case 2: {
+        const bool result = std::equal(dataBegin, dataEnd, compBegin);
+
+        if (viewsAreEqual) {
+          EXPECT_TRUE(resultsView_h(i));
+        } else {
+          ASSERT_EQ(result, resultsView_h(i));
+        }
+
+        break;
+      }
+
+      case 1:
+      case 3: {
+        const bool result =
+            std::equal(dataBegin, dataEnd, compBegin, binaryPred);
+
+        if (viewsAreEqual) {
+          EXPECT_TRUE(resultsView_h(i));
+        } else {
+          ASSERT_EQ(result, resultsView_h(i));
+        }
+
+        break;
+      }
+
+      case 4: {
+        const bool result = std::equal(dataBegin, dataEnd, compBegin, compEnd);
+
+        if (viewsAreEqual) {
+          EXPECT_TRUE(resultsView_h(i));
+        } else {
+          ASSERT_EQ(result, resultsView_h(i));
+        }
+
+        break;
+      }
+
+      case 5: {
+        const bool result =
+            std::equal(dataBegin, dataEnd, compBegin, compEnd, binaryPred);
+
+        if (viewsAreEqual) {
+          EXPECT_TRUE(resultsView_h(i));
+        } else {
+          ASSERT_EQ(result, resultsView_h(i));
+        }
+
+        break;
+      }
+    }
+  }
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios(const bool viewsAreEqual) {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8153}) {
+      for (int apiId : {0, 1, 2, 3, 4, 5}) {
+        test_A<LayoutTag, ValueType>(viewsAreEqual, numTeams, numCols, apiId);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_equal_team_test, views_are_equal) {
+  constexpr bool viewsAreEqual = true;
+  run_all_scenarios<DynamicTag, double>(viewsAreEqual);
+  run_all_scenarios<StridedTwoRowsTag, int>(viewsAreEqual);
+  run_all_scenarios<StridedThreeRowsTag, unsigned>(viewsAreEqual);
+}
+
+TEST(std_algorithms_equal_team_test, views_are_not_equal) {
+  constexpr bool viewsAreEqual = false;
+  run_all_scenarios<DynamicTag, double>(viewsAreEqual);
+  run_all_scenarios<StridedTwoRowsTag, int>(viewsAreEqual);
+  run_all_scenarios<StridedThreeRowsTag, unsigned>(viewsAreEqual);
+}
+
+}  // namespace TeamEqual
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c6b2566c6cfb301d8a0ac9802054df99e37ab145
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp
@@ -0,0 +1,253 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamExclusiveScan {
+
+namespace KE = Kokkos::Experimental;
+
+template <class ValueType>
+struct PlusFunctor {
+  KOKKOS_INLINE_FUNCTION constexpr ValueType operator()(
+      const ValueType& lhs, const ValueType& rhs) const {
+    return lhs + rhs;
+  }
+};
+
+template <class SourceViewType, class DestViewType, class DistancesViewType,
+          class IntraTeamSentinelView, class InitValuesViewType,
+          class BinaryOpType>
+struct TestFunctorA {
+  SourceViewType m_sourceView;
+  DestViewType m_destView;
+  DistancesViewType m_distancesView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+  InitValuesViewType m_initValuesView;
+  BinaryOpType m_binaryOp;
+  int m_apiPick;
+
+  TestFunctorA(const SourceViewType sourceView, const DestViewType destView,
+               const DistancesViewType distancesView,
+               const IntraTeamSentinelView intraTeamSentinelView,
+               const InitValuesViewType initValuesView, BinaryOpType binaryOp,
+               int apiPick)
+      : m_sourceView(sourceView),
+        m_destView(destView),
+        m_distancesView(distancesView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_initValuesView(initValuesView),
+        m_binaryOp(binaryOp),
+        m_apiPick(apiPick) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto rowIndex = member.league_rank();
+
+    auto rowViewSrc    = Kokkos::subview(m_sourceView, rowIndex, Kokkos::ALL());
+    auto rowViewDest   = Kokkos::subview(m_destView, rowIndex, Kokkos::ALL());
+    const auto initVal = m_initValuesView(rowIndex);
+    ptrdiff_t resultDist = 0;
+
+    switch (m_apiPick) {
+      case 0: {
+        auto it    = KE::exclusive_scan(member, KE::cbegin(rowViewSrc),
+                                     KE::cend(rowViewSrc),
+                                     KE::begin(rowViewDest), initVal);
+        resultDist = KE::distance(KE::begin(rowViewDest), it);
+        Kokkos::single(Kokkos::PerTeam(member),
+                       [=, *this] { m_distancesView(rowIndex) = resultDist; });
+
+        break;
+      }
+
+      case 1: {
+        auto it = KE::exclusive_scan(member, rowViewSrc, rowViewDest, initVal);
+        resultDist = KE::distance(KE::begin(rowViewDest), it);
+        Kokkos::single(Kokkos::PerTeam(member),
+                       [=, *this] { m_distancesView(rowIndex) = resultDist; });
+
+        break;
+      }
+
+#if not defined KOKKOS_ENABLE_OPENMPTARGET
+
+      case 2: {
+        auto it = KE::exclusive_scan(
+            member, KE::cbegin(rowViewSrc), KE::cend(rowViewSrc),
+            KE::begin(rowViewDest), initVal, m_binaryOp);
+        resultDist = KE::distance(KE::begin(rowViewDest), it);
+        Kokkos::single(Kokkos::PerTeam(member),
+                       [=, *this] { m_distancesView(rowIndex) = resultDist; });
+
+        break;
+      }
+
+      case 3: {
+        auto it = KE::exclusive_scan(member, rowViewSrc, rowViewDest, initVal,
+                                     m_binaryOp);
+        resultDist = KE::distance(KE::begin(rowViewDest), it);
+        Kokkos::single(Kokkos::PerTeam(member),
+                       [=, *this] { m_distancesView(rowIndex) = resultDist; });
+
+        break;
+      }
+#endif
+    }
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck = team_members_have_matching_result(
+        member, resultDist, m_distancesView(rowIndex));
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(rowIndex) = intraTeamCheck;
+    });
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
+  /* description:
+     use a rank-2 view randomly filled with values,
+     and run a team-level exclusive_scan
+   */
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // create a view in the memory space associated with default exespace
+  // with as many rows as the number of teams and fill it with random
+  // values from an arbitrary range.
+  constexpr ValueType lowerBound = 5;
+  constexpr ValueType upperBound = 523;
+  const auto bounds              = make_bounds(lowerBound, upperBound);
+
+  auto [sourceView, sourceViewBeforeOp_h] = create_random_view_and_host_clone(
+      LayoutTag{}, numTeams, numCols, bounds, "sourceView");
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+
+  // create the destination view
+  Kokkos::View<ValueType**> destView("destView", numTeams, numCols);
+
+  // exclusive_scan returns an iterator so to verify that it is correct
+  // each team stores the distance of the returned iterator from the beginning
+  // of the interval that team operates on and then we check that these
+  // distances match the std result
+  Kokkos::View<std::size_t*> distancesView("distancesView", numTeams);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  PlusFunctor<ValueType> binaryOp;
+
+  // Create view of reduce init values to be used by test cases
+  Kokkos::View<ValueType*, Kokkos::DefaultHostExecutionSpace> initValuesView_h(
+      "initValuesView_h", numTeams);
+  using rand_pool =
+      Kokkos::Random_XorShift64_Pool<Kokkos::DefaultHostExecutionSpace>;
+  rand_pool pool(lowerBound * upperBound);
+  Kokkos::fill_random(initValuesView_h, pool, lowerBound, upperBound);
+
+  // use CTAD for functor
+  auto initValuesView =
+      Kokkos::create_mirror_view_and_copy(space_t(), initValuesView_h);
+  TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView,
+                   initValuesView, binaryOp, apiId);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // run cpp-std kernel and check
+  // -----------------------------------------------
+  auto distancesView_h         = create_host_space_copy(distancesView);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+  Kokkos::View<ValueType**, Kokkos::HostSpace> stdDestView("stdDestView",
+                                                           numTeams, numCols);
+
+  for (std::size_t i = 0; i < sourceView.extent(0); ++i) {
+    auto rowFrom   = Kokkos::subview(sourceViewBeforeOp_h, i, Kokkos::ALL());
+    auto rowDest   = Kokkos::subview(stdDestView, i, Kokkos::ALL());
+    auto initValue = initValuesView_h(i);
+
+    ASSERT_TRUE(intraTeamSentinelView_h(i));
+
+// libstdc++ as provided by GCC 8 does not have exclusive_scan and
+// for GCC 9.1, 9.2 fails to compile for missing overload not accepting policy
+#if defined(_GLIBCXX_RELEASE) && (_GLIBCXX_RELEASE <= 9)
+#define exclusive_scan testing_exclusive_scan
+#else
+#define exclusive_scan std::exclusive_scan
+#endif
+    switch (apiId) {
+      case 0:
+      case 1: {
+        auto it = exclusive_scan(KE::cbegin(rowFrom), KE::cend(rowFrom),
+                                 KE::begin(rowDest), initValue);
+        const std::size_t stdDistance = KE::distance(KE::begin(rowDest), it);
+        ASSERT_EQ(stdDistance, distancesView_h(i));
+        break;
+      }
+
+#if not defined KOKKOS_ENABLE_OPENMPTARGET
+      case 2:
+      case 3: {
+        auto it = exclusive_scan(KE::cbegin(rowFrom), KE::cend(rowFrom),
+                                 KE::begin(rowDest), initValue, binaryOp);
+        const std::size_t stdDistance = KE::distance(KE::begin(rowDest), it);
+        ASSERT_EQ(stdDistance, distancesView_h(i));
+
+        break;
+      }
+#endif
+    }
+
+#undef exclusive_scan
+  }
+
+  auto dataViewAfterOp_h = create_host_space_copy(destView);
+  expect_equal_host_views(stdDestView, dataViewAfterOp_h);
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios() {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8153}) {
+#if not defined KOKKOS_ENABLE_OPENMPTARGET
+      for (int apiId : {0, 1, 2, 3}) {
+#else
+      for (int apiId : {0, 1}) {
+#endif
+        test_A<LayoutTag, ValueType>(numTeams, numCols, apiId);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_exclusive_scan_team_test, test) {
+  run_all_scenarios<DynamicTag, double>();
+  run_all_scenarios<StridedTwoRowsTag, int>();
+  run_all_scenarios<StridedThreeRowsTag, unsigned>();
+}
+
+}  // namespace TeamExclusiveScan
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFill.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFill.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bd3ef3bd67fba6b371eebd8dcc963085377f448f
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFill.cpp
@@ -0,0 +1,106 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamFill {
+
+namespace KE = Kokkos::Experimental;
+
+template <class ViewType>
+struct TestFunctorA {
+  ViewType m_view;
+  int m_apiPick;
+
+  TestFunctorA(const ViewType view, int apiPick)
+      : m_view(view), m_apiPick(apiPick) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto leagueRank = member.league_rank();
+    const auto myRowIndex = leagueRank;
+    auto myRowView        = Kokkos::subview(m_view, myRowIndex, Kokkos::ALL());
+
+    if (m_apiPick == 0) {
+      KE::fill(member, KE::begin(myRowView), KE::end(myRowView), leagueRank);
+    } else if (m_apiPick == 1) {
+      KE::fill(member, myRowView, leagueRank);
+    }
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
+  /* description:
+     create a rank-2 view, randomly fill with non trivial numbers
+     and do a team-level KE::fill where each team fills
+     with its league_rank value the row it is responsible for
+   */
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // create a view in the memory space associated with default exespace
+  // with as many rows as the number of teams and fill it with random
+  // values from an arbitrary range
+  auto [dataView, _] = create_random_view_and_host_clone(
+      LayoutTag{}, numTeams, numCols,
+      Kokkos::pair<ValueType, ValueType>{11, 523}, "dataView");
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+  // use CTAD for functor
+  TestFunctorA fnc(dataView, apiId);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // check
+  // -----------------------------------------------
+  // each row should be filled with the row index
+  // since the league_rank of a team here coincides with row index
+  auto dataViewAfterOp_h = create_host_space_copy(dataView);
+  for (std::size_t i = 0; i < dataViewAfterOp_h.extent(0); ++i) {
+    for (std::size_t j = 0; j < dataViewAfterOp_h.extent(1); ++j) {
+      EXPECT_TRUE(dataViewAfterOp_h(i, j) == static_cast<ValueType>(i));
+    }
+  }
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios() {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8153}) {
+      for (int apiId : {0, 1}) {
+        test_A<LayoutTag, ValueType>(numTeams, numCols, apiId);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_fill_team_test, test) {
+  run_all_scenarios<DynamicTag, double>();
+  run_all_scenarios<StridedTwoRowsTag, int>();
+  run_all_scenarios<StridedThreeRowsTag, unsigned>();
+}
+
+}  // namespace TeamFill
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFill_n.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFill_n.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0a97f15c176f8897139aef09b6009e4f10415d90
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFill_n.cpp
@@ -0,0 +1,176 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamFill_n {
+
+namespace KE = Kokkos::Experimental;
+
+template <class ViewType, class DistancesViewType, class IntraTeamSentinelView>
+struct TestFunctorA {
+  ViewType m_view;
+  DistancesViewType m_distancesView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+  std::size_t m_fillCount;
+  int m_apiPick;
+
+  TestFunctorA(const ViewType view, const DistancesViewType distancesView,
+               const IntraTeamSentinelView intraTeamSentinelView,
+               std::size_t fillCount, int apiPick)
+      : m_view(view),
+        m_distancesView(distancesView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_fillCount(fillCount),
+        m_apiPick(apiPick) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto leagueRank = member.league_rank();
+    const auto myRowIndex = leagueRank;
+    auto myRowView        = Kokkos::subview(m_view, myRowIndex, Kokkos::ALL());
+    ptrdiff_t resultDist  = 0;
+
+    if (m_apiPick == 0) {
+      auto it =
+          KE::fill_n(member, KE::begin(myRowView), m_fillCount, leagueRank);
+      resultDist = KE::distance(KE::begin(myRowView), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    } else if (m_apiPick == 1) {
+      auto it    = KE::fill_n(member, myRowView, m_fillCount, leagueRank);
+      resultDist = KE::distance(KE::begin(myRowView), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    }
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck = team_members_have_matching_result(
+        member, resultDist, m_distancesView(myRowIndex));
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(myRowIndex) = intraTeamCheck;
+    });
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(std::size_t numTeams, std::size_t numCols, std::size_t fillCount,
+            int apiId) {
+  /* description:
+     create a rank-2 view, run a team parfor with one row per team,
+     such that n elements of each row are filled up with the league_rank value
+     of the team in charge of it, while the other elements in the row
+     are left unchanged
+   */
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // create a view in the memory space associated with default exespace
+  // with as many rows as the number of teams and fill it with random
+  // values from an arbitrary range
+  auto [dataView, cloneOfDataViewBeforeOp_h] =
+      create_random_view_and_host_clone(
+          LayoutTag{}, numTeams, numCols,
+          Kokkos::pair<ValueType, ValueType>{5, 523}, "dataView");
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+
+  // fill_n returns an iterator so to verify that it is correct
+  // each team stores the distance of the returned iterator from the
+  // beginning of the interval that team operates on and then we check
+  // that these distances match the expected value
+  Kokkos::View<std::size_t*> distancesView("distancesView", numTeams);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  // use CTAD for functor
+  TestFunctorA fnc(dataView, distancesView, intraTeamSentinelView, fillCount,
+                   apiId);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // check
+  // -----------------------------------------------
+  auto dataViewAfterOp_h       = create_host_space_copy(dataView);
+  auto distancesView_h         = create_host_space_copy(distancesView);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+  for (std::size_t i = 0; i < dataView.extent(0); ++i) {
+    ASSERT_TRUE(intraTeamSentinelView_h(i));
+
+    // check that values match what we expect
+    for (std::size_t j = 0; j < fillCount; ++j) {
+      ASSERT_EQ(dataViewAfterOp_h(i, j), ValueType(i));
+    }
+    // all other elements should be unchanged from before op
+    for (std::size_t j = fillCount; j < numCols; ++j) {
+      ASSERT_EQ(dataViewAfterOp_h(i, j), cloneOfDataViewBeforeOp_h(i, j));
+    }
+
+    // check that returned iterators are correct
+    if (fillCount > 0) {
+      ASSERT_EQ(distancesView_h(i), std::size_t(fillCount));
+    } else {
+      ASSERT_EQ(distancesView_h(i), std::size_t(0));
+    }
+  }
+}
+
+template <class Tag, class ValueType>
+void run_all_scenarios() {
+  // prepare a map where, for a given set of num cols
+  // we provide a list of counts of elements to fill
+  // key = num of columns,
+  // value = list of num of elemenents to fill
+  const std::map<int, std::vector<int>> scenarios = {
+      {0, {0}},
+      {2, {0, 1, 2}},
+      {6, {0, 1, 2, 5}},
+      {13, {0, 1, 2, 8, 11}},
+      {56, {0, 1, 2, 8, 11, 33, 56}},
+      {123, {0, 1, 11, 33, 56, 89, 112}}};
+
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& scenario : scenarios) {
+      const std::size_t numCols = scenario.first;
+      for (int numFills : scenario.second) {
+        for (int apiId : {0, 1}) {
+          test_A<Tag, ValueType>(numTeams, numCols, numFills, apiId);
+        }
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_fill_n_team_test, test) {
+  run_all_scenarios<DynamicTag, double>();
+  run_all_scenarios<StridedTwoRowsTag, int>();
+  run_all_scenarios<StridedThreeRowsTag, unsigned>();
+}
+
+}  // namespace TeamFill_n
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFind.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFind.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..430e4917e06e1afec8afd6fecd310cac7f0f7894
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFind.cpp
@@ -0,0 +1,212 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamFind {
+
+namespace KE = Kokkos::Experimental;
+
+template <class DataViewType, class SearchedValuesViewType,
+          class DistancesViewType, class IntraTeamSentinelView>
+struct TestFunctorA {
+  DataViewType m_dataView;
+  SearchedValuesViewType m_searchedValuesView;
+  DistancesViewType m_distancesView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+  int m_apiPick;
+
+  TestFunctorA(const DataViewType dataView,
+               const SearchedValuesViewType searchedValuesView,
+               const DistancesViewType distancesView,
+               const IntraTeamSentinelView intraTeamSentinelView, int apiPick)
+      : m_dataView(dataView),
+        m_searchedValuesView(searchedValuesView),
+        m_distancesView(distancesView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_apiPick(apiPick) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto myRowIndex = member.league_rank();
+    auto myRowViewFrom = Kokkos::subview(m_dataView, myRowIndex, Kokkos::ALL());
+    const auto searchedValue = m_searchedValuesView(myRowIndex);
+    ptrdiff_t resultDist     = 0;
+
+    switch (m_apiPick) {
+      case 0: {
+        auto it    = KE::find(member, KE::cbegin(myRowViewFrom),
+                           KE::cend(myRowViewFrom), searchedValue);
+        resultDist = KE::distance(KE::cbegin(myRowViewFrom), it);
+        Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+          m_distancesView(myRowIndex) = resultDist;
+        });
+
+        break;
+      }
+
+      case 1: {
+        auto it    = KE::find(member, myRowViewFrom, searchedValue);
+        resultDist = KE::distance(KE::begin(myRowViewFrom), it);
+        Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+          m_distancesView(myRowIndex) = resultDist;
+        });
+
+        break;
+      }
+    }
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck = team_members_have_matching_result(
+        member, resultDist, m_distancesView(myRowIndex));
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(myRowIndex) = intraTeamCheck;
+    });
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(const bool searchedValuesExist, std::size_t numTeams,
+            std::size_t numCols, int apiId) {
+  /* description:
+     use a rank-2 view randomly filled with values,
+     and run a team-level find
+   */
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // create a view in the memory space associated with default exespace
+  // with as many rows as the number of teams and fill it with random
+  // values from an arbitrary range.
+  constexpr ValueType lowerBound = 5;
+  constexpr ValueType upperBound = 523;
+  const auto bounds              = make_bounds(lowerBound, upperBound);
+
+  auto [dataView, dataViewBeforeOp_h] = create_random_view_and_host_clone(
+      LayoutTag{}, numTeams, numCols, bounds, "dataView");
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+
+  // find returns an iterator so to verify that it is correct
+  // each team stores the distance of the returned iterator from the
+  // beginning of the interval that team operates on and then we check
+  // that these distances match the std result
+  Kokkos::View<std::size_t*> distancesView("distancesView", numTeams);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  // If searchedValuesExist == true we want to ensure that each value we're
+  // looking for exists in dataView. To do that, for each numTeams, a random j
+  // index from a range [0, numCols) is used to obtain a value from dataView.
+  //
+  // If searchedValuesExist == false we want to ensure the opposite, so every
+  // value is less than a lower bound of dataView.
+  Kokkos::View<ValueType*> searchedValuesView("searchValuesView", numTeams);
+  auto searchedValuesView_h =
+      create_mirror_view(Kokkos::HostSpace(), searchedValuesView);
+
+  using rand_pool =
+      Kokkos::Random_XorShift64_Pool<Kokkos::DefaultHostExecutionSpace>;
+  rand_pool pool(lowerBound * upperBound);
+
+  if (searchedValuesExist) {
+    Kokkos::View<std::size_t*, Kokkos::DefaultHostExecutionSpace> randomIndices(
+        "randomIndices", numTeams);
+
+    Kokkos::fill_random(randomIndices, pool, 0, numCols);
+
+    for (std::size_t i = 0; i < numTeams; ++i) {
+      const std::size_t j     = randomIndices(i);
+      searchedValuesView_h(i) = dataViewBeforeOp_h(i, j);
+    }
+  } else {
+    Kokkos::fill_random(searchedValuesView_h, pool, 0, lowerBound);
+  }
+
+  Kokkos::deep_copy(searchedValuesView, searchedValuesView_h);
+
+  // use CTAD for functor
+  TestFunctorA fnc(dataView, searchedValuesView, distancesView,
+                   intraTeamSentinelView, apiId);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // run cpp-std kernel and check
+  // -----------------------------------------------
+  auto distancesView_h         = create_host_space_copy(distancesView);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+
+  for (std::size_t i = 0; i < dataView.extent(0); ++i) {
+    auto rowFrom = Kokkos::subview(dataViewBeforeOp_h, i, Kokkos::ALL());
+    const auto rowFromBegin = KE::cbegin(rowFrom);
+    const auto rowFromEnd   = KE::cend(rowFrom);
+
+    auto it = std::find(rowFromBegin, rowFromEnd, searchedValuesView_h(i));
+
+    const std::size_t stdDistance      = KE::distance(rowFromBegin, it);
+    const std::size_t beginEndDistance = KE::distance(rowFromBegin, rowFromEnd);
+
+    if (searchedValuesExist) {
+      EXPECT_LT(stdDistance, beginEndDistance);
+    } else {
+      ASSERT_EQ(stdDistance, beginEndDistance);
+    }
+
+    ASSERT_EQ(stdDistance, distancesView_h(i));
+    ASSERT_TRUE(intraTeamSentinelView_h(i));
+  }
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios(const bool searchedValuesExist) {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : {1, 2, 13, 101, 1444, 8153}) {
+      for (int apiId : {0, 1}) {
+        test_A<LayoutTag, ValueType>(searchedValuesExist, numTeams, numCols,
+                                     apiId);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_find_team_test, searched_values_exist) {
+  constexpr bool searchedValuesExist = true;
+
+  run_all_scenarios<DynamicTag, double>(searchedValuesExist);
+  run_all_scenarios<StridedTwoRowsTag, int>(searchedValuesExist);
+  run_all_scenarios<StridedThreeRowsTag, unsigned>(searchedValuesExist);
+}
+
+TEST(std_algorithms_find_team_test, searched_values_do_not_exist) {
+  constexpr bool searchedValuesExist = false;
+
+  run_all_scenarios<DynamicTag, double>(searchedValuesExist);
+  run_all_scenarios<StridedTwoRowsTag, int>(searchedValuesExist);
+  run_all_scenarios<StridedThreeRowsTag, unsigned>(searchedValuesExist);
+}
+
+}  // namespace TeamFind
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindEnd.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindEnd.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..83eca33569e1fecae2a5751ccf09ec8bd5626743
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindEnd.cpp
@@ -0,0 +1,271 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamFindEnd {
+
+namespace KE = Kokkos::Experimental;
+
+template <class ValueType>
+struct EqualFunctor {
+  KOKKOS_INLINE_FUNCTION
+  bool operator()(const ValueType& lhs, const ValueType& rhs) const {
+    return lhs == rhs;
+  }
+};
+
+template <class DataViewType, class SearchedSequencesViewType,
+          class DistancesViewType, class IntraTeamSentinelView,
+          class BinaryPredType>
+struct TestFunctorA {
+  DataViewType m_dataView;
+  SearchedSequencesViewType m_searchedSequencesView;
+  DistancesViewType m_distancesView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+  BinaryPredType m_binaryPred;
+  int m_apiPick;
+
+  TestFunctorA(const DataViewType dataView,
+               const SearchedSequencesViewType searchedSequencesView,
+               const DistancesViewType distancesView,
+               const IntraTeamSentinelView intraTeamSentinelView,
+               BinaryPredType binaryPred, int apiPick)
+      : m_dataView(dataView),
+        m_searchedSequencesView(searchedSequencesView),
+        m_distancesView(distancesView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_binaryPred(binaryPred),
+        m_apiPick(apiPick) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto myRowIndex = member.league_rank();
+    auto myRowViewFrom = Kokkos::subview(m_dataView, myRowIndex, Kokkos::ALL());
+    auto myRowSearchedSeqView =
+        Kokkos::subview(m_searchedSequencesView, myRowIndex, Kokkos::ALL());
+    ptrdiff_t resultDist = 0;
+
+    switch (m_apiPick) {
+      case 0: {
+        auto it = KE::find_end(
+            member, KE::cbegin(myRowViewFrom), KE::cend(myRowViewFrom),
+            KE::cbegin(myRowSearchedSeqView), KE::cend(myRowSearchedSeqView));
+        resultDist = KE::distance(KE::cbegin(myRowViewFrom), it);
+        Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+          m_distancesView(myRowIndex) = resultDist;
+        });
+
+        break;
+      }
+
+      case 1: {
+        auto it    = KE::find_end(member, myRowViewFrom, myRowSearchedSeqView);
+        resultDist = KE::distance(KE::begin(myRowViewFrom), it);
+        Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+          m_distancesView(myRowIndex) = resultDist;
+        });
+
+        break;
+      }
+
+      case 2: {
+        auto it    = KE::find_end(member, KE::cbegin(myRowViewFrom),
+                               KE::cend(myRowViewFrom),
+                               KE::cbegin(myRowSearchedSeqView),
+                               KE::cend(myRowSearchedSeqView), m_binaryPred);
+        resultDist = KE::distance(KE::cbegin(myRowViewFrom), it);
+        Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+          m_distancesView(myRowIndex) = resultDist;
+        });
+
+        break;
+      }
+
+      case 3: {
+        auto it    = KE::find_end(member, myRowViewFrom, myRowSearchedSeqView,
+                               m_binaryPred);
+        resultDist = KE::distance(KE::begin(myRowViewFrom), it);
+        Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+          m_distancesView(myRowIndex) = resultDist;
+        });
+
+        break;
+      }
+    }
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck = team_members_have_matching_result(
+        member, resultDist, m_distancesView(myRowIndex));
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(myRowIndex) = intraTeamCheck;
+    });
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(const bool sequencesExist, std::size_t numTeams,
+            std::size_t numCols, int apiId) {
+  /* description:
+     use a rank-2 view randomly filled with values,
+     and run a team-level find_end
+   */
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // create a view in the memory space associated with default exespace
+  // with as many rows as the number of teams and fill it with random
+  // values from an arbitrary range.
+  constexpr ValueType lowerBound = 5;
+  constexpr ValueType upperBound = 523;
+  const auto bounds              = make_bounds(lowerBound, upperBound);
+
+  auto [dataView, dataViewBeforeOp_h] = create_random_view_and_host_clone(
+      LayoutTag{}, numTeams, numCols, bounds, "dataView");
+
+  // create a view that stores a sequence to found in dataView. If
+  // sequencesExist == true it is filled base on dataView content, to allow
+  // find_end to actually find anything. If sequencesExist == false it is filled
+  // with random values greater than upperBound
+  const auto halfCols    = (numCols + 1) / 2;
+  const auto quarterCols = halfCols / 2;
+
+  Kokkos::View<ValueType**> searchedSequencesView(
+      "searchedSequencesView", numTeams, halfCols - quarterCols);
+  auto searchedSequencesView_h = create_host_space_copy(searchedSequencesView);
+
+  if (sequencesExist) {
+    for (std::size_t i = 0; i < searchedSequencesView_h.extent(0); ++i) {
+      for (std::size_t js = 0, jd = quarterCols; jd < halfCols; ++js, ++jd) {
+        searchedSequencesView_h(i, js) = dataViewBeforeOp_h(i, jd);
+      }
+    }
+  } else {
+    using rand_pool =
+        Kokkos::Random_XorShift64_Pool<Kokkos::DefaultHostExecutionSpace>;
+    rand_pool pool(lowerBound * upperBound);
+    Kokkos::fill_random(searchedSequencesView_h, pool, upperBound,
+                        upperBound * 2);
+  }
+
+  Kokkos::deep_copy(searchedSequencesView, searchedSequencesView_h);
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+
+  // find_end returns an iterator so to verify that it is correct
+  // each team stores the distance of the returned iterator from the
+  // beginning of the interval that team operates on and then we check
+  // that these distances match the std result
+  Kokkos::View<std::size_t*> distancesView("distancesView", numTeams);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  EqualFunctor<ValueType> binaryPred;
+
+  // use CTAD for functor
+  TestFunctorA fnc(dataView, searchedSequencesView, distancesView,
+                   intraTeamSentinelView, binaryPred, apiId);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // run cpp-std kernel and check
+  // -----------------------------------------------
+  auto distancesView_h         = create_host_space_copy(distancesView);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+
+  for (std::size_t i = 0; i < dataView.extent(0); ++i) {
+    auto rowFrom = Kokkos::subview(dataViewBeforeOp_h, i, Kokkos::ALL());
+    const auto rowFromBegin = KE::cbegin(rowFrom);
+    const auto rowFromEnd   = KE::cend(rowFrom);
+
+    auto rowSearchedSeq =
+        Kokkos::subview(searchedSequencesView_h, i, Kokkos::ALL());
+
+    std::size_t stdDistance = std::numeric_limits<std::size_t>::max();
+    const std::size_t beginEndDistance = KE::distance(rowFromBegin, rowFromEnd);
+
+    ASSERT_TRUE(intraTeamSentinelView_h(i));
+    switch (apiId) {
+      case 0:
+      case 1: {
+        auto it =
+            std::find_end(rowFromBegin, rowFromEnd, KE::cbegin(rowSearchedSeq),
+                          KE::cend(rowSearchedSeq));
+        stdDistance = KE::distance(rowFromBegin, it);
+
+        break;
+      }
+
+      case 2:
+      case 3: {
+        auto it =
+            std::find_end(rowFromBegin, rowFromEnd, KE::cbegin(rowSearchedSeq),
+                          KE::cend(rowSearchedSeq), binaryPred);
+        stdDistance = KE::distance(rowFromBegin, it);
+
+        break;
+      }
+    }
+
+    if (sequencesExist) {
+      EXPECT_LT(stdDistance, beginEndDistance);
+    } else {
+      ASSERT_EQ(stdDistance, beginEndDistance);
+    }
+
+    ASSERT_EQ(stdDistance, distancesView_h(i));
+  }
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios(const bool sequencesExist) {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : {1, 2, 13, 101, 1444, 8153}) {
+      for (int apiId : {0, 1, 2, 3}) {
+        test_A<LayoutTag, ValueType>(sequencesExist, numTeams, numCols, apiId);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_find_end_team_test, sequences_exist) {
+  constexpr bool sequencesExist = true;
+
+  run_all_scenarios<DynamicTag, double>(sequencesExist);
+  run_all_scenarios<StridedTwoRowsTag, int>(sequencesExist);
+  run_all_scenarios<StridedThreeRowsTag, unsigned>(sequencesExist);
+}
+
+TEST(std_algorithms_find_end_team_test, sequences_do_not_exist) {
+  constexpr bool sequencesExist = false;
+
+  run_all_scenarios<DynamicTag, double>(sequencesExist);
+  run_all_scenarios<StridedTwoRowsTag, int>(sequencesExist);
+  run_all_scenarios<StridedThreeRowsTag, unsigned>(sequencesExist);
+}
+
+}  // namespace TeamFindEnd
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindFirstOf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindFirstOf.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e992882e91dbb3177b92e55b32e59a4e75f7b886
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindFirstOf.cpp
@@ -0,0 +1,280 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamFindFirstOf {
+
+namespace KE = Kokkos::Experimental;
+
+template <class ValueType>
+struct EqualFunctor {
+  KOKKOS_INLINE_FUNCTION
+  bool operator()(const ValueType& lhs, const ValueType& rhs) const {
+    return lhs == rhs;
+  }
+};
+
+template <class DataViewType, class SearchedSequencesViewType,
+          class DistancesViewType, class IntraTeamSentinelView,
+          class BinaryPredType>
+struct TestFunctorA {
+  DataViewType m_dataView;
+  SearchedSequencesViewType m_searchedSequencesView;
+  DistancesViewType m_distancesView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+  BinaryPredType m_binaryPred;
+  int m_apiPick;
+
+  TestFunctorA(const DataViewType dataView,
+               const SearchedSequencesViewType searchedSequencesView,
+               const DistancesViewType distancesView,
+               const IntraTeamSentinelView intraTeamSentinelView,
+               BinaryPredType binaryPred, int apiPick)
+      : m_dataView(dataView),
+        m_searchedSequencesView(searchedSequencesView),
+        m_distancesView(distancesView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_binaryPred(binaryPred),
+        m_apiPick(apiPick) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto myRowIndex = member.league_rank();
+    auto myRowViewFrom = Kokkos::subview(m_dataView, myRowIndex, Kokkos::ALL());
+    auto myRowSearchedSeqView =
+        Kokkos::subview(m_searchedSequencesView, myRowIndex, Kokkos::ALL());
+    ptrdiff_t resultDist = 0;
+
+    switch (m_apiPick) {
+      case 0: {
+        auto it = KE::find_first_of(
+            member, KE::cbegin(myRowViewFrom), KE::cend(myRowViewFrom),
+            KE::cbegin(myRowSearchedSeqView), KE::cend(myRowSearchedSeqView));
+        resultDist = KE::distance(KE::cbegin(myRowViewFrom), it);
+        Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+          m_distancesView(myRowIndex) = resultDist;
+        });
+
+        break;
+      }
+
+      case 1: {
+        auto it =
+            KE::find_first_of(member, myRowViewFrom, myRowSearchedSeqView);
+        resultDist = KE::distance(KE::begin(myRowViewFrom), it);
+        Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+          m_distancesView(myRowIndex) = resultDist;
+        });
+
+        break;
+      }
+
+      case 2: {
+        auto it = KE::find_first_of(
+            member, KE::cbegin(myRowViewFrom), KE::cend(myRowViewFrom),
+            KE::cbegin(myRowSearchedSeqView), KE::cend(myRowSearchedSeqView),
+            m_binaryPred);
+        resultDist = KE::distance(KE::cbegin(myRowViewFrom), it);
+        Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+          m_distancesView(myRowIndex) = resultDist;
+        });
+
+        break;
+      }
+
+      case 3: {
+        auto it = KE::find_first_of(member, myRowViewFrom, myRowSearchedSeqView,
+                                    m_binaryPred);
+        resultDist = KE::distance(KE::begin(myRowViewFrom), it);
+        Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+          m_distancesView(myRowIndex) = resultDist;
+        });
+
+        break;
+      }
+    }
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck = team_members_have_matching_result(
+        member, resultDist, m_distancesView(myRowIndex));
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(myRowIndex) = intraTeamCheck;
+    });
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(const bool sequencesExist, std::size_t numTeams,
+            std::size_t numCols, int apiId) {
+  /* description:
+     use a rank-2 view randomly filled with values,
+     and run a team-level find_first_of
+   */
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // create a view in the memory space associated with default exespace
+  // with as many rows as the number of teams and fill it with random
+  // values from an arbitrary range.
+  constexpr ValueType lowerBound = 5;
+  constexpr ValueType upperBound = 523;
+  const auto bounds              = make_bounds(lowerBound, upperBound);
+
+  auto [dataView, dataViewBeforeOp_h] = create_random_view_and_host_clone(
+      LayoutTag{}, numTeams, numCols, bounds, "dataView");
+
+  // create a view that stores a sequence to found a value from in dataView. If
+  // sequencesExist == true it is filled base on dataView content, to allow
+  // find_first_of to actually find anything. If sequencesExist == false it is
+  // filled with random values greater than upperBound
+  const std::size_t halfCols = (numCols > 1) ? ((numCols + 1) / 2) : (1);
+  const std::size_t seqSize  = (numCols > 1) ? (std::log2(numCols)) : (1);
+
+  Kokkos::View<ValueType**> searchedSequencesView("searchedSequencesView",
+                                                  numTeams, seqSize);
+  auto searchedSequencesView_h = create_host_space_copy(searchedSequencesView);
+
+  if (sequencesExist) {
+    const std::size_t dataBegin = halfCols - seqSize;
+    for (std::size_t i = 0; i < searchedSequencesView_h.extent(0); ++i) {
+      for (std::size_t js = 0, jd = dataBegin; js < seqSize; ++js, ++jd) {
+        searchedSequencesView_h(i, js) = dataViewBeforeOp_h(i, jd);
+      }
+    }
+  } else {
+    using rand_pool =
+        Kokkos::Random_XorShift64_Pool<Kokkos::DefaultHostExecutionSpace>;
+    rand_pool pool(lowerBound * upperBound);
+    Kokkos::fill_random(searchedSequencesView_h, pool, upperBound,
+                        upperBound * 2);
+  }
+
+  Kokkos::deep_copy(searchedSequencesView, searchedSequencesView_h);
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+
+  // find_first_of returns an iterator so to verify that it is correct
+  // each team stores the distance of the returned iterator from the
+  // beginning of the interval that team operates on and then we check
+  // that these distances match the std result
+  Kokkos::View<std::size_t*> distancesView("distancesView", numTeams);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  EqualFunctor<ValueType> binaryPred;
+
+  // use CTAD for functor
+  TestFunctorA fnc(dataView, searchedSequencesView, distancesView,
+                   intraTeamSentinelView, binaryPred, apiId);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // run cpp-std kernel and check
+  // -----------------------------------------------
+  auto distancesView_h         = create_host_space_copy(distancesView);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+
+  for (std::size_t i = 0; i < dataView.extent(0); ++i) {
+    auto rowFrom = Kokkos::subview(dataViewBeforeOp_h, i, Kokkos::ALL());
+    auto rowSearchedSeq =
+        Kokkos::subview(searchedSequencesView_h, i, Kokkos::ALL());
+
+    const auto rowFromBegin     = KE::cbegin(rowFrom);
+    const auto rowFromEnd       = KE::cend(rowFrom);
+    const auto rowSearchedBegin = KE::cbegin(rowSearchedSeq);
+    const auto rowSearchedEnd   = KE::cend(rowSearchedSeq);
+
+    const std::size_t beginEndDistance = KE::distance(rowFromBegin, rowFromEnd);
+
+    ASSERT_TRUE(intraTeamSentinelView_h(i));
+    switch (apiId) {
+      case 0:
+      case 1: {
+        auto it = std::find_first_of(rowFromBegin, rowFromEnd, rowSearchedBegin,
+                                     rowSearchedEnd);
+        const std::size_t stdDistance = KE::distance(rowFromBegin, it);
+
+        if (sequencesExist) {
+          EXPECT_LT(distancesView_h(i), beginEndDistance);
+        } else {
+          ASSERT_EQ(distancesView_h(i), beginEndDistance);
+        }
+
+        ASSERT_EQ(stdDistance, distancesView_h(i));
+
+        break;
+      }
+
+      case 2:
+      case 3: {
+        auto it = std::find_first_of(rowFromBegin, rowFromEnd, rowSearchedBegin,
+                                     rowSearchedEnd, binaryPred);
+        const std::size_t stdDistance = KE::distance(rowFromBegin, it);
+
+        if (sequencesExist) {
+          EXPECT_LT(distancesView_h(i), beginEndDistance);
+        } else {
+          ASSERT_EQ(distancesView_h(i), beginEndDistance);
+        }
+
+        ASSERT_EQ(stdDistance, distancesView_h(i));
+
+        break;
+      }
+    }
+  }
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios(const bool sequencesExist) {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : {1, 2, 13, 101, 1444, 8153}) {
+      for (int apiId : {0, 1, 2, 3}) {
+        test_A<LayoutTag, ValueType>(sequencesExist, numTeams, numCols, apiId);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_find_first_of_team_test, sequences_exist) {
+  constexpr bool sequencesExist = true;
+
+  run_all_scenarios<DynamicTag, double>(sequencesExist);
+  run_all_scenarios<StridedTwoRowsTag, int>(sequencesExist);
+  run_all_scenarios<StridedThreeRowsTag, unsigned>(sequencesExist);
+}
+
+TEST(std_algorithms_find_first_of_team_test, sequences_do_not_exist) {
+  constexpr bool sequencesExist = false;
+
+  run_all_scenarios<DynamicTag, double>(sequencesExist);
+  run_all_scenarios<StridedTwoRowsTag, int>(sequencesExist);
+  run_all_scenarios<StridedThreeRowsTag, unsigned>(sequencesExist);
+}
+
+}  // namespace TeamFindFirstOf
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindIf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindIf.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ee4bbed7a30d36ac7d36e94b1a2607c26aab609f
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindIf.cpp
@@ -0,0 +1,241 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamFindIf {
+
+namespace KE = Kokkos::Experimental;
+
+template <class ValueType>
+struct GreaterEqualFunctor {
+  ValueType m_val;
+
+  KOKKOS_INLINE_FUNCTION
+  GreaterEqualFunctor(ValueType val) : m_val(val) {}
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator()(ValueType val) const { return (val >= m_val); }
+};
+
+template <class DataViewType, class GreaterThanValuesViewType,
+          class DistancesViewType, class IntraTeamSentinelView>
+struct TestFunctorA {
+  DataViewType m_dataView;
+  GreaterThanValuesViewType m_greaterThanValuesView;
+  DistancesViewType m_distancesView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+  int m_apiPick;
+
+  TestFunctorA(const DataViewType dataView,
+               const GreaterThanValuesViewType greaterThanValuesView,
+               const DistancesViewType distancesView,
+               const IntraTeamSentinelView intraTeamSentinelView, int apiPick)
+      : m_dataView(dataView),
+        m_greaterThanValuesView(greaterThanValuesView),
+        m_distancesView(distancesView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_apiPick(apiPick) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto myRowIndex = member.league_rank();
+    auto myRowViewFrom = Kokkos::subview(m_dataView, myRowIndex, Kokkos::ALL());
+    const auto val     = m_greaterThanValuesView(myRowIndex);
+    // FIXME_INTEL
+#if defined(KOKKOS_COMPILER_INTEL) && (1900 == KOKKOS_COMPILER_INTEL)
+    GreaterEqualFunctor<
+        typename GreaterThanValuesViewType::non_const_value_type>
+        unaryPred{val};
+#else
+    GreaterEqualFunctor unaryPred{val};
+#endif
+    ptrdiff_t resultDist = 0;
+
+    switch (m_apiPick) {
+      case 0: {
+        auto it    = KE::find_if(member, KE::cbegin(myRowViewFrom),
+                              KE::cend(myRowViewFrom), unaryPred);
+        resultDist = KE::distance(KE::cbegin(myRowViewFrom), it);
+        Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+          m_distancesView(myRowIndex) = resultDist;
+        });
+
+        break;
+      }
+
+      case 1: {
+        auto it    = KE::find_if(member, myRowViewFrom, unaryPred);
+        resultDist = KE::distance(KE::begin(myRowViewFrom), it);
+        Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+          m_distancesView(myRowIndex) = resultDist;
+        });
+
+        break;
+      }
+    }
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck = team_members_have_matching_result(
+        member, resultDist, m_distancesView(myRowIndex));
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(myRowIndex) = intraTeamCheck;
+    });
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(const bool predicatesReturnTrue, std::size_t numTeams,
+            std::size_t numCols, int apiId) {
+  /* description:
+     use a rank-2 view randomly filled with values,
+     and run a team-level find_if
+   */
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // create a view in the memory space associated with default exespace
+  // with as many rows as the number of teams and fill it with random
+  // values from an arbitrary range.
+  constexpr ValueType lowerBound = 5;
+  constexpr ValueType upperBound = 523;
+  const auto bounds              = make_bounds(lowerBound, upperBound);
+
+  auto [dataView, dataViewBeforeOp_h] = create_random_view_and_host_clone(
+      LayoutTag{}, numTeams, numCols, bounds, "dataView");
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+
+  // find_if returns an iterator so to verify that it is correct
+  // each team stores the distance of the returned iterator from the
+  // beginning of the interval that team operates on and then we check
+  // that these distances match the std result
+  Kokkos::View<std::size_t*> distancesView("distancesView", numTeams);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  // If predicatesReturnTrue == true, we want to ensure that eventually, for
+  // some value from each of dataView's row, predicate GreaterEqualFunctor will
+  // return true. To do that, for each numTeams, a random j index from a range
+  // [0, numCols) is used to obtain a value from dataView, and later that value
+  // is used for creating concrete GreaterEqualFunctor predicate.
+  //
+  // If predicatesReturnTrue == false we want to ensure the opposite, so every
+  // value is randomly picked from range [upperBound, upperBound*2).
+  Kokkos::View<ValueType*> greaterEqualValuesView("greaterEqualValuesView",
+                                                  numTeams);
+  auto greaterEqualValuesView_h =
+      create_mirror_view(Kokkos::HostSpace(), greaterEqualValuesView);
+
+  using rand_pool =
+      Kokkos::Random_XorShift64_Pool<Kokkos::DefaultHostExecutionSpace>;
+  rand_pool pool(lowerBound * upperBound);
+
+  if (predicatesReturnTrue) {
+    Kokkos::View<std::size_t*, Kokkos::DefaultHostExecutionSpace> randomIndices(
+        "randomIndices", numTeams);
+    Kokkos::fill_random(randomIndices, pool, 0, numCols);
+
+    for (std::size_t i = 0; i < numTeams; ++i) {
+      const std::size_t j         = randomIndices(i);
+      greaterEqualValuesView_h(i) = dataViewBeforeOp_h(i, j);
+    }
+  } else {
+    Kokkos::fill_random(greaterEqualValuesView_h, pool, upperBound,
+                        upperBound * 2);
+  }
+
+  Kokkos::deep_copy(greaterEqualValuesView, greaterEqualValuesView_h);
+
+  // use CTAD for functor
+  TestFunctorA fnc(dataView, greaterEqualValuesView, distancesView,
+                   intraTeamSentinelView, apiId);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // run cpp-std kernel and check
+  // -----------------------------------------------
+  auto distancesView_h         = create_host_space_copy(distancesView);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+
+  for (std::size_t i = 0; i < dataView.extent(0); ++i) {
+    auto rowFrom = Kokkos::subview(dataViewBeforeOp_h, i, Kokkos::ALL());
+    const auto rowFromBegin = KE::cbegin(rowFrom);
+    const auto rowFromEnd   = KE::cend(rowFrom);
+    const auto val          = greaterEqualValuesView_h(i);
+    // FIXME_INTEL
+#if defined(KOKKOS_COMPILER_INTEL) && (1900 == KOKKOS_COMPILER_INTEL)
+    const GreaterEqualFunctor<ValueType> unaryPred{val};
+#else
+    const GreaterEqualFunctor unaryPred{val};
+#endif
+
+    auto it = std::find_if(rowFromBegin, rowFromEnd, unaryPred);
+
+    const std::size_t stdDistance      = KE::distance(rowFromBegin, it);
+    const std::size_t beginEndDistance = KE::distance(rowFromBegin, rowFromEnd);
+
+    if (predicatesReturnTrue) {
+      EXPECT_LT(stdDistance, beginEndDistance);
+    } else {
+      ASSERT_EQ(stdDistance, beginEndDistance);
+    }
+
+    ASSERT_EQ(stdDistance, distancesView_h(i));
+    ASSERT_TRUE(intraTeamSentinelView_h(i));
+  }
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios(const bool predicatesReturnTrue) {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : {1, 2, 13, 101, 1444, 8153}) {
+      for (int apiId : {0, 1}) {
+        test_A<LayoutTag, ValueType>(predicatesReturnTrue, numTeams, numCols,
+                                     apiId);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_find_if_team_test, predicates_return_true) {
+  constexpr bool predicatesReturnTrue = true;
+
+  run_all_scenarios<DynamicTag, double>(predicatesReturnTrue);
+  run_all_scenarios<StridedTwoRowsTag, int>(predicatesReturnTrue);
+  run_all_scenarios<StridedThreeRowsTag, unsigned>(predicatesReturnTrue);
+}
+
+TEST(std_algorithms_find_if_team_test, predicates_return_false) {
+  constexpr bool predicatesReturnTrue = false;
+
+  run_all_scenarios<DynamicTag, double>(predicatesReturnTrue);
+  run_all_scenarios<StridedTwoRowsTag, int>(predicatesReturnTrue);
+  run_all_scenarios<StridedThreeRowsTag, unsigned>(predicatesReturnTrue);
+}
+
+}  // namespace TeamFindIf
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindIfNot.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindIfNot.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b9448c1a3e688f2bf9879bb20fc42a205ab98fb8
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindIfNot.cpp
@@ -0,0 +1,236 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamFindIfNot {
+
+namespace KE = Kokkos::Experimental;
+
+template <class ValueType>
+struct GreaterEqualFunctor {
+  ValueType m_val;
+
+  KOKKOS_INLINE_FUNCTION
+  GreaterEqualFunctor(ValueType val) : m_val(val) {}
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator()(ValueType val) const { return (val >= m_val); }
+};
+
+template <class DataViewType, class GreaterThanValuesViewType,
+          class DistancesViewType, class IntraTeamSentinelView>
+struct TestFunctorA {
+  DataViewType m_dataView;
+  GreaterThanValuesViewType m_greaterThanValuesView;
+  DistancesViewType m_distancesView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+  int m_apiPick;
+
+  TestFunctorA(const DataViewType dataView,
+               const GreaterThanValuesViewType greaterThanValuesView,
+               DistancesViewType distancesView,
+               const IntraTeamSentinelView intraTeamSentinelView, int apiPick)
+      : m_dataView(dataView),
+        m_greaterThanValuesView(greaterThanValuesView),
+        m_distancesView(distancesView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_apiPick(apiPick) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto myRowIndex = member.league_rank();
+    auto myRowViewFrom = Kokkos::subview(m_dataView, myRowIndex, Kokkos::ALL());
+    const auto val     = m_greaterThanValuesView(myRowIndex);
+    // FIXME_INTEL
+#if defined(KOKKOS_COMPILER_INTEL) && (1900 == KOKKOS_COMPILER_INTEL)
+    GreaterEqualFunctor<
+        typename GreaterThanValuesViewType::non_const_value_type>
+        unaryPred{val};
+#else
+    GreaterEqualFunctor unaryPred{val};
+#endif
+    ptrdiff_t resultDist = 0;
+
+    switch (m_apiPick) {
+      case 0: {
+        auto it    = KE::find_if_not(member, KE::cbegin(myRowViewFrom),
+                                  KE::cend(myRowViewFrom), unaryPred);
+        resultDist = KE::distance(KE::cbegin(myRowViewFrom), it);
+        Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+          m_distancesView(myRowIndex) = resultDist;
+        });
+
+        break;
+      }
+
+      case 1: {
+        auto it    = KE::find_if_not(member, myRowViewFrom, unaryPred);
+        resultDist = KE::distance(KE::begin(myRowViewFrom), it);
+        Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+          m_distancesView(myRowIndex) = resultDist;
+        });
+
+        break;
+      }
+    }
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck = team_members_have_matching_result(
+        member, resultDist, m_distancesView(myRowIndex));
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(myRowIndex) = intraTeamCheck;
+    });
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(const bool predicatesReturnTrue, std::size_t numTeams,
+            std::size_t numCols, int apiId) {
+  /* description:
+     use a rank-2 view randomly filled with values,
+     and run a team-level find_if_not
+   */
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // create a view in the memory space associated with default exespace
+  // with as many rows as the number of teams and fill it with random
+  // values from an arbitrary range.
+  constexpr ValueType lowerBound = 5;
+  constexpr ValueType upperBound = 523;
+  const auto bounds              = make_bounds(lowerBound, upperBound);
+
+  auto [dataView, dataViewBeforeOp_h] = create_random_view_and_host_clone(
+      LayoutTag{}, numTeams, numCols, bounds, "dataView");
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+
+  // find_if_not returns an iterator so to verify that it is correct
+  // each team stores the distance of the returned iterator from the
+  // beginning of the interval that team operates on and then we check
+  // that these distances match the std result
+  Kokkos::View<std::size_t*> distancesView("distancesView", numTeams);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  // If predicatesReturnTrue == true, we want to ensure that for each dataView's
+  // row find_if_not always returns end iterator. To do that,
+  // GreaterEqualFunctor predicate created for each row must return true for
+  // every value in that row, so it needs to compare each value with value
+  // smaller than lowerBound.
+  //
+  // If predicatesReturnTrue == false we want to ensure the opposite -
+  // GreaterEqualFunctor needs to return false for every value of each
+  // dataView's row, so the predicate is constructed with value randomly picked
+  // from range [upperBound, upperBound*2).
+  Kokkos::View<ValueType*> greaterEqualValuesView("greaterEqualValuesView",
+                                                  numTeams);
+  auto greaterEqualValuesView_h =
+      create_mirror_view(Kokkos::HostSpace(), greaterEqualValuesView);
+
+  using rand_pool =
+      Kokkos::Random_XorShift64_Pool<Kokkos::DefaultHostExecutionSpace>;
+  rand_pool pool(lowerBound * upperBound);
+
+  if (predicatesReturnTrue) {
+    Kokkos::fill_random(greaterEqualValuesView_h, pool, 0, lowerBound);
+  } else {
+    Kokkos::fill_random(greaterEqualValuesView_h, pool, upperBound,
+                        upperBound * 2);
+  }
+
+  Kokkos::deep_copy(greaterEqualValuesView, greaterEqualValuesView_h);
+
+  // use CTAD for functor
+  TestFunctorA fnc(dataView, greaterEqualValuesView, distancesView,
+                   intraTeamSentinelView, apiId);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // run cpp-std kernel and check
+  // -----------------------------------------------
+  auto distancesView_h         = create_host_space_copy(distancesView);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+
+  for (std::size_t i = 0; i < dataView.extent(0); ++i) {
+    auto rowFrom = Kokkos::subview(dataViewBeforeOp_h, i, Kokkos::ALL());
+    const auto rowFromBegin = KE::cbegin(rowFrom);
+    const auto rowFromEnd   = KE::cend(rowFrom);
+    const auto val          = greaterEqualValuesView_h(i);
+    // FIXME_INTEL
+#if defined(KOKKOS_COMPILER_INTEL) && (1900 == KOKKOS_COMPILER_INTEL)
+    const GreaterEqualFunctor<ValueType> unaryPred{val};
+#else
+    const GreaterEqualFunctor unaryPred{val};
+#endif
+
+    auto it = std::find_if_not(rowFromBegin, rowFromEnd, unaryPred);
+
+    const std::size_t stdDistance      = KE::distance(rowFromBegin, it);
+    const std::size_t beginEndDistance = KE::distance(rowFromBegin, rowFromEnd);
+
+    if (predicatesReturnTrue) {
+      ASSERT_EQ(stdDistance, beginEndDistance);
+    } else {
+      EXPECT_LT(stdDistance, beginEndDistance);
+    }
+
+    ASSERT_EQ(stdDistance, distancesView_h(i));
+    ASSERT_TRUE(intraTeamSentinelView_h(i));
+  }
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios(const bool predicatesReturnTrue) {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : {1, 2, 13, 101, 1444, 8153}) {
+      for (int apiId : {0, 1}) {
+        test_A<LayoutTag, ValueType>(predicatesReturnTrue, numTeams, numCols,
+                                     apiId);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_find_if_not_team_test, predicates_return_true) {
+  constexpr bool predicatesReturnTrue = true;
+
+  run_all_scenarios<DynamicTag, double>(predicatesReturnTrue);
+  run_all_scenarios<StridedTwoRowsTag, int>(predicatesReturnTrue);
+  run_all_scenarios<StridedThreeRowsTag, unsigned>(predicatesReturnTrue);
+}
+
+TEST(std_algorithms_find_if_not_team_test, predicates_return_false) {
+  constexpr bool predicatesReturnTrue = false;
+
+  run_all_scenarios<DynamicTag, double>(predicatesReturnTrue);
+  run_all_scenarios<StridedTwoRowsTag, int>(predicatesReturnTrue);
+  run_all_scenarios<StridedThreeRowsTag, unsigned>(predicatesReturnTrue);
+}
+
+}  // namespace TeamFindIfNot
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamForEach.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamForEach.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..26e00c21e53087261c68c0849f557798c8c013bd
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamForEach.cpp
@@ -0,0 +1,126 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamForEach {
+
+namespace KE = Kokkos::Experimental;
+
+template <class ValueType>
+struct PrefixIncrementFunctor {
+  KOKKOS_INLINE_FUNCTION
+  void operator()(ValueType& val) const { ++val; }
+};
+
+template <class DataViewType, class UnaryPredType>
+struct TestFunctorA {
+  DataViewType m_dataView;
+  int m_apiPick;
+  UnaryPredType m_unaryPred;
+
+  TestFunctorA(const DataViewType dataView, int apiPick,
+               UnaryPredType unaryPred)
+      : m_dataView(dataView), m_apiPick(apiPick), m_unaryPred(unaryPred) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto myRowIndex = member.league_rank();
+    auto myRowViewFrom = Kokkos::subview(m_dataView, myRowIndex, Kokkos::ALL());
+
+    switch (m_apiPick) {
+      case 0: {
+        KE::for_each(member, KE::begin(myRowViewFrom), KE::end(myRowViewFrom),
+                     m_unaryPred);
+        break;
+      }
+
+      case 1: {
+        KE::for_each(member, myRowViewFrom, m_unaryPred);
+        break;
+      }
+    }
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
+  /* description:
+     use a rank-2 view randomly filled with values,
+     and run a team-level for_each
+   */
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // create a view in the memory space associated with default exespace
+  // with as many rows as the number of teams and fill it with random
+  // values from an arbitrary range.
+  constexpr ValueType lowerBound = 5;
+  constexpr ValueType upperBound = 523;
+  const auto bounds              = make_bounds(lowerBound, upperBound);
+
+  auto [dataView, _] = create_random_view_and_host_clone(
+      LayoutTag{}, numTeams, numCols, bounds, "dataView");
+
+  // for_each modifies dataView, so make a separated host copy of if
+  auto dataViewBeforeOp_h = create_host_space_copy(dataView);
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+
+  PrefixIncrementFunctor<ValueType> unaryPred;
+
+  // use CTAD for functor
+  TestFunctorA fnc(dataView, apiId, unaryPred);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // run cpp-std kernel and check
+  // -----------------------------------------------
+  auto dataViewAfterOp_h = create_host_space_copy(dataView);
+  for (std::size_t i = 0; i < dataViewAfterOp_h.extent(0); ++i) {
+    for (std::size_t j = 0; j < dataViewAfterOp_h.extent(1); ++j) {
+      ASSERT_DOUBLE_EQ(dataViewBeforeOp_h(i, j) + 1, dataViewAfterOp_h(i, j));
+    }
+  }
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios() {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8153}) {
+      for (int apiId : {0, 1}) {
+        test_A<LayoutTag, ValueType>(numTeams, numCols, apiId);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_for_each_team_test, test) {
+  run_all_scenarios<DynamicTag, double>();
+  run_all_scenarios<StridedTwoRowsTag, int>();
+  run_all_scenarios<StridedThreeRowsTag, unsigned>();
+}
+
+}  // namespace TeamForEach
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamForEachN.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamForEachN.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0f4793490f2a1205344c9f9a6acab7b60955e930
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamForEachN.cpp
@@ -0,0 +1,144 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamForEachN {
+
+namespace KE = Kokkos::Experimental;
+
+template <class ValueType>
+struct PrefixIncrementFunctor {
+  KOKKOS_INLINE_FUNCTION
+  void operator()(ValueType& val) const { ++val; }
+};
+
+template <class DataViewType, class NViewType, class UnaryPredType>
+struct TestFunctorA {
+  DataViewType m_dataView;
+  NViewType m_nView;
+  int m_apiPick;
+  UnaryPredType m_unaryPred;
+
+  TestFunctorA(const DataViewType dataView, const NViewType nView, int apiPick,
+               UnaryPredType unaryPred)
+      : m_dataView(dataView),
+        m_nView(nView),
+        m_apiPick(apiPick),
+        m_unaryPred(unaryPred) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto myRowIndex = member.league_rank();
+    const auto n          = m_nView(myRowIndex);
+
+    auto myRowViewFrom = Kokkos::subview(m_dataView, myRowIndex, Kokkos::ALL());
+
+    switch (m_apiPick) {
+      case 0: {
+        KE::for_each_n(member, KE::begin(myRowViewFrom), n, m_unaryPred);
+        break;
+      }
+
+      case 1: {
+        KE::for_each_n(member, myRowViewFrom, n, m_unaryPred);
+        break;
+      }
+    }
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
+  /* description:
+     use a rank-2 view randomly filled with values,
+     and run a team-level for_each_n
+   */
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // create a view in the memory space associated with default exespace
+  // with as many rows as the number of teams and fill it with random
+  // values from an arbitrary range.
+  constexpr ValueType lowerBound = 5;
+  constexpr ValueType upperBound = 523;
+  const auto bounds              = make_bounds(lowerBound, upperBound);
+
+  auto [dataView, _] = create_random_view_and_host_clone(
+      LayoutTag{}, numTeams, numCols, bounds, "dataView");
+
+  // for_each modifies dataView, so make a separated host copy of if
+  auto dataViewBeforeOp_h = create_host_space_copy(dataView);
+
+  Kokkos::View<std::size_t*> nView("nView", numTeams);
+  auto nView_h = create_host_space_copy(nView);
+  using rand_pool =
+      Kokkos::Random_XorShift64_Pool<Kokkos::DefaultHostExecutionSpace>;
+  rand_pool pool(lowerBound * upperBound);
+  Kokkos::fill_random(nView_h, pool, 0, numCols);
+
+  Kokkos::deep_copy(nView, nView_h);
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+
+  PrefixIncrementFunctor<ValueType> unaryPred;
+
+  // use CTAD for functor
+  TestFunctorA fnc(dataView, nView, apiId, unaryPred);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // run cpp-std kernel and check
+  // -----------------------------------------------
+  auto dataViewAfterOp_h = create_host_space_copy(dataView);
+  for (std::size_t i = 0; i < dataViewAfterOp_h.extent(0); ++i) {
+    for (std::size_t j = 0, n = 0; j < dataViewAfterOp_h.extent(1); ++j, ++n) {
+      if (n < nView_h(i)) {
+        ASSERT_DOUBLE_EQ(dataViewBeforeOp_h(i, j) + 1, dataViewAfterOp_h(i, j));
+      } else {
+        ASSERT_DOUBLE_EQ(dataViewBeforeOp_h(i, j), dataViewAfterOp_h(i, j));
+      }
+    }
+  }
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios() {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : {1, 2, 13, 101, 1444, 8153}) {
+      for (int apiId : {0, 1}) {
+        test_A<LayoutTag, ValueType>(numTeams, numCols, apiId);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_for_each_n_team_test, test) {
+  run_all_scenarios<DynamicTag, double>();
+  run_all_scenarios<StridedTwoRowsTag, int>();
+  run_all_scenarios<StridedThreeRowsTag, unsigned>();
+}
+
+}  // namespace TeamForEachN
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamGenerate.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamGenerate.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..28c10c95d2f842e8eb14de07ce2c202af9d05fcd
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamGenerate.cpp
@@ -0,0 +1,116 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamGenerate {
+
+namespace KE = Kokkos::Experimental;
+
+template <class ValueType>
+struct Generator {
+  KOKKOS_INLINE_FUNCTION
+  ValueType operator()() const { return static_cast<ValueType>(23); }
+};
+
+template <class ViewType>
+struct TestFunctorA {
+  ViewType m_view;
+  int m_apiPick;
+
+  TestFunctorA(const ViewType view, int apiPick)
+      : m_view(view), m_apiPick(apiPick) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto myRowIndex = member.league_rank();
+    auto myRowView        = Kokkos::subview(m_view, myRowIndex, Kokkos::ALL());
+
+    using value_type = typename ViewType::value_type;
+    if (m_apiPick == 0) {
+      KE::generate(member, KE::begin(myRowView), KE::end(myRowView),
+                   Generator<value_type>());
+    } else if (m_apiPick == 1) {
+      KE::generate(member, myRowView, Generator<value_type>());
+    }
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
+  /* description:
+     randomly fill a view and then do a team-level generate
+     with one team per row to assign to each element a value
+     produced via a generator functor
+   */
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // create a view in the memory space associated with default exespace
+  // with as many rows as the number of teams and fill it with random
+  // values from an arbitrary range. Pick range so that it does NOT
+  // contain the value produced by the generator (see top of file)
+  // otherwise test check below is ill-posed
+  auto [dataView, cloneOfDataViewBeforeOp_h] =
+      create_random_view_and_host_clone(
+          LayoutTag{}, numTeams, numCols,
+          Kokkos::pair<ValueType, ValueType>{105, 523}, "dataView");
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+
+  // use CTAD for functor
+  TestFunctorA fnc(dataView, apiId);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // check
+  // -----------------------------------------------
+  auto dataViewAfterOp_h = create_host_space_copy(dataView);
+  for (std::size_t i = 0; i < dataViewAfterOp_h.extent(0); ++i) {
+    for (std::size_t j = 0; j < dataViewAfterOp_h.extent(1); ++j) {
+      EXPECT_TRUE(dataViewAfterOp_h(i, j) == static_cast<ValueType>(23));
+      EXPECT_TRUE(dataViewAfterOp_h(i, j) != cloneOfDataViewBeforeOp_h(i, j));
+    }
+  }
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios() {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 51153}) {
+      for (int apiId : {0, 1}) {
+        test_A<LayoutTag, ValueType>(numTeams, numCols, apiId);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_generate_team_test, test_unary_op) {
+  run_all_scenarios<DynamicTag, double>();
+  run_all_scenarios<StridedTwoRowsTag, int>();
+  run_all_scenarios<StridedThreeRowsTag, unsigned>();
+}
+
+}  // namespace TeamGenerate
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamGenerate_n.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamGenerate_n.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4b66dd9131fa128d7e2995e0cdb2f64d32e6e043
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamGenerate_n.cpp
@@ -0,0 +1,179 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamGenerate_n {
+
+namespace KE = Kokkos::Experimental;
+
+template <class ValueType>
+struct GenerateFunctor {
+  KOKKOS_INLINE_FUNCTION
+  ValueType operator()() const { return static_cast<ValueType>(23); }
+};
+
+template <class ViewType, class DistancesViewType, class IntraTeamSentinelView>
+struct TestFunctorA {
+  ViewType m_view;
+  DistancesViewType m_distancesView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+  std::size_t m_count;
+  int m_apiPick;
+
+  TestFunctorA(const ViewType view, const DistancesViewType distancesView,
+               const IntraTeamSentinelView intraTeamSentinelView,
+               std::size_t count, int apiPick)
+      : m_view(view),
+        m_distancesView(distancesView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_count(count),
+        m_apiPick(apiPick) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto leagueRank = member.league_rank();
+    const auto myRowIndex = leagueRank;
+    auto myRowView        = Kokkos::subview(m_view, myRowIndex, Kokkos::ALL());
+    ptrdiff_t resultDist  = 0;
+
+    using value_type = typename ViewType::value_type;
+    if (m_apiPick == 0) {
+      auto it = KE::generate_n(member, KE::begin(myRowView), m_count,
+                               GenerateFunctor<value_type>());
+
+      resultDist = KE::distance(KE::begin(myRowView), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    } else if (m_apiPick == 1) {
+      auto it    = KE::generate_n(member, myRowView, m_count,
+                               GenerateFunctor<value_type>());
+      resultDist = KE::distance(KE::begin(myRowView), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    }
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck = team_members_have_matching_result(
+        member, resultDist, m_distancesView(myRowIndex));
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(myRowIndex) = intraTeamCheck;
+    });
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(std::size_t numTeams, std::size_t numCols, std::size_t count,
+            int apiId) {
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // create a view in the memory space associated with default exespace
+  // with as many rows as the number of teams and fill it with random
+  // values from an arbitrary range. Pick range so that it does NOT
+  // contain the value produced by the generator (see top of file)
+  // otherwise test check below is ill-posed
+  auto [dataView, cloneOfDataViewBeforeOp_h] =
+      create_random_view_and_host_clone(
+          LayoutTag{}, numTeams, numCols,
+          Kokkos::pair<ValueType, ValueType>{105, 523}, "dataView");
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+
+  // each team stores the distance of the returned iterator from the
+  // beginning of the interval that team operates on and then we check
+  // that these distances match the expected value
+  Kokkos::View<std::size_t*> distancesView("distancesView", numTeams);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  // use CTAD for functor
+  TestFunctorA fnc(dataView, distancesView, intraTeamSentinelView, count,
+                   apiId);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // check
+  // -----------------------------------------------
+  auto dataViewAfterOp_h       = create_host_space_copy(dataView);
+  auto distancesView_h         = create_host_space_copy(distancesView);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+  for (std::size_t i = 0; i < dataView.extent(0); ++i) {
+    // check that values match what we expect
+    for (std::size_t j = 0; j < count; ++j) {
+      ASSERT_EQ(dataViewAfterOp_h(i, j), static_cast<ValueType>(23));
+      EXPECT_TRUE(dataViewAfterOp_h(i, j) != cloneOfDataViewBeforeOp_h(i, j));
+    }
+    // all other elements should be unchanged from before op
+    for (std::size_t j = count; j < numCols; ++j) {
+      ASSERT_EQ(dataViewAfterOp_h(i, j), cloneOfDataViewBeforeOp_h(i, j));
+    }
+
+    // check that returned iterators are correct
+    if (count > 0) {
+      ASSERT_EQ(distancesView_h(i), std::size_t(count));
+    } else {
+      ASSERT_EQ(distancesView_h(i), std::size_t(0));
+    }
+    ASSERT_TRUE(intraTeamSentinelView_h(i));
+  }
+}
+
+template <class Tag, class ValueType>
+void run_all_scenarios() {
+  // prepare a map where, for a given set of num cols
+  // we provide a list of counts of elements to generate
+  // key = num of columns
+  // value = list of num of elemenents to generate
+  const std::map<int, std::vector<int>> scenarios = {
+      {0, {0}},
+      {2, {0, 1, 2}},
+      {6, {0, 1, 2, 5}},
+      {13, {0, 1, 2, 8, 11}},
+      {56, {0, 1, 2, 8, 11, 33, 56}},
+      {123, {0, 1, 11, 33, 56, 89, 112}}};
+
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& scenario : scenarios) {
+      const std::size_t numCols = scenario.first;
+      for (int countToGenerate : scenario.second) {
+        for (int apiId : {0, 1}) {
+          test_A<Tag, ValueType>(numTeams, numCols, countToGenerate, apiId);
+        }
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_generate_n_team_test, test) {
+  run_all_scenarios<DynamicTag, double>();
+  run_all_scenarios<StridedTwoRowsTag, int>();
+  run_all_scenarios<StridedThreeRowsTag, unsigned>();
+}
+
+}  // namespace TeamGenerate_n
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamInclusiveScan.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamInclusiveScan.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0daf9dbfe824f1f40cebfaf87b1a37efad36c445
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamInclusiveScan.cpp
@@ -0,0 +1,277 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+#include "std_algorithms/Kokkos_BeginEnd.hpp"
+
+namespace Test {
+namespace stdalgos {
+namespace TeamInclusiveScan {
+
+namespace KE = Kokkos::Experimental;
+
+template <class ValueType>
+struct PlusFunctor {
+  KOKKOS_INLINE_FUNCTION constexpr ValueType operator()(
+      const ValueType& lhs, const ValueType& rhs) const {
+    return lhs + rhs;
+  }
+};
+
+template <class SourceViewType, class DestViewType, class DistancesViewType,
+          class IntraTeamSentinelView, class InitValuesViewType,
+          class BinaryOpType>
+struct TestFunctorA {
+  SourceViewType m_sourceView;
+  DestViewType m_destView;
+  DistancesViewType m_distancesView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+  InitValuesViewType m_initValuesView;
+  BinaryOpType m_binaryOp;
+  int m_apiPick;
+
+  TestFunctorA(const SourceViewType sourceView, const DestViewType destView,
+               const DistancesViewType distancesView,
+               const IntraTeamSentinelView intraTeamSentinelView,
+               const InitValuesViewType initValuesView, BinaryOpType binaryOp,
+               int apiPick)
+      : m_sourceView(sourceView),
+        m_destView(destView),
+        m_distancesView(distancesView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_initValuesView(initValuesView),
+        m_binaryOp(binaryOp),
+        m_apiPick(apiPick) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto rowIndex = member.league_rank();
+
+    auto srcRow      = Kokkos::subview(m_sourceView, rowIndex, Kokkos::ALL());
+    const auto first = KE::cbegin(srcRow);
+    const auto last  = KE::cend(srcRow);
+    auto destRow     = Kokkos::subview(m_destView, rowIndex, Kokkos::ALL());
+    const auto firstDest = KE::begin(destRow);
+
+    const auto initVal   = m_initValuesView(rowIndex);
+    ptrdiff_t resultDist = 0;
+
+    switch (m_apiPick) {
+      case 0: {
+        auto it    = KE::inclusive_scan(member, first, last, firstDest);
+        resultDist = KE::distance(firstDest, it);
+        Kokkos::single(Kokkos::PerTeam(member),
+                       [=, *this] { m_distancesView(rowIndex) = resultDist; });
+
+        break;
+      }
+
+      case 1: {
+        auto it    = KE::inclusive_scan(member, srcRow, destRow);
+        resultDist = KE::distance(firstDest, it);
+        Kokkos::single(Kokkos::PerTeam(member),
+                       [=, *this] { m_distancesView(rowIndex) = resultDist; });
+
+        break;
+      }
+
+      case 2: {
+        auto it =
+            KE::inclusive_scan(member, first, last, firstDest, m_binaryOp);
+        resultDist = KE::distance(firstDest, it);
+        Kokkos::single(Kokkos::PerTeam(member),
+                       [=, *this] { m_distancesView(rowIndex) = resultDist; });
+
+        break;
+      }
+
+      case 3: {
+        auto it    = KE::inclusive_scan(member, srcRow, destRow, m_binaryOp);
+        resultDist = KE::distance(firstDest, it);
+        Kokkos::single(Kokkos::PerTeam(member),
+                       [=, *this] { m_distancesView(rowIndex) = resultDist; });
+
+        break;
+      }
+
+      case 4: {
+        auto it = KE::inclusive_scan(member, first, last, firstDest, m_binaryOp,
+                                     initVal);
+        resultDist = KE::distance(firstDest, it);
+        Kokkos::single(Kokkos::PerTeam(member),
+                       [=, *this] { m_distancesView(rowIndex) = resultDist; });
+
+        break;
+      }
+
+      case 5: {
+        auto it =
+            KE::inclusive_scan(member, srcRow, destRow, m_binaryOp, initVal);
+        resultDist = KE::distance(firstDest, it);
+        Kokkos::single(Kokkos::PerTeam(member),
+                       [=, *this] { m_distancesView(rowIndex) = resultDist; });
+
+        break;
+      }
+    }
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck = team_members_have_matching_result(
+        member, resultDist, m_distancesView(rowIndex));
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(rowIndex) = intraTeamCheck;
+    });
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
+  /* description:
+     use a rank-2 view randomly filled with values,
+     and run a team-level inclusive_scan
+   */
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // create a view in the memory space associated with default exespace
+  // with as many rows as the number of teams and fill it with random
+  // values from an arbitrary range.
+  constexpr ValueType lowerBound = 5;
+  constexpr ValueType upperBound = 523;
+  const auto bounds              = make_bounds(lowerBound, upperBound);
+
+  auto [sourceView, sourceViewBeforeOp_h] = create_random_view_and_host_clone(
+      LayoutTag{}, numTeams, numCols, bounds, "sourceView");
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+
+  // create the destination view
+  Kokkos::View<ValueType**> destView("destView", numTeams, numCols);
+
+  // inclusive_scan returns an iterator so to verify that it is correct
+  // each team stores the distance of the returned iterator from the beginning
+  // of the interval that team operates on and then we check that these
+  // distances match the std result
+  Kokkos::View<std::size_t*> distancesView("distancesView", numTeams);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  PlusFunctor<ValueType> binaryOp;
+
+  // Create view of reduce init values to be used by test cases
+  Kokkos::View<ValueType*, Kokkos::DefaultHostExecutionSpace> initValuesView_h(
+      "initValuesView_h", numTeams);
+  using rand_pool =
+      Kokkos::Random_XorShift64_Pool<Kokkos::DefaultHostExecutionSpace>;
+  rand_pool pool(lowerBound * upperBound);
+  Kokkos::fill_random(initValuesView_h, pool, lowerBound, upperBound);
+
+  // use CTAD for functor
+  auto initValuesView =
+      Kokkos::create_mirror_view_and_copy(space_t(), initValuesView_h);
+  TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView,
+                   initValuesView, binaryOp, apiId);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // run cpp-std kernel and check
+  // -----------------------------------------------
+  auto distancesView_h         = create_host_space_copy(distancesView);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+  Kokkos::View<ValueType**, Kokkos::HostSpace> stdDestView("stdDestView",
+                                                           numTeams, numCols);
+
+  for (std::size_t i = 0; i < sourceView.extent(0); ++i) {
+    auto srcRow    = Kokkos::subview(sourceViewBeforeOp_h, i, Kokkos::ALL());
+    auto first     = KE::begin(srcRow);
+    auto last      = KE::end(srcRow);
+    auto destRow   = Kokkos::subview(stdDestView, i, Kokkos::ALL());
+    auto firstDest = KE::begin(destRow);
+    auto initValue = initValuesView_h(i);
+
+    ASSERT_TRUE(intraTeamSentinelView_h(i));
+
+// libstdc++ as provided by GCC 8 does not have inclusive_scan and
+// for GCC 9.1, 9.2 fails to compile for missing overload not accepting policy
+#if defined(_GLIBCXX_RELEASE) && (_GLIBCXX_RELEASE <= 9)
+#define inclusive_scan testing_inclusive_scan
+#else
+#define inclusive_scan std::inclusive_scan
+#endif
+
+    switch (apiId) {
+      case 0:
+      case 1: {
+        auto it                       = inclusive_scan(first, last, firstDest);
+        const std::size_t stdDistance = KE::distance(firstDest, it);
+        ASSERT_EQ(stdDistance, distancesView_h(i));
+
+        break;
+      }
+
+      case 2:
+      case 3: {
+        auto it = inclusive_scan(first, last, firstDest, binaryOp);
+        const std::size_t stdDistance = KE::distance(firstDest, it);
+        ASSERT_EQ(stdDistance, distancesView_h(i));
+
+        break;
+      }
+
+      case 4:
+      case 5: {
+        auto it = inclusive_scan(first, last, firstDest, binaryOp, initValue);
+        const std::size_t stdDistance = KE::distance(firstDest, it);
+        ASSERT_EQ(stdDistance, distancesView_h(i));
+
+        break;
+      }
+    }
+
+#undef inclusive_scan
+  }
+
+  auto dataViewAfterOp_h = create_host_space_copy(destView);
+  expect_equal_host_views(stdDestView, dataViewAfterOp_h);
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios() {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8153}) {
+      for (int apiId : {0, 1, 2, 3, 4, 5}) {
+        test_A<LayoutTag, ValueType>(numTeams, numCols, apiId);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_inclusive_scan_team_test, test) {
+  run_all_scenarios<DynamicTag, double>();
+  run_all_scenarios<StridedTwoRowsTag, int>();
+  run_all_scenarios<StridedThreeRowsTag, unsigned>();
+}
+
+}  // namespace TeamInclusiveScan
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsPartitioned.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsPartitioned.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1928f9558806524596b929144b167037ab38f618
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsPartitioned.cpp
@@ -0,0 +1,255 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+#include <algorithm>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamIsPartitioned {
+
+namespace KE = Kokkos::Experimental;
+
+template <class ValueType>
+struct UnifDist;
+
+template <>
+struct UnifDist<int> {
+  using dist_type = std::uniform_int_distribution<int>;
+  std::mt19937 m_gen;
+  dist_type m_dist;
+
+  UnifDist(int a, int b, std::size_t seedIn) : m_dist(a, b) {
+    m_gen.seed(seedIn);
+  }
+
+  int operator()() { return m_dist(m_gen); }
+};
+
+template <class ValueType>
+struct GreaterThanValueFunctor {
+  ValueType m_val;
+
+  KOKKOS_INLINE_FUNCTION
+  GreaterThanValueFunctor(ValueType val) : m_val(val) {}
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator()(ValueType val) const { return (val > m_val); }
+};
+
+template <class ViewType, class ReturnViewType, class IntraTeamSentinelView,
+          class ValueType>
+struct TestFunctorA {
+  ViewType m_view;
+  ReturnViewType m_returnsView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+  ValueType m_threshold;
+  int m_apiPick;
+
+  TestFunctorA(const ViewType view, const ReturnViewType returnsView,
+               const IntraTeamSentinelView intraTeamSentinelView,
+               ValueType threshold, int apiPick)
+      : m_view(view),
+        m_returnsView(returnsView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_threshold(threshold),
+        m_apiPick(apiPick) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto myRowIndex = member.league_rank();
+    auto myRowView        = Kokkos::subview(m_view, myRowIndex, Kokkos::ALL());
+    bool result           = false;
+
+    GreaterThanValueFunctor predicate(m_threshold);
+    if (m_apiPick == 0) {
+      result = KE::is_partitioned(member, KE::cbegin(myRowView),
+                                  KE::cend(myRowView), predicate);
+
+      Kokkos::single(Kokkos::PerTeam(member),
+                     [=, *this]() { m_returnsView(myRowIndex) = result; });
+    } else if (m_apiPick == 1) {
+      result = KE::is_partitioned(member, myRowView, predicate);
+
+      Kokkos::single(Kokkos::PerTeam(member),
+                     [=, *this]() { m_returnsView(myRowIndex) = result; });
+    }
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck = team_members_have_matching_result(
+        member, result, m_returnsView(myRowIndex));
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(myRowIndex) = intraTeamCheck;
+    });
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(std::size_t numTeams, std::size_t numCols, int apiId,
+            const std::string& sIn) {
+  /* description:
+     use a rank-2 view randomly filled with values in a range (a,b)
+     and run a team-level is_partitioned with predicate = IsGreaterThanValue
+     where threshold is set to a number larger than b above
+   */
+  const auto threshold           = static_cast<ValueType>(1103);
+  const auto valueForSureGreater = static_cast<ValueType>(2103);
+  const auto valueForSureSmaller = static_cast<ValueType>(111);
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // construct in memory space associated with default exespace
+  auto dataView =
+      create_view<ValueType>(LayoutTag{}, numTeams, numCols, "dataView");
+
+  // dataView might not deep copyable (e.g. strided layout) so to
+  // randomize it, we make a new view that is for sure deep copyable,
+  // modify it on the host, deep copy to device and then launch
+  // a kernel to copy to dataView
+  auto dataView_dc =
+      create_deep_copyable_compatible_view_with_same_extent(dataView);
+  auto dataView_dc_h = create_mirror_view(Kokkos::HostSpace(), dataView_dc);
+
+  if (sIn == "trivialEmpty") {
+    // do nothing
+  }
+
+  else if (sIn == "allTrue") {
+    // randomly fill with values greater than threshold
+    // so that all elements in each row satisfy the predicate
+    // so this counts as being partitioned
+    Kokkos::Random_XorShift64_Pool<Kokkos::DefaultHostExecutionSpace> pool(
+        452377);
+    Kokkos::fill_random(dataView_dc_h, pool, ValueType(2001), ValueType(2501));
+  }
+
+  else if (sIn == "allFalse") {
+    // randomly fill the view with values smaller than threshold
+    // and even in this case each row counts as partitioned
+    Kokkos::Random_XorShift64_Pool<Kokkos::DefaultHostExecutionSpace> pool(
+        452377);
+    Kokkos::fill_random(dataView_dc_h, pool, ValueType(0), ValueType(101));
+  }
+
+  else if (sIn == "random") {
+    // randomly select a location and make all values before that
+    // larger than threshol and all values after to be smaller than threshold
+    // so that this picked location does partition the range
+    UnifDist<int> indexProducer(0, numCols - 1, 3432779);
+    for (std::size_t i = 0; i < dataView_dc_h.extent(0); ++i) {
+      const std::size_t a = indexProducer();
+      for (std::size_t j = 0; j < a; ++j) {
+        dataView_dc_h(i, j) = valueForSureGreater;
+      }
+      for (std::size_t j = a; j < numCols; ++j) {
+        dataView_dc_h(i, j) = valueForSureSmaller;
+      }
+    }
+  }
+
+  // copy to dataView_dc and then to dataView
+  Kokkos::deep_copy(dataView_dc, dataView_dc_h);
+  // use CTAD
+  CopyFunctorRank2 F1(dataView_dc, dataView);
+  Kokkos::parallel_for("copy", dataView.extent(0) * dataView.extent(1), F1);
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+
+  // to verify that things work, each team stores the result
+  // and then we check that these match what we expect
+  Kokkos::View<bool*> returnView("returnView", numTeams);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  // use CTAD for functor
+  TestFunctorA fnc(dataView, returnView, intraTeamSentinelView, threshold,
+                   apiId);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // check
+  // -----------------------------------------------
+  auto returnView_h            = create_host_space_copy(returnView);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+  GreaterThanValueFunctor predicate(threshold);
+
+  for (std::size_t i = 0; i < dataView_dc_h.extent(0); ++i) {
+    auto myRow = Kokkos::subview(dataView_dc_h, i, Kokkos::ALL());
+    const bool stdResult =
+        std::is_partitioned(KE::cbegin(myRow), KE::cend(myRow), predicate);
+    // our result must match std
+    EXPECT_TRUE(stdResult == returnView_h(i));
+    ASSERT_TRUE(intraTeamSentinelView_h(i));
+  }
+
+  // dataView should remain unchanged
+  auto dataViewAfterOp_h = create_host_space_copy(dataView);
+  expect_equal_host_views(dataView_dc_h, dataViewAfterOp_h);
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios(const std::string& name, const std::vector<int>& cols) {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : cols) {
+      for (int apiId : {0, 1}) {
+        test_A<LayoutTag, ValueType>(numTeams, numCols, apiId, name);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_is_partitioned_team_test, empty) {
+  const std::string name      = "trivialEmpty";
+  const std::vector<int> cols = {0};
+  run_all_scenarios<DynamicTag, double>(name, cols);
+  run_all_scenarios<StridedTwoRowsTag, double>(name, cols);
+  run_all_scenarios<StridedThreeRowsTag, int>(name, cols);
+}
+
+TEST(std_algorithms_is_partitioned_team_test, all_true) {
+  const std::string name      = "allTrue";
+  const std::vector<int> cols = {13, 101, 1444, 5153};
+  run_all_scenarios<DynamicTag, double>(name, cols);
+  run_all_scenarios<StridedTwoRowsTag, double>(name, cols);
+  run_all_scenarios<StridedThreeRowsTag, int>(name, cols);
+}
+
+TEST(std_algorithms_is_partitioned_team_test, all_false) {
+  const std::string name      = "allFalse";
+  const std::vector<int> cols = {13, 101, 1444, 5153};
+  run_all_scenarios<DynamicTag, double>(name, cols);
+  run_all_scenarios<StridedTwoRowsTag, double>(name, cols);
+  run_all_scenarios<StridedThreeRowsTag, int>(name, cols);
+}
+
+TEST(std_algorithms_is_partitioned_team_test, random) {
+  const std::string name      = "random";
+  const std::vector<int> cols = {13, 101, 1444, 5153};
+  run_all_scenarios<DynamicTag, double>(name, cols);
+  run_all_scenarios<StridedTwoRowsTag, double>(name, cols);
+  run_all_scenarios<StridedThreeRowsTag, int>(name, cols);
+}
+
+}  // namespace TeamIsPartitioned
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSorted.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSorted.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f9adeb0654b83ae6ac101eb1d69f2695c6c794ff
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSorted.cpp
@@ -0,0 +1,209 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamIsSorted {
+
+namespace KE = Kokkos::Experimental;
+
+template <class ViewType, class ReturnViewType, class IntraTeamSentinelView>
+struct TestFunctorA {
+  ViewType m_view;
+  ReturnViewType m_returnsView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+  int m_apiPick;
+
+  TestFunctorA(const ViewType view, const ReturnViewType returnsView,
+               const IntraTeamSentinelView intraTeamSentinelView, int apiPick)
+      : m_view(view),
+        m_returnsView(returnsView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_apiPick(apiPick) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto myRowIndex = member.league_rank();
+    auto myRowView        = Kokkos::subview(m_view, myRowIndex, Kokkos::ALL());
+    bool result           = false;
+
+    if (m_apiPick == 0) {
+      result =
+          KE::is_sorted(member, KE::cbegin(myRowView), KE::cend(myRowView));
+      Kokkos::single(Kokkos::PerTeam(member),
+                     [=, *this]() { m_returnsView(myRowIndex) = result; });
+    } else if (m_apiPick == 1) {
+      result = KE::is_sorted(member, myRowView);
+      Kokkos::single(Kokkos::PerTeam(member),
+                     [=, *this]() { m_returnsView(myRowIndex) = result; });
+    }
+#if not defined KOKKOS_ENABLE_OPENMPTARGET
+    else if (m_apiPick == 2) {
+      using value_type = typename ViewType::value_type;
+      result = KE::is_sorted(member, KE::cbegin(myRowView), KE::cend(myRowView),
+                             CustomLessThanComparator<value_type>{});
+      Kokkos::single(Kokkos::PerTeam(member),
+                     [=, *this]() { m_returnsView(myRowIndex) = result; });
+    } else if (m_apiPick == 3) {
+      using value_type = typename ViewType::value_type;
+      result           = KE::is_sorted(member, myRowView,
+                             CustomLessThanComparator<value_type>{});
+      Kokkos::single(Kokkos::PerTeam(member),
+                     [=, *this]() { m_returnsView(myRowIndex) = result; });
+    }
+#endif
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck = team_members_have_matching_result(
+        member, result, m_returnsView(myRowIndex));
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(myRowIndex) = intraTeamCheck;
+    });
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(std::size_t numTeams, std::size_t numCols, int apiId,
+            bool makeDataSortedOnPurpose) {
+  /* description:
+     use a rank-2 view randomly filled with values,
+     and run a team-level is_sorted
+   */
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // construct in memory space associated with default exespace
+  auto dataView =
+      create_view<ValueType>(LayoutTag{}, numTeams, numCols, "dataView");
+
+  // dataView might not deep copyable (e.g. strided layout) so to
+  // randomize it, we make a new view that is for sure deep copyable,
+  // modify it on the host, deep copy to device and then launch
+  // a kernel to copy to dataView
+  auto dataView_dc =
+      create_deep_copyable_compatible_view_with_same_extent(dataView);
+  auto dataView_dc_h = create_mirror_view(Kokkos::HostSpace(), dataView_dc);
+
+  if (makeDataSortedOnPurpose) {
+    for (std::size_t i = 0; i < dataView_dc_h.extent(0); ++i) {
+      for (std::size_t j = 0; j < dataView_dc_h.extent(1); ++j) {
+        dataView_dc_h(i, j) = ValueType(j);
+      }
+    }
+  } else {
+    // randomly fill the view
+    using rand_pool =
+        Kokkos::Random_XorShift64_Pool<Kokkos::DefaultHostExecutionSpace>;
+    rand_pool pool(45234977);
+    Kokkos::fill_random(dataView_dc_h, pool, ValueType{5}, ValueType{1545});
+  }
+
+  // copy to dataView_dc and then to dataView
+  Kokkos::deep_copy(dataView_dc, dataView_dc_h);
+  // use CTAD
+  CopyFunctorRank2 F1(dataView_dc, dataView);
+  Kokkos::parallel_for("copy", dataView.extent(0) * dataView.extent(1), F1);
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+
+  // to verify that things work, each team stores the result
+  // and then we check that these match what we expect
+  Kokkos::View<bool*> returnView("returnView", numTeams);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  // use CTAD for functor
+  TestFunctorA fnc(dataView, returnView, intraTeamSentinelView, apiId);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // check
+  // -----------------------------------------------
+  auto returnView_h            = create_host_space_copy(returnView);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+  for (std::size_t i = 0; i < dataView_dc_h.extent(0); ++i) {
+    auto myRow = Kokkos::subview(dataView_dc_h, i, Kokkos::ALL());
+
+    bool stdResult;
+    if (apiId <= 1) {
+      stdResult = std::is_sorted(KE::cbegin(myRow), KE::cend(myRow));
+    } else {
+      stdResult = std::is_sorted(KE::cbegin(myRow), KE::cend(myRow),
+                                 CustomLessThanComparator<ValueType>{});
+    }
+
+    // our result must match std
+    EXPECT_TRUE(stdResult == returnView_h(i));
+
+    // check also since we know in advance when data is really sorted.
+    // note that we have to be careful because when we have only
+    // 0, 1 columns, then the data is sorted by definition
+    // and when we have 2 columns it is very likely it is sorted
+    // so only do the following check for large enough cols count
+    if (numCols <= 1) {
+      EXPECT_TRUE(stdResult == true);
+    } else if (numCols > 10) {
+      EXPECT_TRUE(stdResult == makeDataSortedOnPurpose);
+    }
+    ASSERT_TRUE(intraTeamSentinelView_h(i));
+  }
+
+  // dataView should remain unchanged
+  auto dataViewAfterOp_h = create_host_space_copy(dataView);
+  expect_equal_host_views(dataView_dc_h, dataViewAfterOp_h);
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios(bool makeDataSortedOnPurpose) {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 5153}) {
+#if not defined KOKKOS_ENABLE_OPENMPTARGET
+      for (int apiId : {0, 1, 2, 3}) {
+#else
+      for (int apiId : {0, 1}) {
+#endif
+        test_A<LayoutTag, ValueType>(numTeams, numCols, apiId,
+                                     makeDataSortedOnPurpose);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_is_sorted_team_test,
+     test_data_almost_certainly_not_sorted) {
+  run_all_scenarios<DynamicTag, double>(false);
+  run_all_scenarios<StridedTwoRowsTag, double>(false);
+  run_all_scenarios<StridedThreeRowsTag, unsigned>(false);
+}
+
+TEST(std_algorithms_is_sorted_team_test, test_data_certainly_sorted) {
+  run_all_scenarios<DynamicTag, double>(true);
+  run_all_scenarios<StridedTwoRowsTag, double>(true);
+  run_all_scenarios<StridedThreeRowsTag, unsigned>(true);
+}
+
+}  // namespace TeamIsSorted
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSortedUntil.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSortedUntil.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..33af5f99def666bf9af9ddc00408c638b582a7e0
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSortedUntil.cpp
@@ -0,0 +1,275 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamIsSortedUntil {
+
+namespace KE = Kokkos::Experimental;
+
+template <class ValueType>
+struct UnifDist;
+
+template <>
+struct UnifDist<int> {
+  using dist_type = std::uniform_int_distribution<int>;
+  std::mt19937 m_gen;
+  dist_type m_dist;
+
+  UnifDist(int a, int b, std::size_t seedIn) : m_dist(a, b) {
+    m_gen.seed(seedIn);
+  }
+
+  int operator()() { return m_dist(m_gen); }
+};
+
+template <class ViewType, class DistancesViewType, class IntraTeamSentinelView>
+struct TestFunctorA {
+  ViewType m_view;
+  DistancesViewType m_distancesView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+  int m_apiPick;
+
+  TestFunctorA(const ViewType view, const DistancesViewType distancesView,
+               const IntraTeamSentinelView intraTeamSentinelView, int apiPick)
+      : m_view(view),
+        m_distancesView(distancesView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_apiPick(apiPick) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto leagueRank = member.league_rank();
+    const auto myRowIndex = leagueRank;
+    auto myRowView        = Kokkos::subview(m_view, myRowIndex, Kokkos::ALL());
+    ptrdiff_t resultDist  = 0;
+
+    if (m_apiPick == 0) {
+      auto it    = KE::is_sorted_until(member, KE::cbegin(myRowView),
+                                    KE::cend(myRowView));
+      resultDist = KE::distance(KE::cbegin(myRowView), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    } else if (m_apiPick == 1) {
+      auto it    = KE::is_sorted_until(member, myRowView);
+      resultDist = KE::distance(KE::begin(myRowView), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    }
+#if not defined KOKKOS_ENABLE_OPENMPTARGET
+    else if (m_apiPick == 2) {
+      using value_type = typename ViewType::value_type;
+      auto it          = KE::is_sorted_until(member, KE::cbegin(myRowView),
+                                    KE::cend(myRowView),
+                                    CustomLessThanComparator<value_type>{});
+      resultDist       = KE::distance(KE::cbegin(myRowView), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    }
+
+    else if (m_apiPick == 3) {
+      using value_type = typename ViewType::value_type;
+      auto it          = KE::is_sorted_until(member, myRowView,
+                                    CustomLessThanComparator<value_type>{});
+      resultDist       = KE::distance(KE::begin(myRowView), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    }
+#endif
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck = team_members_have_matching_result(
+        member, resultDist, m_distancesView(myRowIndex));
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(myRowIndex) = intraTeamCheck;
+    });
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(std::size_t numTeams, std::size_t numCols, int apiId,
+            const std::string& sIn) {
+  /* description:
+     use a rank-2 view and run a team-level is_sorted_until
+     for various trivial and non trivial scenarios
+   */
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // construct in memory space associated with default exespace
+  auto dataView =
+      create_view<ValueType>(LayoutTag{}, numTeams, numCols, "dataView");
+
+  // dataView might not deep copyable (e.g. strided layout) so to
+  // randomize it, we make a new view that is for sure deep copyable,
+  // modify it on the host, deep copy to device and then launch
+  // a kernel to copy to dataView
+  auto dataView_dc =
+      create_deep_copyable_compatible_view_with_same_extent(dataView);
+  auto dataView_dc_h = create_mirror_view(Kokkos::HostSpace(), dataView_dc);
+
+  if (sIn == "trivialEmpty" || sIn == "trivialOneElement") {
+    // do not do anything
+  }
+
+  else if (sIn == "nontrivialUntilLast") {
+    for (std::size_t i = 0; i < dataView_dc_h.extent(0); ++i) {
+      for (std::size_t j = 0; j < dataView_dc_h.extent(1); ++j) {
+        dataView_dc_h(i, j) = ValueType(j);
+      }
+    }
+  }
+
+  else if (sIn == "nontrivialRandom") {
+    // randomly fill the view
+    Kokkos::Random_XorShift64_Pool<Kokkos::DefaultHostExecutionSpace> pool(
+        452377);
+    // make the range tight so that we have low likelihood of having
+    // sorted data by chance for small num of cols
+    Kokkos::fill_random(dataView_dc_h, pool, ValueType(113), ValueType(120));
+
+    /* pick randomly a location 0 < a < numCols-1/2
+       annd fill data so that:
+       - from 0 to a: is sorted
+       - from a to a+3: not sorted
+       - from a+3 to numCols-1: is sorted
+       this allows us to exercise that the algorithm returns
+       the larest sorted interval starting from 0
+    */
+    assert(numCols > 10);
+    const std::size_t midPoint = numCols / 2;
+
+    UnifDist<int> randPoolA(0, midPoint, 3432779);
+    for (std::size_t i = 0; i < dataView_dc_h.extent(0); ++i) {
+      const std::size_t a = randPoolA();
+      for (std::size_t j = 0; j < a; ++j) {
+        dataView_dc_h(i, j) = ValueType(j);
+      }
+      for (std::size_t j = a + 3; j < numCols; ++j) {
+        dataView_dc_h(i, j) = ValueType(j);
+      }
+    }
+  }
+
+  // copy to dataView_dc and then to dataView
+  Kokkos::deep_copy(dataView_dc, dataView_dc_h);
+  // use CTAD
+  CopyFunctorRank2 F1(dataView_dc, dataView);
+  Kokkos::parallel_for("copy", dataView.extent(0) * dataView.extent(1), F1);
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+
+  // each team stores the distance of the returned iterator from the
+  // beginning of the interval that team operates on and then we check
+  // that these distances match the std result
+  Kokkos::View<std::size_t*> distancesView("distancesView", numTeams);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  // use CTAD for functor
+  TestFunctorA fnc(dataView, distancesView, intraTeamSentinelView, apiId);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // check
+  // -----------------------------------------------
+  auto distancesView_h         = create_host_space_copy(distancesView);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+  for (std::size_t i = 0; i < dataView_dc_h.extent(0); ++i) {
+    auto myRow = Kokkos::subview(dataView_dc_h, i, Kokkos::ALL());
+
+    std::size_t stdDistance = 0;
+    if (apiId <= 1) {
+      auto it     = std::is_sorted_until(KE::cbegin(myRow), KE::cend(myRow));
+      stdDistance = KE::distance(KE::cbegin(myRow), it);
+    } else {
+      auto it     = std::is_sorted_until(KE::cbegin(myRow), KE::cend(myRow),
+                                     CustomLessThanComparator<ValueType>{});
+      stdDistance = KE::distance(KE::cbegin(myRow), it);
+    }
+    ASSERT_EQ(stdDistance, distancesView_h(i));
+    ASSERT_TRUE(intraTeamSentinelView_h(i));
+  }
+
+  // dataView should remain unchanged
+  auto dataViewAfterOp_h = create_host_space_copy(dataView);
+  expect_equal_host_views(dataView_dc_h, dataViewAfterOp_h);
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios(const std::string& name, const std::vector<int>& cols) {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : cols) {
+#if not defined KOKKOS_ENABLE_OPENMPTARGET
+      for (int apiId : {0, 1, 2, 3}) {
+#else
+      for (int apiId : {0, 1}) {
+#endif
+        test_A<LayoutTag, ValueType>(numTeams, numCols, apiId, name);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_is_sorted_until_team_test, test_trivialA) {
+  const std::string name      = "trivialEmpty";
+  const std::vector<int> cols = {0};
+
+  run_all_scenarios<DynamicTag, double>(name, cols);
+  run_all_scenarios<StridedTwoRowsTag, double>(name, cols);
+  run_all_scenarios<StridedThreeRowsTag, int>(name, cols);
+}
+
+TEST(std_algorithms_is_sorted_until_team_test, test_trivialB) {
+  const std::string name      = "trivialOneElement";
+  const std::vector<int> cols = {1};
+  run_all_scenarios<DynamicTag, double>(name, cols);
+  run_all_scenarios<StridedTwoRowsTag, double>(name, cols);
+  run_all_scenarios<StridedThreeRowsTag, int>(name, cols);
+}
+
+TEST(std_algorithms_is_sorted_until_team_test, test_nontrivialA) {
+  const std::string name      = "nontrivialUntilLast";
+  const std::vector<int> cols = {13, 101, 1444, 5153};
+  run_all_scenarios<DynamicTag, double>(name, cols);
+  run_all_scenarios<StridedTwoRowsTag, double>(name, cols);
+  run_all_scenarios<StridedThreeRowsTag, int>(name, cols);
+}
+
+TEST(std_algorithms_is_sorted_until_team_test, test_nontrivialB) {
+  const std::string name      = "nontrivialRandom";
+  const std::vector<int> cols = {13, 101, 1444, 5153};
+  run_all_scenarios<DynamicTag, double>(name, cols);
+  run_all_scenarios<StridedTwoRowsTag, double>(name, cols);
+  run_all_scenarios<StridedThreeRowsTag, int>(name, cols);
+}
+
+}  // namespace TeamIsSortedUntil
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamLexicographicalCompare.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamLexicographicalCompare.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c377b9fec89a4ce9fc8051642ac20d38b0ab04a7
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamLexicographicalCompare.cpp
@@ -0,0 +1,286 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamLexicographicalCompare {
+
+namespace KE = Kokkos::Experimental;
+
+enum class TestCaseType { ViewsAreEqual, FirstIsLess, FirstIsGreater };
+
+template <class ValueType>
+struct LessFunctor {
+  KOKKOS_INLINE_FUNCTION bool operator()(const ValueType& lhs,
+                                         const ValueType& rhs) const {
+    return lhs < rhs;
+  }
+};
+
+template <class DataViewType, class CompViewType, class ResultsViewType,
+          class IntraTeamSentinelView, class BinaryCompType>
+struct TestFunctorA {
+  DataViewType m_dataView;
+  CompViewType m_compView;
+  ResultsViewType m_resultsView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+  int m_apiPick;
+  BinaryCompType m_binaryComp;
+
+  TestFunctorA(const DataViewType dataView, const CompViewType compView,
+               const ResultsViewType resultsView,
+               const IntraTeamSentinelView intraTeamSentinelView, int apiPick,
+               BinaryCompType binaryComp)
+      : m_dataView(dataView),
+        m_compView(compView),
+        m_resultsView(resultsView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_apiPick(apiPick),
+        m_binaryComp(binaryComp) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto rowIndex = member.league_rank();
+
+    auto rowData         = Kokkos::subview(m_dataView, rowIndex, Kokkos::ALL());
+    const auto dataBegin = KE::cbegin(rowData);
+    const auto dataEnd   = KE::cend(rowData);
+
+    auto rowComp         = Kokkos::subview(m_compView, rowIndex, Kokkos::ALL());
+    const auto compBegin = KE::cbegin(rowComp);
+    const auto compEnd   = KE::cend(rowComp);
+
+    bool result = false;
+    switch (m_apiPick) {
+      case 0: {
+        result = KE::lexicographical_compare(member, dataBegin, dataEnd,
+                                             compBegin, compEnd);
+        Kokkos::single(Kokkos::PerTeam(member),
+                       [=, *this]() { m_resultsView(rowIndex) = result; });
+        break;
+      }
+
+      case 1: {
+        result = KE::lexicographical_compare(member, rowData, rowComp);
+        Kokkos::single(Kokkos::PerTeam(member),
+                       [=, *this]() { m_resultsView(rowIndex) = result; });
+        break;
+      }
+
+      case 2: {
+        result = KE::lexicographical_compare(member, dataBegin, dataEnd,
+                                             compBegin, compEnd, m_binaryComp);
+        Kokkos::single(Kokkos::PerTeam(member),
+                       [=, *this]() { m_resultsView(rowIndex) = result; });
+        break;
+      }
+
+      case 3: {
+        result =
+            KE::lexicographical_compare(member, rowData, rowComp, m_binaryComp);
+        Kokkos::single(Kokkos::PerTeam(member),
+                       [=, *this]() { m_resultsView(rowIndex) = result; });
+        break;
+      }
+    }
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck = team_members_have_matching_result(
+        member, result, m_resultsView(rowIndex));
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(rowIndex) = intraTeamCheck;
+    });
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(const TestCaseType testCase, std::size_t numTeams,
+            std::size_t numCols, int apiId) {
+  /* description:
+     use a rank-2 view randomly filled with values,
+     and run a team-level lexicographical_compare
+   */
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // create a view in the memory space associated with default exespace
+  // with as many rows as the number of teams and fill it with random
+  // values from an arbitrary range.
+  constexpr ValueType lowerBound = 5;
+  constexpr ValueType upperBound = 523;
+  const auto bounds              = make_bounds(lowerBound, upperBound);
+
+  auto [dataView, dataViewBeforeOp_h] = create_random_view_and_host_clone(
+      LayoutTag{}, numTeams, numCols, bounds, "dataView");
+
+  // create a view to compare it with dataView. If testCase == ViewsAreEqual,
+  // compView is a copy of dataView. If testCase == FirstIsLess, we want the
+  // dataView to be lexicographically less (and compView - greater). If testCase
+  // == FirstIsGreater, we want the dataView to be lexicographically greater
+  // (and compView - less).
+  auto compEqualView   = create_deep_copyable_compatible_clone(dataView);
+  auto compEqualView_h = create_mirror_view(Kokkos::HostSpace(), compEqualView);
+  Kokkos::deep_copy(compEqualView_h, dataViewBeforeOp_h);
+  const auto middle = numCols / 2;
+  switch (testCase) {
+    case TestCaseType::ViewsAreEqual: {
+      // Do nothing - deep_copy was already done
+      break;
+    }
+
+    case TestCaseType::FirstIsLess: {
+      for (std::size_t i = 0; i < compEqualView_h.extent(0); ++i) {
+        compEqualView_h(i, middle) += 1;
+      }
+
+      break;
+    }
+
+    case TestCaseType::FirstIsGreater: {
+      for (std::size_t i = 0; i < compEqualView_h.extent(0); ++i) {
+        compEqualView_h(i, middle) -= 1;
+      }
+
+      break;
+    }
+  }
+
+  Kokkos::deep_copy(compEqualView, compEqualView_h);
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+
+  // create the view to store results of equal()
+  Kokkos::View<bool*> resultsView("resultsView", numTeams);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  LessFunctor<ValueType> binaryComp{};
+
+  // use CTAD for functor
+  TestFunctorA fnc(dataView, compEqualView, resultsView, intraTeamSentinelView,
+                   apiId, binaryComp);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // run cpp-std kernel and check
+  // -----------------------------------------------
+  auto resultsView_h           = create_host_space_copy(resultsView);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+
+  for (std::size_t i = 0; i < dataView.extent(0); ++i) {
+    auto rowData = Kokkos::subview(dataViewBeforeOp_h, i, Kokkos::ALL());
+    const auto dataBegin = KE::cbegin(rowData);
+    const auto dataEnd   = KE::cend(rowData);
+
+    auto rowComp         = Kokkos::subview(compEqualView_h, i, Kokkos::ALL());
+    const auto compBegin = KE::cbegin(rowComp);
+    const auto compEnd   = KE::cend(rowComp);
+
+    ASSERT_TRUE(intraTeamSentinelView_h(i));
+    switch (apiId) {
+      case 0:
+      case 1: {
+        const bool result = std::lexicographical_compare(dataBegin, dataEnd,
+                                                         compBegin, compEnd);
+
+        switch (testCase) {
+          case TestCaseType::ViewsAreEqual:
+          case TestCaseType::FirstIsGreater: {
+            EXPECT_FALSE(resultsView_h(i));
+            ASSERT_EQ(result, resultsView_h(i));
+            break;
+          }
+
+          case TestCaseType::FirstIsLess: {
+            EXPECT_TRUE(resultsView_h(i));
+            ASSERT_EQ(result, resultsView_h(i));
+            break;
+          }
+        }
+
+        break;
+      }
+
+      case 2:
+      case 3: {
+        const bool result = std::lexicographical_compare(
+            dataBegin, dataEnd, compBegin, compEnd, binaryComp);
+
+        switch (testCase) {
+          case TestCaseType::ViewsAreEqual:
+          case TestCaseType::FirstIsGreater: {
+            EXPECT_FALSE(resultsView_h(i));
+            ASSERT_EQ(result, resultsView_h(i));
+            break;
+          }
+
+          case TestCaseType::FirstIsLess: {
+            EXPECT_TRUE(resultsView_h(i));
+            ASSERT_EQ(result, resultsView_h(i));
+            break;
+          }
+        }
+
+        break;
+      }
+    }
+  }
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios(const TestCaseType testCase) {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : {1, 2, 13, 101, 1444, 8153}) {
+      for (int apiId : {0, 1, 2, 3}) {
+        test_A<LayoutTag, ValueType>(testCase, numTeams, numCols, apiId);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_lexicographical_compare_team_test, views_are_equal) {
+  constexpr TestCaseType testCaseType = TestCaseType::ViewsAreEqual;
+  run_all_scenarios<DynamicTag, double>(testCaseType);
+  run_all_scenarios<StridedTwoRowsTag, int>(testCaseType);
+  run_all_scenarios<StridedThreeRowsTag, unsigned>(testCaseType);
+}
+
+TEST(std_algorithms_lexicographical_compare_team_test, first_view_is_less) {
+  constexpr TestCaseType testCaseType = TestCaseType::FirstIsLess;
+  run_all_scenarios<DynamicTag, double>(testCaseType);
+  run_all_scenarios<StridedTwoRowsTag, int>(testCaseType);
+  run_all_scenarios<StridedThreeRowsTag, unsigned>(testCaseType);
+}
+
+TEST(std_algorithms_lexicographical_compare_team_test, first_view_is_greater) {
+  constexpr TestCaseType testCaseType = TestCaseType::FirstIsGreater;
+  run_all_scenarios<DynamicTag, double>(testCaseType);
+  run_all_scenarios<StridedTwoRowsTag, int>(testCaseType);
+  run_all_scenarios<StridedThreeRowsTag, unsigned>(testCaseType);
+}
+
+}  // namespace TeamLexicographicalCompare
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMaxElement.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMaxElement.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fb891a8780fbe2dbeadbdff9f5fc586dc293afb9
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMaxElement.cpp
@@ -0,0 +1,182 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+#include <algorithm>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamMaxElement {
+
+namespace KE = Kokkos::Experimental;
+
+template <class ViewType, class DistancesViewType, class IntraTeamSentinelView>
+struct TestFunctorA {
+  ViewType m_view;
+  DistancesViewType m_distancesView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+  int m_apiPick;
+
+  TestFunctorA(const ViewType view, const DistancesViewType distancesView,
+               IntraTeamSentinelView intraTeamSentinelView, int apiPick)
+      : m_view(view),
+        m_distancesView(distancesView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_apiPick(apiPick) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto myRowIndex = member.league_rank();
+    auto myRowView        = Kokkos::subview(m_view, myRowIndex, Kokkos::ALL());
+    ptrdiff_t resultDist  = 0;
+
+    if (m_apiPick == 0) {
+      auto it =
+          KE::max_element(member, KE::cbegin(myRowView), KE::cend(myRowView));
+      resultDist = KE::distance(KE::cbegin(myRowView), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    }
+
+    else if (m_apiPick == 1) {
+      auto it    = KE::max_element(member, myRowView);
+      resultDist = KE::distance(KE::begin(myRowView), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    }
+#if not defined KOKKOS_ENABLE_OPENMPTARGET
+    else if (m_apiPick == 2) {
+      using value_type = typename ViewType::value_type;
+      auto it =
+          KE::max_element(member, KE::cbegin(myRowView), KE::cend(myRowView),
+                          CustomLessThanComparator<value_type>{});
+      resultDist = KE::distance(KE::cbegin(myRowView), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    }
+
+    else if (m_apiPick == 3) {
+      using value_type = typename ViewType::value_type;
+      auto it          = KE::max_element(member, myRowView,
+                                CustomLessThanComparator<value_type>{});
+      resultDist       = KE::distance(KE::begin(myRowView), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    }
+#endif
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck = team_members_have_matching_result(
+        member, resultDist, m_distancesView(myRowIndex));
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(myRowIndex) = intraTeamCheck;
+    });
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
+  /* description:
+     team-level KE::max_element on a rank-2 view where
+     data is filled randomly and we use one team per row.
+   */
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // create a view in the memory space associated with default exespace
+  // with as many rows as the number of teams and fill it with random
+  auto [dataView, cloneOfDataViewBeforeOp_h] =
+      create_random_view_and_host_clone(
+          LayoutTag{}, numTeams, numCols,
+          Kokkos::pair<ValueType, ValueType>{0, 1153}, "dataView");
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+
+  // each team stores the distance of the returned iterator from the
+  // beginning of the interval that team operates on and then we check
+  // that these distances match the expectation
+  Kokkos::View<std::size_t*> distancesView("distancesView", numTeams);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  // use CTAD for functor
+  TestFunctorA fnc(dataView, distancesView, intraTeamSentinelView, apiId);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // run std algo and check
+  // -----------------------------------------------
+  // here I can use cloneOfDataViewBeforeOp_h to run std algo on
+  // since that contains a valid copy of the data
+  auto distancesView_h         = create_host_space_copy(distancesView);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+  auto dataViewAfterOp_h       = create_host_space_copy(dataView);
+  for (std::size_t i = 0; i < cloneOfDataViewBeforeOp_h.extent(0); ++i) {
+    auto myRow = Kokkos::subview(cloneOfDataViewBeforeOp_h, i, Kokkos::ALL());
+
+    std::size_t stdDistance = 0;
+    if (apiId <= 1) {
+      auto it     = std::max_element(KE::cbegin(myRow), KE::cend(myRow));
+      stdDistance = KE::distance(KE::cbegin(myRow), it);
+    } else {
+      auto it     = std::max_element(KE::cbegin(myRow), KE::cend(myRow),
+                                 CustomLessThanComparator<value_type>{});
+      stdDistance = KE::distance(KE::cbegin(myRow), it);
+    }
+
+    ASSERT_EQ(stdDistance, distancesView_h(i));
+    ASSERT_TRUE(intraTeamSentinelView_h(i));
+  }
+
+  // dataView should remain unchanged
+  expect_equal_host_views(cloneOfDataViewBeforeOp_h, dataViewAfterOp_h);
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios() {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 5113}) {
+      // for OpenMPTarget we need to avod api accepting a custom
+      // comparator because it is not supported
+      for (int apiId : {0, 1, 2, 3}) {
+        test_A<LayoutTag, ValueType>(numTeams, numCols, apiId);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_max_element_team_test, test) {
+#if not defined KOKKOS_ENABLE_OPENMPTARGET
+  run_all_scenarios<DynamicTag, int>();
+  run_all_scenarios<StridedTwoRowsTag, double>();
+  run_all_scenarios<StridedThreeRowsTag, int>();
+#endif
+}
+
+}  // namespace TeamMaxElement
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinElement.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinElement.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4ba1b6f968bcf7ac1b1fa292e4d8c06bb469328e
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinElement.cpp
@@ -0,0 +1,181 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+#include <algorithm>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamMinElement {
+
+namespace KE = Kokkos::Experimental;
+
+template <class ViewType, class DistancesViewType, class IntraTeamSentinelView>
+struct TestFunctorA {
+  ViewType m_view;
+  DistancesViewType m_distancesView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+  int m_apiPick;
+
+  TestFunctorA(const ViewType view, const DistancesViewType distancesView,
+               IntraTeamSentinelView intraTeamSentinelView, int apiPick)
+      : m_view(view),
+        m_distancesView(distancesView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_apiPick(apiPick) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto myRowIndex = member.league_rank();
+    auto myRowView        = Kokkos::subview(m_view, myRowIndex, Kokkos::ALL());
+    ptrdiff_t resultDist  = 0;
+
+    if (m_apiPick == 0) {
+      auto it =
+          KE::min_element(member, KE::cbegin(myRowView), KE::cend(myRowView));
+      resultDist = KE::distance(KE::cbegin(myRowView), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    }
+
+    else if (m_apiPick == 1) {
+      auto it    = KE::min_element(member, myRowView);
+      resultDist = KE::distance(KE::begin(myRowView), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    }
+#if not defined KOKKOS_ENABLE_OPENMPTARGET
+    else if (m_apiPick == 2) {
+      using value_type = typename ViewType::value_type;
+      auto it =
+          KE::min_element(member, KE::cbegin(myRowView), KE::cend(myRowView),
+                          CustomLessThanComparator<value_type>{});
+      resultDist = KE::distance(KE::cbegin(myRowView), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    }
+
+    else if (m_apiPick == 3) {
+      using value_type = typename ViewType::value_type;
+      auto it          = KE::min_element(member, myRowView,
+                                CustomLessThanComparator<value_type>{});
+      resultDist       = KE::distance(KE::begin(myRowView), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    }
+#endif
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck = team_members_have_matching_result(
+        member, resultDist, m_distancesView(myRowIndex));
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(myRowIndex) = intraTeamCheck;
+    });
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
+  /* description:
+     team-level KE::min_element on a rank-2 view where
+     data is filled randomly and we use one team per row.
+   */
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // create a view in the memory space associated with default exespace
+  // with as many rows as the number of teams and fill it with random
+  auto [dataView, cloneOfDataViewBeforeOp_h] =
+      create_random_view_and_host_clone(
+          LayoutTag{}, numTeams, numCols,
+          Kokkos::pair<ValueType, ValueType>{0, 1153}, "dataView");
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+
+  // each team stores the distance of the returned iterator from the
+  // beginning of the interval that team operates on and then we check
+  // that these distances match the expectation
+  Kokkos::View<std::size_t*> distancesView("distancesView", numTeams);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  // use CTAD for functor
+  TestFunctorA fnc(dataView, distancesView, intraTeamSentinelView, apiId);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // run std algo and check
+  // -----------------------------------------------
+  // here I can use cloneOfDataViewBeforeOp_h to run std algo on
+  // since that contains a valid copy of the data
+  auto distancesView_h         = create_host_space_copy(distancesView);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+  auto dataViewAfterOp_h       = create_host_space_copy(dataView);
+  for (std::size_t i = 0; i < cloneOfDataViewBeforeOp_h.extent(0); ++i) {
+    auto myRow = Kokkos::subview(cloneOfDataViewBeforeOp_h, i, Kokkos::ALL());
+
+    std::size_t stdDistance = 0;
+    if (apiId <= 1) {
+      auto it     = std::min_element(KE::cbegin(myRow), KE::cend(myRow));
+      stdDistance = KE::distance(KE::cbegin(myRow), it);
+    } else {
+      auto it     = std::min_element(KE::cbegin(myRow), KE::cend(myRow),
+                                 CustomLessThanComparator<value_type>{});
+      stdDistance = KE::distance(KE::cbegin(myRow), it);
+    }
+    ASSERT_EQ(stdDistance, distancesView_h(i));
+    ASSERT_TRUE(intraTeamSentinelView_h(i));
+  }
+
+  // dataView should remain unchanged
+  expect_equal_host_views(cloneOfDataViewBeforeOp_h, dataViewAfterOp_h);
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios() {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 5113}) {
+      // for OpenMPTarget we need to avod api accepting a custom
+      // comparator because it is not supported
+      for (int apiId : {0, 1, 2, 3}) {
+        test_A<LayoutTag, ValueType>(numTeams, numCols, apiId);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_min_element_team_test, test) {
+#if not defined KOKKOS_ENABLE_OPENMPTARGET
+  run_all_scenarios<DynamicTag, int>();
+  run_all_scenarios<StridedTwoRowsTag, double>();
+  run_all_scenarios<StridedThreeRowsTag, int>();
+#endif
+}
+
+}  // namespace TeamMinElement
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinMaxElement.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinMaxElement.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..17562a55727b89f6561a9d0730b72830acfd21df
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinMaxElement.cpp
@@ -0,0 +1,200 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+#include <algorithm>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamMinMaxElement {
+
+namespace KE = Kokkos::Experimental;
+
+template <class ViewType, class DistancesViewType, class IntraTeamSentinelView>
+struct TestFunctorA {
+  ViewType m_view;
+  DistancesViewType m_distancesView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+  int m_apiPick;
+
+  TestFunctorA(const ViewType view, const DistancesViewType distancesView,
+               IntraTeamSentinelView intraTeamSentinelView, int apiPick)
+      : m_view(view),
+        m_distancesView(distancesView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_apiPick(apiPick) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto myRowIndex = member.league_rank();
+    auto myRowView        = Kokkos::subview(m_view, myRowIndex, Kokkos::ALL());
+    ptrdiff_t resultDist1 = 0;
+    ptrdiff_t resultDist2 = 0;
+
+    if (m_apiPick == 0) {
+      auto itPair = KE::minmax_element(member, KE::cbegin(myRowView),
+                                       KE::cend(myRowView));
+      resultDist1 = KE::distance(KE::cbegin(myRowView), itPair.first);
+      resultDist2 = KE::distance(KE::cbegin(myRowView), itPair.second);
+
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex, 0) = resultDist1;
+        m_distancesView(myRowIndex, 1) = resultDist2;
+      });
+    }
+
+    else if (m_apiPick == 1) {
+      auto itPair = KE::minmax_element(member, myRowView);
+      resultDist1 = KE::distance(KE::begin(myRowView), itPair.first);
+      resultDist2 = KE::distance(KE::begin(myRowView), itPair.second);
+
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex, 0) = resultDist1;
+        m_distancesView(myRowIndex, 1) = resultDist2;
+      });
+    }
+#if not defined KOKKOS_ENABLE_OPENMPTARGET
+    else if (m_apiPick == 2) {
+      using value_type = typename ViewType::value_type;
+      auto itPair =
+          KE::minmax_element(member, KE::cbegin(myRowView), KE::cend(myRowView),
+                             CustomLessThanComparator<value_type>{});
+      resultDist1 = KE::distance(KE::cbegin(myRowView), itPair.first);
+      resultDist2 = KE::distance(KE::cbegin(myRowView), itPair.second);
+
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex, 0) = resultDist1;
+        m_distancesView(myRowIndex, 1) = resultDist2;
+      });
+    }
+
+    else if (m_apiPick == 3) {
+      using value_type = typename ViewType::value_type;
+      auto itPair      = KE::minmax_element(member, myRowView,
+                                       CustomLessThanComparator<value_type>{});
+      resultDist1      = KE::distance(KE::begin(myRowView), itPair.first);
+      resultDist2      = KE::distance(KE::begin(myRowView), itPair.second);
+
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex, 0) = resultDist1;
+        m_distancesView(myRowIndex, 1) = resultDist2;
+      });
+    }
+#endif
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck1 = team_members_have_matching_result(
+        member, resultDist1, m_distancesView(myRowIndex, 0));
+    const bool intraTeamCheck2 = team_members_have_matching_result(
+        member, resultDist2, m_distancesView(myRowIndex, 1));
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(myRowIndex) = intraTeamCheck1 && intraTeamCheck2;
+    });
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
+  /* description:
+     team-level KE::minmax_element on a rank-2 view where
+     data is filled randomly and we use one team per row.
+   */
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // create a view in the memory space associated with default exespace
+  // with as many rows as the number of teams and fill it with random
+  auto [dataView, cloneOfDataViewBeforeOp_h] =
+      create_random_view_and_host_clone(
+          LayoutTag{}, numTeams, numCols,
+          Kokkos::pair<ValueType, ValueType>{0, 1153}, "dataView");
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+
+  // each team stores the distance of the returned value from the
+  // beginning of the interval that team operates on and then we check
+  // that these distances match the expectation
+  Kokkos::View<std::size_t**> distancesView("distancesView", numTeams, 2);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  // use CTAD for functor
+  TestFunctorA fnc(dataView, distancesView, intraTeamSentinelView, apiId);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // run std algo and check
+  // -----------------------------------------------
+  // here I can use cloneOfDataViewBeforeOp_h to run std algo on
+  // since that contains a valid copy of the data
+  auto distancesView_h         = create_host_space_copy(distancesView);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+  auto dataViewAfterOp_h       = create_host_space_copy(dataView);
+  for (std::size_t i = 0; i < cloneOfDataViewBeforeOp_h.extent(0); ++i) {
+    auto myRow = Kokkos::subview(cloneOfDataViewBeforeOp_h, i, Kokkos::ALL());
+
+    std::size_t stdDistance[2];
+    if (apiId <= 1) {
+      auto itPair    = std::minmax_element(KE::cbegin(myRow), KE::cend(myRow));
+      stdDistance[0] = KE::distance(KE::cbegin(myRow), itPair.first);
+      stdDistance[1] = KE::distance(KE::cbegin(myRow), itPair.second);
+    } else {
+      auto itPair    = std::minmax_element(KE::cbegin(myRow), KE::cend(myRow),
+                                        CustomLessThanComparator<value_type>{});
+      stdDistance[0] = KE::distance(KE::cbegin(myRow), itPair.first);
+      stdDistance[1] = KE::distance(KE::cbegin(myRow), itPair.second);
+    }
+
+    ASSERT_EQ(stdDistance[0], distancesView_h(i, 0));
+    ASSERT_EQ(stdDistance[1], distancesView_h(i, 1));
+    ASSERT_TRUE(intraTeamSentinelView_h(i));
+  }
+
+  // dataView should remain unchanged
+  expect_equal_host_views(cloneOfDataViewBeforeOp_h, dataViewAfterOp_h);
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios() {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 5113}) {
+      // for OpenMPTarget we need to avod api accepting a custom
+      // comparator because it is not supported
+      for (int apiId : {0, 1, 2, 3}) {
+        test_A<LayoutTag, ValueType>(numTeams, numCols, apiId);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_minmax_element_team_test, test) {
+#if not defined KOKKOS_ENABLE_OPENMPTARGET
+  run_all_scenarios<DynamicTag, int>();
+  run_all_scenarios<StridedTwoRowsTag, double>();
+  run_all_scenarios<StridedThreeRowsTag, int>();
+#endif
+}
+
+}  // namespace TeamMinMaxElement
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMismatch.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMismatch.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..84269511d8334b746112de20a58df5538435c783
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMismatch.cpp
@@ -0,0 +1,283 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamMismatch {
+
+namespace KE = Kokkos::Experimental;
+
+template <class ValueType>
+struct EqualFunctor {
+  KOKKOS_INLINE_FUNCTION bool operator()(const ValueType& lhs,
+                                         const ValueType& rhs) const {
+    return lhs == rhs;
+  }
+};
+
+template <class DataViewType, class CompViewType, class ResultsViewType,
+          class IntraTeamSentinelView, class BinaryOpType>
+struct TestFunctorA {
+  DataViewType m_dataView;
+  CompViewType m_compView;
+  ResultsViewType m_resultsView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+  int m_apiPick;
+  BinaryOpType m_binaryOp;
+
+  TestFunctorA(const DataViewType dataView, const CompViewType compView,
+               const ResultsViewType resultsView,
+               const IntraTeamSentinelView intraTeamSentinelView, int apiPick,
+               BinaryOpType binaryOp)
+      : m_dataView(dataView),
+        m_compView(compView),
+        m_resultsView(resultsView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_apiPick(apiPick),
+        m_binaryOp(binaryOp) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto rowIndex = member.league_rank();
+
+    auto rowData   = Kokkos::subview(m_dataView, rowIndex, Kokkos::ALL());
+    auto dataBegin = KE::begin(rowData);
+    auto dataEnd   = KE::end(rowData);
+
+    auto rowComp   = Kokkos::subview(m_compView, rowIndex, Kokkos::ALL());
+    auto compBegin = KE::begin(rowComp);
+    auto compEnd   = KE::end(rowComp);
+
+    ptrdiff_t dataDist = 0;
+    ptrdiff_t compDist = 0;
+
+    switch (m_apiPick) {
+      case 0: {
+        auto [dataIt, compIt] =
+            KE::mismatch(member, dataBegin, dataEnd, compBegin, compEnd);
+
+        dataDist = KE::distance(dataBegin, dataIt);
+        compDist = KE::distance(compBegin, compIt);
+        Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+          m_resultsView(rowIndex) = Kokkos::make_pair(dataDist, compDist);
+        });
+
+        break;
+      }
+
+      case 1: {
+        const auto [dataIt, compIt] = KE::mismatch(
+            member, dataBegin, dataEnd, compBegin, compEnd, m_binaryOp);
+
+        dataDist = KE::distance(dataBegin, dataIt);
+        compDist = KE::distance(compBegin, compIt);
+        Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+          m_resultsView(rowIndex) = Kokkos::make_pair(dataDist, compDist);
+        });
+
+        break;
+      }
+
+      case 2: {
+        const auto [dataIt, compIt] = KE::mismatch(member, rowData, rowComp);
+
+        dataDist = KE::distance(dataBegin, dataIt);
+        compDist = KE::distance(compBegin, compIt);
+        Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+          m_resultsView(rowIndex) = Kokkos::make_pair(dataDist, compDist);
+        });
+
+        break;
+      }
+
+      case 3: {
+        const auto [dataIt, compIt] =
+            KE::mismatch(member, rowData, rowComp, m_binaryOp);
+
+        dataDist = KE::distance(dataBegin, dataIt);
+        compDist = KE::distance(compBegin, compIt);
+        Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+          m_resultsView(rowIndex) = Kokkos::make_pair(dataDist, compDist);
+        });
+
+        break;
+      }
+    }
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck1 = team_members_have_matching_result(
+        member, dataDist, m_resultsView(rowIndex).first);
+    const bool intraTeamCheck2 = team_members_have_matching_result(
+        member, compDist, m_resultsView(rowIndex).second);
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(rowIndex) = intraTeamCheck1 && intraTeamCheck2;
+    });
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(const bool viewsAreEqual, std::size_t numTeams, std::size_t numCols,
+            int apiId) {
+  /* description:
+     use a rank-2 view randomly filled with values,
+     and run a team-level mismatch
+   */
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // create a view in the memory space associated with default exespace
+  // with as many rows as the number of teams and fill it with random
+  // values from an arbitrary range.
+  constexpr ValueType lowerBound = 5;
+  constexpr ValueType upperBound = 523;
+  const auto bounds              = make_bounds(lowerBound, upperBound);
+
+  auto [dataView, dataViewBeforeOp_h] = create_random_view_and_host_clone(
+      LayoutTag{}, numTeams, numCols, bounds, "dataView");
+
+  // create a view to compare it with dataView. If viewsAreEqual == true,
+  // compView is a copy of dataView. If viewsAreEqual == false, compView is
+  // randomly filled
+  auto compView   = create_deep_copyable_compatible_clone(dataView);
+  auto compView_h = create_mirror_view(Kokkos::HostSpace(), compView);
+  if (viewsAreEqual) {
+    Kokkos::deep_copy(compView_h, dataViewBeforeOp_h);
+  } else {
+    using rand_pool =
+        Kokkos::Random_XorShift64_Pool<Kokkos::DefaultHostExecutionSpace>;
+    rand_pool pool(lowerBound * upperBound);
+    Kokkos::fill_random(compView_h, pool, lowerBound, upperBound);
+  }
+
+  Kokkos::deep_copy(compView, compView_h);
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+
+  // create the view to store results of mismatch()
+  Kokkos::View<Kokkos::pair<std::size_t, std::size_t>*> resultsView(
+      "resultsView", numTeams);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  EqualFunctor<ValueType> binaryPred{};
+
+  // use CTAD for functor
+  TestFunctorA fnc(dataView, compView, resultsView, intraTeamSentinelView,
+                   apiId, binaryPred);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // run cpp-std kernel and check
+  // -----------------------------------------------
+  auto resultsView_h           = create_host_space_copy(resultsView);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+
+  for (std::size_t i = 0; i < dataView.extent(0); ++i) {
+    ASSERT_TRUE(intraTeamSentinelView_h(i));
+
+    auto rowData = Kokkos::subview(dataViewBeforeOp_h, i, Kokkos::ALL());
+
+    const auto dataBegin = KE::cbegin(rowData);
+    const auto dataEnd   = KE::cend(rowData);
+
+    const std::size_t dataBeginEndDist = KE::distance(dataBegin, dataEnd);
+
+    auto rowComp = Kokkos::subview(compView_h, i, Kokkos::ALL());
+
+    const auto compBegin = KE::cbegin(rowComp);
+    const auto compEnd   = KE::cend(rowComp);
+
+    const std::size_t compBeginEndDist = KE::distance(compBegin, compEnd);
+
+    switch (apiId) {
+      case 0:
+      case 2: {
+        const auto [dataIt, compIt] =
+            std::mismatch(dataBegin, dataEnd, compBegin, compEnd);
+
+        const std::size_t dataDist = KE::distance(dataBegin, dataIt);
+        const std::size_t compDist = KE::distance(compBegin, compIt);
+
+        if (viewsAreEqual) {
+          ASSERT_EQ(dataBeginEndDist, resultsView_h(i).first);
+          ASSERT_EQ(compBeginEndDist, resultsView_h(i).second);
+        } else {
+          ASSERT_EQ(dataDist, resultsView_h(i).first);
+          ASSERT_EQ(compDist, resultsView_h(i).second);
+        }
+
+        break;
+      }
+
+      case 1:
+      case 3: {
+        const auto [dataIt, compIt] =
+            std::mismatch(dataBegin, dataEnd, compBegin, compEnd, binaryPred);
+
+        const std::size_t dataDist = KE::distance(dataBegin, dataIt);
+        const std::size_t compDist = KE::distance(compBegin, compIt);
+
+        if (viewsAreEqual) {
+          ASSERT_EQ(dataBeginEndDist, resultsView_h(i).first);
+          ASSERT_EQ(compBeginEndDist, resultsView_h(i).second);
+        } else {
+          ASSERT_EQ(dataDist, resultsView_h(i).first);
+          ASSERT_EQ(compDist, resultsView_h(i).second);
+        }
+
+        break;
+      }
+    }
+  }
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios(const bool viewsAreEqual) {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8153}) {
+      for (int apiId : {0, 1, 2, 3}) {
+        test_A<LayoutTag, ValueType>(viewsAreEqual, numTeams, numCols, apiId);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_mismatch_team_test, views_are_equal) {
+  constexpr bool viewsAreEqual = true;
+  run_all_scenarios<DynamicTag, double>(viewsAreEqual);
+  run_all_scenarios<StridedTwoRowsTag, int>(viewsAreEqual);
+  run_all_scenarios<StridedThreeRowsTag, unsigned>(viewsAreEqual);
+}
+
+TEST(std_algorithms_mismatch_team_test, views_are_not_equal) {
+  constexpr bool viewsAreEqual = false;
+  run_all_scenarios<DynamicTag, double>(viewsAreEqual);
+  run_all_scenarios<StridedTwoRowsTag, int>(viewsAreEqual);
+  run_all_scenarios<StridedThreeRowsTag, unsigned>(viewsAreEqual);
+}
+
+}  // namespace TeamMismatch
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMove.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMove.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1122d6d554ac270911cf4305fd242592df96294a
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMove.cpp
@@ -0,0 +1,161 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamMove {
+
+namespace KE = Kokkos::Experimental;
+
+template <class SourceViewType, class DestViewType, class DistancesViewType,
+          class IntraTeamSentinelView>
+struct TestFunctorA {
+  SourceViewType m_sourceView;
+  DestViewType m_destView;
+  DistancesViewType m_distancesView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+  int m_apiPick;
+
+  TestFunctorA(const SourceViewType sourceView, const DestViewType destView,
+               const DistancesViewType distancesView,
+               const IntraTeamSentinelView intraTeamSentinelView, int apiPick)
+      : m_sourceView(sourceView),
+        m_destView(destView),
+        m_distancesView(distancesView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_apiPick(apiPick) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto myRowIndex = member.league_rank();
+    auto myRowViewFrom =
+        Kokkos::subview(m_sourceView, myRowIndex, Kokkos::ALL());
+    auto myRowViewDest = Kokkos::subview(m_destView, myRowIndex, Kokkos::ALL());
+    ptrdiff_t resultDist = 0;
+
+    if (m_apiPick == 0) {
+      auto it    = KE::move(member, KE::begin(myRowViewFrom),
+                         KE::end(myRowViewFrom), KE::begin(myRowViewDest));
+      resultDist = KE::distance(KE::begin(myRowViewDest), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    }
+
+    else if (m_apiPick == 1) {
+      auto it    = KE::move(member, myRowViewFrom, myRowViewDest);
+      resultDist = KE::distance(KE::begin(myRowViewDest), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    }
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck = team_members_have_matching_result(
+        member, resultDist, m_distancesView(myRowIndex));
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(myRowIndex) = intraTeamCheck;
+    });
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
+  /* description:
+     randomly fill a source view, do team level KE::move into a destination
+     view.
+   */
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // create a view in the memory space associated with default exespace
+  // with as many rows as the number of teams and fill it with random
+  // values from an arbitrary range
+  auto [sourceView, cloneOfSourceViewBeforeOp_h] =
+      create_random_view_and_host_clone(
+          LayoutTag{}, numTeams, numCols,
+          Kokkos::pair<ValueType, ValueType>{11, 523}, "sourceView");
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+  // create the destination view
+  Kokkos::View<ValueType**> destView("destView", numTeams, numCols);
+  // make host copy of destView that should remain unchanged
+  auto destViewBeforeOp_h = create_host_space_copy(destView);
+
+  // each team stores the distance of the returned iterator from the
+  // beginning of the interval that team operates on and then we check
+  // that these distances match the expectation
+  Kokkos::View<std::size_t*> distancesView("distancesView", numTeams);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  // use CTAD for functor
+  TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView,
+                   apiId);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // check
+  // -----------------------------------------------
+  // KE::move has been applied to sourceView, so we should
+  // NOT use sourceView henceforth, becuase all its elements
+  // have been moved from.
+
+  auto distancesView_h         = create_host_space_copy(distancesView);
+  auto destViewAfterOp_h       = create_host_space_copy(destView);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+  for (std::size_t i = 0; i < destViewBeforeOp_h.extent(0); ++i) {
+    for (std::size_t j = 0; j < destViewBeforeOp_h.extent(1); ++j) {
+      ASSERT_EQ(destViewBeforeOp_h(i, j), ValueType(0));
+      EXPECT_TRUE(destViewAfterOp_h(i, j) != destViewBeforeOp_h(i, j));
+    }
+    // each team should return an iterator past the last column
+    EXPECT_TRUE(distancesView_h(i) == numCols);
+    ASSERT_TRUE(intraTeamSentinelView_h(i));
+  }
+
+  expect_equal_host_views(cloneOfSourceViewBeforeOp_h, destViewAfterOp_h);
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios() {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8113}) {
+      for (int apiId : {0, 1}) {
+        test_A<LayoutTag, ValueType>(numTeams, numCols, apiId);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_move_team_test, test) {
+  run_all_scenarios<DynamicTag, double>();
+  run_all_scenarios<StridedTwoRowsTag, int>();
+  run_all_scenarios<StridedThreeRowsTag, unsigned>();
+}
+
+}  // namespace TeamMove
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMoveBackward.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMoveBackward.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..eb46f0301f8115ac4c7fda25451dcf75c8f4e470
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMoveBackward.cpp
@@ -0,0 +1,170 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamMovebackward {
+
+namespace KE = Kokkos::Experimental;
+
+template <class SourceViewType, class DestViewType, class DistancesViewType,
+          class IntraTeamSentinelView>
+struct TestFunctorA {
+  SourceViewType m_sourceView;
+  DestViewType m_destView;
+  DistancesViewType m_distancesView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+  int m_apiPick;
+
+  TestFunctorA(const SourceViewType sourceView, const DestViewType destView,
+               const DistancesViewType distancesView,
+               const IntraTeamSentinelView intraTeamSentinelView, int apiPick)
+      : m_sourceView(sourceView),
+        m_destView(destView),
+        m_distancesView(distancesView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_apiPick(apiPick) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto myRowIndex = member.league_rank();
+    auto myRowViewFrom =
+        Kokkos::subview(m_sourceView, myRowIndex, Kokkos::ALL());
+    auto myRowViewDest = Kokkos::subview(m_destView, myRowIndex, Kokkos::ALL());
+    ptrdiff_t resultDist = 0;
+
+    if (m_apiPick == 0) {
+      auto it =
+          KE::move_backward(member, KE::cbegin(myRowViewFrom),
+                            KE::cend(myRowViewFrom), KE::end(myRowViewDest));
+      resultDist = KE::distance(KE::begin(myRowViewDest), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    } else if (m_apiPick == 1) {
+      auto it    = KE::move_backward(member, myRowViewFrom, myRowViewDest);
+      resultDist = KE::distance(KE::begin(myRowViewDest), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    }
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck = team_members_have_matching_result(
+        member, resultDist, m_distancesView(myRowIndex));
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(myRowIndex) = intraTeamCheck;
+    });
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
+  /* description:
+     randomly fill a source view and do team KE::move_backward
+     into a destination view. The operation is done via a
+     team parfor with one row per team.
+   */
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // create a view in the memory space associated with default exespace
+  // with as many rows as the number of teams and fill it with random
+  // values from an arbitrary range
+  auto [sourceView, cloneOfSourceViewBeforeOp_h] =
+      create_random_view_and_host_clone(
+          LayoutTag{}, numTeams, numCols,
+          Kokkos::pair<ValueType, ValueType>{11, 523}, "sourceView");
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+  // create the destination view: for a meaningful test, the destination
+  // view must have more columns that than the source view so that we
+  // can check that the elements are copied into the right place
+  constexpr std::size_t extra = 10;
+  Kokkos::View<ValueType**> destView("destView", numTeams, numCols + extra);
+  // make host copy of destView that should remain unchanged
+  auto destViewBeforeOp_h = create_host_space_copy(destView);
+
+  // each team stores the distance of the returned iterator from the
+  // beginning of the interval that team operates on and then we check
+  // that these distances match the expectation
+  Kokkos::View<std::size_t*> distancesView("distancesView", numTeams);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  // use CTAD for functor
+  TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView,
+                   apiId);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // check
+  // -----------------------------------------------
+  // KE::move_backwar has been applied to sourceView, so we should
+  // NOT use sourceView henceforth, becuase all its elements
+  // have been moved from.
+
+  auto distancesView_h         = create_host_space_copy(distancesView);
+  auto destViewAfterOp_h       = create_host_space_copy(destView);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+  for (std::size_t i = 0; i < destViewAfterOp_h.extent(0); ++i) {
+    // first extra num of columns should be unchanged
+    for (std::size_t j = 0; j < extra; ++j) {
+      EXPECT_TRUE(destViewAfterOp_h(i, j) == destViewBeforeOp_h(i, j));
+    }
+
+    // after extra # of column (inclusive) should match the source view
+    for (std::size_t j = extra; j < destViewBeforeOp_h.extent(1); ++j) {
+      EXPECT_TRUE(cloneOfSourceViewBeforeOp_h(i, j - extra) ==
+                  destViewAfterOp_h(i, j));
+    }
+
+    // each team should have returned an interator whose distance
+    // from the beginning of the row should satisfy this
+    EXPECT_TRUE(distancesView_h(i) == extra);
+    ASSERT_TRUE(intraTeamSentinelView_h(i));
+  }
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios() {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 11113}) {
+      for (int apiId : {0, 1}) {
+        test_A<LayoutTag, ValueType>(numTeams, numCols, apiId);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_move_backward_team_test, test) {
+  run_all_scenarios<DynamicTag, double>();
+  run_all_scenarios<StridedTwoRowsTag, int>();
+  run_all_scenarios<StridedThreeRowsTag, unsigned>();
+}
+
+}  // namespace TeamMovebackward
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamNoneOf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamNoneOf.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..373c6c662b9dbe3281d114b6286654d3d6faf66e
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamNoneOf.cpp
@@ -0,0 +1,165 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamNoneOf {
+
+namespace KE = Kokkos::Experimental;
+
+template <class ValueType>
+struct GreaterThanValueFunctor {
+  ValueType m_val;
+
+  KOKKOS_INLINE_FUNCTION
+  GreaterThanValueFunctor(ValueType val) : m_val(val) {}
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator()(ValueType val) const { return (val > m_val); }
+};
+
+template <class DataViewType, class NoneOfResultsViewType,
+          class IntraTeamSentinelView, class UnaryPredType>
+struct TestFunctorA {
+  DataViewType m_dataView;
+  NoneOfResultsViewType m_noneOfResultsView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+  int m_apiPick;
+  UnaryPredType m_unaryPred;
+
+  TestFunctorA(const DataViewType dataView,
+               const NoneOfResultsViewType noneOfResultsView,
+               const IntraTeamSentinelView intraTeamSentinelView, int apiPick,
+               UnaryPredType unaryPred)
+      : m_dataView(dataView),
+        m_noneOfResultsView(noneOfResultsView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_apiPick(apiPick),
+        m_unaryPred(unaryPred) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto myRowIndex = member.league_rank();
+
+    auto myRowViewFrom = Kokkos::subview(m_dataView, myRowIndex, Kokkos::ALL());
+    bool result        = false;
+
+    switch (m_apiPick) {
+      case 0: {
+        result = KE::none_of(member, KE::cbegin(myRowViewFrom),
+                             KE::cend(myRowViewFrom), m_unaryPred);
+        Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+          m_noneOfResultsView(myRowIndex) = result;
+        });
+        break;
+      }
+
+      case 1: {
+        result = KE::none_of(member, myRowViewFrom, m_unaryPred);
+        Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+          m_noneOfResultsView(myRowIndex) = result;
+        });
+        break;
+      }
+    }
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck = team_members_have_matching_result(
+        member, result, m_noneOfResultsView(myRowIndex));
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(myRowIndex) = intraTeamCheck;
+    });
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
+  /* description:
+     use a rank-2 view randomly filled with values,
+     and run a team-level none_of
+   */
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // create a view in the memory space associated with default exespace
+  // with as many rows as the number of teams and fill it with random
+  // values from an arbitrary range.
+  constexpr ValueType lowerBound = 5;
+  constexpr ValueType upperBound = 523;
+  const auto bounds              = make_bounds(lowerBound, upperBound);
+
+  auto [dataView, dataViewBeforeOp_h] = create_random_view_and_host_clone(
+      LayoutTag{}, numTeams, numCols, bounds, "dataView");
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+
+  // to verify that things work, each team stores the result of its none_of
+  // call, and then we check that these match what we expect
+  Kokkos::View<bool*> noneOfResultsView("noneOfResultsView", numTeams);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  GreaterThanValueFunctor unaryPred{upperBound};
+
+  // use CTAD for functor
+  TestFunctorA fnc(dataView, noneOfResultsView, intraTeamSentinelView, apiId,
+                   unaryPred);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // run cpp-std kernel and check
+  // -----------------------------------------------
+  auto noneOfResultsView_h     = create_host_space_copy(noneOfResultsView);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+
+  for (std::size_t i = 0; i < dataView.extent(0); ++i) {
+    auto rowFrom = Kokkos::subview(dataViewBeforeOp_h, i, Kokkos::ALL());
+    const bool result =
+        std::none_of(KE::cbegin(rowFrom), KE::cend(rowFrom), unaryPred);
+    ASSERT_EQ(result, noneOfResultsView_h(i));
+    ASSERT_TRUE(intraTeamSentinelView_h(i));
+  }
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios() {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8153}) {
+      for (int apiId : {0, 1}) {
+        test_A<LayoutTag, ValueType>(numTeams, numCols, apiId);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_none_of_team_test, test) {
+  run_all_scenarios<DynamicTag, double>();
+  run_all_scenarios<StridedTwoRowsTag, int>();
+  run_all_scenarios<StridedThreeRowsTag, unsigned>();
+}
+
+}  // namespace TeamNoneOf
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamPartitionCopy.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamPartitionCopy.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c0bbdfa39041e0ef758564788eff7d70f184b8ef
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamPartitionCopy.cpp
@@ -0,0 +1,313 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+#include <algorithm>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamPartitionCopy {
+
+namespace KE = Kokkos::Experimental;
+
+template <class ValueType>
+struct UnifDist;
+
+template <>
+struct UnifDist<int> {
+  using dist_type = std::uniform_int_distribution<int>;
+  std::mt19937 m_gen;
+  dist_type m_dist;
+
+  UnifDist(int a, int b, std::size_t seedIn) : m_dist(a, b) {
+    m_gen.seed(seedIn);
+  }
+
+  int operator()() { return m_dist(m_gen); }
+};
+
+template <class ValueType>
+struct GreaterThanValueFunctor {
+  ValueType m_val;
+
+  KOKKOS_INLINE_FUNCTION
+  GreaterThanValueFunctor(ValueType val) : m_val(val) {}
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator()(ValueType val) const { return (val > m_val); }
+};
+
+template <class SourceViewType, class DestViewType, class DistancesViewType,
+          class IntraTeamSentinelView, class ValueType>
+struct TestFunctorA {
+  SourceViewType m_sourceView;
+
+  DestViewType m_destTrueView;
+  DestViewType m_destFalseView;
+
+  DistancesViewType m_distancesTrueView;
+  DistancesViewType m_distancesFalseView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+
+  ValueType m_threshold;
+  int m_apiPick;
+
+  TestFunctorA(const SourceViewType sourceView, const DestViewType destTrueView,
+               const DestViewType destFalseView,
+               const DistancesViewType distancesTrueView,
+               const DistancesViewType distancesFalseView,
+               const IntraTeamSentinelView intraTeamSentinelView,
+               ValueType threshold, int apiPick)
+      : m_sourceView(sourceView),
+        m_destTrueView(destTrueView),
+        m_destFalseView(destFalseView),
+        m_distancesTrueView(distancesTrueView),
+        m_distancesFalseView(distancesFalseView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_threshold(threshold),
+        m_apiPick(apiPick) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto myRowIndex = member.league_rank();
+    auto myRowViewFrom =
+        Kokkos::subview(m_sourceView, myRowIndex, Kokkos::ALL());
+
+    auto myRowViewDestTrue =
+        Kokkos::subview(m_destTrueView, myRowIndex, Kokkos::ALL());
+    auto myRowViewDestFalse =
+        Kokkos::subview(m_destFalseView, myRowIndex, Kokkos::ALL());
+
+    ptrdiff_t resultDist1 = 0;
+    ptrdiff_t resultDist2 = 0;
+
+    GreaterThanValueFunctor predicate(m_threshold);
+    if (m_apiPick == 0) {
+      const auto result = KE::partition_copy(
+          member, KE::cbegin(myRowViewFrom), KE::cend(myRowViewFrom),
+          KE::begin(myRowViewDestTrue), KE::begin(myRowViewDestFalse),
+          predicate);
+      resultDist1 = KE::distance(KE::begin(myRowViewDestTrue), result.first);
+      resultDist2 = KE::distance(KE::begin(myRowViewDestFalse), result.second);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesTrueView(myRowIndex)  = resultDist1;
+        m_distancesFalseView(myRowIndex) = resultDist2;
+      });
+    }
+
+    else if (m_apiPick == 1) {
+      const auto result =
+          KE::partition_copy(member, myRowViewFrom, myRowViewDestTrue,
+                             myRowViewDestFalse, predicate);
+      resultDist1 = KE::distance(KE::begin(myRowViewDestTrue), result.first);
+      resultDist2 = KE::distance(KE::begin(myRowViewDestFalse), result.second);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesTrueView(myRowIndex)  = resultDist1;
+        m_distancesFalseView(myRowIndex) = resultDist2;
+      });
+    }
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck1 = team_members_have_matching_result(
+        member, resultDist1, m_distancesTrueView(myRowIndex));
+    const bool intraTeamCheck2 = team_members_have_matching_result(
+        member, resultDist2, m_distancesFalseView(myRowIndex));
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(myRowIndex) = intraTeamCheck1 && intraTeamCheck2;
+    });
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(std::size_t numTeams, std::size_t numCols, int apiId,
+            const std::string& sIn) {
+  /* description:
+     use a rank-2 view randomly filled with values in a range (a,b)
+     and run a team-level partition_copy with predicate = IsGreaterThanValue
+     where threshold is set to a number larger than b above
+   */
+  const auto threshold           = static_cast<ValueType>(1103);
+  const auto valueForSureGreater = static_cast<ValueType>(2103);
+  const auto valueForSureSmaller = static_cast<ValueType>(111);
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // construct in memory space associated with default exespace
+  auto sourceView =
+      create_view<ValueType>(LayoutTag{}, numTeams, numCols, "sourceView");
+
+  // sourceView might not deep copyable (e.g. strided layout) so to
+  // randomize it, we make a new view that is for sure deep copyable,
+  // modify it on the host, deep copy to device and then launch
+  // a kernel to copy to sourceView
+  auto sourceView_dc =
+      create_deep_copyable_compatible_view_with_same_extent(sourceView);
+  auto sourceView_dc_h = create_mirror_view(Kokkos::HostSpace(), sourceView_dc);
+
+  if (sIn == "trivialEmpty") {
+    // do nothing
+  }
+
+  else if (sIn == "allTrue") {
+    // randomly fill with values greater than threshold
+    // so that all elements in each row satisfy the predicate
+    // so this counts as being partitioned
+    Kokkos::Random_XorShift64_Pool<Kokkos::DefaultHostExecutionSpace> pool(
+        452377);
+    Kokkos::fill_random(sourceView_dc_h, pool, ValueType(2001),
+                        ValueType(2501));
+  }
+
+  else if (sIn == "allFalse") {
+    // randomly fill the view with values smaller than threshold
+    // and even in this case each row counts as partitioned
+    Kokkos::Random_XorShift64_Pool<Kokkos::DefaultHostExecutionSpace> pool(
+        452377);
+    Kokkos::fill_random(sourceView_dc_h, pool, ValueType(0), ValueType(101));
+  }
+
+  else if (sIn == "random") {
+    // randomly select a location and make all values before that
+    // larger than threshol and all values after to be smaller than threshold
+    // so that this picked location does partition the range
+    UnifDist<int> indexProducer(0, numCols - 1, 3432779);
+    for (std::size_t i = 0; i < sourceView_dc_h.extent(0); ++i) {
+      const std::size_t a = indexProducer();
+      for (std::size_t j = 0; j < a; ++j) {
+        sourceView_dc_h(i, j) = valueForSureGreater;
+      }
+      for (std::size_t j = a; j < numCols; ++j) {
+        sourceView_dc_h(i, j) = valueForSureSmaller;
+      }
+    }
+  }
+
+  // copy to sourceView_dc and then to sourceView
+  Kokkos::deep_copy(sourceView_dc, sourceView_dc_h);
+  // use CTAD
+  CopyFunctorRank2 F1(sourceView_dc, sourceView);
+  Kokkos::parallel_for("copy", sourceView.extent(0) * sourceView.extent(1), F1);
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+
+  // create the destination views
+  Kokkos::View<ValueType**> destTrueView("destViewTrue", numTeams, numCols);
+  Kokkos::View<ValueType**> destFalseView("destViewFalse", numTeams, numCols);
+
+  // to verify that things work, each team stores the result
+  // and then we check that these match what we expect
+  Kokkos::View<std::size_t*> distancesTrueView("distancesTrue", numTeams);
+  Kokkos::View<std::size_t*> distancesFalseView("distancesFalse", numTeams);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  // use CTAD for functor
+  TestFunctorA fnc(sourceView, destTrueView, destFalseView, distancesTrueView,
+                   distancesFalseView, intraTeamSentinelView, threshold, apiId);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // check
+  // -----------------------------------------------
+  auto distancesTrueView_h     = create_host_space_copy(distancesTrueView);
+  auto distancesFalseView_h    = create_host_space_copy(distancesFalseView);
+  auto sourceViewAfterOp_h     = create_host_space_copy(sourceView);
+  auto destTrueViewAfterOp_h   = create_host_space_copy(destTrueView);
+  auto destFalseViewAfterOp_h  = create_host_space_copy(destFalseView);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+
+  Kokkos::View<ValueType**, Kokkos::HostSpace> stdDestTrueView(
+      "stdDestTrueView", numTeams, numCols);
+  Kokkos::View<ValueType**, Kokkos::HostSpace> stdDestFalseView(
+      "stdDestFalseView", numTeams, numCols);
+  GreaterThanValueFunctor predicate(threshold);
+
+  for (std::size_t i = 0; i < sourceView_dc_h.extent(0); ++i) {
+    auto myRowSource    = Kokkos::subview(sourceView_dc_h, i, Kokkos::ALL());
+    auto myRowDestTrue  = Kokkos::subview(stdDestTrueView, i, Kokkos::ALL());
+    auto myRowDestFalse = Kokkos::subview(stdDestFalseView, i, Kokkos::ALL());
+
+    const auto stdResult = std::partition_copy(
+        KE::cbegin(myRowSource), KE::cend(myRowSource),
+        KE::begin(myRowDestTrue), KE::begin(myRowDestFalse), predicate);
+    // our result must match std
+    const std::size_t stdDistanceTrue =
+        KE::distance(KE::begin(myRowDestTrue), stdResult.first);
+    const std::size_t stdDistanceFalse =
+        KE::distance(KE::begin(myRowDestFalse), stdResult.second);
+    ASSERT_EQ(stdDistanceTrue, distancesTrueView_h(i));
+    ASSERT_EQ(stdDistanceFalse, distancesFalseView_h(i));
+    ASSERT_TRUE(intraTeamSentinelView_h(i));
+  }
+
+  expect_equal_host_views(sourceView_dc_h, sourceViewAfterOp_h);
+  expect_equal_host_views(destTrueViewAfterOp_h, stdDestTrueView);
+  expect_equal_host_views(destFalseViewAfterOp_h, stdDestFalseView);
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios(const std::string& name, const std::vector<int>& cols) {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : cols) {
+      for (int apiId : {0, 1}) {
+        test_A<LayoutTag, ValueType>(numTeams, numCols, apiId, name);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_partition_copy_team_test, empty) {
+  const std::string name      = "trivialEmpty";
+  const std::vector<int> cols = {0};
+  run_all_scenarios<DynamicTag, double>(name, cols);
+  run_all_scenarios<StridedTwoRowsTag, double>(name, cols);
+  run_all_scenarios<StridedThreeRowsTag, int>(name, cols);
+}
+
+TEST(std_algorithms_partition_copy_team_test, all_true) {
+  const std::string name      = "allTrue";
+  const std::vector<int> cols = {13, 101, 1444, 5153};
+  run_all_scenarios<DynamicTag, double>(name, cols);
+  run_all_scenarios<StridedTwoRowsTag, double>(name, cols);
+  run_all_scenarios<StridedThreeRowsTag, int>(name, cols);
+}
+
+TEST(std_algorithms_partition_copy_team_test, all_false) {
+  const std::string name      = "allFalse";
+  const std::vector<int> cols = {13, 101, 1444, 5153};
+  run_all_scenarios<DynamicTag, double>(name, cols);
+  run_all_scenarios<StridedTwoRowsTag, double>(name, cols);
+  run_all_scenarios<StridedThreeRowsTag, int>(name, cols);
+}
+
+TEST(std_algorithms_partition_copy_team_test, random) {
+  const std::string name      = "random";
+  const std::vector<int> cols = {13, 101, 1444, 5153};
+  run_all_scenarios<DynamicTag, double>(name, cols);
+  run_all_scenarios<StridedTwoRowsTag, double>(name, cols);
+  run_all_scenarios<StridedThreeRowsTag, int>(name, cols);
+}
+
+}  // namespace TeamPartitionCopy
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamPartitionPoint.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamPartitionPoint.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..954d4612468d39f5754ec2e46e34ab59cf46d5fb
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamPartitionPoint.cpp
@@ -0,0 +1,260 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+#include <algorithm>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamPartitionPoint {
+
+namespace KE = Kokkos::Experimental;
+
+template <class ValueType>
+struct UnifDist;
+
+template <>
+struct UnifDist<int> {
+  using dist_type = std::uniform_int_distribution<int>;
+  std::mt19937 m_gen;
+  dist_type m_dist;
+
+  UnifDist(int a, int b, std::size_t seedIn) : m_dist(a, b) {
+    m_gen.seed(seedIn);
+  }
+
+  int operator()() { return m_dist(m_gen); }
+};
+
+template <class ValueType>
+struct GreaterThanValueFunctor {
+  ValueType m_val;
+
+  KOKKOS_INLINE_FUNCTION
+  GreaterThanValueFunctor(ValueType val) : m_val(val) {}
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator()(ValueType val) const { return (val > m_val); }
+};
+
+template <class ViewType, class DistancesViewType, class IntraTeamSentinelView,
+          class ValueType>
+struct TestFunctorA {
+  ViewType m_view;
+  DistancesViewType m_distancesView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+  ValueType m_threshold;
+  int m_apiPick;
+
+  TestFunctorA(const ViewType view, const DistancesViewType distancesView,
+               const IntraTeamSentinelView intraTeamSentinelView,
+               ValueType threshold, int apiPick)
+      : m_view(view),
+        m_distancesView(distancesView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_threshold(threshold),
+        m_apiPick(apiPick) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto myRowIndex = member.league_rank();
+    auto myRowView        = Kokkos::subview(m_view, myRowIndex, Kokkos::ALL());
+    ptrdiff_t resultDist  = 0;
+
+    GreaterThanValueFunctor predicate(m_threshold);
+    if (m_apiPick == 0) {
+      const auto it = KE::partition_point(member, KE::cbegin(myRowView),
+                                          KE::cend(myRowView), predicate);
+      resultDist    = KE::distance(KE::cbegin(myRowView), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    }
+
+    else if (m_apiPick == 1) {
+      const auto it = KE::partition_point(member, myRowView, predicate);
+      resultDist    = KE::distance(KE::begin(myRowView), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    }
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck = team_members_have_matching_result(
+        member, resultDist, m_distancesView(myRowIndex));
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(myRowIndex) = intraTeamCheck;
+    });
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(std::size_t numTeams, std::size_t numCols, int apiId,
+            const std::string& sIn) {
+  /* description:
+     use a rank-2 view randomly filled with values in a range (a,b)
+     and run a team-level (one team per row) partition_point with
+     predicate = IsGreaterThanValue
+     where threshold is set to a number larger than b above
+   */
+  const auto threshold           = static_cast<ValueType>(1103);
+  const auto valueForSureGreater = static_cast<ValueType>(2103);
+  const auto valueForSureSmaller = static_cast<ValueType>(111);
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // construct in memory space associated with default exespace
+  auto dataView =
+      create_view<ValueType>(LayoutTag{}, numTeams, numCols, "dataView");
+
+  // dataView might not deep copyable (e.g. strided layout) so to
+  // randomize it, we make a new view that is for sure deep copyable,
+  // modify it on the host, deep copy to device and then launch
+  // a kernel to copy to dataView
+  auto dataView_dc =
+      create_deep_copyable_compatible_view_with_same_extent(dataView);
+  auto dataView_dc_h = create_mirror_view(Kokkos::HostSpace(), dataView_dc);
+
+  if (sIn == "trivialEmpty") {
+    // do nothing
+  }
+
+  else if (sIn == "allTrue") {
+    // randomly fill with values greater than threshold
+    // so that all elements in each row satisfy the predicate
+    // so this counts as being partitioned
+    Kokkos::Random_XorShift64_Pool<Kokkos::DefaultHostExecutionSpace> pool(
+        452377);
+    Kokkos::fill_random(dataView_dc_h, pool, ValueType(2001), ValueType(2501));
+  }
+
+  else if (sIn == "allFalse") {
+    // randomly fill the view with values smaller than threshold
+    // and even in this case each row counts as partitioned
+    Kokkos::Random_XorShift64_Pool<Kokkos::DefaultHostExecutionSpace> pool(
+        452377);
+    Kokkos::fill_random(dataView_dc_h, pool, ValueType(0), ValueType(101));
+  }
+
+  else if (sIn == "random") {
+    // randomly select a location and make all values before that
+    // larger than threshol and all values after to be smaller than threshold
+    // so that this picked location does partition the range
+    UnifDist<int> indexProducer(0, numCols - 1, 3432779);
+    for (std::size_t i = 0; i < dataView_dc_h.extent(0); ++i) {
+      const std::size_t a = indexProducer();
+      for (std::size_t j = 0; j < a; ++j) {
+        dataView_dc_h(i, j) = valueForSureGreater;
+      }
+      for (std::size_t j = a; j < numCols; ++j) {
+        dataView_dc_h(i, j) = valueForSureSmaller;
+      }
+    }
+  }
+
+  // copy to dataView_dc and then to dataView
+  Kokkos::deep_copy(dataView_dc, dataView_dc_h);
+  // use CTAD
+  CopyFunctorRank2 F1(dataView_dc, dataView);
+  Kokkos::parallel_for("copy", dataView.extent(0) * dataView.extent(1), F1);
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+
+  // to verify that things work, each team stores the result
+  // and then we check that these match what we expect
+  Kokkos::View<std::size_t*> distancesView("distances", numTeams);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  // use CTAD for functor
+  TestFunctorA fnc(dataView, distancesView, intraTeamSentinelView, threshold,
+                   apiId);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // check
+  // -----------------------------------------------
+  auto distancesView_h         = create_host_space_copy(distancesView);
+  auto dataViewAfterOp_h       = create_host_space_copy(dataView);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+  GreaterThanValueFunctor predicate(threshold);
+
+  for (std::size_t i = 0; i < dataView_dc_h.extent(0); ++i) {
+    auto myRow = Kokkos::subview(dataView_dc_h, i, Kokkos::ALL());
+    const auto stdResult =
+        std::partition_point(KE::cbegin(myRow), KE::cend(myRow), predicate);
+
+    // our result must match std
+    const std::size_t stdDistance = KE::distance(KE::cbegin(myRow), stdResult);
+    ASSERT_EQ(stdDistance, distancesView_h(i));
+    ASSERT_TRUE(intraTeamSentinelView_h(i));
+  }
+  expect_equal_host_views(dataView_dc_h, dataViewAfterOp_h);
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios(const std::string& name, const std::vector<int>& cols) {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : cols) {
+      for (int apiId : {0, 1}) {
+        test_A<LayoutTag, ValueType>(numTeams, numCols, apiId, name);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_partition_point_team_test, empty) {
+  const std::string name      = "trivialEmpty";
+  const std::vector<int> cols = {0};
+  run_all_scenarios<DynamicTag, double>(name, cols);
+  run_all_scenarios<StridedTwoRowsTag, double>(name, cols);
+  run_all_scenarios<StridedThreeRowsTag, int>(name, cols);
+}
+
+TEST(std_algorithms_partition_point_team_test, all_true) {
+  const std::string name      = "allTrue";
+  const std::vector<int> cols = {13, 101, 1444, 5153};
+  run_all_scenarios<DynamicTag, double>(name, cols);
+  run_all_scenarios<StridedTwoRowsTag, double>(name, cols);
+  run_all_scenarios<StridedThreeRowsTag, int>(name, cols);
+}
+
+TEST(std_algorithms_partition_point_team_test, all_false) {
+  const std::string name      = "allFalse";
+  const std::vector<int> cols = {13, 101, 1444, 5153};
+  run_all_scenarios<DynamicTag, double>(name, cols);
+  run_all_scenarios<StridedTwoRowsTag, double>(name, cols);
+  run_all_scenarios<StridedThreeRowsTag, int>(name, cols);
+}
+
+TEST(std_algorithms_partition_point_team_test, random) {
+  const std::string name      = "random";
+  const std::vector<int> cols = {13, 101, 1444, 5153};
+  run_all_scenarios<DynamicTag, double>(name, cols);
+  run_all_scenarios<StridedTwoRowsTag, double>(name, cols);
+  run_all_scenarios<StridedThreeRowsTag, int>(name, cols);
+}
+
+}  // namespace TeamPartitionPoint
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReduce.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReduce.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..94c2a8f1f9a7259c2870132a7a4f0dc4a71b77ea
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReduce.cpp
@@ -0,0 +1,272 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+
+#if not defined KOKKOS_ENABLE_OPENMPTARGET
+
+namespace Test {
+namespace stdalgos {
+namespace TeamReduce {
+
+namespace KE = Kokkos::Experimental;
+
+template <class ValueType>
+struct PlusFunctor {
+  KOKKOS_INLINE_FUNCTION
+  ValueType operator()(const ValueType& lhs, const ValueType& rhs) const {
+    return lhs + rhs;
+  }
+};
+
+template <class DataViewType, class ReductionInitValuesViewType,
+          class ReduceResultsViewType, class IntraTeamSentinelView,
+          class BinaryPredType>
+struct TestFunctorA {
+  DataViewType m_dataView;
+  ReductionInitValuesViewType m_reductionInitValuesView;
+  ReduceResultsViewType m_reduceResultsView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+  int m_apiPick;
+  BinaryPredType m_binaryPred;
+
+  TestFunctorA(const DataViewType dataView,
+               const ReductionInitValuesViewType reductionInitValuesView,
+               const ReduceResultsViewType reduceResultsView,
+               const IntraTeamSentinelView intraTeamSentinelView, int apiPick,
+               BinaryPredType binaryPred)
+      : m_dataView(dataView),
+        m_reductionInitValuesView(reductionInitValuesView),
+        m_reduceResultsView(reduceResultsView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_apiPick(apiPick),
+        m_binaryPred(binaryPred) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto myRowIndex = member.league_rank();
+
+    auto myRowViewFrom = Kokkos::subview(m_dataView, myRowIndex, Kokkos::ALL());
+
+    const auto rowFromBegin     = KE::cbegin(myRowViewFrom);
+    const auto rowFromEnd       = KE::cend(myRowViewFrom);
+    const auto initReductionVal = m_reductionInitValuesView(myRowIndex);
+    typename ReduceResultsViewType::non_const_value_type result = 0;
+
+    switch (m_apiPick) {
+      case 0: {
+        result = KE::reduce(member, rowFromBegin, rowFromEnd);
+        Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+          m_reduceResultsView(myRowIndex) = result;
+        });
+        break;
+      }
+
+      case 1: {
+        result = KE::reduce(member, myRowViewFrom);
+        Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+          m_reduceResultsView(myRowIndex) = result;
+        });
+        break;
+      }
+
+      case 2: {
+        result = KE::reduce(member, rowFromBegin, rowFromEnd, initReductionVal);
+        Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+          m_reduceResultsView(myRowIndex) = result;
+        });
+        break;
+      }
+
+      case 3: {
+        result = KE::reduce(member, myRowViewFrom, initReductionVal);
+        Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+          m_reduceResultsView(myRowIndex) = result;
+        });
+        break;
+      }
+
+      case 4: {
+        result = KE::reduce(member, rowFromBegin, rowFromEnd, initReductionVal,
+                            m_binaryPred);
+        Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+          m_reduceResultsView(myRowIndex) = result;
+        });
+        break;
+      }
+
+      case 5: {
+        result =
+            KE::reduce(member, myRowViewFrom, initReductionVal, m_binaryPred);
+        Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+          m_reduceResultsView(myRowIndex) = result;
+        });
+        break;
+      }
+    }
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck = team_members_have_matching_result(
+        member, result, m_reduceResultsView(myRowIndex));
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(myRowIndex) = intraTeamCheck;
+    });
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
+  /* description:
+     use a rank-2 view randomly filled with values,
+     and run a team-level reduce
+   */
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // create a view in the memory space associated with default exespace
+  // with as many rows as the number of teams and fill it with random
+  // values from an arbitrary range.
+  constexpr ValueType lowerBound = 5;
+  constexpr ValueType upperBound = 523;
+  const auto bounds              = make_bounds(lowerBound, upperBound);
+
+  auto [dataView, dataViewBeforeOp_h] = create_random_view_and_host_clone(
+      LayoutTag{}, numTeams, numCols, bounds, "dataView");
+
+  // Create view of reduce init values to be used by test cases
+  Kokkos::View<ValueType*, Kokkos::DefaultHostExecutionSpace>
+      reductionInitValuesView_h("reductionInitValuesView_h", numTeams);
+  using rand_pool =
+      Kokkos::Random_XorShift64_Pool<Kokkos::DefaultHostExecutionSpace>;
+  rand_pool pool(lowerBound * upperBound);
+  Kokkos::fill_random(reductionInitValuesView_h, pool, lowerBound, upperBound);
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+
+  // to verify that things work, each team stores the result of its reduce
+  // call, and then we check that these match what we expect
+  Kokkos::View<ValueType*> reduceResultsView("reduceResultsView", numTeams);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  PlusFunctor<ValueType> binaryPred;
+
+  // use CTAD for functor
+  auto reductionInitValuesView =
+      Kokkos::create_mirror_view_and_copy(space_t(), reductionInitValuesView_h);
+  TestFunctorA fnc(dataView, reductionInitValuesView, reduceResultsView,
+                   intraTeamSentinelView, apiId, binaryPred);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // run cpp-std kernel and check
+  // -----------------------------------------------
+
+  auto reduceResultsView_h     = create_host_space_copy(reduceResultsView);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+
+  for (std::size_t i = 0; i < dataView.extent(0); ++i) {
+    auto rowFrom = Kokkos::subview(dataViewBeforeOp_h, i, Kokkos::ALL());
+
+    const auto rowFromBegin = KE::cbegin(rowFrom);
+    const auto rowFromEnd   = KE::cend(rowFrom);
+    const auto initVal      = reductionInitValuesView_h(i);
+
+    ASSERT_TRUE(intraTeamSentinelView_h(i));
+
+    // libstdc++ as provided by GCC 8 does not have reduce, transform_reduce,
+    // exclusive_scan, inclusive_scan, transform_exclusive_scan,
+    // transform_inclusive_scan and for GCC 9.1, 9.2 fails to compile them for
+    // missing overload not accepting policy
+
+#if defined(_GLIBCXX_RELEASE) && (_GLIBCXX_RELEASE <= 9)
+#define reduce testing_reduce
+#else
+#define reduce std::reduce
+#endif
+
+    switch (apiId) {
+      case 0:
+      case 1: {
+        const ValueType result = reduce(rowFromBegin, rowFromEnd);
+        if constexpr (std::is_floating_point_v<ValueType>) {
+          EXPECT_FLOAT_EQ(result, reduceResultsView_h(i));
+        } else {
+          ASSERT_EQ(result, reduceResultsView_h(i));
+        }
+
+        break;
+      }
+
+      case 2:
+      case 3: {
+        const ValueType result = reduce(rowFromBegin, rowFromEnd, initVal);
+        if constexpr (std::is_floating_point_v<ValueType>) {
+          EXPECT_FLOAT_EQ(result, reduceResultsView_h(i));
+        } else {
+          ASSERT_EQ(result, reduceResultsView_h(i));
+        }
+
+        break;
+      }
+
+      case 4:
+      case 5: {
+        const ValueType result =
+            reduce(rowFromBegin, rowFromEnd, initVal, binaryPred);
+        if constexpr (std::is_floating_point_v<ValueType>) {
+          EXPECT_FLOAT_EQ(result, reduceResultsView_h(i));
+        } else {
+          ASSERT_EQ(result, reduceResultsView_h(i));
+        }
+
+        break;
+      }
+    }
+
+#undef reduce
+  }
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios() {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8153}) {
+      for (int apiId : {0, 1, 2, 3, 4, 5}) {
+        test_A<LayoutTag, ValueType>(numTeams, numCols, apiId);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_reduce_team_test, test) {
+  run_all_scenarios<DynamicTag, double>();
+  run_all_scenarios<StridedTwoRowsTag, int>();
+  run_all_scenarios<StridedThreeRowsTag, unsigned>();
+}
+
+}  // namespace TeamReduce
+}  // namespace stdalgos
+}  // namespace Test
+
+#endif
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemove.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemove.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fb9c70391b3d6de84b5a5a06b2f576962bafe120
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemove.cpp
@@ -0,0 +1,182 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+#include <algorithm>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamRemove {
+
+namespace KE = Kokkos::Experimental;
+
+template <class ValueType>
+struct UnifDist;
+
+template <>
+struct UnifDist<int> {
+  using dist_type = std::uniform_int_distribution<int>;
+  std::mt19937 m_gen;
+  dist_type m_dist;
+
+  UnifDist(int b, std::size_t seedIn) : m_dist(0, b) { m_gen.seed(seedIn); }
+
+  int operator()() { return m_dist(m_gen); }
+};
+
+template <class ViewType, class DistancesViewType, class IntraTeamSentinelView,
+          class ValueType>
+struct TestFunctorA {
+  ViewType m_view;
+  ValueType m_targetValue;
+  DistancesViewType m_distancesView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+  int m_apiPick;
+
+  TestFunctorA(const ViewType view, ValueType oldVal,
+               const DistancesViewType distancesView,
+               const IntraTeamSentinelView intraTeamSentinelView, int apiPick)
+      : m_view(view),
+        m_targetValue(oldVal),
+        m_distancesView(distancesView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_apiPick(apiPick) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto myRowIndex = member.league_rank();
+    auto myRowView        = Kokkos::subview(m_view, myRowIndex, Kokkos::ALL());
+    ptrdiff_t resultDist  = 0;
+
+    if (m_apiPick == 0) {
+      auto it    = KE::remove(member, KE::begin(myRowView), KE::end(myRowView),
+                           m_targetValue);
+      resultDist = KE::distance(KE::begin(myRowView), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    } else if (m_apiPick == 1) {
+      auto it    = KE::remove(member, myRowView, m_targetValue);
+      resultDist = KE::distance(KE::begin(myRowView), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    }
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck = team_members_have_matching_result(
+        member, resultDist, m_distancesView(myRowIndex));
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(myRowIndex) = intraTeamCheck;
+    });
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
+  /* description:
+     set a random subset of each row of a rank-2 view
+     to a target value and run a team-level KE::remove
+     with one team per row to remove all those elements.
+   */
+
+  const auto targetVal = static_cast<ValueType>(531);
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // Create a view in the memory space associated with default exespace with as
+  // many rows as the number of teams and fill it with random values from an
+  // arbitrary range. Pick range so that some of the values are equal to target.
+  auto [dataView, dataView_h] = create_random_view_and_host_clone(
+      LayoutTag{}, numTeams, numCols,
+      Kokkos::pair<ValueType, ValueType>{targetVal - 1, targetVal + 1},
+      "dataView");
+
+  // note that we need to count how many elements are equal
+  // to targetVal because the dataView was origianlly filled
+  // with random values
+  std::vector<std::size_t> perRowRealCount(numTeams);
+  for (std::size_t i = 0; i < dataView_h.extent(0); ++i) {
+    std::size_t realCount = 0;
+    for (std::size_t j = 0; j < dataView_h.extent(1); ++j) {
+      if (dataView_h(i, j) == targetVal) {
+        realCount++;
+      }
+    }
+    perRowRealCount[i] = realCount;
+  }
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+
+  // each team stores the distance of the returned iterator from the
+  // beginning of the interval that team operates on and then we check
+  // that these distances match the std result
+  Kokkos::View<std::size_t*> distancesView("distancesView", numTeams);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  // use CTAD for functor
+  TestFunctorA fnc(dataView, targetVal, distancesView, intraTeamSentinelView,
+                   apiId);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // check against std
+  // -----------------------------------------------
+  auto dataViewAfterOp_h       = create_host_space_copy(dataView);
+  auto distancesView_h         = create_host_space_copy(distancesView);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+  for (std::size_t i = 0; i < dataViewAfterOp_h.extent(0); ++i) {
+    auto myRow = Kokkos::subview(dataView_h, i, Kokkos::ALL());
+    auto stdIt = std::remove(KE::begin(myRow), KE::end(myRow), targetVal);
+    const std::size_t stdDistance = KE::distance(KE::begin(myRow), stdIt);
+    ASSERT_EQ(distancesView_h(i), stdDistance);
+    ASSERT_EQ(distancesView_h(i), numCols - perRowRealCount[i]);
+
+    for (std::size_t j = 0; j < distancesView_h(i); ++j) {
+      ASSERT_EQ(dataViewAfterOp_h(i, j), dataView_h(i, j));
+    }
+    ASSERT_TRUE(intraTeamSentinelView_h(i));
+  }
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios() {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8113}) {
+      for (int apiId : {0, 1}) {
+        test_A<LayoutTag, ValueType>(numTeams, numCols, apiId);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_remove_team_test, test) {
+  run_all_scenarios<DynamicTag, double>();
+  run_all_scenarios<StridedTwoRowsTag, int>();
+  run_all_scenarios<StridedThreeRowsTag, unsigned>();
+}
+
+}  // namespace TeamRemove
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopy.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopy.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..24b840154b7383717c9602bbef0d709f0f3b4453
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopy.cpp
@@ -0,0 +1,222 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+#include <algorithm>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamRemoveCopy {
+
+namespace KE = Kokkos::Experimental;
+
+template <class ValueType>
+struct UnifDist;
+
+template <>
+struct UnifDist<int> {
+  using dist_type = std::uniform_int_distribution<int>;
+  std::mt19937 m_gen;
+  dist_type m_dist;
+
+  UnifDist(int b, std::size_t seedIn) : m_dist(0, b) { m_gen.seed(seedIn); }
+
+  int operator()() { return m_dist(m_gen); }
+};
+
+template <class SourceViewType, class DestViewType, class DistancesViewType,
+          class IntraTeamSentinelView, class ValueType>
+struct TestFunctorA {
+  SourceViewType m_sourceView;
+  DestViewType m_destView;
+  ValueType m_targetValue;
+  DistancesViewType m_distancesView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+  int m_apiPick;
+
+  TestFunctorA(const SourceViewType sourceView, const DestViewType destView,
+               ValueType targetVal, const DistancesViewType distancesView,
+               const IntraTeamSentinelView intraTeamSentinelView, int apiPick)
+      : m_sourceView(sourceView),
+        m_destView(destView),
+        m_targetValue(targetVal),
+        m_distancesView(distancesView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_apiPick(apiPick) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto myRowIndex = member.league_rank();
+    auto myRowViewFrom =
+        Kokkos::subview(m_sourceView, myRowIndex, Kokkos::ALL());
+    auto myRowViewDest = Kokkos::subview(m_destView, myRowIndex, Kokkos::ALL());
+    ptrdiff_t resultDist = 0;
+
+    if (m_apiPick == 0) {
+      auto it    = KE::remove_copy(member, KE::cbegin(myRowViewFrom),
+                                KE::cend(myRowViewFrom),
+                                KE::begin(myRowViewDest), m_targetValue);
+      resultDist = KE::distance(KE::begin(myRowViewDest), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    } else if (m_apiPick == 1) {
+      auto it =
+          KE::remove_copy(member, myRowViewFrom, myRowViewDest, m_targetValue);
+      resultDist = KE::distance(KE::begin(myRowViewDest), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    }
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck = team_members_have_matching_result(
+        member, resultDist, m_distancesView(myRowIndex));
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(myRowIndex) = intraTeamCheck;
+    });
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
+  /* description:
+     set a random subset of each row of a rank-2 view
+     to a target value and run a team-level KE::remove_copy
+     to a destination view with one team per row to remove all those elements.
+   */
+
+  const auto targetVal = static_cast<ValueType>(531);
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // construct in memory space associated with default exespace
+  auto sourceView =
+      create_view<ValueType>(LayoutTag{}, numTeams, numCols, "dataView");
+
+  // sourceView might not deep copyable (e.g. strided layout) so to fill it
+  // we make a new view that is for sure deep copyable, modify it on the host
+  // deep copy to device and then launch copy kernel to sourceView
+  auto sourceView_dc =
+      create_deep_copyable_compatible_view_with_same_extent(sourceView);
+  auto sourceView_dc_h = create_mirror_view(Kokkos::HostSpace(), sourceView_dc);
+
+  Kokkos::Random_XorShift64_Pool<Kokkos::DefaultHostExecutionSpace> pool(
+      45234399);
+  Kokkos::fill_random(sourceView_dc_h, pool, ValueType(0), ValueType(1177));
+
+  // for each row, randomly select columns, fill with targetVal
+  std::vector<std::size_t> perRowRealCount(numTeams);
+  const std::size_t maxColInd = numCols > 0 ? numCols - 1 : 0;
+  UnifDist<int> colCountProducer(maxColInd, 3123377);
+  UnifDist<int> colIndicesProducer(maxColInd, 455225);
+  for (std::size_t i = 0; i < sourceView_dc_h.extent(0); ++i) {
+    const std::size_t currCount = colCountProducer();
+    std::vector<std::size_t> colIndForThisRow(currCount);
+    for (std::size_t j = 0; j < currCount; ++j) {
+      const auto colInd          = colIndicesProducer();
+      sourceView_dc_h(i, colInd) = targetVal;
+      colIndForThisRow[j]        = colInd;
+    }
+
+    // note that we need to count how many elements are equal
+    // to targetVal because the sourceView was origianlly filled
+    // with random values so it could be that we have more matches
+    // than what we manually set above
+    std::size_t realCount = 0;
+    for (std::size_t j = 0; j < sourceView_dc_h.extent(1); ++j) {
+      if (sourceView_dc_h(i, j) == targetVal) {
+        realCount++;
+      }
+    }
+    perRowRealCount[i] = realCount;
+  }
+
+  // copy to sourceView_dc and then to sourceView
+  Kokkos::deep_copy(sourceView_dc, sourceView_dc_h);
+  CopyFunctorRank2 F1(sourceView_dc, sourceView);
+  Kokkos::parallel_for("copy", sourceView.extent(0) * sourceView.extent(1), F1);
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+
+  // create the destination view
+  Kokkos::View<ValueType**> destView("destView", numTeams, numCols);
+
+  // each team stores the distance of the returned iterator from the
+  // beginning of the interval that team operates on and then we check
+  // that these distances match the std result
+  Kokkos::View<std::size_t*> distancesView("distancesView", numTeams);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  // use CTAD for functor
+  TestFunctorA fnc(sourceView, destView, targetVal, distancesView,
+                   intraTeamSentinelView, apiId);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // check against std
+  // -----------------------------------------------
+  auto destViewAfterOp_h       = create_host_space_copy(destView);
+  auto distancesView_h         = create_host_space_copy(distancesView);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+  Kokkos::View<ValueType**, Kokkos::HostSpace> stdDestView("stdDestView",
+                                                           numTeams, numCols);
+
+  for (std::size_t i = 0; i < destViewAfterOp_h.extent(0); ++i) {
+    auto rowFrom = Kokkos::subview(sourceView_dc_h, i, Kokkos::ALL());
+    auto rowDest = Kokkos::subview(stdDestView, i, Kokkos::ALL());
+
+    auto stdIt = std::remove_copy(KE::cbegin(rowFrom), KE::cend(rowFrom),
+                                  KE::begin(rowDest), targetVal);
+    const std::size_t stdDistance = KE::distance(KE::begin(rowDest), stdIt);
+
+    EXPECT_TRUE(distancesView_h(i) == stdDistance);
+    // EXPECT_TRUE(distancesView_h(i) == numCols - perRowRealCount[i]);
+    for (std::size_t j = 0; j < distancesView_h(i); ++j) {
+      EXPECT_TRUE(destViewAfterOp_h(i, j) == stdDestView(i, j));
+    }
+    ASSERT_TRUE(intraTeamSentinelView_h(i));
+  }
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios() {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8113}) {
+      for (int apiId : {0, 1}) {
+        test_A<LayoutTag, ValueType>(numTeams, numCols, apiId);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_remove_copy_team_test, test) {
+  run_all_scenarios<DynamicTag, double>();
+  run_all_scenarios<StridedTwoRowsTag, int>();
+  run_all_scenarios<StridedThreeRowsTag, unsigned>();
+}
+
+}  // namespace TeamRemoveCopy
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2082fa972880c51c5794942a5d68c3aef2f136a5
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp
@@ -0,0 +1,178 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+#include <algorithm>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamRemoveCopyIf {
+
+namespace KE = Kokkos::Experimental;
+
+template <class ValueType>
+struct GreaterThanValueFunctor {
+  ValueType m_val;
+
+  KOKKOS_INLINE_FUNCTION
+  GreaterThanValueFunctor(ValueType val) : m_val(val) {}
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator()(ValueType val) const { return (val > m_val); }
+};
+
+template <class SourceViewType, class DestViewType, class DistancesViewType,
+          class IntraTeamSentinelView, class ValueType>
+struct TestFunctorA {
+  SourceViewType m_sourceView;
+  DestViewType m_destView;
+  ValueType m_threshold;
+  DistancesViewType m_distancesView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+  int m_apiPick;
+
+  TestFunctorA(const SourceViewType sourceView, const DestViewType destView,
+               ValueType threshold, const DistancesViewType distancesView,
+               const IntraTeamSentinelView intraTeamSentinelView, int apiPick)
+      : m_sourceView(sourceView),
+        m_destView(destView),
+        m_threshold(threshold),
+        m_distancesView(distancesView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_apiPick(apiPick) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto myRowIndex = member.league_rank();
+    auto myRowViewFrom =
+        Kokkos::subview(m_sourceView, myRowIndex, Kokkos::ALL());
+    auto myRowViewDest = Kokkos::subview(m_destView, myRowIndex, Kokkos::ALL());
+    ptrdiff_t resultDist = 0;
+
+    GreaterThanValueFunctor predicate(m_threshold);
+    if (m_apiPick == 0) {
+      auto it    = KE::remove_copy_if(member, KE::cbegin(myRowViewFrom),
+                                   KE::cend(myRowViewFrom),
+                                   KE::begin(myRowViewDest), predicate);
+      resultDist = KE::distance(KE::begin(myRowViewDest), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    } else if (m_apiPick == 1) {
+      auto it =
+          KE::remove_copy_if(member, myRowViewFrom, myRowViewDest, predicate);
+      resultDist = KE::distance(KE::begin(myRowViewDest), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    }
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck = team_members_have_matching_result(
+        member, resultDist, m_distancesView(myRowIndex));
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(myRowIndex) = intraTeamCheck;
+    });
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
+  /* description:
+     use a rank-2 view randomly filled,
+     and run a team-level remove_copy_if where the values copied
+     are those strictly greater than a threshold.
+   */
+
+  const auto threshold = static_cast<ValueType>(531);
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  auto [sourceView, cloneOfSourceViewBeforeOp_h] =
+      create_random_view_and_host_clone(
+          LayoutTag{}, numTeams, numCols,
+          Kokkos::pair<ValueType, ValueType>{5, 523}, "sourceView");
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+
+  // create the destination view
+  Kokkos::View<ValueType**> destView("destView", numTeams, numCols);
+
+  // each team stores the distance of the returned iterator from the
+  // beginning of the interval that team operates on and then we check
+  // that these distances match the std result
+  Kokkos::View<std::size_t*> distancesView("distancesView", numTeams);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  // use CTAD for functor
+  TestFunctorA fnc(sourceView, destView, threshold, distancesView,
+                   intraTeamSentinelView, apiId);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // check against std
+  // -----------------------------------------------
+  auto destViewAfterOp_h       = create_host_space_copy(destView);
+  auto distancesView_h         = create_host_space_copy(distancesView);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+  Kokkos::View<ValueType**, Kokkos::HostSpace> stdDestView("stdDestView",
+                                                           numTeams, numCols);
+  GreaterThanValueFunctor predicate(threshold);
+  for (std::size_t i = 0; i < destViewAfterOp_h.extent(0); ++i) {
+    auto rowFrom =
+        Kokkos::subview(cloneOfSourceViewBeforeOp_h, i, Kokkos::ALL());
+    auto rowDest = Kokkos::subview(stdDestView, i, Kokkos::ALL());
+
+    auto stdIt = std::remove_copy_if(KE::cbegin(rowFrom), KE::cend(rowFrom),
+                                     KE::begin(rowDest), predicate);
+    const std::size_t stdDistance = KE::distance(KE::begin(rowDest), stdIt);
+
+    EXPECT_TRUE(distancesView_h(i) == stdDistance);
+    for (std::size_t j = 0; j < distancesView_h(i); ++j) {
+      EXPECT_TRUE(destViewAfterOp_h(i, j) == stdDestView(i, j));
+    }
+    ASSERT_TRUE(intraTeamSentinelView_h(i));
+  }
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios() {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8113}) {
+      for (int apiId : {0, 1}) {
+        test_A<LayoutTag, ValueType>(numTeams, numCols, apiId);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_remove_copy_if_team_test, test) {
+  run_all_scenarios<DynamicTag, double>();
+  run_all_scenarios<StridedTwoRowsTag, int>();
+  run_all_scenarios<StridedThreeRowsTag, unsigned>();
+}
+
+}  // namespace TeamRemoveCopyIf
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveIf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveIf.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3315f281da616e3ce05e2105c7a08d7a2dd83af0
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveIf.cpp
@@ -0,0 +1,166 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+#include <algorithm>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamRemoveIf {
+
+namespace KE = Kokkos::Experimental;
+
+template <class ValueType>
+struct GreaterThanValueFunctor {
+  ValueType m_val;
+
+  KOKKOS_INLINE_FUNCTION
+  GreaterThanValueFunctor(ValueType val) : m_val(val) {}
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator()(ValueType val) const { return (val > m_val); }
+};
+
+template <class ViewType, class DistancesViewType, class IntraTeamSentinelView,
+          class ValueType>
+struct TestFunctorA {
+  ViewType m_view;
+  ValueType m_threshold;
+  DistancesViewType m_distancesView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+  int m_apiPick;
+
+  TestFunctorA(const ViewType view, ValueType threshold,
+               const DistancesViewType distancesView,
+               const IntraTeamSentinelView intraTeamSentinelView, int apiPick)
+      : m_view(view),
+        m_threshold(threshold),
+        m_distancesView(distancesView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_apiPick(apiPick) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto myRowIndex = member.league_rank();
+    auto myRowView        = Kokkos::subview(m_view, myRowIndex, Kokkos::ALL());
+    ptrdiff_t resultDist  = 0;
+
+    GreaterThanValueFunctor predicate(m_threshold);
+    if (m_apiPick == 0) {
+      auto it = KE::remove_if(member, KE::begin(myRowView), KE::end(myRowView),
+                              predicate);
+      resultDist = KE::distance(KE::begin(myRowView), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    } else if (m_apiPick == 1) {
+      auto it    = KE::remove_if(member, myRowView, predicate);
+      resultDist = KE::distance(KE::begin(myRowView), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    }
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck = team_members_have_matching_result(
+        member, resultDist, m_distancesView(myRowIndex));
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(myRowIndex) = intraTeamCheck;
+    });
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
+  /* description:
+     use a rank-2 view randomly filled with values,
+     and run a team-level remove_if for values strictly
+     greater than a threshold.
+   */
+
+  const auto threshold = static_cast<ValueType>(151);
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // create a view in the memory space associated with default exespace
+  // with as many rows as the number of teams and fill it with random
+  // values from an arbitrary range
+  auto [dataView, cloneOfDataViewBeforeOp_h] =
+      create_random_view_and_host_clone(
+          LayoutTag{}, numTeams, numCols,
+          Kokkos::pair<ValueType, ValueType>{5, 523}, "dataView");
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+
+  // each team stores the distance of the returned iterator from the
+  // beginning of the interval that team operates on and then we check
+  // that these distances match the std result
+  Kokkos::View<std::size_t*> distancesView("distancesView", numTeams);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  // use CTAD for functor
+  TestFunctorA fnc(dataView, threshold, distancesView, intraTeamSentinelView,
+                   apiId);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // check against std
+  // -----------------------------------------------
+  GreaterThanValueFunctor predicate(threshold);
+  auto dataViewAfterOp_h       = create_host_space_copy(dataView);
+  auto distancesView_h         = create_host_space_copy(distancesView);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+  for (std::size_t i = 0; i < dataViewAfterOp_h.extent(0); ++i) {
+    auto myRow = Kokkos::subview(cloneOfDataViewBeforeOp_h, i, Kokkos::ALL());
+    auto stdIt = std::remove_if(KE::begin(myRow), KE::end(myRow), predicate);
+    const std::size_t stdDistance = KE::distance(KE::begin(myRow), stdIt);
+    EXPECT_TRUE(distancesView_h(i) == stdDistance);
+
+    for (std::size_t j = 0; j < distancesView_h(i); ++j) {
+      EXPECT_TRUE(dataViewAfterOp_h(i, j) == cloneOfDataViewBeforeOp_h(i, j));
+    }
+    ASSERT_TRUE(intraTeamSentinelView_h(i));
+  }
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios() {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8113}) {
+      for (int apiId : {0, 1}) {
+        test_A<LayoutTag, ValueType>(numTeams, numCols, apiId);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_remove_if_team_test, test) {
+  run_all_scenarios<DynamicTag, double>();
+  run_all_scenarios<StridedTwoRowsTag, int>();
+  run_all_scenarios<StridedThreeRowsTag, unsigned>();
+}
+
+}  // namespace TeamRemoveIf
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplace.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplace.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fddc0f3b5bc3f7ff9b8bc1524797c4e503c4cd47
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplace.cpp
@@ -0,0 +1,135 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+#include <algorithm>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamReplace {
+
+namespace KE = Kokkos::Experimental;
+
+template <class ValueType>
+struct UnifDist;
+
+template <>
+struct UnifDist<int> {
+  using dist_type = std::uniform_int_distribution<int>;
+  std::mt19937 m_gen;
+  dist_type m_dist;
+
+  UnifDist(int b, std::size_t seedIn) : m_dist(0, b) { m_gen.seed(seedIn); }
+
+  int operator()() { return m_dist(m_gen); }
+};
+
+template <class ViewType, class ValueType>
+struct TestFunctorA {
+  ViewType m_view;
+  ValueType m_targetValue;
+  ValueType m_newValue;
+  int m_apiPick;
+
+  TestFunctorA(const ViewType view, ValueType oldVal, ValueType newVal,
+               int apiPick)
+      : m_view(view),
+        m_targetValue(oldVal),
+        m_newValue(newVal),
+        m_apiPick(apiPick) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto myRowIndex = member.league_rank();
+    auto myRowView        = Kokkos::subview(m_view, myRowIndex, Kokkos::ALL());
+
+    if (m_apiPick == 0) {
+      KE::replace(member, KE::begin(myRowView), KE::end(myRowView),
+                  m_targetValue, m_newValue);
+    } else if (m_apiPick == 1) {
+      KE::replace(member, myRowView, m_targetValue, m_newValue);
+    }
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
+  /* description:
+     Randomly fill a view with some elements equal to a target value that we
+     want to replace with a new value. Do the operation via a team parfor with
+     one row per team.
+   */
+
+  const auto targetVal = static_cast<ValueType>(531);
+  const auto newVal    = static_cast<ValueType>(123);
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // Create a view in the memory space associated with default exespace with as
+  // many rows as the number of teams and fill it with random values from an
+  // arbitrary range. Pick range so that some of the values are equal to target.
+  auto [dataView, dataViewBeforeOp_h] = create_random_view_and_host_clone(
+      LayoutTag{}, numTeams, numCols,
+      Kokkos::pair<ValueType, ValueType>{targetVal - 1, targetVal + 1},
+      "dataView");
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+  // use CTAD for functor
+  TestFunctorA fnc(dataView, targetVal, newVal, apiId);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // conditions for test passing:
+  // - the target elements are replaced with the new value
+  // - all other elements are unchanged
+  // -----------------------------------------------
+  auto dataViewAfterOp_h = create_host_space_copy(dataView);
+  for (std::size_t i = 0; i < dataViewAfterOp_h.extent(0); ++i) {
+    for (std::size_t j = 0; j < dataViewAfterOp_h.extent(1); ++j) {
+      const auto correctVal = (dataViewBeforeOp_h(i, j) == targetVal)
+                                  ? newVal
+                                  : dataViewBeforeOp_h(i, j);
+      ASSERT_EQ(dataViewAfterOp_h(i, j), correctVal)
+          << "i, j = " << i << ", " << j;
+    }
+  }
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios() {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 11113}) {
+      for (int apiId : {0, 1}) {
+        test_A<LayoutTag, ValueType>(numTeams, numCols, apiId);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_replace_team_test, test) {
+  run_all_scenarios<DynamicTag, double>();
+  run_all_scenarios<StridedTwoRowsTag, int>();
+  run_all_scenarios<StridedThreeRowsTag, unsigned>();
+}
+
+}  // namespace TeamReplace
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopy.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopy.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..70dbf10574b85a79378099f7bb85b94d161f2559
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopy.cpp
@@ -0,0 +1,204 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+#include <algorithm>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamReplaceCopy {
+
+namespace KE = Kokkos::Experimental;
+
+template <class ValueType>
+struct UnifDist;
+
+template <>
+struct UnifDist<int> {
+  using dist_type = std::uniform_int_distribution<int>;
+  std::mt19937 m_gen;
+  dist_type m_dist;
+
+  UnifDist(int b, std::size_t seedIn) : m_dist(0, b) { m_gen.seed(seedIn); }
+
+  int operator()() { return m_dist(m_gen); }
+};
+
+template <class SourceViewType, class DestViewType, class DistancesViewType,
+          class IntraTeamSentinelView, class ValueType>
+struct TestFunctorA {
+  SourceViewType m_sourceView;
+  DestViewType m_destView;
+  DistancesViewType m_distancesView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+  ValueType m_targetValue;
+  ValueType m_newValue;
+  int m_apiPick;
+
+  TestFunctorA(const SourceViewType sourceView, const DestViewType destView,
+               const DistancesViewType distancesView,
+               const IntraTeamSentinelView intraTeamSentinelView,
+               ValueType targetVal, ValueType newVal, int apiPick)
+      : m_sourceView(sourceView),
+        m_destView(destView),
+        m_distancesView(distancesView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_targetValue(targetVal),
+        m_newValue(newVal),
+        m_apiPick(apiPick) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto myRowIndex = member.league_rank();
+    auto myRowViewFrom =
+        Kokkos::subview(m_sourceView, myRowIndex, Kokkos::ALL());
+    auto myRowViewDest = Kokkos::subview(m_destView, myRowIndex, Kokkos::ALL());
+    ptrdiff_t resultDist = 0;
+
+    if (m_apiPick == 0) {
+      auto it = KE::replace_copy(
+          member, KE::begin(myRowViewFrom), KE::end(myRowViewFrom),
+          KE::begin(myRowViewDest), m_targetValue, m_newValue);
+      resultDist = KE::distance(KE::begin(myRowViewDest), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    } else if (m_apiPick == 1) {
+      auto it    = KE::replace_copy(member, myRowViewFrom, myRowViewDest,
+                                 m_targetValue, m_newValue);
+      resultDist = KE::distance(KE::begin(myRowViewDest), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    }
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck = team_members_have_matching_result(
+        member, resultDist, m_distancesView(myRowIndex));
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(myRowIndex) = intraTeamCheck;
+    });
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
+  /* description:
+     use a "source" and "destination" rank-2 views such that in the source,
+     for each row, a random subset of elements is filled with a target value
+     that we want to replace_copy with a new value into the destination view.
+     The operation is done via a team parfor with one row per team.
+   */
+
+  const auto targetVal = static_cast<ValueType>(531);
+  const auto newVal    = static_cast<ValueType>(123);
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // construct in memory space associated with default exespace
+  auto sourceView =
+      create_view<ValueType>(LayoutTag{}, numTeams, numCols, "sourceView");
+
+  // sourceView might not deep copyable (e.g. strided layout) so to fill it
+  // we make a new view that is for sure deep copyable, modify it on the host
+  // deep copy to device and then launch copy kernel to sourceView
+  auto sourceView_dc =
+      create_deep_copyable_compatible_view_with_same_extent(sourceView);
+  auto sourceView_dc_h = create_mirror_view(Kokkos::HostSpace(), sourceView_dc);
+
+  // for each row, randomly select columns, fill with targetVal
+  const std::size_t maxColInd = numCols > 0 ? numCols - 1 : 0;
+  UnifDist<int> colCountProducer(maxColInd, 3123377);
+  UnifDist<int> colIndicesProducer(maxColInd, 455225);
+  for (std::size_t i = 0; i < sourceView_dc_h.extent(0); ++i) {
+    const std::size_t currCount = colCountProducer();
+    for (std::size_t j = 0; j < currCount; ++j) {
+      const auto colInd          = colIndicesProducer();
+      sourceView_dc_h(i, colInd) = targetVal;
+    }
+  }
+
+  // copy to sourceView_dc and then to sourceView
+  Kokkos::deep_copy(sourceView_dc, sourceView_dc_h);
+  // use CTAD
+  CopyFunctorRank2 F1(sourceView_dc, sourceView);
+  Kokkos::parallel_for("copy", sourceView.extent(0) * sourceView.extent(1), F1);
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+  // create the destination view where we to store the replace_copy
+  Kokkos::View<ValueType**> destView("destView", numTeams, numCols);
+
+  // replace_copy returns an iterator so to verify that it is correct
+  // each team stores the distance of the returned iterator from the
+  // beginning of the interval that team operates on and then we check
+  // that these distances match the std result
+  Kokkos::View<std::size_t*> distancesView("distancesView", numTeams);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  // use CTAD for functor
+  TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView,
+                   targetVal, newVal, apiId);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // run cpp-std kernel and check
+  // -----------------------------------------------
+  auto distancesView_h         = create_host_space_copy(distancesView);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+  Kokkos::View<ValueType**, Kokkos::HostSpace> stdDestView("stdDestView",
+                                                           numTeams, numCols);
+  for (std::size_t i = 0; i < sourceView_dc_h.extent(0); ++i) {
+    auto rowFrom = Kokkos::subview(sourceView_dc_h, i, Kokkos::ALL());
+    auto rowDest = Kokkos::subview(stdDestView, i, Kokkos::ALL());
+    auto it      = std::replace_copy(KE::cbegin(rowFrom), KE::cend(rowFrom),
+                                KE::begin(rowDest), targetVal, newVal);
+    const std::size_t stdDistance = KE::distance(KE::begin(rowDest), it);
+    ASSERT_EQ(stdDistance, distancesView_h(i));
+    ASSERT_TRUE(intraTeamSentinelView_h(i));
+  }
+
+  auto dataViewAfterOp_h = create_host_space_copy(destView);
+  expect_equal_host_views(stdDestView, dataViewAfterOp_h);
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios() {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 11113}) {
+      for (int apiId : {0, 1}) {
+        test_A<LayoutTag, ValueType>(numTeams, numCols, apiId);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_replace_copy_team_test, test) {
+  run_all_scenarios<DynamicTag, double>();
+  run_all_scenarios<StridedTwoRowsTag, int>();
+  run_all_scenarios<StridedThreeRowsTag, unsigned>();
+}
+
+}  // namespace TeamReplaceCopy
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopyIf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopyIf.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ae43a2a4269cb75f1164e2106e952cbb57c1afa8
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopyIf.cpp
@@ -0,0 +1,183 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamReplaceCopyIf {
+
+namespace KE = Kokkos::Experimental;
+
+template <class ValueType>
+struct GreaterThanValueFunctor {
+  ValueType m_val;
+
+  KOKKOS_INLINE_FUNCTION
+  GreaterThanValueFunctor(ValueType val) : m_val(val) {}
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator()(ValueType val) const { return (val > m_val); }
+};
+
+template <class SourceViewType, class DestViewType, class DistancesViewType,
+          class IntraTeamSentinelView, class ValueType>
+struct TestFunctorA {
+  SourceViewType m_sourceView;
+  DestViewType m_destView;
+  DistancesViewType m_distancesView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+  ValueType m_threshold;
+  ValueType m_newValue;
+  int m_apiPick;
+
+  TestFunctorA(const SourceViewType sourceView, const DestViewType destView,
+               const DistancesViewType distancesView,
+               const IntraTeamSentinelView intraTeamSentinelView,
+               ValueType threshold, ValueType newVal, int apiPick)
+      : m_sourceView(sourceView),
+        m_destView(destView),
+        m_distancesView(distancesView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_threshold(threshold),
+        m_newValue(newVal),
+        m_apiPick(apiPick) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto myRowIndex = member.league_rank();
+    auto myRowViewFrom =
+        Kokkos::subview(m_sourceView, myRowIndex, Kokkos::ALL());
+    auto myRowViewDest = Kokkos::subview(m_destView, myRowIndex, Kokkos::ALL());
+    ptrdiff_t resultDist = 0;
+
+    GreaterThanValueFunctor predicate(m_threshold);
+
+    if (m_apiPick == 0) {
+      auto it = KE::replace_copy_if(
+          member, KE::begin(myRowViewFrom), KE::end(myRowViewFrom),
+          KE::begin(myRowViewDest), predicate, m_newValue);
+      resultDist = KE::distance(KE::begin(myRowViewDest), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    } else if (m_apiPick == 1) {
+      auto it    = KE::replace_copy_if(member, myRowViewFrom, myRowViewDest,
+                                    predicate, m_newValue);
+      resultDist = KE::distance(KE::begin(myRowViewDest), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    }
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck = team_members_have_matching_result(
+        member, resultDist, m_distancesView(myRowIndex));
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(myRowIndex) = intraTeamCheck;
+    });
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
+  /* description:
+     use a rank-2 view randomly filled with values,
+     and run a team-level replace_copy_if where the values strictly
+     greater than a threshold are replaced/copied into a new view
+     with a new value, while those <= threshold are copied but unchanged
+   */
+
+  const auto threshold = static_cast<ValueType>(151);
+  const auto newVal    = static_cast<ValueType>(1);
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // create a view in the memory space associated with default exespace
+  // with as many rows as the number of teams and fill it with random
+  // values from an arbitrary range
+  auto [sourceView, cloneOfSourceViewBeforeOp_h] =
+      create_random_view_and_host_clone(
+          LayoutTag{}, numTeams, numCols,
+          Kokkos::pair<ValueType, ValueType>{5, 523}, "sourceView");
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+  // create the destination view
+  Kokkos::View<ValueType**> destView("destView", numTeams, numCols);
+
+  // replace_copy_if returns an iterator so to verify that it is correct
+  // each team stores the distance of the returned iterator from the
+  // beginning of the interval that team operates on and then we check
+  // that these distances match the std result
+  Kokkos::View<std::size_t*> distancesView("distancesView", numTeams);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  // use CTAD for functor
+  TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView,
+                   threshold, newVal, apiId);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // run cpp-std kernel and check
+  // -----------------------------------------------
+  auto distancesView_h         = create_host_space_copy(distancesView);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+  Kokkos::View<ValueType**, Kokkos::HostSpace> stdDestView("stdDestView",
+                                                           numTeams, numCols);
+  GreaterThanValueFunctor predicate(threshold);
+  for (std::size_t i = 0; i < sourceView.extent(0); ++i) {
+    auto rowFrom =
+        Kokkos::subview(cloneOfSourceViewBeforeOp_h, i, Kokkos::ALL());
+    auto rowDest = Kokkos::subview(stdDestView, i, Kokkos::ALL());
+    auto it      = std::replace_copy_if(KE::cbegin(rowFrom), KE::cend(rowFrom),
+                                   KE::begin(rowDest), predicate, newVal);
+    const std::size_t stdDistance = KE::distance(KE::begin(rowDest), it);
+    ASSERT_EQ(stdDistance, distancesView_h(i));
+    ASSERT_TRUE(intraTeamSentinelView_h(i));
+  }
+
+  auto dataViewAfterOp_h = create_host_space_copy(destView);
+  expect_equal_host_views(stdDestView, dataViewAfterOp_h);
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios() {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8153}) {
+      for (int apiId : {0, 1}) {
+        test_A<LayoutTag, ValueType>(numTeams, numCols, apiId);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_replace_copy_if_team_test, test) {
+  run_all_scenarios<DynamicTag, double>();
+  run_all_scenarios<StridedTwoRowsTag, int>();
+  run_all_scenarios<StridedThreeRowsTag, unsigned>();
+}
+
+}  // namespace TeamReplaceCopyIf
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceIf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceIf.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1d5d9578f948185bfd2fc39dfa21556a03b0c580
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceIf.cpp
@@ -0,0 +1,138 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamReplaceIf {
+
+namespace KE = Kokkos::Experimental;
+
+template <class ValueType>
+struct GreaterThanValueFunctor {
+  ValueType m_val;
+
+  KOKKOS_INLINE_FUNCTION
+  GreaterThanValueFunctor(ValueType val) : m_val(val) {}
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator()(ValueType val) const { return (val > m_val); }
+};
+
+template <class ViewType, class ValueType>
+struct TestFunctorA {
+  ViewType m_view;
+  ValueType m_threshold;
+  ValueType m_newVal;
+  int m_apiPick;
+
+  TestFunctorA(const ViewType view, ValueType threshold, ValueType newVal,
+               int apiPick)
+      : m_view(view),
+        m_threshold(threshold),
+        m_newVal(newVal),
+        m_apiPick(apiPick) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto myRowIndex = member.league_rank();
+    auto myRowView        = Kokkos::subview(m_view, myRowIndex, Kokkos::ALL());
+
+    GreaterThanValueFunctor predicate(m_threshold);
+    if (m_apiPick == 0) {
+      KE::replace_if(member, KE::begin(myRowView), KE::end(myRowView),
+                     predicate, m_newVal);
+    } else if (m_apiPick == 1) {
+      KE::replace_if(member, myRowView, predicate, m_newVal);
+    }
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
+  /* description:
+     use a rank-2 view randomly filled with values,
+     and run a team-level replace_if where the values strictly greater
+     than a threshold are replaced with a new value.
+   */
+  const auto threshold = static_cast<ValueType>(151);
+  const auto newVal    = static_cast<ValueType>(1);
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // create a view in the memory space associated with default exespace
+  // with as many rows as the number of teams and fill it with random
+  // values from an arbitrary range
+  auto [dataView, cloneOfDataViewBeforeOp_h] =
+      create_random_view_and_host_clone(
+          LayoutTag{}, numTeams, numCols,
+          Kokkos::pair<ValueType, ValueType>{5, 523}, "dataView");
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+  // use CTAD for functor
+  TestFunctorA fnc(dataView, threshold, newVal, apiId);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // run cpp-std kernel
+  // -----------------------------------------------
+  Kokkos::View<ValueType**, Kokkos::HostSpace> stdDataView("stdDataView",
+                                                           numTeams, numCols);
+  // ensure that we use the same data to run the std algo on
+  for (std::size_t i = 0; i < dataView.extent(0); ++i) {
+    for (std::size_t j = 0; j < dataView.extent(1); ++j) {
+      stdDataView(i, j) = cloneOfDataViewBeforeOp_h(i, j);
+    }
+  }
+  GreaterThanValueFunctor predicate(threshold);
+  for (std::size_t i = 0; i < dataView.extent(0); ++i) {
+    auto thisRow = Kokkos::subview(stdDataView, i, Kokkos::ALL());
+    std::replace_if(KE::begin(thisRow), KE::end(thisRow), predicate, newVal);
+  }
+
+  // -----------------------------------------------
+  // check
+  // -----------------------------------------------
+  auto dataViewAfterOp_h = create_host_space_copy(dataView);
+  expect_equal_host_views(stdDataView, dataViewAfterOp_h);
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios() {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8153}) {
+      for (int apiId : {0, 1}) {
+        test_A<LayoutTag, ValueType>(numTeams, numCols, apiId);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_replace_if_team_test, test) {
+  run_all_scenarios<DynamicTag, double>();
+  run_all_scenarios<StridedTwoRowsTag, int>();
+  run_all_scenarios<StridedThreeRowsTag, unsigned>();
+}
+
+}  // namespace TeamReplaceIf
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReverse.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReverse.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..46d27aa16b0f6afc7fa32f9f028cc4c411bf4d34
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReverse.cpp
@@ -0,0 +1,105 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamReverse {
+
+namespace KE = Kokkos::Experimental;
+
+template <class ViewType>
+struct TestFunctorA {
+  ViewType m_view;
+  int m_api_pick;
+
+  TestFunctorA(const ViewType view, int apiPick)
+      : m_view(view), m_api_pick(apiPick) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto myRowIndex = member.league_rank();
+    auto myRowView        = Kokkos::subview(m_view, myRowIndex, Kokkos::ALL());
+
+    if (m_api_pick == 0) {
+      KE::reverse(member, KE::begin(myRowView), KE::end(myRowView));
+    } else if (m_api_pick == 1) {
+      KE::reverse(member, myRowView);
+    }
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
+  /* description:
+     create a rank-2 view, randomly fill, and do a team-level
+     KE::reverse for each row where each team is
+     responsible for a single row
+   */
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // create a view in the memory space associated with default exespace
+  // with as many rows as the number of teams and fill it with random
+  // values from an arbitrary range
+  auto [dataView, cloneOfDataViewBeforeOp_h] =
+      create_random_view_and_host_clone(
+          LayoutTag{}, numTeams, numCols,
+          Kokkos::pair<ValueType, ValueType>{11, 523}, "dataView");
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+  // use CTAD for functor
+  TestFunctorA fnc(dataView, apiId);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // check
+  // -----------------------------------------------
+  auto dataViewAfterOp_h = create_host_space_copy(dataView);
+  for (std::size_t i = 0; i < dataViewAfterOp_h.extent(0); ++i) {
+    for (std::size_t j = 0; j < dataViewAfterOp_h.extent(1); ++j) {
+      EXPECT_TRUE(dataViewAfterOp_h(i, j) ==
+                  cloneOfDataViewBeforeOp_h(i, numCols - j - 1));
+    }
+  }
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios() {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8153}) {
+      for (int apiId : {0, 1}) {
+        test_A<LayoutTag, ValueType>(numTeams, numCols, apiId);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_reverse_team_test, test) {
+  run_all_scenarios<DynamicTag, double>();
+  run_all_scenarios<StridedTwoRowsTag, int>();
+  run_all_scenarios<StridedThreeRowsTag, unsigned>();
+}
+
+}  // namespace TeamReverse
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReverseCopy.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReverseCopy.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a06ae839e1fbbf65111ec97f5a9922f59de9dac8
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReverseCopy.cpp
@@ -0,0 +1,153 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamReverseCopy {
+
+namespace KE = Kokkos::Experimental;
+
+template <class SourceViewType, class DestViewType, class DistancesViewType,
+          class IntraTeamSentinelView>
+struct TestFunctorA {
+  SourceViewType m_sourceView;
+  DestViewType m_destView;
+  DistancesViewType m_distancesView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+  int m_apiPick;
+
+  TestFunctorA(const SourceViewType sourceView, const DestViewType destView,
+               const DistancesViewType distancesView,
+               const IntraTeamSentinelView intraTeamSentinelView, int apiPick)
+      : m_sourceView(sourceView),
+        m_destView(destView),
+        m_distancesView(distancesView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_apiPick(apiPick) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto myRowIndex = member.league_rank();
+    auto myRowViewFrom =
+        Kokkos::subview(m_sourceView, myRowIndex, Kokkos::ALL());
+    auto myRowViewDest = Kokkos::subview(m_destView, myRowIndex, Kokkos::ALL());
+    ptrdiff_t resultDist = 0;
+
+    if (m_apiPick == 0) {
+      auto it =
+          KE::reverse_copy(member, KE::begin(myRowViewFrom),
+                           KE::end(myRowViewFrom), KE::begin(myRowViewDest));
+      resultDist = KE::distance(KE::begin(myRowViewDest), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    } else if (m_apiPick == 1) {
+      auto it    = KE::reverse_copy(member, myRowViewFrom, myRowViewDest);
+      resultDist = KE::distance(KE::begin(myRowViewDest), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    }
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck = team_members_have_matching_result(
+        member, resultDist, m_distancesView(myRowIndex));
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(myRowIndex) = intraTeamCheck;
+    });
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
+  /* description:
+     randomly fill a source view and reverse_copy into a destination view.
+     The operation is done via a team parfor with one row per team.
+   */
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // create a view in the memory space associated with default exespace
+  // with as many rows as the number of teams and fill it with random
+  // values from an arbitrary range
+  auto [sourceView, cloneOfSourceViewBeforeOp_h] =
+      create_random_view_and_host_clone(
+          LayoutTag{}, numTeams, numCols,
+          Kokkos::pair<ValueType, ValueType>{11, 523}, "sourceView");
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+  // create the destination view
+  Kokkos::View<ValueType**> destView("destView", numTeams, numCols);
+
+  // to verify that things are correct each team stores the distance
+  // of the returned iterator from the
+  // beginning of the interval that team operates on and then we check
+  // that these distances match the expectation
+  Kokkos::View<std::size_t*> distancesView("distancesView", numTeams);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  // use CTAD for functor
+  TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView,
+                   apiId);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // check
+  // -----------------------------------------------
+  auto distancesView_h         = create_host_space_copy(distancesView);
+  auto destViewAfterOp_h       = create_host_space_copy(destView);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+  for (std::size_t i = 0; i < destViewAfterOp_h.extent(0); ++i) {
+    for (std::size_t j = 0; j < destViewAfterOp_h.extent(1); ++j) {
+      EXPECT_TRUE(destViewAfterOp_h(i, j) ==
+                  cloneOfSourceViewBeforeOp_h(i, numCols - j - 1));
+    }
+    // each team should return an iterator past the last column
+    EXPECT_TRUE(distancesView_h(i) == numCols);
+    ASSERT_TRUE(intraTeamSentinelView_h(i));
+  }
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios() {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8113}) {
+      for (int apiId : {0, 1}) {
+        test_A<LayoutTag, ValueType>(numTeams, numCols, apiId);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_reverse_copy_team_test, test) {
+  run_all_scenarios<DynamicTag, double>();
+  run_all_scenarios<StridedTwoRowsTag, int>();
+  run_all_scenarios<StridedThreeRowsTag, unsigned>();
+}
+
+}  // namespace TeamReverseCopy
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRotate.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRotate.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3188e029103bd49c80ac93dd4a03c2e3490a3b16
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRotate.cpp
@@ -0,0 +1,173 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+#include <algorithm>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamRotate {
+
+namespace KE = Kokkos::Experimental;
+
+template <class ViewType, class DistancesViewType, class IntraTeamSentinelView>
+struct TestFunctorA {
+  ViewType m_view;
+  DistancesViewType m_distancesView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+  std::size_t m_pivotShift;
+  int m_apiPick;
+
+  TestFunctorA(const ViewType view, const DistancesViewType distancesView,
+               const IntraTeamSentinelView intraTeamSentinelView,
+               std::size_t pivotShift, int apiPick)
+      : m_view(view),
+        m_distancesView(distancesView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_pivotShift(pivotShift),
+        m_apiPick(apiPick) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto myRowIndex = member.league_rank();
+    auto myRowView        = Kokkos::subview(m_view, myRowIndex, Kokkos::ALL());
+    ptrdiff_t resultDist  = 0;
+
+    if (m_apiPick == 0) {
+      auto pivot = KE::begin(myRowView) + m_pivotShift;
+      auto it =
+          KE::rotate(member, KE::begin(myRowView), pivot, KE::end(myRowView));
+      resultDist = KE::distance(KE::begin(myRowView), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    } else if (m_apiPick == 1) {
+      auto it    = KE::rotate(member, myRowView, m_pivotShift);
+      resultDist = KE::distance(KE::begin(myRowView), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    }
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck = team_members_have_matching_result(
+        member, resultDist, m_distancesView(myRowIndex));
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(myRowIndex) = intraTeamCheck;
+    });
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(std::size_t numTeams, std::size_t numCols, std::size_t pivotShift,
+            int apiId) {
+  /* description:
+     randomly fill a rank-2 view and for a given pivot,
+     do a team-level KE::rotate
+   */
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // create a view in the memory space associated with default exespace
+  // with as many rows as the number of teams and fill it with random
+  // values from an arbitrary range
+  auto [dataView, cloneOfDataViewBeforeOp_h] =
+      create_random_view_and_host_clone(
+          LayoutTag{}, numTeams, numCols,
+          Kokkos::pair<ValueType, ValueType>{11, 523}, "dataView");
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+
+  // each team stores the distance of the returned iterator from the
+  // beginning of the interval that team operates on and then we check
+  // that these distances match the expectation
+  Kokkos::View<std::size_t*> distancesView("distancesView", numTeams);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  // use CTAD for functor
+  TestFunctorA fnc(dataView, distancesView, intraTeamSentinelView, pivotShift,
+                   apiId);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // run std algo and check
+  // -----------------------------------------------
+  // here I can use cloneOfDataViewBeforeOp_h to run std algo on
+  // since that contains a valid copy of the data
+  auto distancesView_h         = create_host_space_copy(distancesView);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+  for (std::size_t i = 0; i < cloneOfDataViewBeforeOp_h.extent(0); ++i) {
+    auto myRow = Kokkos::subview(cloneOfDataViewBeforeOp_h, i, Kokkos::ALL());
+    auto pivot = KE::begin(myRow) + pivotShift;
+
+    auto it = std::rotate(KE::begin(myRow), pivot, KE::end(myRow));
+    const std::size_t stdDistance = KE::distance(KE::begin(myRow), it);
+    ASSERT_EQ(stdDistance, distancesView_h(i));
+    ASSERT_TRUE(intraTeamSentinelView_h(i));
+  }
+
+  auto dataViewAfterOp_h = create_host_space_copy(dataView);
+  expect_equal_host_views(cloneOfDataViewBeforeOp_h, dataViewAfterOp_h);
+}
+
+template <class ValueType>
+struct UnifDist;
+
+template <>
+struct UnifDist<int> {
+  using dist_type = std::uniform_int_distribution<int>;
+  std::mt19937 m_gen;
+  dist_type m_dist;
+
+  UnifDist(int b, std::size_t seedIn) : m_dist(0, b) { m_gen.seed(seedIn); }
+  int operator()() { return m_dist(m_gen); }
+};
+
+template <class Tag, class ValueType>
+void run_all_scenarios() {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : {0, 1, 2, 13, 101, 1153}) {
+      // given numTeams, numCols, randomly pick a few pivots to test
+      constexpr int numPivotsToTest = 5;
+      UnifDist<int> pivotsProducer(numCols, 3123377);
+      for (int k = 0; k < numPivotsToTest; ++k) {
+        const auto pivotIndex = pivotsProducer();
+        // test all apis
+        for (int apiId : {0, 1}) {
+          test_A<Tag, ValueType>(numTeams, numCols, pivotIndex, apiId);
+        }
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_rotate_team_test, test) {
+  run_all_scenarios<DynamicTag, double>();
+  run_all_scenarios<StridedTwoRowsTag, int>();
+  run_all_scenarios<StridedThreeRowsTag, int>();
+}
+
+}  // namespace TeamRotate
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRotateCopy.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRotateCopy.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e865b998f600321f775b042b981fcec6275b49bb
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRotateCopy.cpp
@@ -0,0 +1,188 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+#include <algorithm>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamRotateCopy {
+
+namespace KE = Kokkos::Experimental;
+
+template <class SourceViewType, class DestViewType, class DistancesViewType,
+          class IntraTeamSentinelView>
+struct TestFunctorA {
+  SourceViewType m_sourceView;
+  DestViewType m_destView;
+  DistancesViewType m_distancesView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+  std::size_t m_pivotShift;
+  int m_apiPick;
+
+  TestFunctorA(const SourceViewType sourceView, const DestViewType destView,
+               const DistancesViewType distancesView,
+               const IntraTeamSentinelView intraTeamSentinelView,
+               std::size_t pivotShift, int apiPick)
+      : m_sourceView(sourceView),
+        m_destView(destView),
+        m_distancesView(distancesView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_pivotShift(pivotShift),
+        m_apiPick(apiPick) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto myRowIndex = member.league_rank();
+    auto myRowViewFrom =
+        Kokkos::subview(m_sourceView, myRowIndex, Kokkos::ALL());
+    auto myRowViewDest = Kokkos::subview(m_destView, myRowIndex, Kokkos::ALL());
+    ptrdiff_t resultDist = 0;
+
+    if (m_apiPick == 0) {
+      auto pivot = KE::cbegin(myRowViewFrom) + m_pivotShift;
+      auto it =
+          KE::rotate_copy(member, KE::cbegin(myRowViewFrom), pivot,
+                          KE::cend(myRowViewFrom), KE::begin(myRowViewDest));
+      resultDist = KE::distance(KE::begin(myRowViewDest), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    } else if (m_apiPick == 1) {
+      auto it =
+          KE::rotate_copy(member, myRowViewFrom, m_pivotShift, myRowViewDest);
+      resultDist = KE::distance(KE::begin(myRowViewDest), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    }
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck = team_members_have_matching_result(
+        member, resultDist, m_distancesView(myRowIndex));
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(myRowIndex) = intraTeamCheck;
+    });
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(std::size_t numTeams, std::size_t numCols, std::size_t pivotShift,
+            int apiId) {
+  /* description:
+     randomly fill a rank-2 view and for a given pivot,
+     do a team-level KE::rotateCopy
+   */
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // create a view in the memory space associated with default exespace
+  // with as many rows as the number of teams and fill it with random
+  // values from an arbitrary range
+  auto [sourceView, cloneOfSourceViewBeforeOp_h] =
+      create_random_view_and_host_clone(
+          LayoutTag{}, numTeams, numCols,
+          Kokkos::pair<ValueType, ValueType>{11, 523}, "sourceView");
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+
+  // create the destination view
+  Kokkos::View<ValueType**> destView("destView", numTeams, numCols);
+
+  // each team stores the distance of the returned iterator from the
+  // beginning of the interval that team operates on and then we check
+  // that these distances match the expectation
+  Kokkos::View<std::size_t*> distancesView("distancesView", numTeams);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  // use CTAD for functor
+  TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView,
+                   pivotShift, apiId);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // run std algo and check
+  // -----------------------------------------------
+  Kokkos::View<ValueType**, Kokkos::HostSpace> stdDestView("stdDestView",
+                                                           numTeams, numCols);
+  auto distancesView_h         = create_host_space_copy(distancesView);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+
+  for (std::size_t i = 0; i < cloneOfSourceViewBeforeOp_h.extent(0); ++i) {
+    auto myRowFrom =
+        Kokkos::subview(cloneOfSourceViewBeforeOp_h, i, Kokkos::ALL());
+    auto myRowDest = Kokkos::subview(stdDestView, i, Kokkos::ALL());
+
+    auto pivot = KE::cbegin(myRowFrom) + pivotShift;
+    auto it    = std::rotate_copy(KE::cbegin(myRowFrom), pivot,
+                               KE::cend(myRowFrom), KE::begin(myRowDest));
+    const std::size_t stdDistance = KE::distance(KE::begin(myRowDest), it);
+    ASSERT_EQ(stdDistance, distancesView_h(i));
+    ASSERT_TRUE(intraTeamSentinelView_h(i));
+  }
+
+  auto destViewAfterOp_h = create_host_space_copy(destView);
+  expect_equal_host_views(stdDestView, destViewAfterOp_h);
+}
+
+template <class ValueType>
+struct UnifDist;
+
+template <>
+struct UnifDist<int> {
+  using dist_type = std::uniform_int_distribution<int>;
+  std::mt19937 m_gen;
+  dist_type m_dist;
+
+  UnifDist(int b, std::size_t seedIn) : m_dist(0, b) { m_gen.seed(seedIn); }
+  int operator()() { return m_dist(m_gen); }
+};
+
+template <class Tag, class ValueType>
+void run_all_scenarios() {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : {0, 1, 2, 13, 101, 1153}) {
+      // given numTeams, numCols, randomly pick a few pivots to test
+      constexpr int numPivotsToTest = 5;
+      UnifDist<int> pivotsProducer(numCols, 3123377);
+      for (int k = 0; k < numPivotsToTest; ++k) {
+        const auto pivotIndex = pivotsProducer();
+        // test all apis
+        for (int apiId : {0, 1}) {
+          test_A<Tag, ValueType>(numTeams, numCols, pivotIndex, apiId);
+        }
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_rotate_copy_team_test, test) {
+  run_all_scenarios<DynamicTag, double>();
+  run_all_scenarios<StridedTwoRowsTag, int>();
+  run_all_scenarios<StridedThreeRowsTag, int>();
+}
+
+}  // namespace TeamRotateCopy
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamSearch.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamSearch.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..039db4095df3e44c519152b95f79647af106749f
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamSearch.cpp
@@ -0,0 +1,279 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamSearch {
+
+namespace KE = Kokkos::Experimental;
+
+template <class ValueType>
+struct EqualFunctor {
+  KOKKOS_INLINE_FUNCTION
+  bool operator()(const ValueType& lhs, const ValueType& rhs) const {
+    return lhs == rhs;
+  }
+};
+
+template <class DataViewType, class SearchedSequencesViewType,
+          class DistancesViewType, class IntraTeamSentinelView,
+          class BinaryPredType>
+struct TestFunctorA {
+  DataViewType m_dataView;
+  SearchedSequencesViewType m_searchedSequencesView;
+  DistancesViewType m_distancesView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+  BinaryPredType m_binaryPred;
+  int m_apiPick;
+
+  TestFunctorA(const DataViewType dataView,
+               const SearchedSequencesViewType searchedSequencesView,
+               const DistancesViewType distancesView,
+               const IntraTeamSentinelView intraTeamSentinelView,
+               BinaryPredType binaryPred, int apiPick)
+      : m_dataView(dataView),
+        m_searchedSequencesView(searchedSequencesView),
+        m_distancesView(distancesView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_binaryPred(binaryPred),
+        m_apiPick(apiPick) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto myRowIndex = member.league_rank();
+    auto myRowViewFrom = Kokkos::subview(m_dataView, myRowIndex, Kokkos::ALL());
+    auto myRowSearchedSeqView =
+        Kokkos::subview(m_searchedSequencesView, myRowIndex, Kokkos::ALL());
+    ptrdiff_t resultDist = 0;
+
+    switch (m_apiPick) {
+      case 0: {
+        const auto it = KE::search(
+            member, KE::cbegin(myRowViewFrom), KE::cend(myRowViewFrom),
+            KE::cbegin(myRowSearchedSeqView), KE::cend(myRowSearchedSeqView));
+        resultDist = KE::distance(KE::cbegin(myRowViewFrom), it);
+        Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+          m_distancesView(myRowIndex) = resultDist;
+        });
+
+        break;
+      }
+
+      case 1: {
+        const auto it = KE::search(member, myRowViewFrom, myRowSearchedSeqView);
+        resultDist    = KE::distance(KE::begin(myRowViewFrom), it);
+        Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+          m_distancesView(myRowIndex) = resultDist;
+        });
+
+        break;
+      }
+
+      case 2: {
+        const auto it = KE::search(
+            member, KE::cbegin(myRowViewFrom), KE::cend(myRowViewFrom),
+            KE::cbegin(myRowSearchedSeqView), KE::cend(myRowSearchedSeqView),
+            m_binaryPred);
+        resultDist = KE::distance(KE::cbegin(myRowViewFrom), it);
+        Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+          m_distancesView(myRowIndex) = resultDist;
+        });
+
+        break;
+      }
+
+      case 3: {
+        const auto it = KE::search(member, myRowViewFrom, myRowSearchedSeqView,
+                                   m_binaryPred);
+        resultDist    = KE::distance(KE::begin(myRowViewFrom), it);
+        Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+          m_distancesView(myRowIndex) = resultDist;
+        });
+
+        break;
+      }
+    }
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck = team_members_have_matching_result(
+        member, resultDist, m_distancesView(myRowIndex));
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(myRowIndex) = intraTeamCheck;
+    });
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(const bool sequencesExist, std::size_t numTeams,
+            std::size_t numCols, int apiId) {
+  /* description:
+     use a rank-2 view randomly filled with values,
+     and run a team-level search
+   */
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // create a view in the memory space associated with default exespace
+  // with as many rows as the number of teams and fill it with random
+  // values from an arbitrary range.
+  constexpr ValueType lowerBound = 5;
+  constexpr ValueType upperBound = 523;
+  const auto bounds              = make_bounds(lowerBound, upperBound);
+
+  auto [dataView, dataViewBeforeOp_h] = create_random_view_and_host_clone(
+      LayoutTag{}, numTeams, numCols, bounds, "dataView");
+
+  // create a view that stores a sequence to found in dataView. If
+  // sequencesExist == true it is filled base on dataView content, to allow
+  // search to actually find anything. If sequencesExist == false it is filled
+  // with random values greater than upperBound
+  const std::size_t halfCols = (numCols > 1) ? ((numCols + 1) / 2) : (1);
+  const std::size_t seqSize  = (numCols > 1) ? (std::log2(numCols)) : (1);
+
+  Kokkos::View<ValueType**> searchedSequencesView("searchedSequencesView",
+                                                  numTeams, seqSize);
+  auto searchedSequencesView_h = create_host_space_copy(searchedSequencesView);
+
+  if (sequencesExist) {
+    const std::size_t dataBegin = halfCols - seqSize;
+    for (std::size_t i = 0; i < searchedSequencesView_h.extent(0); ++i) {
+      for (std::size_t js = 0, jd = dataBegin; js < seqSize; ++js, ++jd) {
+        searchedSequencesView_h(i, js) = dataViewBeforeOp_h(i, jd);
+      }
+    }
+  } else {
+    using rand_pool =
+        Kokkos::Random_XorShift64_Pool<Kokkos::DefaultHostExecutionSpace>;
+    rand_pool pool(lowerBound * upperBound);
+    Kokkos::fill_random(searchedSequencesView_h, pool, upperBound,
+                        upperBound * 2);
+  }
+
+  Kokkos::deep_copy(searchedSequencesView, searchedSequencesView_h);
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+
+  // search returns an iterator so to verify that it is correct each team stores
+  // the distance of the returned iterator from the beginning of the interval
+  // that team operates on and then we check that these distances match the std
+  // result
+  Kokkos::View<std::size_t*> distancesView("distancesView", numTeams);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  EqualFunctor<ValueType> binaryPred;
+
+  // use CTAD for functor
+  TestFunctorA fnc(dataView, searchedSequencesView, distancesView,
+                   intraTeamSentinelView, binaryPred, apiId);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // run cpp-std kernel and check
+  // -----------------------------------------------
+  auto distancesView_h         = create_host_space_copy(distancesView);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+
+  for (std::size_t i = 0; i < dataView.extent(0); ++i) {
+    auto rowFrom = Kokkos::subview(dataViewBeforeOp_h, i, Kokkos::ALL());
+    auto rowSearchedSeq =
+        Kokkos::subview(searchedSequencesView_h, i, Kokkos::ALL());
+
+    const auto rowFromBegin     = KE::cbegin(rowFrom);
+    const auto rowFromEnd       = KE::cend(rowFrom);
+    const auto rowSearchedBegin = KE::cbegin(rowSearchedSeq);
+    const auto rowSearchedEnd   = KE::cend(rowSearchedSeq);
+
+    const std::size_t beginEndDistance = KE::distance(rowFromBegin, rowFromEnd);
+
+    ASSERT_TRUE(intraTeamSentinelView_h(i));
+    switch (apiId) {
+      case 0:
+      case 1: {
+        const auto it = std::search(rowFromBegin, rowFromEnd, rowSearchedBegin,
+                                    rowSearchedEnd);
+        const std::size_t stdDistance = KE::distance(rowFromBegin, it);
+
+        if (sequencesExist) {
+          EXPECT_LT(distancesView_h(i), beginEndDistance);
+        } else {
+          ASSERT_EQ(distancesView_h(i), beginEndDistance);
+        }
+
+        ASSERT_EQ(stdDistance, distancesView_h(i));
+
+        break;
+      }
+
+      case 2:
+      case 3: {
+        const auto it = std::search(rowFromBegin, rowFromEnd, rowSearchedBegin,
+                                    rowSearchedEnd, binaryPred);
+        const std::size_t stdDistance = KE::distance(rowFromBegin, it);
+
+        if (sequencesExist) {
+          EXPECT_LT(distancesView_h(i), beginEndDistance);
+        } else {
+          ASSERT_EQ(distancesView_h(i), beginEndDistance);
+        }
+
+        ASSERT_EQ(stdDistance, distancesView_h(i));
+
+        break;
+      }
+    }
+  }
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios(const bool sequencesExist) {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : {1, 2, 13, 101, 1444, 8153}) {
+      for (int apiId : {0, 1, 2, 3}) {
+        test_A<LayoutTag, ValueType>(sequencesExist, numTeams, numCols, apiId);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_search_team_test, sequences_exist) {
+  constexpr bool sequencesExist = true;
+
+  run_all_scenarios<DynamicTag, double>(sequencesExist);
+  run_all_scenarios<StridedTwoRowsTag, int>(sequencesExist);
+  run_all_scenarios<StridedThreeRowsTag, unsigned>(sequencesExist);
+}
+
+TEST(std_algorithms_search_team_test, sequences_do_not_exist) {
+  constexpr bool sequencesExist = false;
+
+  run_all_scenarios<DynamicTag, double>(sequencesExist);
+  run_all_scenarios<StridedTwoRowsTag, int>(sequencesExist);
+  run_all_scenarios<StridedThreeRowsTag, unsigned>(sequencesExist);
+}
+
+}  // namespace TeamSearch
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamSearchN.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamSearchN.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..25cd1471e0206a8aa3d0dee7492d6fb8e0712b04
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamSearchN.cpp
@@ -0,0 +1,295 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamSearchN {
+
+namespace KE = Kokkos::Experimental;
+
+template <class ValueType>
+struct EqualFunctor {
+  KOKKOS_INLINE_FUNCTION
+  bool operator()(const ValueType& lhs, const ValueType& rhs) const {
+    return lhs == rhs;
+  }
+};
+
+template <class DataViewType, class SearchedValuesViewType,
+          class DistancesViewType, class IntraTeamSentinelView,
+          class BinaryPredType>
+struct TestFunctorA {
+  DataViewType m_dataView;
+  std::size_t m_seqSize;
+  SearchedValuesViewType m_searchedValuesView;
+  DistancesViewType m_distancesView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+  BinaryPredType m_binaryPred;
+  int m_apiPick;
+
+  TestFunctorA(const DataViewType dataView, std::size_t seqSize,
+               const SearchedValuesViewType searchedValuesView,
+               const DistancesViewType distancesView,
+               const IntraTeamSentinelView intraTeamSentinelView,
+               BinaryPredType binaryPred, int apiPick)
+      : m_dataView(dataView),
+        m_seqSize(seqSize),
+        m_searchedValuesView(searchedValuesView),
+        m_distancesView(distancesView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_binaryPred(binaryPred),
+        m_apiPick(apiPick) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto myRowIndex = member.league_rank();
+    auto myRowViewFrom = Kokkos::subview(m_dataView, myRowIndex, Kokkos::ALL());
+    auto rowFromBegin  = KE::begin(myRowViewFrom);
+    auto rowFromEnd    = KE::end(myRowViewFrom);
+    const auto searchedValue = m_searchedValuesView(myRowIndex);
+    ptrdiff_t resultDist     = 0;
+
+    switch (m_apiPick) {
+      case 0: {
+        const auto it = KE::search_n(member, rowFromBegin, rowFromEnd,
+                                     m_seqSize, searchedValue);
+        resultDist    = KE::distance(rowFromBegin, it);
+        Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+          m_distancesView(myRowIndex) = resultDist;
+        });
+
+        break;
+      }
+
+      case 1: {
+        const auto it =
+            KE::search_n(member, myRowViewFrom, m_seqSize, searchedValue);
+        resultDist = KE::distance(rowFromBegin, it);
+        Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+          m_distancesView(myRowIndex) = resultDist;
+        });
+
+        break;
+      }
+
+      case 2: {
+        const auto it = KE::search_n(member, rowFromBegin, rowFromEnd,
+                                     m_seqSize, searchedValue, m_binaryPred);
+        resultDist    = KE::distance(rowFromBegin, it);
+        Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+          m_distancesView(myRowIndex) = resultDist;
+        });
+
+        break;
+      }
+
+      case 3: {
+        const auto it = KE::search_n(member, myRowViewFrom, m_seqSize,
+                                     searchedValue, m_binaryPred);
+        resultDist    = KE::distance(rowFromBegin, it);
+        Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+          m_distancesView(myRowIndex) = resultDist;
+        });
+
+        break;
+      }
+    }
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck = team_members_have_matching_result(
+        member, resultDist, m_distancesView(myRowIndex));
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(myRowIndex) = intraTeamCheck;
+    });
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(const bool sequencesExist, std::size_t numTeams,
+            std::size_t numCols, int apiId) {
+  /* description:
+     use a rank-2 view randomly filled with values,
+     and run a team-level search_n
+   */
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // create a view in the memory space associated with default exespace
+  // with as many rows as the number of teams and fill it with random
+  // values from an arbitrary range.
+  constexpr ValueType lowerBound = 5;
+  constexpr ValueType upperBound = 523;
+  const auto bounds              = make_bounds(lowerBound, upperBound);
+
+  auto [dataView, dataViewBeforeOp_h] = create_random_view_and_host_clone(
+      LayoutTag{}, numTeams, numCols, bounds, "dataView");
+
+  // If sequencesExist == true we need to inject some sequence of count test
+  // value into dataView. If sequencesExist == false we set searchedVal to a
+  // value that is not present in dataView
+
+  const std::size_t halfCols = (numCols > 1) ? ((numCols + 1) / 2) : (1);
+  const std::size_t seqSize  = (numCols > 1) ? (std::log2(numCols)) : (1);
+
+  Kokkos::View<ValueType*> searchedValuesView("searchedValuesView", numTeams);
+  auto searchedValuesView_h = create_host_space_copy(searchedValuesView);
+
+  // dataView might not deep copyable (e.g. strided layout) so to prepare it
+  // correclty, we make a new view that is for sure deep copyable, modify it
+  // on the host, deep copy to device and then launch a kernel to copy to
+  // dataView
+  auto dataView_dc =
+      create_deep_copyable_compatible_view_with_same_extent(dataView);
+  auto dataView_dc_h = create_mirror_view(Kokkos::HostSpace(), dataView_dc);
+
+  if (sequencesExist) {
+    const std::size_t dataBegin = halfCols - seqSize;
+    for (std::size_t i = 0; i < searchedValuesView.extent(0); ++i) {
+      const ValueType searchedVal = dataView_dc_h(i, dataBegin);
+      searchedValuesView_h(i)     = searchedVal;
+
+      for (std::size_t j = dataBegin + 1; j < seqSize; ++j) {
+        dataView_dc_h(i, j) = searchedVal;
+      }
+    }
+
+    // copy to dataView_dc and then to dataView
+    Kokkos::deep_copy(dataView_dc, dataView_dc_h);
+
+    CopyFunctorRank2 cpFun(dataView_dc, dataView);
+    Kokkos::parallel_for("copy", dataView.extent(0) * dataView.extent(1),
+                         cpFun);
+  } else {
+    using rand_pool =
+        Kokkos::Random_XorShift64_Pool<Kokkos::DefaultHostExecutionSpace>;
+    rand_pool pool(lowerBound * upperBound);
+    Kokkos::fill_random(searchedValuesView_h, pool, upperBound, upperBound * 2);
+  }
+
+  Kokkos::deep_copy(searchedValuesView, searchedValuesView_h);
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+
+  // search_n returns an iterator so to verify that it is correct each team
+  // stores the distance of the returned iterator from the beginning of the
+  // interval that team operates on and then we check that these distances match
+  // the std result
+  Kokkos::View<std::size_t*> distancesView("distancesView", numTeams);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  EqualFunctor<ValueType> binaryPred;
+
+  // use CTAD for functor
+  TestFunctorA fnc(dataView, seqSize, searchedValuesView, distancesView,
+                   intraTeamSentinelView, binaryPred, apiId);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // run cpp-std kernel and check
+  // -----------------------------------------------
+  auto distancesView_h         = create_host_space_copy(distancesView);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+
+  for (std::size_t i = 0; i < dataView.extent(0); ++i) {
+    auto rowFrom = Kokkos::subview(dataView_dc_h, i, Kokkos::ALL());
+
+    const auto rowFromBegin = KE::cbegin(rowFrom);
+    const auto rowFromEnd   = KE::cend(rowFrom);
+
+    const ValueType searchedVal = searchedValuesView_h(i);
+
+    const std::size_t beginEndDist = KE::distance(rowFromBegin, rowFromEnd);
+
+    switch (apiId) {
+      case 0:
+      case 1: {
+        const auto it =
+            std::search_n(rowFromBegin, rowFromEnd, seqSize, searchedVal);
+        const std::size_t stdDistance = KE::distance(rowFromBegin, it);
+
+        if (sequencesExist) {
+          EXPECT_LT(distancesView_h(i), beginEndDist);
+        } else {
+          ASSERT_EQ(distancesView_h(i), beginEndDist);
+        }
+
+        ASSERT_EQ(stdDistance, distancesView_h(i));
+        ASSERT_TRUE(intraTeamSentinelView_h(i));
+        break;
+      }
+
+      case 2:
+      case 3: {
+        const auto it = std::search_n(rowFromBegin, rowFromEnd, seqSize,
+                                      searchedVal, binaryPred);
+        const std::size_t stdDistance = KE::distance(rowFromBegin, it);
+
+        if (sequencesExist) {
+          EXPECT_LT(distancesView_h(i), beginEndDist);
+        } else {
+          ASSERT_EQ(distancesView_h(i), beginEndDist);
+        }
+
+        ASSERT_EQ(stdDistance, distancesView_h(i));
+        ASSERT_TRUE(intraTeamSentinelView_h(i));
+
+        break;
+      }
+    }
+  }
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios(const bool sequencesExist) {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : {2, 13, 101, 1444, 8153}) {
+      for (int apiId : {0, 1}) {
+        test_A<LayoutTag, ValueType>(sequencesExist, numTeams, numCols, apiId);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_search_n_team_test, sequences_of_equal_elements_exist) {
+  constexpr bool sequencesExist = true;
+
+  run_all_scenarios<DynamicTag, double>(sequencesExist);
+  run_all_scenarios<StridedTwoRowsTag, int>(sequencesExist);
+  run_all_scenarios<StridedThreeRowsTag, unsigned>(sequencesExist);
+}
+
+TEST(std_algorithms_search_n_team_test,
+     sequences_of_equal_elements_probably_does_not_exist) {
+  constexpr bool sequencesExist = false;
+
+  run_all_scenarios<DynamicTag, double>(sequencesExist);
+  run_all_scenarios<StridedTwoRowsTag, int>(sequencesExist);
+  run_all_scenarios<StridedThreeRowsTag, unsigned>(sequencesExist);
+}
+
+}  // namespace TeamSearchN
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamShiftLeft.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamShiftLeft.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..60edb377d0265c160dd318ecd1a557745b867b84
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamShiftLeft.cpp
@@ -0,0 +1,189 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamShiftLeft {
+
+namespace KE = Kokkos::Experimental;
+
+template <class ViewType, class DistancesViewType, class IntraTeamSentinelView>
+struct TestFunctorA {
+  ViewType m_view;
+  DistancesViewType m_distancesView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+  std::size_t m_shift;
+  int m_apiPick;
+
+  TestFunctorA(const ViewType view, const DistancesViewType distancesView,
+               const IntraTeamSentinelView intraTeamSentinelView,
+               std::size_t shift, int apiPick)
+      : m_view(view),
+        m_distancesView(distancesView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_shift(shift),
+        m_apiPick(apiPick) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto myRowIndex = member.league_rank();
+    auto myRowView        = Kokkos::subview(m_view, myRowIndex, Kokkos::ALL());
+    ptrdiff_t resultDist  = 0;
+
+    if (m_apiPick == 0) {
+      auto it = KE::shift_left(member, KE::begin(myRowView), KE::end(myRowView),
+                               m_shift);
+      resultDist = KE::distance(KE::begin(myRowView), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    } else if (m_apiPick == 1) {
+      auto it    = KE::shift_left(member, myRowView, m_shift);
+      resultDist = KE::distance(KE::begin(myRowView), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    }
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck = team_members_have_matching_result(
+        member, resultDist, m_distancesView(myRowIndex));
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(myRowIndex) = intraTeamCheck;
+    });
+  }
+};
+
+// shift_left is only supported starting from C++20,
+// so put here a working version of the std algo copied from
+// https://github.com/llvm/llvm-project/blob/main/libcxx/include/__algorithm/shift_left.h
+template <class ForwardIterator>
+ForwardIterator my_std_shift_left(
+    ForwardIterator first, ForwardIterator last,
+    typename std::iterator_traits<ForwardIterator>::difference_type n) {
+  if (n == 0) {
+    return last;
+  }
+
+  ForwardIterator m = first;
+  for (; n > 0; --n) {
+    if (m == last) {
+      return first;
+    }
+    ++m;
+  }
+  return std::move(m, last, first);
+}
+
+template <class LayoutTag, class ValueType>
+void test_A(std::size_t numTeams, std::size_t numCols, std::size_t shift,
+            int apiId) {
+  /* description:
+     randomly fill a rank-2 view and do a team-level KE::shift_left
+     using shift as the shift count.
+   */
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // create a view in the memory space associated with default exespace
+  // with as many rows as the number of teams and fill it with random
+  // values from an arbitrary range
+  auto [dataView, cloneOfDataViewBeforeOp_h] =
+      create_random_view_and_host_clone(
+          LayoutTag{}, numTeams, numCols,
+          Kokkos::pair<ValueType, ValueType>{11, 523}, "dataView");
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+
+  // each team stores the distance of the returned iterator from the
+  // beginning of the interval that team operates on and then we check
+  // that these distances match the expectation
+  Kokkos::View<std::size_t*> distancesView("distancesView", numTeams);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  // use CTAD for functor
+  TestFunctorA fnc(dataView, distancesView, intraTeamSentinelView, shift,
+                   apiId);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // run std algo and check
+  // -----------------------------------------------
+  // here I can use cloneOfDataViewBeforeOp_h to run std algo on
+  // since that contains a valid copy of the data
+  auto distancesView_h         = create_host_space_copy(distancesView);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+  for (std::size_t i = 0; i < cloneOfDataViewBeforeOp_h.extent(0); ++i) {
+    auto myRow = Kokkos::subview(cloneOfDataViewBeforeOp_h, i, Kokkos::ALL());
+    auto it    = my_std_shift_left(KE::begin(myRow), KE::end(myRow), shift);
+    const std::size_t stdDistance = KE::distance(KE::begin(myRow), it);
+    ASSERT_EQ(stdDistance, distancesView_h(i));
+    ASSERT_TRUE(intraTeamSentinelView_h(i));
+  }
+
+  auto dataViewAfterOp_h = create_host_space_copy(dataView);
+  expect_equal_host_views(cloneOfDataViewBeforeOp_h, dataViewAfterOp_h);
+}
+
+template <class Tag, class ValueType>
+void run_all_scenarios() {
+  // prepare a map where, for a given set of num cols
+  // we provide a list of shifts to use for testing
+  // key = num of columns
+  // value = list of shifts
+  // Note that the cornerCase number is here since the shiftLeft algo
+  // should work even when the shift given is way larger than the range.
+  constexpr std::size_t cornerCase                        = 110111;
+  const std::map<int, std::vector<std::size_t>> scenarios = {
+      {0, {0, cornerCase}},
+      {2, {0, 1, 2, cornerCase}},
+      {6, {0, 1, 2, 5, cornerCase}},
+      {13, {0, 1, 2, 8, 11, cornerCase}},
+      {56, {0, 1, 2, 8, 11, 33, 56, cornerCase}},
+      {123, {0, 1, 11, 33, 56, 89, 112, cornerCase}},
+      {3145, {0, 1, 11, 33, 56, 89, 112, 5677, cornerCase}}};
+
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& scenario : scenarios) {
+      const std::size_t numCols = scenario.first;
+      for (int copyCount : scenario.second) {
+        for (int apiId : {0, 1}) {
+          test_A<Tag, ValueType>(numTeams, numCols, copyCount, apiId);
+        }
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_shift_left_team_test, test) {
+  run_all_scenarios<DynamicTag, double>();
+  run_all_scenarios<StridedTwoRowsTag, int>();
+  run_all_scenarios<StridedThreeRowsTag, unsigned>();
+}
+
+}  // namespace TeamShiftLeft
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamShiftRight.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamShiftRight.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..00a80c5ef070e7cdc6b653f43d6887dc775c5d0e
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamShiftRight.cpp
@@ -0,0 +1,187 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamShiftRight {
+
+namespace KE = Kokkos::Experimental;
+
+template <class ViewType, class DistancesViewType, class IntraTeamSentinelView>
+struct TestFunctorA {
+  ViewType m_view;
+  DistancesViewType m_distancesView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+  std::size_t m_shift;
+  int m_apiPick;
+
+  TestFunctorA(const ViewType view, const DistancesViewType distancesView,
+               const IntraTeamSentinelView intraTeamSentinelView,
+               std::size_t shift, int apiPick)
+      : m_view(view),
+        m_distancesView(distancesView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_shift(shift),
+        m_apiPick(apiPick) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto myRowIndex = member.league_rank();
+    auto myRowView        = Kokkos::subview(m_view, myRowIndex, Kokkos::ALL());
+    ptrdiff_t resultDist  = 0;
+
+    if (m_apiPick == 0) {
+      auto it    = KE::shift_right(member, KE::begin(myRowView),
+                                KE::end(myRowView), m_shift);
+      resultDist = KE::distance(KE::begin(myRowView), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    } else if (m_apiPick == 1) {
+      auto it    = KE::shift_right(member, myRowView, m_shift);
+      resultDist = KE::distance(KE::begin(myRowView), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    }
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck = team_members_have_matching_result(
+        member, resultDist, m_distancesView(myRowIndex));
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(myRowIndex) = intraTeamCheck;
+    });
+  }
+};
+
+// shift_right is only supported starting from C++20,
+// so put here a working version of the std algo copied from
+// https://github.com/llvm/llvm-project/blob/main/libcxx/include/__algorithm/shift_right.h
+template <class ForwardIterator>
+ForwardIterator my_std_shift_right(
+    ForwardIterator first, ForwardIterator last,
+    typename std::iterator_traits<ForwardIterator>::difference_type n) {
+  if (n == 0) {
+    return first;
+  }
+
+  decltype(n) d = last - first;
+  if (n >= d) {
+    return last;
+  }
+  ForwardIterator m = first + (d - n);
+  return std::move_backward(first, m, last);
+}
+
+template <class LayoutTag, class ValueType>
+void test_A(std::size_t numTeams, std::size_t numCols, std::size_t shift,
+            int apiId) {
+  /* description:
+     randomly fill a rank-2 view and do a team-level KE::shift_right
+     using shift as the shift count.
+   */
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // create a view in the memory space associated with default exespace
+  // with as many rows as the number of teams and fill it with random
+  // values from an arbitrary range
+  auto [dataView, cloneOfDataViewBeforeOp_h] =
+      create_random_view_and_host_clone(
+          LayoutTag{}, numTeams, numCols,
+          Kokkos::pair<ValueType, ValueType>{11, 523}, "dataView");
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+
+  // each team stores the distance of the returned iterator from the
+  // beginning of the interval that team operates on and then we check
+  // that these distances match the expectation
+  Kokkos::View<std::size_t*> distancesView("distancesView", numTeams);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  // use CTAD for functor
+  TestFunctorA fnc(dataView, distancesView, intraTeamSentinelView, shift,
+                   apiId);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // run std algo and check
+  // -----------------------------------------------
+  // here I can use cloneOfDataViewBeforeOp_h to run std algo on
+  // since that contains a valid copy of the data
+  auto distancesView_h         = create_host_space_copy(distancesView);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+  for (std::size_t i = 0; i < cloneOfDataViewBeforeOp_h.extent(0); ++i) {
+    auto myRow = Kokkos::subview(cloneOfDataViewBeforeOp_h, i, Kokkos::ALL());
+    auto it    = my_std_shift_right(KE::begin(myRow), KE::end(myRow), shift);
+    const std::size_t stdDistance = KE::distance(KE::begin(myRow), it);
+    ASSERT_EQ(stdDistance, distancesView_h(i));
+    ASSERT_TRUE(intraTeamSentinelView_h(i));
+  }
+
+  auto dataViewAfterOp_h = create_host_space_copy(dataView);
+  expect_equal_host_views(cloneOfDataViewBeforeOp_h, dataViewAfterOp_h);
+}
+
+template <class Tag, class ValueType>
+void run_all_scenarios() {
+  // prepare a map where, for a given set of num cols
+  // we provide a list of shifts to use for testing
+  // key = num of columns
+  // value = list of shifts
+  // Note that the cornerCase number is here since the shift_right algo
+  // should work even when the shift given is way larger than the range.
+  constexpr std::size_t cornerCase                        = 110111;
+  const std::map<int, std::vector<std::size_t>> scenarios = {
+      {0, {0, cornerCase}},
+      {2, {0, 1, 2, cornerCase}},
+      {6, {0, 1, 2, 5, cornerCase}},
+      {13, {0, 1, 2, 8, 11, cornerCase}},
+      {56, {0, 1, 2, 8, 11, 33, 56, cornerCase}},
+      {123, {0, 1, 11, 33, 56, 89, 112, cornerCase}},
+      {3145, {0, 1, 11, 33, 56, 89, 112, 5677, cornerCase}}};
+
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& scenario : scenarios) {
+      const std::size_t numCols = scenario.first;
+      for (int copyCount : scenario.second) {
+        for (int apiId : {0, 1}) {
+          test_A<Tag, ValueType>(numTeams, numCols, copyCount, apiId);
+        }
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_shift_right_team_test, test) {
+  run_all_scenarios<DynamicTag, double>();
+  run_all_scenarios<StridedTwoRowsTag, int>();
+  run_all_scenarios<StridedThreeRowsTag, unsigned>();
+}
+
+}  // namespace TeamShiftRight
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamSwapRanges.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamSwapRanges.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5fc9612caa7bc76795d4ad45a05d24f00841b875
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamSwapRanges.cpp
@@ -0,0 +1,151 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamSwapRanges {
+
+namespace KE = Kokkos::Experimental;
+
+template <class View1Type, class View2Type, class DistancesViewType,
+          class IntraTeamSentinelView>
+struct TestFunctorA {
+  View1Type m_view1;
+  View2Type m_view2;
+  DistancesViewType m_distancesView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+  int m_apiPick;
+
+  TestFunctorA(const View1Type view1, const View2Type view2,
+               const DistancesViewType distancesView,
+               const IntraTeamSentinelView intraTeamSentinelView, int apiPick)
+      : m_view1(view1),
+        m_view2(view2),
+        m_distancesView(distancesView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_apiPick(apiPick) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto myRowIndex = member.league_rank();
+    auto myRowView1       = Kokkos::subview(m_view1, myRowIndex, Kokkos::ALL());
+    auto myRowView2       = Kokkos::subview(m_view2, myRowIndex, Kokkos::ALL());
+    ptrdiff_t resultDist  = 0;
+
+    if (m_apiPick == 0) {
+      auto it    = KE::swap_ranges(member, KE::begin(myRowView1),
+                                KE::end(myRowView1), KE::begin(myRowView2));
+      resultDist = KE::distance(KE::begin(myRowView2), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    } else if (m_apiPick == 1) {
+      auto it    = KE::swap_ranges(member, myRowView1, myRowView2);
+      resultDist = KE::distance(KE::begin(myRowView2), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    }
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck = team_members_have_matching_result(
+        member, resultDist, m_distancesView(myRowIndex));
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(myRowIndex) = intraTeamCheck;
+    });
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
+  /* description:
+     randomly fill two views and do team level swap_ranges
+   */
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  auto [dataView1, cloneOfDataView1BeforeOp_h] =
+      create_random_view_and_host_clone(
+          LayoutTag{}, numTeams, numCols,
+          Kokkos::pair<ValueType, ValueType>{11, 523}, "dataView1");
+
+  auto [dataView2, cloneOfDataView2BeforeOp_h] =
+      create_random_view_and_host_clone(
+          LayoutTag{}, numTeams, numCols,
+          Kokkos::pair<ValueType, ValueType>{530, 1523}, "dataView2");
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+
+  // each team stores the distance of the returned iterator from the
+  // beginning of the interval that team operates on and then we check
+  // that these distances match the expectation
+  Kokkos::View<std::size_t*> distancesView("distancesView", numTeams);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  // use CTAD for functor
+  TestFunctorA fnc(dataView1, dataView2, distancesView, intraTeamSentinelView,
+                   apiId);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // check
+  // -----------------------------------------------
+  auto distancesView_h         = create_host_space_copy(distancesView);
+  auto dataView1AfterOp_h      = create_host_space_copy(dataView1);
+  auto dataView2AfterOp_h      = create_host_space_copy(dataView2);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+
+  for (std::size_t i = 0; i < dataView1AfterOp_h.extent(0); ++i) {
+    for (std::size_t j = 0; j < dataView1AfterOp_h.extent(1); ++j) {
+      ASSERT_EQ(cloneOfDataView1BeforeOp_h(i, j), dataView2AfterOp_h(i, j));
+      ASSERT_EQ(cloneOfDataView2BeforeOp_h(i, j), dataView1AfterOp_h(i, j));
+    }
+    // each team should return an iterator past the last column
+    EXPECT_TRUE(distancesView_h(i) == numCols);
+    ASSERT_TRUE(intraTeamSentinelView_h(i));
+  }
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios() {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 11113}) {
+      for (int apiId : {0, 1}) {
+        test_A<LayoutTag, ValueType>(numTeams, numCols, apiId);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_swap_ranges_team_test, test) {
+  run_all_scenarios<DynamicTag, double>();
+  run_all_scenarios<StridedTwoRowsTag, int>();
+  run_all_scenarios<StridedThreeRowsTag, unsigned>();
+}
+
+}  // namespace TeamSwapRanges
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformBinaryOp.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformBinaryOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b3557d8afb3893b789a95521c037e69b53c977b4
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformBinaryOp.cpp
@@ -0,0 +1,185 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamTransformBinaryOp {
+
+namespace KE = Kokkos::Experimental;
+
+template <class ValueType>
+struct AddValuesBinaryOp {
+  KOKKOS_INLINE_FUNCTION
+  ValueType operator()(const ValueType& a, const ValueType& b) const {
+    return a + b;
+  }
+};
+
+template <class SourceView1Type, class SourceView2Type, class DestViewType,
+          class DistancesViewType, class IntraTeamSentinelView>
+struct TestFunctorA {
+  SourceView1Type m_sourceView1;
+  SourceView2Type m_sourceView2;
+  DestViewType m_destView;
+  DistancesViewType m_distancesView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+  int m_apiPick;
+
+  TestFunctorA(const SourceView1Type sourceView1,
+               const SourceView2Type sourceView2, const DestViewType destView,
+               const DistancesViewType distancesView,
+               const IntraTeamSentinelView intraTeamSentinelView, int apiPick)
+      : m_sourceView1(sourceView1),
+        m_sourceView2(sourceView2),
+        m_destView(destView),
+        m_distancesView(distancesView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_apiPick(apiPick) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto myRowIndex = member.league_rank();
+    auto myRowView1From =
+        Kokkos::subview(m_sourceView1, myRowIndex, Kokkos::ALL());
+    auto myRowView2From =
+        Kokkos::subview(m_sourceView2, myRowIndex, Kokkos::ALL());
+    auto myRowViewDest = Kokkos::subview(m_destView, myRowIndex, Kokkos::ALL());
+    ptrdiff_t resultDist = 0;
+
+    using value_type = typename SourceView1Type::value_type;
+    if (m_apiPick == 0) {
+      auto it = KE::transform(
+          member, KE::cbegin(myRowView1From), KE::cend(myRowView1From),
+          KE::cbegin(myRowView2From), KE::begin(myRowViewDest),
+          AddValuesBinaryOp<value_type>());
+
+      resultDist = KE::distance(KE::begin(myRowViewDest), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    } else if (m_apiPick == 1) {
+      auto it = KE::transform(member, myRowView1From, myRowView2From,
+                              myRowViewDest, AddValuesBinaryOp<value_type>());
+
+      resultDist = KE::distance(KE::begin(myRowViewDest), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    }
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck = team_members_have_matching_result(
+        member, resultDist, m_distancesView(myRowIndex));
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(myRowIndex) = intraTeamCheck;
+    });
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
+  /* description:
+     team level transform with each team handling a row of
+     two rank-2 source views and applying a binary op that
+     add each pair of element from those two views
+   */
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // create a view in the memory space associated with default exespace
+  // with as many rows as the number of teams and fill it with random
+  // values from an arbitrary range
+  auto [sourceView1, cloneOfSourceView1BeforeOp_h] =
+      create_random_view_and_host_clone(
+          LayoutTag{}, numTeams, numCols,
+          Kokkos::pair<ValueType, ValueType>{0, 523}, "sourceView1",
+          317539 /*random seed*/);
+  auto [sourceView2, cloneOfSourceView2BeforeOp_h] =
+      create_random_view_and_host_clone(
+          LayoutTag{}, numTeams, numCols,
+          Kokkos::pair<ValueType, ValueType>{0, 523}, "sourceView2",
+          957313 /*random seed*/);
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+  // create the destination view
+  Kokkos::View<ValueType**> destView("destView", numTeams, numCols);
+  // make a host copy of the dest view that we can check below
+  // to be all zeros
+  auto destViewBeforeOp_h = create_host_space_copy(destView);
+
+  // each team stores the distance of the returned iterator from the
+  // beginning of the interval that team operates on and then we check
+  // that these distances match the expectation
+  Kokkos::View<std::size_t*> distancesView("distancesView", numTeams);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  // use CTAD for functor
+  TestFunctorA fnc(sourceView1, sourceView2, destView, distancesView,
+                   intraTeamSentinelView, apiId);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // check
+  // -----------------------------------------------
+  auto distancesView_h         = create_host_space_copy(distancesView);
+  auto destViewAfterOp_h       = create_host_space_copy(destView);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+  for (std::size_t i = 0; i < destViewAfterOp_h.extent(0); ++i) {
+    for (std::size_t j = 0; j < destViewAfterOp_h.extent(1); ++j) {
+      // elements in dest view should be the sum of source elements
+      ASSERT_DOUBLE_EQ(destViewAfterOp_h(i, j),
+                       cloneOfSourceView1BeforeOp_h(i, j) +
+                           cloneOfSourceView2BeforeOp_h(i, j));
+    }
+
+    // each team should return an iterator whose distance from the
+    // beginning of the row equals the num of columns since
+    // each team transforms all elements in each row
+    EXPECT_TRUE(distancesView_h(i) == numCols);
+    ASSERT_TRUE(intraTeamSentinelView_h(i));
+  }
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios() {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 11113}) {
+      for (int apiId : {0, 1}) {
+        test_A<LayoutTag, ValueType>(numTeams, numCols, apiId);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_transform_team_test, test_binary_op) {
+  run_all_scenarios<DynamicTag, double>();
+  run_all_scenarios<StridedTwoRowsTag, int>();
+  run_all_scenarios<StridedThreeRowsTag, unsigned>();
+}
+
+}  // namespace TeamTransformBinaryOp
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformExclusiveScan.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformExclusiveScan.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9f30812d8ef03cf40fc56d9f1041b2777f86e726
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformExclusiveScan.cpp
@@ -0,0 +1,228 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+
+#if not defined KOKKOS_ENABLE_OPENMPTARGET
+
+namespace Test {
+namespace stdalgos {
+namespace TeamTransformExclusiveScan {
+
+namespace KE = Kokkos::Experimental;
+
+template <class ValueType>
+struct PlusFunctor {
+  KOKKOS_INLINE_FUNCTION
+  ValueType operator()(const ValueType& lhs, const ValueType& rhs) const {
+    return lhs + rhs;
+  }
+};
+
+template <class ValueType>
+struct MultipliesByTwoFunctor {
+  KOKKOS_INLINE_FUNCTION
+  ValueType operator()(const ValueType& value) const { return value * 2; }
+};
+
+template <class SourceViewType, class DestViewType, class DistancesViewType,
+          class IntraTeamSentinelView, class InitValuesViewType,
+          class BinaryOpType, class UnaryOpType>
+struct TestFunctorA {
+  SourceViewType m_sourceView;
+  DestViewType m_destView;
+  DistancesViewType m_distancesView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+  InitValuesViewType m_initValuesView;
+  BinaryOpType m_binaryOp;
+  UnaryOpType m_unaryOp;
+  int m_apiPick;
+
+  TestFunctorA(const SourceViewType sourceView, const DestViewType destView,
+               const DistancesViewType distancesView,
+               const IntraTeamSentinelView intraTeamSentinelView,
+               const InitValuesViewType initValuesView, BinaryOpType binaryOp,
+               UnaryOpType unaryOp, int apiPick)
+      : m_sourceView(sourceView),
+        m_destView(destView),
+        m_distancesView(distancesView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_initValuesView(initValuesView),
+        m_binaryOp(binaryOp),
+        m_unaryOp(unaryOp),
+        m_apiPick(apiPick) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto rowIndex = member.league_rank();
+
+    auto rowViewSrc    = Kokkos::subview(m_sourceView, rowIndex, Kokkos::ALL());
+    auto rowViewDest   = Kokkos::subview(m_destView, rowIndex, Kokkos::ALL());
+    const auto initVal = m_initValuesView(rowIndex);
+    ptrdiff_t resultDist = 0;
+
+    switch (m_apiPick) {
+      case 0: {
+        auto it = KE::transform_exclusive_scan(
+            member, KE::cbegin(rowViewSrc), KE::cend(rowViewSrc),
+            KE::begin(rowViewDest), initVal, m_binaryOp, m_unaryOp);
+        resultDist = KE::distance(KE::begin(rowViewDest), it);
+        Kokkos::single(Kokkos::PerTeam(member),
+                       [=, *this] { m_distancesView(rowIndex) = resultDist; });
+
+        break;
+      }
+
+      case 1: {
+        auto it = KE::transform_exclusive_scan(member, rowViewSrc, rowViewDest,
+                                               initVal, m_binaryOp, m_unaryOp);
+        resultDist = KE::distance(KE::begin(rowViewDest), it);
+        Kokkos::single(Kokkos::PerTeam(member),
+                       [=, *this] { m_distancesView(rowIndex) = resultDist; });
+
+        break;
+      }
+    }
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck = team_members_have_matching_result(
+        member, resultDist, m_distancesView(rowIndex));
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(rowIndex) = intraTeamCheck;
+    });
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
+  /* description:
+     use a rank-2 view randomly filled with values,
+     and run a team-level transform_exclusive_scan
+   */
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // create a view in the memory space associated with default exespace
+  // with as many rows as the number of teams and fill it with random
+  // values from an arbitrary range.
+  constexpr ValueType lowerBound = 5;
+  constexpr ValueType upperBound = 523;
+  const auto bounds              = make_bounds(lowerBound, upperBound);
+
+  auto [sourceView, sourceViewBeforeOp_h] = create_random_view_and_host_clone(
+      LayoutTag{}, numTeams, numCols, bounds, "sourceView");
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+
+  // create the destination view
+  Kokkos::View<ValueType**> destView("destView", numTeams, numCols);
+
+  // tranform_exclusive_scan returns an iterator so to verify that it is correct
+  // each team stores the distance of the returned iterator from the beginning
+  // of the interval that team operates on and then we check that these
+  // distances match the std result
+  Kokkos::View<std::size_t*> distancesView("distancesView", numTeams);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  PlusFunctor<ValueType> binaryOp;
+  MultipliesByTwoFunctor<ValueType> unaryOp;
+
+  // Create view of reduce init values to be used by test cases
+  Kokkos::View<ValueType*, Kokkos::DefaultHostExecutionSpace> initValuesView_h(
+      "initValuesView_h", numTeams);
+  using rand_pool =
+      Kokkos::Random_XorShift64_Pool<Kokkos::DefaultHostExecutionSpace>;
+  rand_pool pool(lowerBound * upperBound);
+  Kokkos::fill_random(initValuesView_h, pool, lowerBound, upperBound);
+
+  // use CTAD for functor
+  auto initValuesView =
+      Kokkos::create_mirror_view_and_copy(space_t(), initValuesView_h);
+  TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView,
+                   initValuesView, binaryOp, unaryOp, apiId);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // run cpp-std kernel and check
+  // -----------------------------------------------
+  auto distancesView_h         = create_host_space_copy(distancesView);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+  Kokkos::View<ValueType**, Kokkos::HostSpace> stdDestView("stdDestView",
+                                                           numTeams, numCols);
+
+  for (std::size_t i = 0; i < sourceView.extent(0); ++i) {
+    auto rowFrom   = Kokkos::subview(sourceViewBeforeOp_h, i, Kokkos::ALL());
+    auto rowDest   = Kokkos::subview(stdDestView, i, Kokkos::ALL());
+    auto initValue = initValuesView_h(i);
+    ASSERT_TRUE(intraTeamSentinelView_h(i));
+
+// libstdc++ as provided by GCC 8 does not have transform_exclusive_scan and
+// for GCC 9.1, 9.2 fails to compile for missing overload not accepting policy
+#if defined(_GLIBCXX_RELEASE) && (_GLIBCXX_RELEASE <= 9)
+#define transform_exclusive_scan testing_transform_exclusive_scan
+#else
+#define transform_exclusive_scan std::transform_exclusive_scan
+#endif
+
+    switch (apiId) {
+      case 0:
+      case 1: {
+        auto it = transform_exclusive_scan(
+            KE::cbegin(rowFrom), KE::cend(rowFrom), KE::begin(rowDest),
+            initValue, binaryOp, unaryOp);
+        const std::size_t stdDistance = KE::distance(KE::begin(rowDest), it);
+        ASSERT_EQ(stdDistance, distancesView_h(i));
+        break;
+      }
+    }
+
+#undef transform_exclusive_scan
+  }
+
+  auto dataViewAfterOp_h = create_host_space_copy(destView);
+  expect_equal_host_views(stdDestView, dataViewAfterOp_h);
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios() {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8153}) {
+      for (int apiId : {0, 1}) {
+        test_A<LayoutTag, ValueType>(numTeams, numCols, apiId);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_transform_exclusive_scan_team_test, test) {
+  run_all_scenarios<DynamicTag, double>();
+  run_all_scenarios<StridedTwoRowsTag, int>();
+  run_all_scenarios<StridedThreeRowsTag, unsigned>();
+}
+
+}  // namespace TeamTransformExclusiveScan
+}  // namespace stdalgos
+}  // namespace Test
+
+#endif
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4b3166023267927db16903e022cdf5f02dd54a3f
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp
@@ -0,0 +1,264 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+
+#if not defined KOKKOS_ENABLE_OPENMPTARGET
+
+namespace Test {
+namespace stdalgos {
+namespace TeamTransformInclusiveScan {
+
+namespace KE = Kokkos::Experimental;
+
+template <class ValueType>
+struct PlusFunctor {
+  KOKKOS_INLINE_FUNCTION
+  ValueType operator()(const ValueType& lhs, const ValueType& rhs) const {
+    return lhs + rhs;
+  }
+};
+
+template <class ValueType>
+struct MultipliesByTwoFunctor {
+  KOKKOS_INLINE_FUNCTION
+  ValueType operator()(const ValueType& value) const { return value * 2; }
+};
+
+template <class SourceViewType, class DestViewType, class DistancesViewType,
+          class IntraTeamSentinelView, class InitValuesViewType,
+          class BinaryOpType, class UnaryOpType>
+struct TestFunctorA {
+  SourceViewType m_sourceView;
+  DestViewType m_destView;
+  DistancesViewType m_distancesView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+  InitValuesViewType m_initValuesView;
+  BinaryOpType m_binaryOp;
+  UnaryOpType m_unaryOp;
+  int m_apiPick;
+
+  TestFunctorA(const SourceViewType sourceView, const DestViewType destView,
+               const DistancesViewType distancesView,
+               const IntraTeamSentinelView intraTeamSentinelView,
+               const InitValuesViewType initValuesView, BinaryOpType binaryOp,
+               UnaryOpType unaryOp, int apiPick)
+      : m_sourceView(sourceView),
+        m_destView(destView),
+        m_distancesView(distancesView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_initValuesView(initValuesView),
+        m_binaryOp(binaryOp),
+        m_unaryOp(unaryOp),
+        m_apiPick(apiPick) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto rowIndex = member.league_rank();
+
+    auto srcRow      = Kokkos::subview(m_sourceView, rowIndex, Kokkos::ALL());
+    const auto first = KE::cbegin(srcRow);
+    const auto last  = KE::cend(srcRow);
+    auto destRow     = Kokkos::subview(m_destView, rowIndex, Kokkos::ALL());
+    auto firstDest   = KE::begin(destRow);
+
+    const auto initVal   = m_initValuesView(rowIndex);
+    ptrdiff_t resultDist = 0;
+
+    switch (m_apiPick) {
+      case 0: {
+        auto it = KE::transform_inclusive_scan(member, first, last, firstDest,
+                                               m_binaryOp, m_unaryOp);
+        resultDist = KE::distance(firstDest, it);
+        Kokkos::single(Kokkos::PerTeam(member),
+                       [=, *this] { m_distancesView(rowIndex) = resultDist; });
+
+        break;
+      }
+
+      case 1: {
+        auto it    = KE::transform_inclusive_scan(member, srcRow, destRow,
+                                               m_binaryOp, m_unaryOp);
+        resultDist = KE::distance(firstDest, it);
+        Kokkos::single(Kokkos::PerTeam(member),
+                       [=, *this] { m_distancesView(rowIndex) = resultDist; });
+
+        break;
+      }
+
+      case 2: {
+        auto it = KE::transform_inclusive_scan(member, first, last, firstDest,
+                                               m_binaryOp, m_unaryOp, initVal);
+        resultDist = KE::distance(firstDest, it);
+        Kokkos::single(Kokkos::PerTeam(member),
+                       [=, *this] { m_distancesView(rowIndex) = resultDist; });
+
+        break;
+      }
+
+      case 3: {
+        auto it    = KE::transform_inclusive_scan(member, srcRow, destRow,
+                                               m_binaryOp, m_unaryOp, initVal);
+        resultDist = KE::distance(firstDest, it);
+        Kokkos::single(Kokkos::PerTeam(member),
+                       [=, *this] { m_distancesView(rowIndex) = resultDist; });
+
+        break;
+      }
+    }
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck = team_members_have_matching_result(
+        member, resultDist, m_distancesView(rowIndex));
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(rowIndex) = intraTeamCheck;
+    });
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
+  /* description:
+     use a rank-2 view randomly filled with values,
+     and run a team-level transform_inclusive_scan
+   */
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // create a view in the memory space associated with default exespace
+  // with as many rows as the number of teams and fill it with random
+  // values from an arbitrary range.
+  constexpr ValueType lowerBound = 5;
+  constexpr ValueType upperBound = 523;
+  const auto bounds              = make_bounds(lowerBound, upperBound);
+
+  auto [sourceView, sourceViewBeforeOp_h] = create_random_view_and_host_clone(
+      LayoutTag{}, numTeams, numCols, bounds, "sourceView");
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+
+  // create the destination view
+  Kokkos::View<ValueType**> destView("destView", numTeams, numCols);
+
+  // tranform_inclusive_scan returns an iterator so to verify that it is correct
+  // each team stores the distance of the returned iterator from the beginning
+  // of the interval that team operates on and then we check that these
+  // distances match the std result
+  Kokkos::View<std::size_t*> distancesView("distancesView", numTeams);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  PlusFunctor<ValueType> binaryOp;
+  MultipliesByTwoFunctor<ValueType> unaryOp;
+
+  // Create view of reduce init values to be used by test cases
+  Kokkos::View<ValueType*, Kokkos::DefaultHostExecutionSpace> initValuesView_h(
+      "initValuesView_h", numTeams);
+  using rand_pool =
+      Kokkos::Random_XorShift64_Pool<Kokkos::DefaultHostExecutionSpace>;
+  rand_pool pool(lowerBound * upperBound);
+  Kokkos::fill_random(initValuesView_h, pool, lowerBound, upperBound);
+
+  // use CTAD for functor
+  auto initValuesView =
+      Kokkos::create_mirror_view_and_copy(space_t(), initValuesView_h);
+  TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView,
+                   initValuesView, binaryOp, unaryOp, apiId);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // run cpp-std kernel and check
+  // -----------------------------------------------
+  auto distancesView_h         = create_host_space_copy(distancesView);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+  Kokkos::View<ValueType**, Kokkos::HostSpace> stdDestView("stdDestView",
+                                                           numTeams, numCols);
+
+  for (std::size_t i = 0; i < sourceView.extent(0); ++i) {
+    auto srcRow    = Kokkos::subview(sourceViewBeforeOp_h, i, Kokkos::ALL());
+    auto first     = KE::cbegin(srcRow);
+    auto last      = KE::cend(srcRow);
+    auto destRow   = Kokkos::subview(stdDestView, i, Kokkos::ALL());
+    auto firstDest = KE::begin(destRow);
+    auto initValue = initValuesView_h(i);
+
+    ASSERT_TRUE(intraTeamSentinelView_h(i));
+
+// libstdc++ as provided by GCC 8 does not have transform_inclusive_scan and
+// for GCC 9.1, 9.2 fails to compile for missing overload not accepting policy
+#if defined(_GLIBCXX_RELEASE) && (_GLIBCXX_RELEASE <= 9)
+#define transform_inclusive_scan testing_transform_inclusive_scan
+#else
+#define transform_inclusive_scan std::transform_inclusive_scan
+#endif
+
+    switch (apiId) {
+      case 0:
+      case 1: {
+        const auto it =
+            transform_inclusive_scan(first, last, firstDest, binaryOp, unaryOp);
+        const std::size_t stdDistance = KE::distance(firstDest, it);
+        ASSERT_EQ(stdDistance, distancesView_h(i));
+
+        break;
+      }
+
+      case 2:
+      case 3: {
+        const auto it = transform_inclusive_scan(first, last, firstDest,
+                                                 binaryOp, unaryOp, initValue);
+        const std::size_t stdDistance = KE::distance(firstDest, it);
+        ASSERT_EQ(stdDistance, distancesView_h(i));
+
+        break;
+      }
+    }
+  }
+#undef transform_inclusive_scan
+
+  auto dataViewAfterOp_h = create_host_space_copy(destView);
+  expect_equal_host_views(stdDestView, dataViewAfterOp_h);
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios() {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8153}) {
+      for (int apiId : {0, 1, 2, 3}) {
+        test_A<LayoutTag, ValueType>(numTeams, numCols, apiId);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_transform_inclusive_scan_team_test, test) {
+  run_all_scenarios<DynamicTag, double>();
+  run_all_scenarios<StridedTwoRowsTag, int>();
+  run_all_scenarios<StridedThreeRowsTag, unsigned>();
+}
+
+}  // namespace TeamTransformInclusiveScan
+}  // namespace stdalgos
+}  // namespace Test
+
+#endif
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformReduce.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformReduce.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b0a3241ec4bf84685987c76e088daa67691f2f31
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformReduce.cpp
@@ -0,0 +1,323 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+
+#if not defined KOKKOS_ENABLE_OPENMPTARGET
+
+namespace Test {
+namespace stdalgos {
+namespace TeamTransformReduce {
+
+namespace KE = Kokkos::Experimental;
+
+template <class ValueType>
+struct PlusFunctor {
+  KOKKOS_INLINE_FUNCTION
+  ValueType operator()(const ValueType& lhs, const ValueType& rhs) const {
+    return lhs + rhs;
+  }
+};
+
+template <class ValueType>
+struct MultipliesFunctor {
+  KOKKOS_INLINE_FUNCTION
+  ValueType operator()(const ValueType& lhs, const ValueType& rhs) const {
+    return lhs * rhs;
+  }
+};
+
+template <class ValueType>
+struct PlusOneFunctor {
+  KOKKOS_INLINE_FUNCTION
+  ValueType operator()(const ValueType& val) const { return val + 1; };
+};
+
+template <class FirstDataViewType, class SecondDataViewType,
+          class InitValuesViewType, class ResultsViewType,
+          class IntraTeamSentinelView, class BinaryJoinerType,
+          class BinaryTransformType, class UnaryTransformType>
+struct TestFunctorA {
+  FirstDataViewType m_firstDataView;
+  SecondDataViewType m_secondDataView;
+  InitValuesViewType m_initValuesView;
+  ResultsViewType m_resultsView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+  BinaryJoinerType m_binaryJoiner;
+  BinaryTransformType m_binaryTransform;
+  UnaryTransformType m_unaryTransform;
+  int m_apiPick;
+
+  TestFunctorA(const FirstDataViewType firstDataView,
+               const SecondDataViewType secondDataview,
+               const InitValuesViewType initValuesView,
+               const ResultsViewType resultsView,
+               const IntraTeamSentinelView intraTeamSentinelView,
+               BinaryJoinerType binaryJoiner,
+               BinaryTransformType binaryTransform,
+               UnaryTransformType unaryTransform, int apiPick)
+      : m_firstDataView(firstDataView),
+        m_secondDataView(secondDataview),
+        m_initValuesView(initValuesView),
+        m_resultsView(resultsView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_binaryJoiner(binaryJoiner),
+        m_binaryTransform(binaryTransform),
+        m_unaryTransform(unaryTransform),
+        m_apiPick(apiPick) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const int rowIndex = member.league_rank();
+
+    auto firstDataRow =
+        Kokkos::subview(m_firstDataView, rowIndex, Kokkos::ALL());
+    auto firstDataRowBegin = KE::cbegin(firstDataRow);
+    auto firstDataRowEnd   = KE::cend(firstDataRow);
+
+    auto secondDataRow =
+        Kokkos::subview(m_secondDataView, rowIndex, Kokkos::ALL());
+    auto secondDataRowBegin = KE::cbegin(secondDataRow);
+
+    const auto initVal = m_initValuesView(rowIndex);
+    typename InitValuesViewType::non_const_value_type result = 0;
+
+    switch (m_apiPick) {
+      case 0: {
+        result =
+            KE::transform_reduce(member, firstDataRowBegin, firstDataRowEnd,
+                                 secondDataRowBegin, initVal);
+        Kokkos::single(Kokkos::PerTeam(member),
+                       [=, *this]() { m_resultsView(rowIndex) = result; });
+        break;
+      }
+
+      case 1: {
+        result =
+            KE::transform_reduce(member, firstDataRow, secondDataRow, initVal);
+        Kokkos::single(Kokkos::PerTeam(member),
+                       [=, *this]() { m_resultsView(rowIndex) = result; });
+        break;
+      }
+
+      case 2: {
+        result = KE::transform_reduce(
+            member, firstDataRowBegin, firstDataRowEnd, secondDataRowBegin,
+            initVal, m_binaryJoiner, m_binaryTransform);
+        Kokkos::single(Kokkos::PerTeam(member),
+                       [=, *this]() { m_resultsView(rowIndex) = result; });
+        break;
+      }
+
+      case 3: {
+        result =
+            KE::transform_reduce(member, firstDataRow, secondDataRow, initVal,
+                                 m_binaryJoiner, m_binaryTransform);
+        Kokkos::single(Kokkos::PerTeam(member),
+                       [=, *this]() { m_resultsView(rowIndex) = result; });
+        break;
+      }
+
+      case 4: {
+        result =
+            KE::transform_reduce(member, firstDataRowBegin, firstDataRowEnd,
+                                 initVal, m_binaryJoiner, m_unaryTransform);
+        Kokkos::single(Kokkos::PerTeam(member),
+                       [=, *this]() { m_resultsView(rowIndex) = result; });
+        break;
+      }
+
+      case 5: {
+        result = KE::transform_reduce(member, firstDataRow, initVal,
+                                      m_binaryJoiner, m_unaryTransform);
+        Kokkos::single(Kokkos::PerTeam(member),
+                       [=, *this]() { m_resultsView(rowIndex) = result; });
+        break;
+      }
+    }
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck = team_members_have_matching_result(
+        member, result, m_resultsView(rowIndex));
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(rowIndex) = intraTeamCheck;
+    });
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
+  /* description:
+     use a rank-2 view randomly filled with values,
+     and run a team-level transform_reduce
+   */
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // create a view in the memory space associated with default exespace
+  // with as many rows as the number of teams and fill it with random
+  // values from an arbitrary range.
+  constexpr ValueType lowerBound = 5;
+  constexpr ValueType upperBound = 523;
+  const auto bounds              = make_bounds(lowerBound, upperBound);
+
+  auto [firstDataView, firstDataViewBeforeOp_h] =
+      create_random_view_and_host_clone(LayoutTag{}, numTeams, numCols, bounds,
+                                        "firstDataView");
+  auto [secondDataView, secondDataViewBeforeOp_h] =
+      create_random_view_and_host_clone(LayoutTag{}, numTeams, numCols, bounds,
+                                        "secondDataView");
+
+  // Create view of init values to be used by test cases
+  Kokkos::View<ValueType*, Kokkos::DefaultHostExecutionSpace> initValuesView_h(
+      "initValuesView_h", numTeams);
+  using rand_pool =
+      Kokkos::Random_XorShift64_Pool<Kokkos::DefaultHostExecutionSpace>;
+  rand_pool pool(lowerBound * upperBound);
+  Kokkos::fill_random(initValuesView_h, pool, lowerBound, upperBound);
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+
+  // to verify that things work, each team stores the result of its
+  // transform_reduce call, and then we check that these match what we expect
+  Kokkos::View<ValueType*> resultsView("resultsView", numTeams);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  PlusFunctor<ValueType> binaryJoiner;
+  MultipliesFunctor<ValueType> binaryTransform;
+  PlusOneFunctor<ValueType> unaryTransform;
+
+  // use CTAD for functor
+  auto initValuesView =
+      Kokkos::create_mirror_view_and_copy(space_t(), initValuesView_h);
+  TestFunctorA fnc(firstDataView, secondDataView, initValuesView, resultsView,
+                   intraTeamSentinelView, binaryJoiner, binaryTransform,
+                   unaryTransform, apiId);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // run cpp-std kernel and check
+  // -----------------------------------------------
+
+  auto resultsView_h           = create_host_space_copy(resultsView);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+
+  for (std::size_t i = 0; i < firstDataView.extent(0); ++i) {
+    auto firstDataRow =
+        Kokkos::subview(firstDataViewBeforeOp_h, i, Kokkos::ALL());
+
+    const auto firstDataRowBegin = KE::cbegin(firstDataRow);
+    const auto firstDataRowEnd   = KE::cend(firstDataRow);
+
+    auto secondDataRow =
+        Kokkos::subview(secondDataViewBeforeOp_h, i, Kokkos::ALL());
+
+    const auto secondDataRowBegin = KE::cbegin(secondDataRow);
+
+    const auto initVal = initValuesView_h(i);
+
+    ASSERT_TRUE(intraTeamSentinelView_h(i));
+
+// libstdc++ as provided by GCC 8 does not have reduce, transform_reduce,
+// exclusive_scan, inclusive_scan, transform_exclusive_scan,
+// transform_inclusive_scan and for GCC 9.1, 9.2 fails to compile them for
+// missing overload not accepting policy
+#if defined(_GLIBCXX_RELEASE) && (_GLIBCXX_RELEASE <= 9)
+#define transform_reduce testing_transform_reduce
+#else
+#define transform_reduce std::transform_reduce
+#endif
+
+    switch (apiId) {
+      case 0:
+      case 1: {
+        const auto result = transform_reduce(firstDataRowBegin, firstDataRowEnd,
+                                             secondDataRowBegin, initVal);
+
+        if constexpr (std::is_floating_point_v<ValueType>) {
+          EXPECT_FLOAT_EQ(result, resultsView_h(i));
+        } else {
+          ASSERT_EQ(result, resultsView_h(i));
+        }
+
+        break;
+      }
+
+      case 2:
+      case 3: {
+        const ValueType result = transform_reduce(
+            firstDataRowBegin, firstDataRowEnd, secondDataRowBegin, initVal,
+            binaryJoiner, binaryTransform);
+
+        if constexpr (std::is_floating_point_v<ValueType>) {
+          EXPECT_FLOAT_EQ(result, resultsView_h(i));
+        } else {
+          ASSERT_EQ(result, resultsView_h(i));
+        }
+
+        break;
+      }
+
+      case 4:
+      case 5: {
+        const ValueType result =
+            transform_reduce(firstDataRowBegin, firstDataRowEnd, initVal,
+                             binaryJoiner, unaryTransform);
+
+        if constexpr (std::is_floating_point_v<ValueType>) {
+          EXPECT_FLOAT_EQ(result, resultsView_h(i));
+        } else {
+          ASSERT_EQ(result, resultsView_h(i));
+        }
+
+        break;
+      }
+    }
+
+#undef transform_reduce
+  }
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios() {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8153}) {
+      for (int apiId : {0, 1, 2, 3, 4, 5}) {
+        test_A<LayoutTag, ValueType>(numTeams, numCols, apiId);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_transform_reduce_team_test, test) {
+  run_all_scenarios<DynamicTag, double>();
+  run_all_scenarios<StridedTwoRowsTag, int>();
+  run_all_scenarios<StridedThreeRowsTag, unsigned>();
+}
+
+}  // namespace TeamTransformReduce
+}  // namespace stdalgos
+}  // namespace Test
+
+#endif
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformUnaryOp.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformUnaryOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0fbb040020e79824d998fbc4260dbcdb0ebef91d
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformUnaryOp.cpp
@@ -0,0 +1,176 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamTransformUnaryOp {
+
+namespace KE = Kokkos::Experimental;
+
+template <class ValueType>
+struct PlusTwoUnaryOp {
+  KOKKOS_INLINE_FUNCTION
+  ValueType operator()(const ValueType& val) const {
+    return val + static_cast<ValueType>(2);
+  }
+};
+
+template <class SourceViewType, class DestViewType, class DistancesViewType,
+          class IntraTeamSentinelView>
+struct TestFunctorA {
+  SourceViewType m_sourceView;
+  DestViewType m_destView;
+  DistancesViewType m_distancesView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+  int m_apiPick;
+
+  TestFunctorA(const SourceViewType sourceView, const DestViewType destView,
+               const DistancesViewType distancesView,
+               const IntraTeamSentinelView intraTeamSentinelView, int apiPick)
+      : m_sourceView(sourceView),
+        m_destView(destView),
+        m_distancesView(distancesView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_apiPick(apiPick) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto myRowIndex = member.league_rank();
+    auto myRowViewFrom =
+        Kokkos::subview(m_sourceView, myRowIndex, Kokkos::ALL());
+    auto myRowViewDest = Kokkos::subview(m_destView, myRowIndex, Kokkos::ALL());
+    ptrdiff_t resultDist = 0;
+
+    using value_type = typename SourceViewType::value_type;
+    if (m_apiPick == 0) {
+      auto it = KE::transform(member, KE::cbegin(myRowViewFrom),
+                              KE::cend(myRowViewFrom), KE::begin(myRowViewDest),
+                              PlusTwoUnaryOp<value_type>());
+
+      resultDist = KE::distance(KE::begin(myRowViewDest), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    } else if (m_apiPick == 1) {
+      auto it = KE::transform(member, myRowViewFrom, myRowViewDest,
+                              PlusTwoUnaryOp<value_type>());
+
+      resultDist = KE::distance(KE::begin(myRowViewDest), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    }
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck = team_members_have_matching_result(
+        member, resultDist, m_distancesView(myRowIndex));
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(myRowIndex) = intraTeamCheck;
+    });
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
+  /* description:
+     team level transform with each team handling a row of
+     a rank-2 source view and applying a unary op that
+     increments each element by two
+   */
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // create a view in the memory space associated with default exespace
+  // with as many rows as the number of teams and fill it with random
+  // values from an arbitrary range
+  auto [sourceView, cloneOfSourceViewBeforeOp_h] =
+      create_random_view_and_host_clone(
+          LayoutTag{}, numTeams, numCols,
+          Kokkos::pair<ValueType, ValueType>{0, 523}, "sourceView");
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+  // create the destination view
+  Kokkos::View<ValueType**> destView("destView", numTeams, numCols);
+  // make a host copy of the dest view that we can check below
+  // to be all zeros since this should remain unchanged
+  auto destViewBeforeOp_h = create_host_space_copy(destView);
+
+  // each team stores the distance of the returned iterator from the
+  // beginning of the interval that team operates on and then we check
+  // that these distances match the expectation
+  Kokkos::View<std::size_t*> distancesView("distancesView", numTeams);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  // use CTAD for functor
+  TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView,
+                   apiId);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // check
+  // -----------------------------------------------
+  auto distancesView_h         = create_host_space_copy(distancesView);
+  auto sourceViewAfterOp_h     = create_host_space_copy(sourceView);
+  auto destViewAfterOp_h       = create_host_space_copy(destView);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+  for (std::size_t i = 0; i < destViewBeforeOp_h.extent(0); ++i) {
+    for (std::size_t j = 0; j < destViewBeforeOp_h.extent(1); ++j) {
+      // source view should not change
+      ASSERT_EQ(sourceViewAfterOp_h(i, j), cloneOfSourceViewBeforeOp_h(i, j));
+
+      // elements in dest view should be the source elements plus two
+      ASSERT_EQ(destViewAfterOp_h(i, j), cloneOfSourceViewBeforeOp_h(i, j) + 2);
+      ASSERT_EQ(destViewBeforeOp_h(i, j), ValueType(0));
+    }
+
+    // each team should return an iterator whose distance from the
+    // beginning of the row equals the num of columns since
+    // each team transforms all elements in each row
+    EXPECT_TRUE(distancesView_h(i) == numCols);
+    ASSERT_TRUE(intraTeamSentinelView_h(i));
+  }
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios() {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 11113}) {
+      for (int apiId : {0, 1}) {
+        test_A<LayoutTag, ValueType>(numTeams, numCols, apiId);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_transform_team_test, test_unary_op) {
+  run_all_scenarios<DynamicTag, double>();
+  run_all_scenarios<StridedTwoRowsTag, int>();
+  run_all_scenarios<StridedThreeRowsTag, unsigned>();
+}
+
+}  // namespace TeamTransformUnaryOp
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUnique.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUnique.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c46146e0a8f6f6dbc309f208ffdc3a881755faa6
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUnique.cpp
@@ -0,0 +1,171 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+#include <algorithm>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamUniqueDefaultPredicate {
+
+namespace KE = Kokkos::Experimental;
+
+template <class ViewType, class DistancesViewType, class IntraTeamSentinelView>
+struct TestFunctorA {
+  ViewType m_view;
+  DistancesViewType m_distancesView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+  int m_apiPick;
+
+  TestFunctorA(const ViewType view, const DistancesViewType distancesView,
+               const IntraTeamSentinelView intraTeamSentinelView, int apiPick)
+      : m_view(view),
+        m_distancesView(distancesView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_apiPick(apiPick) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto myRowIndex = member.league_rank();
+    auto myRowView        = Kokkos::subview(m_view, myRowIndex, Kokkos::ALL());
+    ptrdiff_t resultDist  = 0;
+
+    if (m_apiPick == 0) {
+      auto it    = KE::unique(member, KE::begin(myRowView), KE::end(myRowView));
+      resultDist = KE::distance(KE::begin(myRowView), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    } else if (m_apiPick == 1) {
+      auto it    = KE::unique(member, myRowView);
+      resultDist = KE::distance(KE::begin(myRowView), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    } else if (m_apiPick == 2) {
+      using value_type = typename ViewType::value_type;
+      auto it    = KE::unique(member, KE::begin(myRowView), KE::end(myRowView),
+                           CustomEqualityComparator<value_type>{});
+      resultDist = KE::distance(KE::begin(myRowView), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    } else if (m_apiPick == 3) {
+      using value_type = typename ViewType::value_type;
+      auto it =
+          KE::unique(member, myRowView, CustomEqualityComparator<value_type>{});
+      resultDist = KE::distance(KE::begin(myRowView), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    }
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck = team_members_have_matching_result(
+        member, resultDist, m_distancesView(myRowIndex));
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(myRowIndex) = intraTeamCheck;
+    });
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
+  /* description:
+     team-level KE::unique on a rank-2 view where
+     data is filled randomly such that we have several subsets
+     of consecutive equal elements. Use one team per row.
+   */
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // create a view in the memory space associated with default exespace
+  // with as many rows as the number of teams and fill it with random
+  // values from a range that is tight enough that there is a high likelihood
+  // of having several consecutive subsets of equal elements
+  auto [dataView, cloneOfDataViewBeforeOp_h] =
+      create_random_view_and_host_clone(
+          LayoutTag{}, numTeams, numCols,
+          Kokkos::pair<ValueType, ValueType>{121, 153}, "dataView");
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+
+  // each team stores the distance of the returned iterator from the
+  // beginning of the interval that team operates on and then we check
+  // that these distances match the expectation
+  Kokkos::View<std::size_t*> distancesView("distancesView", numTeams);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  // use CTAD for functor
+  TestFunctorA fnc(dataView, distancesView, intraTeamSentinelView, apiId);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // run std algo and check
+  // -----------------------------------------------
+  // here I can use cloneOfDataViewBeforeOp_h to run std algo on
+  // since that contains a valid copy of the data
+  auto distancesView_h         = create_host_space_copy(distancesView);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+
+  for (std::size_t i = 0; i < cloneOfDataViewBeforeOp_h.extent(0); ++i) {
+    auto myRow = Kokkos::subview(cloneOfDataViewBeforeOp_h, i, Kokkos::ALL());
+
+    std::size_t stdDistance = 0;
+    if (apiId <= 1) {
+      auto it     = std::unique(KE::begin(myRow), KE::end(myRow));
+      stdDistance = KE::distance(KE::begin(myRow), it);
+    } else {
+      auto it     = std::unique(KE::begin(myRow), KE::end(myRow),
+                            CustomEqualityComparator<value_type>{});
+      stdDistance = KE::distance(KE::begin(myRow), it);
+    }
+    ASSERT_EQ(stdDistance, distancesView_h(i));
+    ASSERT_TRUE(intraTeamSentinelView_h(i));
+  }
+
+  auto dataViewAfterOp_h = create_host_space_copy(dataView);
+  expect_equal_host_views(cloneOfDataViewBeforeOp_h, dataViewAfterOp_h);
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios() {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 11113}) {
+      for (int apiId : {0, 1}) {
+        test_A<LayoutTag, ValueType>(numTeams, numCols, apiId);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_unique_team_test, test_default_predicate) {
+  run_all_scenarios<DynamicTag, int>();
+  run_all_scenarios<StridedTwoRowsTag, int>();
+  run_all_scenarios<StridedThreeRowsTag, int>();
+}
+
+}  // namespace TeamUniqueDefaultPredicate
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUniqueCopy.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUniqueCopy.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..87687b60a16e13ceb58ff2a3d53aacdb3e6e0ec1
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUniqueCopy.cpp
@@ -0,0 +1,196 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestStdAlgorithmsCommon.hpp>
+
+namespace Test {
+namespace stdalgos {
+namespace TeamUniqueCopy {
+
+namespace KE = Kokkos::Experimental;
+
+template <class ValueType>
+struct CustomPredicate {
+  KOKKOS_INLINE_FUNCTION
+  bool operator()(ValueType a, ValueType b) const { return a == b; }
+};
+
+template <class SourceViewType, class DestViewType, class DistancesViewType,
+          class IntraTeamSentinelView>
+struct TestFunctorA {
+  SourceViewType m_sourceView;
+  DestViewType m_destView;
+  DistancesViewType m_distancesView;
+  IntraTeamSentinelView m_intraTeamSentinelView;
+  int m_apiPick;
+
+  TestFunctorA(const SourceViewType sourceView, const DestViewType destView,
+               const DistancesViewType distancesView,
+               const IntraTeamSentinelView intraTeamSentinelView, int apiPick)
+      : m_sourceView(sourceView),
+        m_destView(destView),
+        m_distancesView(distancesView),
+        m_intraTeamSentinelView(intraTeamSentinelView),
+        m_apiPick(apiPick) {}
+
+  template <class MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const {
+    const auto myRowIndex = member.league_rank();
+    auto myRowViewFrom =
+        Kokkos::subview(m_sourceView, myRowIndex, Kokkos::ALL());
+    auto myRowViewDest = Kokkos::subview(m_destView, myRowIndex, Kokkos::ALL());
+    ptrdiff_t resultDist = 0;
+
+    if (m_apiPick == 0) {
+      auto it =
+          KE::unique_copy(member, KE::begin(myRowViewFrom),
+                          KE::end(myRowViewFrom), KE::begin(myRowViewDest));
+      resultDist = KE::distance(KE::begin(myRowViewDest), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    } else if (m_apiPick == 1) {
+      auto it    = KE::unique_copy(member, myRowViewFrom, myRowViewDest);
+      resultDist = KE::distance(KE::begin(myRowViewDest), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    } else if (m_apiPick == 2) {
+      using comparator_t =
+          CustomEqualityComparator<typename SourceViewType::value_type>;
+      auto it    = KE::unique_copy(member, KE::begin(myRowViewFrom),
+                                KE::end(myRowViewFrom),
+                                KE::begin(myRowViewDest), comparator_t());
+      resultDist = KE::distance(KE::begin(myRowViewDest), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    } else if (m_apiPick == 3) {
+      using comparator_t =
+          CustomEqualityComparator<typename SourceViewType::value_type>;
+      auto it =
+          KE::unique_copy(member, myRowViewFrom, myRowViewDest, comparator_t());
+      resultDist = KE::distance(KE::begin(myRowViewDest), it);
+      Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+        m_distancesView(myRowIndex) = resultDist;
+      });
+    }
+
+    // store result of checking if all members have their local
+    // values matching the one stored in m_distancesView
+    member.team_barrier();
+    const bool intraTeamCheck = team_members_have_matching_result(
+        member, resultDist, m_distancesView(myRowIndex));
+    Kokkos::single(Kokkos::PerTeam(member), [=, *this]() {
+      m_intraTeamSentinelView(myRowIndex) = intraTeamCheck;
+    });
+  }
+};
+
+template <class LayoutTag, class ValueType>
+void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
+  /* description:
+     team-level KE::unique_copy on a rank-2 view where
+     data is filled randomly such that we have several subsets
+     of consecutive equal elements. Use one team per row.
+   */
+
+  // -----------------------------------------------
+  // prepare data
+  // -----------------------------------------------
+  // create a view in the memory space associated with default exespace
+  // with as many rows as the number of teams and fill it with random
+  // values from a range that is tight enough that there is a high likelihood
+  // of having several consecutive subsets of equal elements
+  auto [sourceView, cloneOfDataViewBeforeOp_h] =
+      create_random_view_and_host_clone(
+          LayoutTag{}, numTeams, numCols,
+          Kokkos::pair<ValueType, ValueType>{121, 153}, "sourceView");
+
+  // -----------------------------------------------
+  // launch kokkos kernel
+  // -----------------------------------------------
+  using space_t = Kokkos::DefaultExecutionSpace;
+  Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO());
+
+  // create the destination view
+  Kokkos::View<ValueType**> destView("destView", numTeams, numCols);
+
+  // each team stores the distance of the returned iterator from the
+  // beginning of the interval that team operates on and then we check
+  // that these distances match the expectation
+  Kokkos::View<std::size_t*> distancesView("distancesView", numTeams);
+  // sentinel to check if all members of the team compute the same result
+  Kokkos::View<bool*> intraTeamSentinelView("intraTeamSameResult", numTeams);
+
+  // use CTAD for functor
+  TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView,
+                   apiId);
+  Kokkos::parallel_for(policy, fnc);
+
+  // -----------------------------------------------
+  // run std algo and check
+  // -----------------------------------------------
+  auto distancesView_h         = create_host_space_copy(distancesView);
+  auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
+  Kokkos::View<ValueType**, Kokkos::HostSpace> stdDestView("stdDestView",
+                                                           numTeams, numCols);
+
+  for (std::size_t i = 0; i < cloneOfDataViewBeforeOp_h.extent(0); ++i) {
+    ASSERT_TRUE(intraTeamSentinelView_h(i));
+
+    auto myRowFrom =
+        Kokkos::subview(cloneOfDataViewBeforeOp_h, i, Kokkos::ALL());
+    auto myRowDest = Kokkos::subview(stdDestView, i, Kokkos::ALL());
+
+    std::size_t stdDistance = 0;
+    if (apiId <= 1) {
+      auto it     = std::unique_copy(KE::cbegin(myRowFrom), KE::cend(myRowFrom),
+                                 KE::begin(myRowDest));
+      stdDistance = KE::distance(KE::begin(myRowDest), it);
+    } else {
+      auto it     = std::unique_copy(KE::cbegin(myRowFrom), KE::cend(myRowFrom),
+                                 KE::begin(myRowDest),
+                                 CustomEqualityComparator<value_type>{});
+      stdDistance = KE::distance(KE::begin(myRowDest), it);
+    }
+    ASSERT_EQ(stdDistance, distancesView_h(i));
+  }
+
+  auto destViewAfterOp_h = create_host_space_copy(destView);
+  expect_equal_host_views(stdDestView, destViewAfterOp_h);
+}
+
+template <class LayoutTag, class ValueType>
+void run_all_scenarios() {
+  for (int numTeams : teamSizesToTest) {
+    for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 11113}) {
+      for (int apiId : {0, 1}) {
+        test_A<LayoutTag, ValueType>(numTeams, numCols, apiId);
+      }
+    }
+  }
+}
+
+TEST(std_algorithms_unique_copy_team_test, test) {
+  run_all_scenarios<DynamicTag, int>();
+  run_all_scenarios<StridedTwoRowsTag, int>();
+  run_all_scenarios<StridedThreeRowsTag, int>();
+}
+
+}  // namespace TeamUniqueCopy
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp
index 70c04dbafa2979938091dd8a8268715c9601d1c1..9dac3ce75ffa9fdc40347850527305fbb74abff2 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp
@@ -165,7 +165,7 @@ void verify_data(ViewType1 data_view,  // contains data
       //           << std::abs(gold_h(i) - test_view_h(i)) << std::endl;
 
       if (std::is_same<gold_view_value_type, int>::value) {
-        EXPECT_EQ(gold_h(i), test_view_h(i));
+        ASSERT_EQ(gold_h(i), test_view_h(i));
       } else {
         const auto error = std::abs(gold_h(i) - test_view_h(i));
         if (error > 1e-10) {
@@ -221,7 +221,7 @@ void run_single_scenario(const InfoType& scenario_info, ValueType init_value,
     auto r = KE::transform_exclusive_scan(
         exespace(), KE::cbegin(view_from), KE::cend(view_from),
         KE::begin(view_dest), init_value, bop, uop);
-    EXPECT_EQ(r, KE::end(view_dest));
+    ASSERT_EQ(r, KE::end(view_dest));
     verify_data(view_from, view_dest, init_value, bop, uop);
   }
 
@@ -230,7 +230,7 @@ void run_single_scenario(const InfoType& scenario_info, ValueType init_value,
     auto r = KE::transform_exclusive_scan(
         "label", exespace(), KE::cbegin(view_from), KE::cend(view_from),
         KE::begin(view_dest), init_value, bop, uop);
-    EXPECT_EQ(r, KE::end(view_dest));
+    ASSERT_EQ(r, KE::end(view_dest));
     verify_data(view_from, view_dest, init_value, bop, uop);
   }
 
@@ -238,7 +238,7 @@ void run_single_scenario(const InfoType& scenario_info, ValueType init_value,
     fill_zero(view_dest);
     auto r = KE::transform_exclusive_scan(exespace(), view_from, view_dest,
                                           init_value, bop, uop);
-    EXPECT_EQ(r, KE::end(view_dest));
+    ASSERT_EQ(r, KE::end(view_dest));
     verify_data(view_from, view_dest, init_value, bop, uop);
   }
 
@@ -246,7 +246,7 @@ void run_single_scenario(const InfoType& scenario_info, ValueType init_value,
     fill_zero(view_dest);
     auto r = KE::transform_exclusive_scan("label", exespace(), view_from,
                                           view_dest, init_value, bop, uop);
-    EXPECT_EQ(r, KE::end(view_dest));
+    ASSERT_EQ(r, KE::end(view_dest));
     verify_data(view_from, view_dest, init_value, bop, uop);
   }
 
@@ -279,6 +279,59 @@ TEST(std_algorithms_numeric_ops_test, transform_exclusive_scan) {
 }
 #endif
 
+template <class ValueType>
+struct MultiplyFunctor {
+  KOKKOS_INLINE_FUNCTION
+  ValueType operator()(const ValueType& a, const ValueType& b) const {
+    return (a * b);
+  }
+};
+
+TEST(std_algorithms_numeric_ops_test, transform_exclusive_scan_functor) {
+  int dummy       = 0;
+  using view_type = Kokkos::View<int*, exespace>;
+  view_type dummy_view("dummy_view", 0);
+  using unary_op_type =
+      Kokkos::Experimental::Impl::StdNumericScanIdentityReferenceUnaryFunctor<
+          int>;
+  using functor_type =
+      Kokkos::Experimental::Impl::TransformExclusiveScanFunctorWithValueWrapper<
+          exespace, int, int, view_type, view_type, MultiplyFunctor<int>,
+          unary_op_type>;
+  functor_type functor(dummy, dummy_view, dummy_view, {}, {});
+  using value_type = functor_type::value_type;
+
+  value_type value1;
+  functor.init(value1);
+  ASSERT_EQ(value1.val, 0);
+  ASSERT_EQ(value1.is_initial, true);
+
+  value_type value2;
+  value2.val        = 1;
+  value2.is_initial = false;
+  functor.join(value1, value2);
+  ASSERT_EQ(value1.val, 1);
+  ASSERT_EQ(value1.is_initial, false);
+
+  functor.init(value1);
+  functor.join(value2, value1);
+  ASSERT_EQ(value2.val, 1);
+  ASSERT_EQ(value2.is_initial, false);
+
+  functor.init(value2);
+  functor.join(value2, value1);
+  ASSERT_EQ(value2.val, 0);
+  ASSERT_EQ(value2.is_initial, true);
+
+  value1.val        = 3;
+  value1.is_initial = false;
+  value2.val        = 2;
+  value2.is_initial = false;
+  functor.join(value2, value1);
+  ASSERT_EQ(value2.val, 6);
+  ASSERT_EQ(value2.is_initial, false);
+}
+
 }  // namespace TransformEScan
 }  // namespace stdalgos
 }  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp
index 80ff8132519b22b56b24f8e26b96a525f6495364..a90a68ca1d7588932bbffd034b890824e69ba0ec 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp
@@ -177,7 +177,7 @@ void verify_data(ViewType1 data_view,  // contains data
       //           << std::abs(gold_h(i) - test_view_h(i)) << std::endl;
 
       if (std::is_same<gold_view_value_type, int>::value) {
-        EXPECT_EQ(gold_h(i), test_view_h(i));
+        ASSERT_EQ(gold_h(i), test_view_h(i));
       } else {
         const auto error = std::abs(gold_h(i) - test_view_h(i));
         if (error > 1e-10) {
@@ -246,7 +246,7 @@ void run_single_scenario(const InfoType& scenario_info,
     auto r = KE::transform_inclusive_scan(exespace(), KE::cbegin(view_from),
                                           KE::cend(view_from),
                                           KE::begin(view_dest), args...);
-    EXPECT_EQ(r, KE::end(view_dest));
+    ASSERT_EQ(r, KE::end(view_dest));
     verify_data(view_from, view_dest, args...);
   }
 
@@ -255,7 +255,7 @@ void run_single_scenario(const InfoType& scenario_info,
     auto r = KE::transform_inclusive_scan(
         "label", exespace(), KE::cbegin(view_from), KE::cend(view_from),
         KE::begin(view_dest), args...);
-    EXPECT_EQ(r, KE::end(view_dest));
+    ASSERT_EQ(r, KE::end(view_dest));
     verify_data(view_from, view_dest, args...);
   }
 
@@ -263,7 +263,7 @@ void run_single_scenario(const InfoType& scenario_info,
     fill_zero(view_dest);
     auto r =
         KE::transform_inclusive_scan(exespace(), view_from, view_dest, args...);
-    EXPECT_EQ(r, KE::end(view_dest));
+    ASSERT_EQ(r, KE::end(view_dest));
     verify_data(view_from, view_dest, args...);
   }
 
@@ -271,7 +271,7 @@ void run_single_scenario(const InfoType& scenario_info,
     fill_zero(view_dest);
     auto r = KE::transform_inclusive_scan("label", exespace(), view_from,
                                           view_dest, args...);
-    EXPECT_EQ(r, KE::end(view_dest));
+    ASSERT_EQ(r, KE::end(view_dest));
     verify_data(view_from, view_dest, args...);
   }
 
@@ -306,6 +306,75 @@ TEST(std_algorithms_numeric_ops_test, transform_inclusive_scan) {
 }
 #endif
 
+template <class ValueType>
+struct MultiplyFunctor {
+  KOKKOS_INLINE_FUNCTION
+  ValueType operator()(const ValueType& a, const ValueType& b) const {
+    return (a * b);
+  }
+};
+
+TEST(std_algorithms_numeric_ops_test, transform_inclusive_scan_functor) {
+  using value_type = KE::Impl::ValueWrapperForNoNeutralElement<int>;
+
+  auto test_lambda = [&](auto& functor) {
+    value_type value1;
+    functor.init(value1);
+    ASSERT_EQ(value1.val, 0);
+    ASSERT_EQ(value1.is_initial, true);
+
+    value_type value2;
+    value2.val        = 1;
+    value2.is_initial = false;
+    functor.join(value1, value2);
+    ASSERT_EQ(value1.val, 1);
+    ASSERT_EQ(value1.is_initial, false);
+
+    functor.init(value1);
+    functor.join(value2, value1);
+    ASSERT_EQ(value2.val, 1);
+    ASSERT_EQ(value2.is_initial, false);
+
+    functor.init(value2);
+    functor.join(value2, value1);
+    ASSERT_EQ(value2.val, 0);
+    ASSERT_EQ(value2.is_initial, true);
+
+    value1.val        = 3;
+    value1.is_initial = false;
+    value2.val        = 2;
+    value2.is_initial = false;
+    functor.join(value2, value1);
+    ASSERT_EQ(value2.val, 6);
+    ASSERT_EQ(value2.is_initial, false);
+  };
+
+  int dummy       = 0;
+  using view_type = Kokkos::View<int*, exespace>;
+  view_type dummy_view("dummy_view", 0);
+  using unary_op_type =
+      KE::Impl::StdNumericScanIdentityReferenceUnaryFunctor<int>;
+  {
+    using functor_type =
+        KE::Impl::ExeSpaceTransformInclusiveScanNoInitValueFunctor<
+            exespace, int, int, view_type, view_type, MultiplyFunctor<int>,
+            unary_op_type>;
+    functor_type functor(dummy_view, dummy_view, {}, {});
+
+    test_lambda(functor);
+  }
+
+  {
+    using functor_type =
+        KE::Impl::ExeSpaceTransformInclusiveScanWithInitValueFunctor<
+            exespace, int, int, view_type, view_type, MultiplyFunctor<int>,
+            unary_op_type>;
+    functor_type functor(dummy_view, dummy_view, {}, {}, dummy);
+
+    test_lambda(functor);
+  }
+}
+
 }  // namespace TransformIncScan
 }  // namespace stdalgos
 }  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformUnaryOp.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformUnaryOp.cpp
index dab81b8f1e3bb536a3635bde98b072909a1e31cc..6070c1a60d36dc44d778906e67f18d1fba247053 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformUnaryOp.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformUnaryOp.cpp
@@ -58,7 +58,7 @@ void verify_data(ViewTypeFrom view_from, ViewTypeTest view_test) {
       create_mirror_view_and_copy(Kokkos::HostSpace(), view_from_dc);
 
   for (std::size_t i = 0; i < view_test_h.extent(0); ++i) {
-    EXPECT_EQ(view_test_h(i), view_from_h(i) + value_type(1));
+    ASSERT_EQ(view_test_h(i), view_from_h(i) + value_type(1));
   }
 }
 
@@ -89,7 +89,7 @@ void run_single_scenario(const InfoType& scenario_info) {
     auto r1 = KE::transform(exespace(), KE::begin(view_from),
                             KE::end(view_from), KE::begin(view_dest), unOp);
     verify_data(view_from, view_dest);
-    EXPECT_EQ(r1, KE::end(view_dest));
+    ASSERT_EQ(r1, KE::end(view_dest));
   }
 
   {
@@ -98,7 +98,7 @@ void run_single_scenario(const InfoType& scenario_info) {
     auto r1 = KE::transform("label", exespace(), KE::begin(view_from),
                             KE::end(view_from), KE::begin(view_dest), unOp);
     verify_data(view_from, view_dest);
-    EXPECT_EQ(r1, KE::end(view_dest));
+    ASSERT_EQ(r1, KE::end(view_dest));
   }
 
   {
@@ -106,7 +106,7 @@ void run_single_scenario(const InfoType& scenario_info) {
         create_view<ValueType>(Tag{}, view_ext, "transform_uop_dest");
     auto r1 = KE::transform(exespace(), view_from, view_dest, unOp);
     verify_data(view_from, view_dest);
-    EXPECT_EQ(r1, KE::end(view_dest));
+    ASSERT_EQ(r1, KE::end(view_dest));
   }
 
   {
@@ -114,7 +114,7 @@ void run_single_scenario(const InfoType& scenario_info) {
         create_view<ValueType>(Tag{}, view_ext, "transform_uop_dest");
     auto r1 = KE::transform("label", exespace(), view_from, view_dest, unOp);
     verify_data(view_from, view_dest);
-    EXPECT_EQ(r1, KE::end(view_dest));
+    ASSERT_EQ(r1, KE::end(view_dest));
   }
 
   Kokkos::fence();
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsUnique.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsUnique.cpp
index a810d31d820266294a9cdf57121d7606c9588d07..9c5ae0cf8a1e465c955f12797aa35269a53be9df 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsUnique.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsUnique.cpp
@@ -157,7 +157,7 @@ void verify_data(const std::string& name, ResultIt my_result_it,
   //
   const auto std_diff = (std::size_t)(std_r - KE::begin(data_v_h));
   const auto my_diff  = (std::size_t)(my_result_it - KE::begin(view_test));
-  EXPECT_EQ(my_diff, std_diff);
+  ASSERT_EQ(my_diff, std_diff);
 
   //
   // check the data in the view
@@ -170,14 +170,14 @@ void verify_data(const std::string& name, ResultIt my_result_it,
     // 		<< " my  = " << view_test_h(i) << " "
     // 		<< " std = " << data_v_h(i)
     // 		<< '\n';
-    EXPECT_EQ(view_test_h(i), data_v_h(i));
+    ASSERT_EQ(view_test_h(i), data_v_h(i));
   }
 
   if (name == "medium-b") {
     using value_type = typename ViewType1::value_type;
-    EXPECT_EQ(my_diff, (std::size_t)2);
-    EXPECT_EQ(view_test_h(0), (value_type)22);
-    EXPECT_EQ(view_test_h(1), (value_type)44);
+    ASSERT_EQ(my_diff, (std::size_t)2);
+    ASSERT_EQ(view_test_h(0), (value_type)22);
+    ASSERT_EQ(view_test_h(1), (value_type)44);
   }
 }
 
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsUniqueCopy.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsUniqueCopy.cpp
index f609d8517e642fa5b1a2cff420475468e0a1099f..3cf43ad4db8ff96dd9bbe3d66e563c822258000e 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsUniqueCopy.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsUniqueCopy.cpp
@@ -174,51 +174,51 @@ void verify_data(const std::string& name, ViewTypeFrom view_from,
   }
 
   else if (name == "one-element-a") {
-    EXPECT_EQ(view_test_h(0), static_cast<value_type>(1));
+    ASSERT_EQ(view_test_h(0), static_cast<value_type>(1));
   }
 
   else if (name == "one-element-b") {
-    EXPECT_EQ(view_test_h(0), static_cast<value_type>(2));
+    ASSERT_EQ(view_test_h(0), static_cast<value_type>(2));
   }
 
   else if (name == "two-elements-a") {
-    EXPECT_EQ(view_test_h(0), static_cast<value_type>(1));
-    EXPECT_EQ(view_test_h(1), static_cast<value_type>(2));
+    ASSERT_EQ(view_test_h(0), static_cast<value_type>(1));
+    ASSERT_EQ(view_test_h(1), static_cast<value_type>(2));
   }
 
   else if (name == "two-elements-b") {
-    EXPECT_EQ(view_test_h(0), static_cast<value_type>(2));
-    EXPECT_EQ(view_test_h(1), static_cast<value_type>(-1));
+    ASSERT_EQ(view_test_h(0), static_cast<value_type>(2));
+    ASSERT_EQ(view_test_h(1), static_cast<value_type>(-1));
   }
 
   else if (name == "small-a") {
-    EXPECT_EQ(view_test_h(0), static_cast<value_type>(0));
-    EXPECT_EQ(view_test_h(1), static_cast<value_type>(1));
-    EXPECT_EQ(view_test_h(2), static_cast<value_type>(2));
-    EXPECT_EQ(view_test_h(3), static_cast<value_type>(3));
-    EXPECT_EQ(view_test_h(4), static_cast<value_type>(4));
-    EXPECT_EQ(view_test_h(5), static_cast<value_type>(5));
-    EXPECT_EQ(view_test_h(6), static_cast<value_type>(6));
-    EXPECT_EQ(view_test_h(7), static_cast<value_type>(0));
-    EXPECT_EQ(view_test_h(8), static_cast<value_type>(0));
-    EXPECT_EQ(view_test_h(9), static_cast<value_type>(0));
-    EXPECT_EQ(view_test_h(10), static_cast<value_type>(0));
+    ASSERT_EQ(view_test_h(0), static_cast<value_type>(0));
+    ASSERT_EQ(view_test_h(1), static_cast<value_type>(1));
+    ASSERT_EQ(view_test_h(2), static_cast<value_type>(2));
+    ASSERT_EQ(view_test_h(3), static_cast<value_type>(3));
+    ASSERT_EQ(view_test_h(4), static_cast<value_type>(4));
+    ASSERT_EQ(view_test_h(5), static_cast<value_type>(5));
+    ASSERT_EQ(view_test_h(6), static_cast<value_type>(6));
+    ASSERT_EQ(view_test_h(7), static_cast<value_type>(0));
+    ASSERT_EQ(view_test_h(8), static_cast<value_type>(0));
+    ASSERT_EQ(view_test_h(9), static_cast<value_type>(0));
+    ASSERT_EQ(view_test_h(10), static_cast<value_type>(0));
   }
 
   else if (name == "small-b") {
-    EXPECT_EQ(view_test_h(0), static_cast<value_type>(1));
-    EXPECT_EQ(view_test_h(1), static_cast<value_type>(2));
-    EXPECT_EQ(view_test_h(2), static_cast<value_type>(3));
-    EXPECT_EQ(view_test_h(3), static_cast<value_type>(4));
-    EXPECT_EQ(view_test_h(4), static_cast<value_type>(5));
-    EXPECT_EQ(view_test_h(5), static_cast<value_type>(6));
-    EXPECT_EQ(view_test_h(6), static_cast<value_type>(8));
-    EXPECT_EQ(view_test_h(7), static_cast<value_type>(9));
-    EXPECT_EQ(view_test_h(8), static_cast<value_type>(8));
-    EXPECT_EQ(view_test_h(9), static_cast<value_type>(0));
-    EXPECT_EQ(view_test_h(10), static_cast<value_type>(0));
-    EXPECT_EQ(view_test_h(11), static_cast<value_type>(0));
-    EXPECT_EQ(view_test_h(12), static_cast<value_type>(0));
+    ASSERT_EQ(view_test_h(0), static_cast<value_type>(1));
+    ASSERT_EQ(view_test_h(1), static_cast<value_type>(2));
+    ASSERT_EQ(view_test_h(2), static_cast<value_type>(3));
+    ASSERT_EQ(view_test_h(3), static_cast<value_type>(4));
+    ASSERT_EQ(view_test_h(4), static_cast<value_type>(5));
+    ASSERT_EQ(view_test_h(5), static_cast<value_type>(6));
+    ASSERT_EQ(view_test_h(6), static_cast<value_type>(8));
+    ASSERT_EQ(view_test_h(7), static_cast<value_type>(9));
+    ASSERT_EQ(view_test_h(8), static_cast<value_type>(8));
+    ASSERT_EQ(view_test_h(9), static_cast<value_type>(0));
+    ASSERT_EQ(view_test_h(10), static_cast<value_type>(0));
+    ASSERT_EQ(view_test_h(11), static_cast<value_type>(0));
+    ASSERT_EQ(view_test_h(12), static_cast<value_type>(0));
   }
 
   else if (name == "medium" || name == "large") {
@@ -230,7 +230,7 @@ void verify_data(const std::string& name, ViewTypeFrom view_from,
     (void)std_r;
 
     for (std::size_t i = 0; i < view_from_h.extent(0); ++i) {
-      EXPECT_EQ(view_test_h(i), tmp[i]);
+      ASSERT_EQ(view_test_h(i), tmp[i]);
     }
   }
 
@@ -273,7 +273,7 @@ void run_single_scenario(const InfoType& scenario_info, Args... args) {
         KE::unique_copy(exespace(), KE::cbegin(view_from), KE::cend(view_from),
                         KE::begin(view_dest), args...);
     verify_data(name, view_from, view_dest, args...);
-    EXPECT_EQ(rit, (KE::begin(view_dest) + n));
+    ASSERT_EQ(rit, (KE::begin(view_dest) + n));
   }
 
   {
@@ -283,7 +283,7 @@ void run_single_scenario(const InfoType& scenario_info, Args... args) {
         KE::unique_copy("label", exespace(), KE::cbegin(view_from),
                         KE::cend(view_from), KE::begin(view_dest), args...);
     verify_data(name, view_from, view_dest, args...);
-    EXPECT_EQ(rit, (KE::begin(view_dest) + n));
+    ASSERT_EQ(rit, (KE::begin(view_dest) + n));
   }
 
   {
@@ -291,7 +291,7 @@ void run_single_scenario(const InfoType& scenario_info, Args... args) {
         create_view<ValueType>(Tag{}, view_ext, "unique_copy_dest");
     auto rit = KE::unique_copy(exespace(), view_from, view_dest, args...);
     verify_data(name, view_from, view_dest, args...);
-    EXPECT_EQ(rit, (KE::begin(view_dest) + n));
+    ASSERT_EQ(rit, (KE::begin(view_dest) + n));
   }
 
   {
@@ -300,7 +300,7 @@ void run_single_scenario(const InfoType& scenario_info, Args... args) {
     auto rit =
         KE::unique_copy("label", exespace(), view_from, view_dest, args...);
     verify_data(name, view_from, view_dest, args...);
-    EXPECT_EQ(rit, (KE::begin(view_dest) + n));
+    ASSERT_EQ(rit, (KE::begin(view_dest) + n));
   }
 
   Kokkos::fence();
diff --git a/packages/kokkos/appveyor.yml b/packages/kokkos/appveyor.yml
index ceb33bf4410f44fcb4be98d9c029122a7168df5a..c0b6e9cab9f73fc180796a4d154b758b71037a8f 100644
--- a/packages/kokkos/appveyor.yml
+++ b/packages/kokkos/appveyor.yml
@@ -5,6 +5,6 @@ build_script:
 - cmd: >-
     mkdir build &&
     cd build &&
-    cmake c:\projects\source -DKokkos_ENABLE_TESTS=ON -DKokkos_ENABLE_BENCHMARKS=ON -DCMAKE_CXX_FLAGS="/W0 /EHsc" -DKokkos_ENABLE_DEPRECATED_CODE_4=ON -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF &&
+    cmake c:\projects\source -DKokkos_ENABLE_TESTS=ON -DCMAKE_CXX_FLAGS="/W0 /EHsc" -DKokkos_ENABLE_DEPRECATED_CODE_4=ON -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF &&
     cmake --build . --target install &&
     ctest -C Debug --output-on-failure
diff --git a/packages/kokkos/benchmarks/CMakeLists.txt b/packages/kokkos/benchmarks/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..42279bf55db83ad5a0aeade86e18a918d88370a6
--- /dev/null
+++ b/packages/kokkos/benchmarks/CMakeLists.txt
@@ -0,0 +1 @@
+KOKKOS_ADD_BENCHMARK_DIRECTORIES(gups)
diff --git a/packages/kokkos/benchmarks/gups/CMakeLists.txt b/packages/kokkos/benchmarks/gups/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8de5b73cc67f9b3fcd0ff9d9ae1d49efdc6d789a
--- /dev/null
+++ b/packages/kokkos/benchmarks/gups/CMakeLists.txt
@@ -0,0 +1,4 @@
+KOKKOS_ADD_EXECUTABLE(
+  gups
+  SOURCES gups.cpp
+)
diff --git a/packages/kokkos/benchmarks/gups/Makefile b/packages/kokkos/benchmarks/gups/Makefile
deleted file mode 100644
index 2a90621d8ca20af96b991ff525641d06bf831ce5..0000000000000000000000000000000000000000
--- a/packages/kokkos/benchmarks/gups/Makefile
+++ /dev/null
@@ -1,51 +0,0 @@
-KOKKOS_DEVICES=Cuda
-KOKKOS_CUDA_OPTIONS=enable_lambda
-KOKKOS_ARCH = "SNB,Volta70"
-
-
-MAKEFILE_PATH := $(subst Makefile,,$(abspath $(lastword $(MAKEFILE_LIST))))
-
-ifndef KOKKOS_PATH
-  KOKKOS_PATH = $(MAKEFILE_PATH)../..
-endif
-
-SRC = $(wildcard $(MAKEFILE_PATH)*.cpp)
-HEADERS = $(wildcard $(MAKEFILE_PATH)*.hpp)
-
-vpath %.cpp $(sort $(dir $(SRC)))
-
-default: build
-	echo "Start Build"
-
-ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
-CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper
-EXE = gups.cuda
-else
-CXX = g++
-EXE = gups.exe
-endif
-
-CXXFLAGS ?= -O3 -g
-override CXXFLAGS += -I$(MAKEFILE_PATH)
-
-DEPFLAGS = -M
-LINK = ${CXX}
-LINKFLAGS =
-
-OBJ = $(notdir $(SRC:.cpp=.o))
-LIB =
-
-include $(KOKKOS_PATH)/Makefile.kokkos
-
-build: $(EXE)
-
-$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
-	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
-
-clean: kokkos-clean
-	rm -f *.o gups.cuda gups.exe
-
-# Compilation rules
-
-%.o:%.cpp $(KOKKOS_CPP_DEPENDS) $(HEADERS)
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@)
diff --git a/packages/kokkos/benchmarks/gups/gups-kokkos.cpp b/packages/kokkos/benchmarks/gups/gups-kokkos.cpp
deleted file mode 100644
index 97c339d09d8bb052251919d15b8690c040e14924..0000000000000000000000000000000000000000
--- a/packages/kokkos/benchmarks/gups/gups-kokkos.cpp
+++ /dev/null
@@ -1,175 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#include "Kokkos_Core.hpp"
-#include <cstdio>
-#include <cstdlib>
-#include <cmath>
-
-#include <sys/time.h>
-
-#define HLINE "-------------------------------------------------------------\n"
-
-#if defined(KOKKOS_ENABLE_CUDA)
-using GUPSHostArray   = Kokkos::View<int64_t*, Kokkos::CudaSpace>::HostMirror;
-using GUPSDeviceArray = Kokkos::View<int64_t*, Kokkos::CudaSpace>;
-#else
-using GUPSHostArray   = Kokkos::View<int64_t*, Kokkos::HostSpace>::HostMirror;
-using GUPSDeviceArray = Kokkos::View<int64_t*, Kokkos::HostSpace>;
-#endif
-
-using GUPSIndex = int;
-
-double now() {
-  struct timeval now;
-  gettimeofday(&now, nullptr);
-
-  return (double)now.tv_sec + ((double)now.tv_usec * 1.0e-6);
-}
-
-void randomize_indices(GUPSHostArray& indices, GUPSDeviceArray& dev_indices,
-                       const int64_t dataCount) {
-  for (GUPSIndex i = 0; i < indices.extent(0); ++i) {
-    indices[i] = lrand48() % dataCount;
-  }
-
-  Kokkos::deep_copy(dev_indices, indices);
-}
-
-void run_gups(GUPSDeviceArray& indices, GUPSDeviceArray& data,
-              const int64_t datum, const bool performAtomics) {
-  if (performAtomics) {
-    Kokkos::parallel_for(
-        "bench-gups-atomic", indices.extent(0),
-        KOKKOS_LAMBDA(const GUPSIndex i) {
-          Kokkos::atomic_fetch_xor(&data[indices[i]], datum);
-        });
-  } else {
-    Kokkos::parallel_for(
-        "bench-gups-non-atomic", indices.extent(0),
-        KOKKOS_LAMBDA(const GUPSIndex i) { data[indices[i]] ^= datum; });
-  }
-
-  Kokkos::fence();
-}
-
-int run_benchmark(const GUPSIndex indicesCount, const GUPSIndex dataCount,
-                  const int repeats, const bool useAtomics) {
-  printf("Reports fastest timing per kernel\n");
-  printf("Creating Views...\n");
-
-  printf("Memory Sizes:\n");
-  printf("- Elements:      %15" PRIu64 " (%12.4f MB)\n",
-         static_cast<uint64_t>(dataCount),
-         1.0e-6 * ((double)dataCount * (double)sizeof(int64_t)));
-  printf("- Indices:       %15" PRIu64 " (%12.4f MB)\n",
-         static_cast<uint64_t>(indicesCount),
-         1.0e-6 * ((double)indicesCount * (double)sizeof(int64_t)));
-  printf(" - Atomics:      %15s\n", (useAtomics ? "Yes" : "No"));
-  printf("Benchmark kernels will be performed for %d iterations.\n", repeats);
-
-  printf(HLINE);
-
-  GUPSDeviceArray dev_indices("indices", indicesCount);
-  GUPSDeviceArray dev_data("data", dataCount);
-  int64_t datum = -1;
-
-  GUPSHostArray indices = Kokkos::create_mirror_view(dev_indices);
-  GUPSHostArray data    = Kokkos::create_mirror_view(dev_data);
-
-  double gupsTime = 0.0;
-
-  printf("Initializing Views...\n");
-
-#if defined(KOKKOS_HAVE_OPENMP)
-  Kokkos::parallel_for(
-      "init-data", Kokkos::RangePolicy<Kokkos::OpenMP>(0, dataCount),
-#else
-  Kokkos::parallel_for(
-      "init-data", Kokkos::RangePolicy<Kokkos::Serial>(0, dataCount),
-#endif
-      KOKKOS_LAMBDA(const int i) { data[i] = 10101010101; });
-
-#if defined(KOKKOS_HAVE_OPENMP)
-  Kokkos::parallel_for(
-      "init-indices", Kokkos::RangePolicy<Kokkos::OpenMP>(0, indicesCount),
-#else
-  Kokkos::parallel_for(
-      "init-indices", Kokkos::RangePolicy<Kokkos::Serial>(0, indicesCount),
-#endif
-      KOKKOS_LAMBDA(const int i) { indices[i] = 0; });
-
-  Kokkos::deep_copy(dev_data, data);
-  Kokkos::deep_copy(dev_indices, indices);
-  double start;
-
-  printf("Starting benchmarking...\n");
-
-  for (GUPSIndex k = 0; k < repeats; ++k) {
-    randomize_indices(indices, dev_indices, data.extent(0));
-
-    start = now();
-    run_gups(dev_indices, dev_data, datum, useAtomics);
-    gupsTime += now() - start;
-  }
-
-  Kokkos::deep_copy(indices, dev_indices);
-  Kokkos::deep_copy(data, dev_data);
-
-  printf(HLINE);
-  printf(
-      "GUP/s Random:      %18.6f\n",
-      (1.0e-9 * ((double)repeats) * (double)dev_indices.extent(0)) / gupsTime);
-  printf(HLINE);
-
-  return 0;
-}
-
-int main(int argc, char* argv[]) {
-  printf(HLINE);
-  printf("Kokkos GUPS Benchmark\n");
-  printf(HLINE);
-
-  srand48(1010101);
-
-  Kokkos::initialize(argc, argv);
-
-  int64_t indices = 8192;
-  int64_t data    = 33554432;
-  int64_t repeats = 10;
-  bool useAtomics = false;
-
-  for (int i = 1; i < argc; ++i) {
-    if (strcmp(argv[i], "--indices") == 0) {
-      indices = std::atoll(argv[i + 1]);
-      ++i;
-    } else if (strcmp(argv[i], "--data") == 0) {
-      data = std::atoll(argv[i + 1]);
-      ++i;
-    } else if (strcmp(argv[i], "--repeats") == 0) {
-      repeats = std::atoll(argv[i + 1]);
-      ++i;
-    } else if (strcmp(argv[i], "--atomics") == 0) {
-      useAtomics = true;
-    }
-  }
-
-  const int rc = run_benchmark(indices, data, repeats, useAtomics);
-
-  Kokkos::finalize();
-
-  return rc;
-}
diff --git a/packages/kokkos/benchmarks/gups/gups.cpp b/packages/kokkos/benchmarks/gups/gups.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..369052321d7b0c44099e4707006b85c7dbcc1be7
--- /dev/null
+++ b/packages/kokkos/benchmarks/gups/gups.cpp
@@ -0,0 +1,195 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+/*! \brief file gups.cpp
+
+    An implementation of something like HPCC RandomAccess.
+*/
+
+#include "Kokkos_Core.hpp"
+#include <cstdio>
+#include <cstdlib>
+#include <cmath>
+#include <chrono>
+#include <numeric>
+#include <algorithm>
+#include <random>
+
+#define HLINE "-------------------------------------------------------------\n"
+
+using Index = int;
+using Datum = int64_t;
+
+using IndexView = Kokkos::View<Index*>;
+using DataView  = Kokkos::View<Datum*>;
+
+using Clock    = std::chrono::steady_clock;
+using Duration = std::chrono::duration<double>;
+
+using RandomDevice = std::random_device;
+using RNG          = std::mt19937;
+
+IndexView randomized_indices(const Index indicesCount, const Index dataCount,
+                             RNG& rng) {
+  // generate random indices 0..dataCount
+  std::uniform_int_distribution<Index> uid(0, dataCount);
+  std::vector<Index> indices(indicesCount);
+  std::generate(indices.begin(), indices.end(), [&]() { return uid(rng); });
+
+  // Copy to the default space and return
+  Kokkos::View<Index*, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>
+      unmanaged_indices(indices.data(), indices.size());
+  IndexView dev_indices("dev_indices", indicesCount);
+  Kokkos::deep_copy(dev_indices, unmanaged_indices);
+  return dev_indices;
+}
+
+IndexView permuted_indices(const Index indicesCount, const Index dataCount,
+                           RNG& rng) {
+  // create a permutation array of offsets into the data
+  std::vector<Index> perm(dataCount);
+  std::iota(perm.begin(), perm.end(), 0);
+  std::shuffle(perm.begin(), perm.end(), rng);
+
+  // indices is repeated copies of the permutation array
+  // (or the first entries of the permutation array if there
+  // are fewer indices than data elements)
+  IndexView dev_indices("dev_indices", indicesCount);
+  auto indices = Kokkos::create_mirror_view(dev_indices);
+  for (Index i = 0; i < Index(indices.extent(0)); ++i) {
+    indices(i) = perm[i % perm.size()];
+  }
+
+  // Copy to the default space and return
+
+  Kokkos::deep_copy(dev_indices, indices);
+  return dev_indices;
+}
+
+void run_gups(IndexView& indices, DataView& data, const Datum datum,
+              const bool performAtomics) {
+  if (performAtomics) {
+    Kokkos::parallel_for(
+        "bench-gups-atomic", indices.extent(0), KOKKOS_LAMBDA(const Index i) {
+          Kokkos::atomic_fetch_xor(&data[indices[i]], datum);
+        });
+  } else {
+    Kokkos::parallel_for(
+        "bench-gups-non-atomic", indices.extent(0),
+        KOKKOS_LAMBDA(const Index i) { data[indices[i]] ^= datum; });
+  }
+
+  Kokkos::fence();
+}
+
+enum class AccessPattern { random, permutation };
+
+int run_benchmark(const Index indicesCount, const Index dataCount,
+                  const int repeats, const bool useAtomics,
+                  const AccessPattern pattern) {
+  constexpr auto arbitrary_seed = 20230913;
+  RNG rng(arbitrary_seed);
+
+  printf("Reports fastest timing per kernel\n");
+  printf("Creating Views...\n");
+
+  printf("Memory Sizes:\n");
+  printf("- Elements:      %15" PRIu64 " (%12.4f MB)\n",
+         static_cast<uint64_t>(dataCount),
+         1.0e-6 * ((double)dataCount * (double)sizeof(Datum)));
+  printf("- Indices:       %15" PRIu64 " (%12.4f MB)\n",
+         static_cast<uint64_t>(indicesCount),
+         1.0e-6 * ((double)indicesCount * (double)sizeof(Index)));
+  printf(" - Atomics:      %15s\n", (useAtomics ? "Yes" : "No"));
+  printf("Benchmark kernels will be performed for %d iterations.\n", repeats);
+
+  printf(HLINE);
+
+  printf("Initializing Data...\n");
+  DataView data("data", dataCount);
+  Kokkos::parallel_for(
+      "init-data",
+      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, dataCount),
+      KOKKOS_LAMBDA(const int i) { data[i] = 10101010101; });
+
+  printf("Starting benchmarking...\n");
+  double gupsTime       = 0.0;
+  constexpr Datum datum = -1;
+  for (Index k = 0; k < repeats; ++k) {
+    IndexView indices;
+    switch (pattern) {
+      case AccessPattern::random: {
+        indices = randomized_indices(indicesCount, dataCount, rng);
+        break;
+      }
+      case AccessPattern::permutation: {
+        indices = permuted_indices(indicesCount, dataCount, rng);
+        break;
+      }
+      default: {
+        throw std::runtime_error("unexpected mode");
+      }
+    }
+
+    auto start = Clock::now();
+    run_gups(indices, data, datum, useAtomics);
+    gupsTime += Duration(Clock::now() - start).count();
+  }
+
+  printf(HLINE);
+  printf("GUP/s Random:      %18.6f\n",
+         (1.0e-9 * ((double)repeats) * (double)indicesCount) / gupsTime);
+  printf(HLINE);
+
+  return 0;
+}
+
+int main(int argc, char* argv[]) {
+  printf(HLINE);
+  printf("Kokkos GUPS Benchmark\n");
+  printf(HLINE);
+
+  Kokkos::initialize(argc, argv);
+
+  int64_t indices       = 8192;
+  int64_t data          = 33554432;
+  int64_t repeats       = 10;
+  bool useAtomics       = false;
+  AccessPattern pattern = AccessPattern::random;
+
+  for (int i = 1; i < argc; ++i) {
+    if (strcmp(argv[i], "--indices") == 0) {
+      indices = std::atoll(argv[i + 1]);
+      ++i;
+    } else if (strcmp(argv[i], "--data") == 0) {
+      data = std::atoll(argv[i + 1]);
+      ++i;
+    } else if (strcmp(argv[i], "--repeats") == 0) {
+      repeats = std::atoll(argv[i + 1]);
+      ++i;
+    } else if (strcmp(argv[i], "--atomics") == 0) {
+      useAtomics = true;
+    } else if (strcmp(argv[i], "--pattern-permutation") == 0) {
+      pattern = AccessPattern::permutation;
+    }
+  }
+
+  const int rc = run_benchmark(indices, data, repeats, useAtomics, pattern);
+
+  Kokkos::finalize();
+
+  return rc;
+}
diff --git a/packages/kokkos/bin/hpcbind b/packages/kokkos/bin/hpcbind
index cb2af2c4b51ef1f25f8fd9c615b8c7f065d49457..b6db270128c1bd71dc85645beab854c391991962 100755
--- a/packages/kokkos/bin/hpcbind
+++ b/packages/kokkos/bin/hpcbind
@@ -36,8 +36,14 @@ fi
 ################################################################################
 declare -i HPCBIND_HAS_NVIDIA=0
 type nvidia-smi >/dev/null 2>&1
-HPCBIND_HAS_NVIDIA=$((!$?))
+HPCBIND_HAS_NVIDIA=$((! $?))
 
+################################################################################
+# Check if rocm-smi exist
+################################################################################
+declare -i HPCBIND_HAS_AMD=0
+type rocm-smi >/dev/null 2>&1
+HPCBIND_HAS_AMD=$((! $?))
 
 ################################################################################
 # Get visible gpu
@@ -45,11 +51,30 @@ HPCBIND_HAS_NVIDIA=$((!$?))
 declare -i NUM_GPUS=0
 HPCBIND_VISIBLE_GPUS=""
 if [[ ${HPCBIND_HAS_NVIDIA} -eq 1 ]]; then
-  NUM_GPUS=$(nvidia-smi -L | wc -l);
-  HPCBIND_HAS_NVIDIA=$((!$?))
+  nvidia-smi >/dev/null 2>&1
+  HPCBIND_HAS_NVIDIA=$((! $?))
   if [[ ${HPCBIND_HAS_NVIDIA} -eq 1 ]]; then
-    GPU_LIST="$( seq 0 $((NUM_GPUS-1)) )"
-    HPCBIND_VISIBLE_GPUS=${CUDA_VISIBLE_DEVICES:-${GPU_LIST}}
+    NUM_GPUS=$(nvidia-smi -L | wc -l);
+    HPCBIND_HAS_NVIDIA=$((! $?))
+    if [[ ${HPCBIND_HAS_NVIDIA} -eq 1 ]]; then
+      GPU_LIST="$( seq 0 $((NUM_GPUS-1)) )"
+      HPCBIND_VISIBLE_GPUS=${CUDA_VISIBLE_DEVICES:-${GPU_LIST}}
+    fi
+  fi
+fi
+
+if [[ ${HPCBIND_HAS_AMD} -eq 1 ]]; then
+  # rocm-smi doesn't have an error code if there is no hardware
+  # check for /sys/module/amdgpu/initstate instead
+  stat /sys/module/amdgpu/initstate >/dev/null 2>&1
+  HPCBIND_HAS_AMD=$((! $?))
+  if [[ ${HPCBIND_HAS_AMD} -eq 1 ]]; then
+    NUM_GPUS=$(rocm-smi -i --csv | sed '/^$/d' | tail -n +2 | wc -l);
+    HPCBIND_HAS_AMD=$((! $?))
+    if [[ ${HPCBIND_HAS_AMD} -eq 1 ]]; then
+      GPU_LIST="$( seq 0 $((NUM_GPUS-1)) )"
+      HPCBIND_VISIBLE_GPUS=${ROCR_VISIBLE_DEVICES:-${GPU_LIST}}
+    fi
   fi
 fi
 
@@ -80,7 +105,7 @@ elif [[ ! -z "${MV2_COMM_WORLD_RANK}" ]]; then
   HPCBIND_QUEUE_NAME="mvapich2"
   HPCBIND_QUEUE_RANK=${MV2_COMM_WORLD_RANK}
   HPCBIND_QUEUE_SIZE=${MV2_COMM_WORLD_SIZE}
-elif [[ ! -z "${SLURM_LOCAL_ID}" ]]; then
+elif [[ ! -z "${SLURM_LOCALID}" ]]; then
   HPCBIND_QUEUE_MAPPING=1
   HPCBIND_QUEUE_NAME="slurm"
   HPCBIND_QUEUE_RANK=${SLURM_PROCID}
@@ -101,8 +126,8 @@ fi
 function show_help {
   local cmd=$(basename "$0")
   echo "Usage: ${cmd} <options> -- command ..."
-  echo "  Set the process mask, OMP environment variables and CUDA environment"
-  echo "  variables to sane values if possible. Uses hwloc and nvidia-smi if"
+  echo "  Set the process mask, OMP environment variables and CUDA/ROCm environment"
+  echo "  variables to sane values if possible. Uses hwloc and nvidia-smi/rocm-smi if"
   echo "  available.  Will preserve the current process binding, so it is safe"
   echo "  to use with a queuing system or mpiexec."
   echo ""
@@ -116,10 +141,10 @@ function show_help {
   echo "  --distribute-partition=I"
   echo "                        Use the i'th partition (zero based)"
   echo "  --visible-gpus=<L>    Comma separated list of gpu ids"
-  echo "                        Default: CUDA_VISIBLE_DEVICES or all gpus in"
+  echo "                        Default: CUDA_VISIBLE_DEVICES/ROCR_VISIBLE_DEVICES or all gpus in"
   echo "                        sequential order"
   echo "  --ignore-queue        Ignore queue job id when choosing visible GPU and partition"
-  echo "  --no-gpu-mapping      Do not set CUDA_VISIBLE_DEVICES"
+  echo "  --no-gpu-mapping      Do not set CUDA_VISIBLE_DEVICES/ROCR_VISIBLE_DEVICES"
   echo "  --openmp=M.m          Set env variables for the given OpenMP version"
   echo "                        Default: 4.0"
   echo "  --openmp-ratio=N/D    Ratio of the cpuset to use for OpenMP"
@@ -525,13 +550,24 @@ fi
 ################################################################################
 
 if [[ ${HPCBIND_ENABLE_GPU_MAPPING} -eq 1 ]]; then
-  if [[ ${HPCBIND_QUEUE_MAPPING} -eq 0 ]]; then
-    declare -i GPU_ID=$((HPCBIND_PARTITION % NUM_GPUS))
-    export CUDA_VISIBLE_DEVICES="${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}"
-  else
-    declare -i MY_TASK_ID=$((HPCBIND_QUEUE_RANK * HPCBIND_DISTRIBUTE + HPCBIND_PARTITION))
-    declare -i GPU_ID=$((MY_TASK_ID % NUM_GPUS))
-    export CUDA_VISIBLE_DEVICES="${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}"
+  if [[ ${HPCBIND_HAS_NVIDIA} -eq 1 ]]; then
+    if [[ ${HPCBIND_QUEUE_MAPPING} -eq 0 ]]; then
+      declare -i GPU_ID=$((HPCBIND_PARTITION % NUM_GPUS))
+      export CUDA_VISIBLE_DEVICES="${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}"
+    else
+      declare -i MY_TASK_ID=$((HPCBIND_QUEUE_RANK * HPCBIND_DISTRIBUTE + HPCBIND_PARTITION))
+      declare -i GPU_ID=$((MY_TASK_ID % NUM_GPUS))
+      export CUDA_VISIBLE_DEVICES="${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}"
+    fi
+  elif [[ ${HPCBIND_HAS_AMD} -eq 1 ]]; then
+    if [[ ${HPCBIND_QUEUE_MAPPING} -eq 0 ]]; then
+      declare -i GPU_ID=$((HPCBIND_PARTITION % NUM_GPUS))
+      export ROCR_VISIBLE_DEVICES="${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}"
+    else
+      declare -i MY_TASK_ID=$((HPCBIND_QUEUE_RANK * HPCBIND_DISTRIBUTE + HPCBIND_PARTITION))
+      declare -i GPU_ID=$((MY_TASK_ID % NUM_GPUS))
+      export ROCR_VISIBLE_DEVICES="${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}"
+    fi
   fi
 fi
 
@@ -541,6 +577,7 @@ fi
 export HPCBIND_HWLOC_VERSION=${HPCBIND_HWLOC_VERSION}
 export HPCBIND_HAS_HWLOC=${HPCBIND_HAS_HWLOC}
 export HPCBIND_HAS_NVIDIA=${HPCBIND_HAS_NVIDIA}
+export HPCBIND_HAS_AMD=${HPCBIND_HAS_AMD}
 export HPCBIND_NUM_PUS=${HPCBIND_NUM_PUS}
 export HPCBIND_NUM_CORES=${HPCBIND_NUM_CORES}
 export HPCBIND_NUM_NUMAS=${HPCBIND_NUM_NUMAS}
@@ -555,8 +592,14 @@ else
   export HPCBIND_HWLOC_PARENT_CPUSET="${HPCBIND_HWLOC_PARENT_CPUSET}"
 fi
 export HPCBIND_HWLOC_PROC_BIND="${HPCBIND_PROC_BIND}"
-export HPCBIND_NVIDIA_ENABLE_GPU_MAPPING=${HPCBIND_ENABLE_GPU_MAPPING}
-export HPCBIND_NVIDIA_VISIBLE_GPUS=$(echo "${HPCBIND_VISIBLE_GPUS[*]}" | tr ' ' ',')
+if [[ ${HPCBIND_HAS_NVIDIA} -eq 1 ]]; then
+  export HPCBIND_NVIDIA_ENABLE_GPU_MAPPING=${HPCBIND_ENABLE_GPU_MAPPING}
+  export HPCBIND_NVIDIA_VISIBLE_GPUS=$(echo "${HPCBIND_VISIBLE_GPUS[*]}" | tr ' ' ',')
+fi
+if [[ ${HPCBIND_HAS_AMD} -eq 1 ]]; then
+  export HPCBIND_AMD_ENABLE_GPU_MAPPING=${HPCBIND_ENABLE_GPU_MAPPING}
+  export HPCBIND_AMD_VISIBLE_GPUS=$(echo "${HPCBIND_VISIBLE_GPUS[*]}" | tr ' ' ',')
+fi
 export HPCBIND_OPENMP_VERSION="${HPCBIND_OPENMP_VERSION}"
 if [[ "${HPCBIND_QUEUE_NAME}" != "" ]]; then
   export HPCBIND_QUEUE_RANK=${HPCBIND_QUEUE_RANK}
@@ -580,6 +623,9 @@ if [[ ${HPCBIND_TEE} -eq 0 || ${HPCBIND_VERBOSE} -eq 0 ]]; then
   echo "${TMP_ENV}" | grep -E "^HWLOC_" >> ${HPCBIND_LOG}
   echo "[CUDA]" >> ${HPCBIND_LOG}
   echo "${TMP_ENV}" | grep -E "^CUDA_" >> ${HPCBIND_LOG}
+  echo "[ROCM]" >> ${HPCBIND_LOG}
+  echo "${TMP_ENV}" | grep -E "^ROCM_" >> ${HPCBIND_LOG}
+  echo "${TMP_ENV}" | grep -E "^ROCR_" >> ${HPCBIND_LOG}
   echo "[OPENMP]" >> ${HPCBIND_LOG}
   echo "${TMP_ENV}" | grep -E "^OMP_" >> ${HPCBIND_LOG}
   echo "[GOMP] (gcc, g++, and gfortran)" >> ${HPCBIND_LOG}
@@ -602,6 +648,9 @@ else
   echo "${TMP_ENV}" | grep -E "^HWLOC_" > >(tee -a ${HPCBIND_LOG})
   echo "[CUDA]" > >(tee -a ${HPCBIND_LOG})
   echo "${TMP_ENV}" | grep -E "^CUDA_" > >(tee -a ${HPCBIND_LOG})
+  echo "[ROCM]" > >(tee -a ${HPCBIND_LOG})
+  echo "${TMP_ENV}" | grep -E "^ROCM_" > >(tee -a ${HPCBIND_LOG})
+  echo "${TMP_ENV}" | grep -E "^ROCR_" > >(tee -a ${HPCBIND_LOG})
   echo "[OPENMP]" > >(tee -a ${HPCBIND_LOG})
   echo "${TMP_ENV}" | grep -E "^OMP_" > >(tee -a ${HPCBIND_LOG})
   echo "[GOMP] (gcc, g++, and gfortran)" > >(tee -a ${HPCBIND_LOG})
diff --git a/packages/kokkos/bin/kokkos_launch_compiler b/packages/kokkos/bin/kokkos_launch_compiler
index 37c17956a1b46830172dc56c7dc68bf6dd50014a..d1f8896f91b1d5b4b5210c21c26ca28cbcd45240 100755
--- a/packages/kokkos/bin/kokkos_launch_compiler
+++ b/packages/kokkos/bin/kokkos_launch_compiler
@@ -1,4 +1,4 @@
-#!/bin/bash -e
+#!/usr/bin/env bash
 #
 #   This script allows CMAKE_CXX_COMPILER to be a standard
 #   C++ compiler and Kokkos sets RULE_LAUNCH_COMPILE and
@@ -13,6 +13,8 @@
 #   $1 are 'ar', 'cmake', etc. during the linking phase
 #
 
+set -e
+
 # emit a message about the underlying command executed
 : ${DEBUG:=0}
 : ${KOKKOS_DEBUG_LAUNCH_COMPILER:=${DEBUG}}
diff --git a/packages/kokkos/bin/nvcc_wrapper b/packages/kokkos/bin/nvcc_wrapper
index 0c55651460ad9570fd0815c078e833e39c6d99aa..c1400872402bae59fd97a698d9f8ed243a74372a 100755
--- a/packages/kokkos/bin/nvcc_wrapper
+++ b/packages/kokkos/bin/nvcc_wrapper
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 #
 # This shell script (nvcc_wrapper) wraps both the host compiler and
 # NVCC, if you are building legacy C or C++ code with CUDA enabled.
@@ -407,7 +407,7 @@ do
   -Woverloaded-virtual)
     ;;
   #strip -Xcompiler because we add it
-  -Xcompiler)
+  -Xcompiler|--compiler-options)
     if [[ $2 != "-o" ]]; then
       if [ $first_xcompiler_arg -eq 1 ]; then
         xcompiler_args="$2"
diff --git a/packages/kokkos/cmake/Dependencies.cmake b/packages/kokkos/cmake/Dependencies.cmake
index 23b473ce2471c44c67d2fc4b004500b293dd9ef5..611c089b2e3feec2ec79228360f93c242fc055e2 100644
--- a/packages/kokkos/cmake/Dependencies.cmake
+++ b/packages/kokkos/cmake/Dependencies.cmake
@@ -1,10 +1,6 @@
 TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
-  SUBPACKAGES_DIRS_CLASSIFICATIONS_OPTREQS
-    #SubPackageName       Directory         Class    Req/Opt
-    #
-    # New Kokkos subpackages:
-    Core                  core              PS       REQUIRED
-    Containers            containers        PS       OPTIONAL
-    Algorithms            algorithms        PS       OPTIONAL
-    Simd                  simd              PT       OPTIONAL
+  LIB_OPTIONAL_TPLS Pthread CUDA HWLOC DLlib
+  TEST_OPTIONAL_TPLS CUSPARSE
   )
+
+TRIBITS_TPL_TENTATIVELY_ENABLE(DLlib)
diff --git a/packages/kokkos/cmake/KokkosConfigCommon.cmake.in b/packages/kokkos/cmake/KokkosConfigCommon.cmake.in
index bb5ce5ff8191fc9f3d75a92d4c1aee3df4258141..8d5ef0de42f9440070e5772d31fbf9324be7e7a3 100644
--- a/packages/kokkos/cmake/KokkosConfigCommon.cmake.in
+++ b/packages/kokkos/cmake/KokkosConfigCommon.cmake.in
@@ -4,12 +4,40 @@ SET(Kokkos_TPLS @KOKKOS_ENABLED_TPLS@)
 SET(Kokkos_ARCH @KOKKOS_ENABLED_ARCH_LIST@)
 SET(Kokkos_CXX_COMPILER "@CMAKE_CXX_COMPILER@")
 SET(Kokkos_CXX_COMPILER_ID "@KOKKOS_CXX_COMPILER_ID@")
+SET(Kokkos_CXX_COMPILER_VERSION "@KOKKOS_CXX_COMPILER_VERSION@")
 SET(Kokkos_CXX_STANDARD @KOKKOS_CXX_STANDARD@)
 
-# These are needed by KokkosKernels
+# Required to be a TriBITS-compliant external package
+IF(NOT TARGET Kokkos::all_libs)
+  # CMake Error at <prefix>/lib/cmake/Kokkos/KokkosConfigCommon.cmake:10 (ADD_LIBRARY):
+  #   ADD_LIBRARY cannot create ALIAS target "Kokkos::all_libs" because target
+  #   "Kokkos::kokkos" is imported but not globally visible.
+  IF(CMAKE_VERSION VERSION_LESS "3.18")
+    SET_TARGET_PROPERTIES(Kokkos::kokkos PROPERTIES IMPORTED_GLOBAL ON)
+  ENDIF()
+  ADD_LIBRARY(Kokkos::all_libs ALIAS Kokkos::kokkos)
+ENDIF()
+
+# Export Kokkos_ENABLE_<BACKEND> for each backend that was enabled.
+# NOTE: "Devices" is a little bit of a misnomer here.  These are really
+# backends, e.g. Kokkos_ENABLE_OPENMP, Kokkos_ENABLE_CUDA, Kokkos_ENABLE_HIP,
+# or Kokkos_ENABLE_SYCL.
 FOREACH(DEV ${Kokkos_DEVICES})
   SET(Kokkos_ENABLE_${DEV} ON)
 ENDFOREACH()
+# Export relevant Kokkos_ENABLE<OPTION> variables, e.g.
+# Kokkos_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE, Kokkos_ENABLE_DEBUG, etc.
+FOREACH(OPT ${Kokkos_OPTIONS})
+  SET(Kokkos_ENABLE_${OPT} ON)
+ENDFOREACH()
+
+IF(Kokkos_ENABLE_CUDA)
+  SET(Kokkos_CUDA_ARCHITECTURES @KOKKOS_CUDA_ARCHITECTURES@)
+ENDIF()
+
+IF(Kokkos_ENABLE_HIP)
+  SET(Kokkos_HIP_ARCHITECTURES @KOKKOS_HIP_ARCHITECTURES@)
+ENDIF()
 
 IF(NOT Kokkos_FIND_QUIETLY)
   MESSAGE(STATUS "Enabled Kokkos devices: ${Kokkos_DEVICES}")
diff --git a/packages/kokkos/cmake/KokkosCore_config.h.in b/packages/kokkos/cmake/KokkosCore_config.h.in
index cb1affa24c3ee257585e7b3de00a9409e39c9226..bec59ebd034939d3f819f2990a5c1494c517772b 100644
--- a/packages/kokkos/cmake/KokkosCore_config.h.in
+++ b/packages/kokkos/cmake/KokkosCore_config.h.in
@@ -26,6 +26,7 @@
 #cmakedefine KOKKOS_ENABLE_MEMKIND
 #cmakedefine KOKKOS_ENABLE_LIBRT
 #cmakedefine KOKKOS_ENABLE_SYCL
+#cmakedefine KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED
 
 /* General Settings */
 #cmakedefine KOKKOS_ENABLE_CXX17
@@ -34,23 +35,21 @@
 
 #cmakedefine KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
 #cmakedefine KOKKOS_ENABLE_CUDA_UVM
-#cmakedefine KOKKOS_ENABLE_CUDA_LAMBDA
+#cmakedefine KOKKOS_ENABLE_CUDA_LAMBDA  // deprecated
 #cmakedefine KOKKOS_ENABLE_CUDA_CONSTEXPR
 #cmakedefine KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC
 #cmakedefine KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE
 #cmakedefine KOKKOS_ENABLE_HIP_MULTIPLE_KERNEL_INSTANTIATIONS
-#cmakedefine KOKKOS_ENABLE_HPX_ASYNC_DISPATCH
+#cmakedefine KOKKOS_ENABLE_IMPL_HPX_ASYNC_DISPATCH
 #cmakedefine KOKKOS_ENABLE_DEBUG
 #cmakedefine KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK
 #cmakedefine KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK
-#cmakedefine KOKKOS_ENABLE_PROFILING_LOAD_PRINT
 #cmakedefine KOKKOS_ENABLE_TUNING
 #cmakedefine KOKKOS_ENABLE_DEPRECATED_CODE_3
 #cmakedefine KOKKOS_ENABLE_DEPRECATED_CODE_4
 #cmakedefine KOKKOS_ENABLE_DEPRECATION_WARNINGS
 #cmakedefine KOKKOS_ENABLE_LARGE_MEM_TESTS
 #cmakedefine KOKKOS_ENABLE_COMPLEX_ALIGN
-#cmakedefine KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
 #cmakedefine KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION  // deprecated
 #cmakedefine KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION
 #cmakedefine KOKKOS_ENABLE_IMPL_MDSPAN
@@ -62,6 +61,7 @@
 #cmakedefine KOKKOS_ENABLE_LIBDL
 #cmakedefine KOKKOS_ENABLE_LIBQUADMATH
 #cmakedefine KOKKOS_IMPL_CUDA_CLANG_WORKAROUND
+#cmakedefine KOKKOS_ENABLE_ONEDPL
 
 #cmakedefine KOKKOS_ARCH_SSE42
 #cmakedefine KOKKOS_ARCH_ARMV80
@@ -69,10 +69,10 @@
 #cmakedefine KOKKOS_ARCH_ARMV81
 #cmakedefine KOKKOS_ARCH_ARMV8_THUNDERX2
 #cmakedefine KOKKOS_ARCH_A64FX
-#cmakedefine KOKKOS_ARCH_AMD_AVX2
 #cmakedefine KOKKOS_ARCH_AVX
 #cmakedefine KOKKOS_ARCH_AVX2
 #cmakedefine KOKKOS_ARCH_AVX512XEON
+#cmakedefine KOKKOS_ARCH_ARM_NEON
 #cmakedefine KOKKOS_ARCH_KNC
 #cmakedefine KOKKOS_ARCH_AVX512MIC
 #cmakedefine KOKKOS_ARCH_POWER7
@@ -111,10 +111,19 @@
 #cmakedefine KOKKOS_ARCH_AMD_ZEN
 #cmakedefine KOKKOS_ARCH_AMD_ZEN2
 #cmakedefine KOKKOS_ARCH_AMD_ZEN3
-#cmakedefine KOKKOS_ARCH_VEGA
-#cmakedefine KOKKOS_ARCH_VEGA906
-#cmakedefine KOKKOS_ARCH_VEGA908
-#cmakedefine KOKKOS_ARCH_VEGA90A
-#cmakedefine KOKKOS_ARCH_NAVI
-#cmakedefine KOKKOS_ARCH_NAVI1030
-#cmakedefine KOKKOS_ARCH_NAVI1100
+#cmakedefine KOKKOS_ARCH_AMD_GFX906
+#cmakedefine KOKKOS_ARCH_AMD_GFX908
+#cmakedefine KOKKOS_ARCH_AMD_GFX90A
+#cmakedefine KOKKOS_ARCH_AMD_GFX942
+#cmakedefine KOKKOS_ARCH_AMD_GFX1030
+#cmakedefine KOKKOS_ARCH_AMD_GFX1100
+#cmakedefine KOKKOS_ARCH_AMD_GPU
+#cmakedefine KOKKOS_ARCH_VEGA // deprecated
+#cmakedefine KOKKOS_ARCH_VEGA906 // deprecated
+#cmakedefine KOKKOS_ARCH_VEGA908 // deprecated
+#cmakedefine KOKKOS_ARCH_VEGA90A // deprecated
+#cmakedefine KOKKOS_ARCH_NAVI // deprecated
+#cmakedefine KOKKOS_ARCH_NAVI1030 // deprecated
+#cmakedefine KOKKOS_ARCH_NAVI1100 // deprecated
+
+#cmakedefine KOKKOS_IMPL_32BIT
diff --git a/packages/kokkos/cmake/Kokkos_Version_Info.cpp.in b/packages/kokkos/cmake/Kokkos_Version_Info.cpp.in
index e9fabe8177e67d76c53f422e36e3057a9880a0fa..3665282e7b69c13bf85378e6c15ef3daa8f25674 100644
--- a/packages/kokkos/cmake/Kokkos_Version_Info.cpp.in
+++ b/packages/kokkos/cmake/Kokkos_Version_Info.cpp.in
@@ -19,11 +19,12 @@
 namespace Kokkos {
 namespace Impl {
 
-std::string GIT_BRANCH             = "@GIT_BRANCH@";
-std::string GIT_COMMIT_HASH        = "@GIT_COMMIT_HASH@";
-std::string GIT_CLEAN_STATUS       = "@GIT_CLEAN_STATUS@";
-std::string GIT_COMMIT_DESCRIPTION = "@GIT_COMMIT_DESCRIPTION@";
-std::string GIT_COMMIT_DATE        = "@GIT_COMMIT_DATE@";
+std::string GIT_BRANCH       = R"branch(@GIT_BRANCH@)branch";
+std::string GIT_COMMIT_HASH  = "@GIT_COMMIT_HASH@";
+std::string GIT_CLEAN_STATUS = "@GIT_CLEAN_STATUS@";
+std::string GIT_COMMIT_DESCRIPTION =
+    R"message(@GIT_COMMIT_DESCRIPTION@)message";
+std::string GIT_COMMIT_DATE = "@GIT_COMMIT_DATE@";
 
 }  // namespace Impl
 
diff --git a/packages/kokkos/cmake/Kokkos_Version_Info.hpp b/packages/kokkos/cmake/Kokkos_Version_Info.hpp
index ba605a301db4a7e3858c181781a6d331ef7c4517..831247115e27d6c006ced21d197174d47f078e7b 100644
--- a/packages/kokkos/cmake/Kokkos_Version_Info.hpp
+++ b/packages/kokkos/cmake/Kokkos_Version_Info.hpp
@@ -14,8 +14,8 @@
 //
 //@HEADER
 
-#ifndef GIT_VERSION_H
-#define GIT_VERSION_H
+#ifndef KOKKOS_GIT_VERSION_INFO_H
+#define KOKKOS_GIT_VERSION_INFO_H
 
 #include <string>
 
diff --git a/packages/kokkos/cmake/Modules/FindTPLHPX.cmake b/packages/kokkos/cmake/Modules/FindTPLHPX.cmake
index 5636a9bb66b114dab18415da08065777251e9ee0..d7b54fb9c9ab79d810adbebb84f1a5bf2104c351 100644
--- a/packages/kokkos/cmake/Modules/FindTPLHPX.cmake
+++ b/packages/kokkos/cmake/Modules/FindTPLHPX.cmake
@@ -1,5 +1,5 @@
 
-FIND_PACKAGE(HPX REQUIRED 1.7.0)
+FIND_PACKAGE(HPX REQUIRED 1.8.0)
 #as of right now, HPX doesn't export correctly
 #so let's convert it to an interface target
 KOKKOS_CREATE_IMPORTED_TPL(HPX INTERFACE
diff --git a/packages/kokkos/cmake/Modules/FindTPLLIBNUMA.cmake b/packages/kokkos/cmake/Modules/FindTPLLIBNUMA.cmake
deleted file mode 100644
index 811db5851b9ee359ad996a743bf8a0ac283512f6..0000000000000000000000000000000000000000
--- a/packages/kokkos/cmake/Modules/FindTPLLIBNUMA.cmake
+++ /dev/null
@@ -1 +0,0 @@
-KOKKOS_FIND_IMPORTED(LIBNUMA HEADER numa.h   LIBRARY numa)
diff --git a/packages/kokkos/cmake/Modules/FindTPLONEDPL.cmake b/packages/kokkos/cmake/Modules/FindTPLONEDPL.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..01791cff443c83d8f3d4887fc664ebce8780fbd8
--- /dev/null
+++ b/packages/kokkos/cmake/Modules/FindTPLONEDPL.cmake
@@ -0,0 +1,46 @@
+INCLUDE(CheckIncludeFileCXX)
+CHECK_INCLUDE_FILE_CXX(oneapi/dpl/execution KOKKOS_COMPILER_HAS_ONEDPL_EXECUTION_HEADER)
+CHECK_INCLUDE_FILE_CXX(oneapi/dpl/algorithm KOKKOS_COMPILER_HAS_ONEDPL_ALGORITHM_HEADER)
+
+INCLUDE(CheckCXXSourceCompiles)
+CHECK_CXX_SOURCE_COMPILES("
+  #include <iostream>
+
+  int main()
+  {
+    #if defined(_GLIBCXX_RELEASE) && (_GLIBCXX_RELEASE == 9 || _GLIBCXX_RELEASE == 10)
+      static_assert(false);
+    #endif
+    return 0;
+  }"
+  KOKKOS_NO_TBB_CONFLICT)
+
+IF (KOKKOS_COMPILER_HAS_ONEDPL_EXECUTION_HEADER AND KOKKOS_COMPILER_HAS_ONEDPL_ALGORITHM_HEADER)
+  IF(KOKKOS_NO_TBB_CONFLICT)
+    KOKKOS_CREATE_IMPORTED_TPL(
+      ONEDPL INTERFACE
+    )
+  ELSE()
+    KOKKOS_CREATE_IMPORTED_TPL(
+      ONEDPL INTERFACE
+      # https://stackoverflow.com/questions/67923287/how-to-resolve-no-member-named-task-in-namespace-tbb-error-when-using-oned/
+      COMPILE_DEFINITIONS PSTL_USE_PARALLEL_POLICIES=0 _GLIBCXX_USE_TBB_PAR_BACKEND=0
+    )
+  ENDIF()
+ELSE()
+  FIND_PACKAGE(oneDPL REQUIRED)
+
+  IF(KOKKOS_NO_TBB_CONFLICT)
+    KOKKOS_CREATE_IMPORTED_TPL(
+      ONEDPL INTERFACE
+      LINK_LIBRARIES oneDPL
+    )
+  ELSE()
+    KOKKOS_CREATE_IMPORTED_TPL(
+      ONEDPL INTERFACE
+      LINK_LIBRARIES oneDPL
+      # https://stackoverflow.com/questions/67923287/how-to-resolve-no-member-named-task-in-namespace-tbb-error-when-using-oned/
+      COMPILE_DEFINITIONS PSTL_USE_PARALLEL_POLICIES=0 _GLIBCXX_USE_TBB_PAR_BACKEND=0
+    )
+  ENDIF()
+ENDIF()
diff --git a/packages/kokkos/cmake/Modules/FindTPLROCM.cmake b/packages/kokkos/cmake/Modules/FindTPLROCM.cmake
index aacdfcaf19cbb77dba00067bb818d9c065dbfe8b..f796737f5b29cdbffd4eadf2b23c7321d7a607fb 100644
--- a/packages/kokkos/cmake/Modules/FindTPLROCM.cmake
+++ b/packages/kokkos/cmake/Modules/FindTPLROCM.cmake
@@ -3,15 +3,27 @@ include(FindPackageHandleStandardArgs)
 FIND_LIBRARY(AMD_HIP_LIBRARY amdhip64 PATHS ENV ROCM_PATH PATH_SUFFIXES lib)
 FIND_LIBRARY(HSA_RUNTIME_LIBRARY hsa-runtime64 PATHS ENV ROCM_PATH PATH_SUFFIXES lib)
 
+# FIXME_HIP Starting with ROCm 5.5 it is not necessary to link againt clang_rt.
+# We keep the code as is for now because it is hard to find the version of ROCM
+# found.
 # clang_rt.builtins is necessary to use half precision. The following code to
 # find clang_rt.buitins is based on
-# https://github.com/ROCm-Developer-Tools/HIP/blob/develop/hip-lang-config.cmake.in#L99-L111
-file(GLOB_RECURSE CLANG_RT_DIR "$ENV{ROCM_PATH}/llvm/lib/clang/*/lib/*/*clang_rt.builtins*")
-FIND_LIBRARY(CLANG_RT_LIBRARY
-  NAMES
-  clang_rt.builtins
-  clang_rt.builtins-x86_64
-  PATHS "${CLANG_RT_DIR}/..")
+# https://github.com/ROCm-Developer-Tools/hipamd/blob/d1e0ee98a0f3d79f7bf43295f82d0053a69ec742/hip-config.cmake.in#L241
+# NOTE: Per the above, we still search for the clang-rt library,
+# but use the user's specified compiler to find the library to avoid use of
+# environment variables / relative paths.
+execute_process(
+  COMMAND ${CMAKE_CXX_COMPILER} -print-libgcc-file-name --rtlib=compiler-rt
+  OUTPUT_VARIABLE CLANG_RT_LIBRARY
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+  RESULT_VARIABLE CLANG_RT_CHECK)
+
+if( NOT "${CLANG_RT_CHECK}" STREQUAL "0" )
+  # if the above failed, we delete CLANG_RT_LIBRARY to make the args check
+  # below fail
+  unset(CLANG_RT_LIBRARY)
+endif()
+
 
 find_package_handle_standard_args(TPLROCM DEFAULT_MSG AMD_HIP_LIBRARY HSA_RUNTIME_LIBRARY CLANG_RT_LIBRARY)
 
diff --git a/packages/kokkos/cmake/build_env_info.cmake b/packages/kokkos/cmake/build_env_info.cmake
index 2cd169cba417e9194f8208a59cd8f590f0b0612f..0eeb6372455bd0e0ff6ece535d2724b3648e936c 100644
--- a/packages/kokkos/cmake/build_env_info.cmake
+++ b/packages/kokkos/cmake/build_env_info.cmake
@@ -110,6 +110,7 @@ FUNCTION(check_git_setup)
 
   add_library(impl_git_version ${CMAKE_BINARY_DIR}/generated/Kokkos_Version_Info.cpp)
   target_include_directories(impl_git_version PUBLIC ${CMAKE_BINARY_DIR}/generated)
+  target_compile_features(impl_git_version PRIVATE cxx_raw_string_literals)
   add_dependencies(impl_git_version AlwaysCheckGit)
 
   check_git_version()
diff --git a/packages/kokkos/cmake/fake_tribits.cmake b/packages/kokkos/cmake/fake_tribits.cmake
index 71e85e915c6160ea5dee9c9d6a32c9852a0c39ba..4c5331ec793b28b9d6e1343ae2b0d746dd785242 100644
--- a/packages/kokkos/cmake/fake_tribits.cmake
+++ b/packages/kokkos/cmake/fake_tribits.cmake
@@ -11,22 +11,7 @@ FUNCTION(ASSERT_DEFINED VARS)
   ENDFOREACH()
 ENDFUNCTION()
 
-MACRO(KOKKOS_ADD_OPTION_AND_DEFINE USER_OPTION_NAME MACRO_DEFINE_NAME DOCSTRING DEFAULT_VALUE )
-SET( ${USER_OPTION_NAME} "${DEFAULT_VALUE}" CACHE BOOL "${DOCSTRING}" )
-IF(NOT ${MACRO_DEFINE_NAME} STREQUAL "")
-  IF(${USER_OPTION_NAME})
-    GLOBAL_SET(${MACRO_DEFINE_NAME} ON)
-  ELSE()
-    GLOBAL_SET(${MACRO_DEFINE_NAME} OFF)
-  ENDIF()
-ENDIF()
-ENDMACRO()
-
-MACRO(GLOBAL_OVERWRITE VARNAME VALUE TYPE)
-  SET(${VARNAME} ${VALUE} CACHE ${TYPE} "" FORCE)
-ENDMACRO()
-
-IF (NOT KOKKOS_HAS_TRILINOS)
+IF(NOT KOKKOS_HAS_TRILINOS)
 MACRO(APPEND_GLOB VAR)
   FILE(GLOB LOCAL_TMP_VAR ${ARGN})
   LIST(APPEND ${VAR} ${LOCAL_TMP_VAR})
@@ -40,35 +25,7 @@ MACRO(PREPEND_GLOBAL_SET VARNAME)
   ASSERT_DEFINED(${VARNAME})
   GLOBAL_SET(${VARNAME} ${ARGN} ${${VARNAME}})
 ENDMACRO()
-
-MACRO(PREPEND_TARGET_SET VARNAME TARGET_NAME TYPE)
-  IF(TYPE STREQUAL "REQUIRED")
-    SET(REQUIRED TRUE)
-  ELSE()
-    SET(REQUIRED FALSE)
-  ENDIF()
-  IF(TARGET ${TARGET_NAME})
-    PREPEND_GLOBAL_SET(${VARNAME} ${TARGET_NAME})
-  ELSE()
-    IF(REQUIRED)
-      MESSAGE(FATAL_ERROR "Missing dependency ${TARGET_NAME}")
-    ENDIF()
-  ENDIF()
-ENDMACRO()
-endif()
-
-
-FUNCTION(KOKKOS_CONFIGURE_FILE  PACKAGE_NAME_CONFIG_FILE)
-  if (KOKKOS_HAS_TRILINOS)
-    TRIBITS_CONFIGURE_FILE(${PACKAGE_NAME_CONFIG_FILE})
-  else()
-    # Configure the file
-    CONFIGURE_FILE(
-      ${PACKAGE_SOURCE_DIR}/cmake/${PACKAGE_NAME_CONFIG_FILE}.in
-      ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME_CONFIG_FILE}
-      )
-  endif()
-ENDFUNCTION()
+ENDIF()
 
 MACRO(ADD_INTERFACE_LIBRARY LIB_NAME)
   FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp "")
@@ -213,15 +170,6 @@ else()
 endif()
 ENDMACRO()
 
-
-MACRO(KOKKOS_EXCLUDE_AUTOTOOLS_FILES)
-  if (KOKKOS_HAS_TRILINOS)
-    TRIBITS_EXCLUDE_AUTOTOOLS_FILES()
-  else()
-    #do nothing
-  endif()
-ENDMACRO()
-
 FUNCTION(KOKKOS_LIB_TYPE LIB RET)
 GET_TARGET_PROPERTY(PROP ${LIB} TYPE)
 IF (${PROP} STREQUAL "INTERFACE_LIBRARY")
diff --git a/packages/kokkos/cmake/kokkos_arch.cmake b/packages/kokkos/cmake/kokkos_arch.cmake
index eb7c271b156e7345de39331968ef27d243fe250d..bccf674d7633a01cc896f97d4d26f61097e930cb 100644
--- a/packages/kokkos/cmake/kokkos_arch.cmake
+++ b/packages/kokkos/cmake/kokkos_arch.cmake
@@ -89,14 +89,23 @@ KOKKOS_ARCH_OPTION(AMPERE86        GPU  "NVIDIA Ampere generation CC 8.6"  "KOKK
 KOKKOS_ARCH_OPTION(ADA89           GPU  "NVIDIA Ada generation CC 8.9"     "KOKKOS_SHOW_CUDA_ARCHS")
 KOKKOS_ARCH_OPTION(HOPPER90        GPU  "NVIDIA Hopper generation CC 9.0"  "KOKKOS_SHOW_CUDA_ARCHS")
 
-IF(Kokkos_ENABLE_HIP OR Kokkos_ENABLE_OPENMPTARGET)
+IF(Kokkos_ENABLE_HIP OR Kokkos_ENABLE_OPENMPTARGET OR Kokkos_ENABLE_OPENACC OR Kokkos_ENABLE_SYCL)
   SET(KOKKOS_SHOW_HIP_ARCHS ON)
 ENDIF()
 
 # AMD archs ordered in decreasing priority of autodetection
-LIST(APPEND SUPPORTED_AMD_GPUS       MI200    MI100    MI50/60  RX7900XTX  V620/W6800)
-LIST(APPEND SUPPORTED_AMD_ARCHS      VEGA90A  VEGA908  VEGA906  NAVI1100   NAVI1030)
-LIST(APPEND CORRESPONDING_AMD_FLAGS  gfx90a   gfx908   gfx906   gfx1100    gfx1030)
+LIST(APPEND SUPPORTED_AMD_GPUS       MI300)
+LIST(APPEND SUPPORTED_AMD_ARCHS      AMD_GFX942)
+LIST(APPEND CORRESPONDING_AMD_FLAGS  gfx942)
+LIST(APPEND SUPPORTED_AMD_GPUS       MI200    MI200       MI100    MI100)
+LIST(APPEND SUPPORTED_AMD_ARCHS      VEGA90A  AMD_GFX90A  VEGA908  AMD_GFX908)
+LIST(APPEND CORRESPONDING_AMD_FLAGS  gfx90a   gfx90a      gfx908   gfx908)
+LIST(APPEND SUPPORTED_AMD_GPUS       MI50/60  MI50/60)
+LIST(APPEND SUPPORTED_AMD_ARCHS      VEGA906  AMD_GFX906)
+LIST(APPEND CORRESPONDING_AMD_FLAGS  gfx906   gfx906)
+LIST(APPEND SUPPORTED_AMD_GPUS       RX7900XTX  RX7900XTX    V620/W6800  V620/W6800)
+LIST(APPEND SUPPORTED_AMD_ARCHS      NAVI1100   AMD_GFX1100  NAVI1030    AMD_GFX1030)
+LIST(APPEND CORRESPONDING_AMD_FLAGS  gfx1100    gfx1100      gfx1030     gfx1030)
 
 #FIXME CAN BE REPLACED WITH LIST_ZIP IN CMAKE 3.17
 FOREACH(ARCH IN LISTS SUPPORTED_AMD_ARCHS)
@@ -120,7 +129,7 @@ KOKKOS_ARCH_OPTION(INTEL_PVC       GPU  "Intel GPU Ponte Vecchio"
 
 IF(KOKKOS_ENABLE_COMPILER_WARNINGS)
   SET(COMMON_WARNINGS
-    "-Wall" "-Wunused-parameter" "-Wshadow" "-pedantic"
+    "-Wall" "-Wextra" "-Wunused-parameter" "-Wshadow" "-pedantic"
     "-Wsign-compare" "-Wtype-limits" "-Wuninitialized")
 
   # NOTE KOKKOS_ prefixed variable (all uppercase) is not set yet because TPLs are processed after ARCH
@@ -129,11 +138,6 @@ IF(KOKKOS_ENABLE_COMPILER_WARNINGS)
     LIST(REMOVE_ITEM COMMON_WARNINGS "-pedantic")
   ENDIF()
 
-  # OpenMPTarget compilers give erroneous warnings about sign comparison in loops
-  IF(KOKKOS_ENABLE_OPENMPTARGET)
-    LIST(REMOVE_ITEM COMMON_WARNINGS "-Wsign-compare")
-  ENDIF()
-
   # NVHPC compiler does not support -Wtype-limits.
   IF(KOKKOS_ENABLE_OPENACC)
     IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
@@ -167,11 +171,9 @@ ENDIF()
 #clear anything that might be in the cache
 GLOBAL_SET(KOKKOS_CUDA_OPTIONS)
 # Construct the Makefile options
-IF (KOKKOS_ENABLE_CUDA_LAMBDA)
-  IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA)
-    GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "-expt-extended-lambda")
-    GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "-Wext-lambda-captures-this")
-  ENDIF()
+IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA)
+  GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "-extended-lambda")
+  GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "-Wext-lambda-captures-this")
 ENDIF()
 
 IF (KOKKOS_ENABLE_CUDA_CONSTEXPR)
@@ -217,7 +219,10 @@ GLOBAL_SET(KOKKOS_AMDGPU_OPTIONS)
 IF(KOKKOS_ENABLE_HIP)
   SET(AMDGPU_ARCH_FLAG "--offload-arch")
   IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC)
-    GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS -x hip)
+    IF (NOT CMAKE_CXX_STANDARD)
+      MESSAGE(FATAL_ERROR "Kokkos requires CMAKE_CXX_STANDARD to set to 17 or higher")
+    ENDIF()
+    GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS -xhip)
     IF(DEFINED ENV{ROCM_PATH})
       GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS --rocm-path=$ENV{ROCM_PATH})
     ENDIF()
@@ -230,13 +235,21 @@ IF(KOKKOS_ARCH_NATIVE)
     MESSAGE(FATAL_ERROR "MSVC doesn't support ARCH_NATIVE!")
   ENDIF()
 
+  STRING(TOUPPER "${CMAKE_SYSTEM_PROCESSOR}" KOKKOS_UC_SYSTEM_PROCESSOR)
+  IF(KOKKOS_UC_SYSTEM_PROCESSOR MATCHES "(X86)|(AMD64)")
+    SET(KOKKOS_NATIVE_FLAGS "-march=native;-mtune=native")
+  ELSE()
+    SET(KOKKOS_NATIVE_FLAGS "-mcpu=native")
+  ENDIF()
   COMPILER_SPECIFIC_FLAGS(
     COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
-    DEFAULT -march=native -mtune=native
+    NVHPC   -tp=native
+    DEFAULT ${KOKKOS_NATIVE_FLAGS}
   )
 ENDIF()
 
 IF (KOKKOS_ARCH_ARMV80)
+  SET(KOKKOS_ARCH_ARM_NEON ON)
   COMPILER_SPECIFIC_FLAGS(
     COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
     Cray    NO-VALUE-SPECIFIED
@@ -247,6 +260,7 @@ IF (KOKKOS_ARCH_ARMV80)
 ENDIF()
 
 IF (KOKKOS_ARCH_ARMV81)
+  SET(KOKKOS_ARCH_ARM_NEON ON)
   COMPILER_SPECIFIC_FLAGS(
     COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
     Cray    NO-VALUE-SPECIFIED
@@ -257,6 +271,7 @@ IF (KOKKOS_ARCH_ARMV81)
 ENDIF()
 
 IF (KOKKOS_ARCH_ARMV8_THUNDERX)
+  SET(KOKKOS_ARCH_ARM_NEON ON)
   SET(KOKKOS_ARCH_ARMV80 ON) #Not a cache variable
   COMPILER_SPECIFIC_FLAGS(
     COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
@@ -268,6 +283,7 @@ IF (KOKKOS_ARCH_ARMV8_THUNDERX)
 ENDIF()
 
 IF (KOKKOS_ARCH_ARMV8_THUNDERX2)
+  SET(KOKKOS_ARCH_ARM_NEON ON)
   SET(KOKKOS_ARCH_ARMV81 ON) #Not a cache variable
   COMPILER_SPECIFIC_FLAGS(
     COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
@@ -279,6 +295,7 @@ IF (KOKKOS_ARCH_ARMV8_THUNDERX2)
 ENDIF()
 
 IF (KOKKOS_ARCH_A64FX)
+  SET(KOKKOS_ARCH_ARM_NEON ON)
   COMPILER_SPECIFIC_FLAGS(
     COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
     Clang   -march=armv8.2-a+sve -msve-vector-bits=512
@@ -298,7 +315,7 @@ IF (KOKKOS_ARCH_ZEN)
     DEFAULT -march=znver1 -mtune=znver1
   )
   SET(KOKKOS_ARCH_AMD_ZEN  ON)
-  SET(KOKKOS_ARCH_AMD_AVX2 ON)
+  SET(KOKKOS_ARCH_AVX2 ON)
 ENDIF()
 
 IF (KOKKOS_ARCH_ZEN2)
@@ -310,7 +327,7 @@ IF (KOKKOS_ARCH_ZEN2)
     DEFAULT -march=znver2 -mtune=znver2
   )
   SET(KOKKOS_ARCH_AMD_ZEN2 ON)
-  SET(KOKKOS_ARCH_AMD_AVX2 ON)
+  SET(KOKKOS_ARCH_AVX2 ON)
 ENDIF()
 
 IF (KOKKOS_ARCH_ZEN3)
@@ -322,7 +339,7 @@ IF (KOKKOS_ARCH_ZEN3)
     DEFAULT -march=znver3 -mtune=znver3
   )
   SET(KOKKOS_ARCH_AMD_ZEN3 ON)
-  SET(KOKKOS_ARCH_AMD_AVX2 ON)
+  SET(KOKKOS_ARCH_AVX2 ON)
 ENDIF()
 
 IF (KOKKOS_ARCH_WSM)
@@ -406,7 +423,6 @@ IF (KOKKOS_ARCH_SKL)
 ENDIF()
 
 IF (KOKKOS_ARCH_SKX)
-  #avx512-xeon
   SET(KOKKOS_ARCH_AVX512XEON ON)
   COMPILER_SPECIFIC_FLAGS(
     COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
@@ -472,6 +488,53 @@ IF (KOKKOS_ARCH_POWER9)
   )
 ENDIF()
 
+# If Kokkos_ARCH_NATIVE is enabled, we are trying to autodetect
+# the SIMD capabilities based on compiler macros.
+IF (KOKKOS_ARCH_NATIVE)
+  # Make sure to rerun the checks if compile options have changed
+  IF(NOT "${KOKKOS_COMPILE_OPTIONS}" STREQUAL "${KOKKOS_COMPILE_OPTIONS_SAVED}")
+    SET(KOKKOS_COMPILE_OPTIONS_SAVED "${KOKKOS_COMPILE_OPTIONS}" CACHE INTERNAL "")
+
+    SET(CMAKE_REQUIRED_QUIET ON)
+    SET(CMAKE_REQUIRED_FLAGS "${KOKKOS_COMPILE_OPTIONS}")
+    INCLUDE(CheckCXXSymbolExists)
+
+    UNSET(KOKKOS_COMPILER_HAS_AVX512 CACHE)
+    CHECK_CXX_SYMBOL_EXISTS(__AVX512F__ "" KOKKOS_COMPILER_HAS_AVX512)
+    UNSET(KOKKOS_COMPILER_HAS_AVX2 CACHE)
+    CHECK_CXX_SYMBOL_EXISTS(__AVX2__ "" KOKKOS_COMPILER_HAS_AVX2)
+    UNSET(KOKKOS_COMPILER_HAS_ARM_NEON CACHE)
+    CHECK_CXX_SYMBOL_EXISTS(__ARM_NEON "" KOKKOS_COMPILER_HAS_ARM_NEON)
+    UNSET(KOKKOS_COMPILER_HAS_AVX CACHE)
+    CHECK_CXX_SYMBOL_EXISTS(__AVX__ "" KOKKOS_COMPILER_HAS_AVX)
+    SET(CMAKE_REQUIRED_FLAGS "${KOKKOS_COMPILE_OPTIONS}")
+
+    UNSET(CMAKE_REQUIRED_QUIET)
+    UNSET(CMAKE_REQUIRED_FLAGS)
+  ENDIF()
+
+  # Only define one of these macros for now
+  # to be uniform with what we are doing for other architectures.
+  IF(KOKKOS_COMPILER_HAS_AVX512)
+    MESSAGE(STATUS "SIMD: AVX512 detected")
+    SET(KOKKOS_ARCH_AVX512XEON ON)
+  ELSEIF(KOKKOS_COMPILER_HAS_AVX2)
+    MESSAGE(STATUS "SIMD: AVX2 detected")
+    SET(KOKKOS_ARCH_AVX2 ON)
+  ELSEIF(KOKKOS_COMPILER_HAS_ARM_NEON)
+    MESSAGE(STATUS "SIMD: ARM_NEON detected")
+    SET(KOKKOS_ARCH_ARM_NEON ON)
+  ELSEIF(KOKKOS_COMPILER_HAS_AVX)
+    MESSAGE(STATUS "SIMD: AVX detected")
+    SET(KOKKOS_ARCH_AVX ON)
+  ENDIF()
+ENDIF()
+
+# FIXME_NVHPC nvc++ doesn't seem to support AVX512.
+IF (KOKKOS_CXX_HOST_COMPILER_ID STREQUAL NVHPC)
+  SET(KOKKOS_ARCH_AVX512XEON OFF)
+ENDIF()
+
 IF (NOT KOKKOS_COMPILE_LANGUAGE STREQUAL CUDA)
   IF (KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE)
       COMPILER_SPECIFIC_FLAGS(
@@ -506,7 +569,7 @@ ENDIF()
 IF (KOKKOS_ENABLE_HIP)
   IF (KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE)
     COMPILER_SPECIFIC_FLAGS(
-      DEFAULT -fgpu-rdc -DDESUL_HIP_RDC
+      DEFAULT -fgpu-rdc
     )
   ELSE()
     COMPILER_SPECIFIC_FLAGS(
@@ -524,6 +587,35 @@ IF (KOKKOS_ENABLE_SYCL)
   )
 ENDIF()
 
+# Check support for device_global variables
+# FIXME_SYCL Once the feature test macro SYCL_EXT_ONEAPI_DEVICE_GLOBAL is
+#            available, use that instead.
+IF(KOKKOS_ENABLE_SYCL AND NOT BUILD_SHARED_LIBS)
+  INCLUDE(CheckCXXSourceCompiles)
+  STRING(REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${KOKKOS_COMPILE_OPTIONS}")
+  CHECK_CXX_SOURCE_COMPILES("
+    #include <sycl/sycl.hpp>
+    using namespace sycl::ext::oneapi::experimental;
+    using namespace sycl;
+
+    SYCL_EXTERNAL device_global<int, decltype(properties(device_image_scope))> Foo;
+
+    void bar(queue q) {
+      q.single_task([=] {
+      Foo = 42;
+    });
+    }
+
+    int main(){ return 0; }
+    "
+    KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED)
+
+  IF(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED)
+    COMPILER_SPECIFIC_FLAGS(
+      DEFAULT -fsycl-device-code-split=off -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED
+    )
+  ENDIF()
+ENDIF()
 
 SET(CUDA_ARCH_ALREADY_SPECIFIED "")
 FUNCTION(CHECK_CUDA_ARCH ARCH FLAG)
@@ -533,13 +625,17 @@ FUNCTION(CHECK_CUDA_ARCH ARCH FLAG)
     ENDIF()
     SET(CUDA_ARCH_ALREADY_SPECIFIED ${ARCH} PARENT_SCOPE)
     IF (NOT KOKKOS_ENABLE_CUDA AND NOT KOKKOS_ENABLE_OPENMPTARGET AND NOT KOKKOS_ENABLE_SYCL AND NOT KOKKOS_ENABLE_OPENACC)
-      MESSAGE(WARNING "Given CUDA arch ${ARCH}, but Kokkos_ENABLE_CUDA, Kokkos_ENABLE_OPENACC, and Kokkos_ENABLE_OPENMPTARGET are OFF. Option will be ignored.")
+      MESSAGE(WARNING "Given CUDA arch ${ARCH}, but Kokkos_ENABLE_CUDA, Kokkos_ENABLE_SYCL, Kokkos_ENABLE_OPENACC, and Kokkos_ENABLE_OPENMPTARGET are OFF. Option will be ignored.")
       UNSET(KOKKOS_ARCH_${ARCH} PARENT_SCOPE)
     ELSE()
+      IF(KOKKOS_ENABLE_CUDA)
+        STRING(REPLACE "sm_" "" CMAKE_ARCH ${FLAG})
+        SET(KOKKOS_CUDA_ARCHITECTURES ${CMAKE_ARCH})
+        SET(KOKKOS_CUDA_ARCHITECTURES ${CMAKE_ARCH} PARENT_SCOPE)
+      ENDIF()
       SET(KOKKOS_CUDA_ARCH_FLAG ${FLAG} PARENT_SCOPE)
       IF(KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE)
-        string(REPLACE "sm_" "" CMAKE_ARCH ${FLAG})
-        SET(CMAKE_CUDA_ARCHITECTURES ${CMAKE_ARCH} PARENT_SCOPE)
+        SET(CMAKE_CUDA_ARCHITECTURES ${KOKKOS_CUDA_ARCHITECTURES} PARENT_SCOPE)
       ELSE()
         IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
           STRING(REPLACE "sm_" "cc" NVHPC_CUDA_ARCH ${FLAG})
@@ -587,10 +683,13 @@ FUNCTION(CHECK_AMDGPU_ARCH ARCH FLAG)
       MESSAGE(FATAL_ERROR "Multiple GPU architectures given! Already have ${AMDGPU_ARCH_ALREADY_SPECIFIED}, but trying to add ${ARCH}. If you are re-running CMake, try clearing the cache and running again.")
     ENDIF()
     SET(AMDGPU_ARCH_ALREADY_SPECIFIED ${ARCH} PARENT_SCOPE)
-    IF (NOT KOKKOS_ENABLE_HIP AND NOT KOKKOS_ENABLE_OPENMPTARGET)
-      MESSAGE(WARNING "Given AMD GPU architecture ${ARCH}, but Kokkos_ENABLE_HIP and Kokkos_ENABLE_OPENMPTARGET are OFF. Option will be ignored.")
+    IF (NOT KOKKOS_ENABLE_HIP AND NOT KOKKOS_ENABLE_OPENMPTARGET AND NOT KOKKOS_ENABLE_OPENACC AND NOT KOKKOS_ENABLE_SYCL)
+      MESSAGE(WARNING "Given AMD GPU architecture ${ARCH}, but Kokkos_ENABLE_HIP, Kokkos_ENABLE_SYCL, Kokkos_ENABLE_OPENACC, and Kokkos_ENABLE_OPENMPTARGET are OFF. Option will be ignored.")
       UNSET(KOKKOS_ARCH_${ARCH} PARENT_SCOPE)
     ELSE()
+      IF(KOKKOS_ENABLE_HIP)
+        SET(KOKKOS_HIP_ARCHITECTURES ${FLAG} PARENT_SCOPE)
+      ENDIF()
       SET(KOKKOS_AMDGPU_ARCH_FLAG ${FLAG} PARENT_SCOPE)
       GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS "${AMDGPU_ARCH_FLAG}=${FLAG}")
       IF(KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE)
@@ -646,11 +745,17 @@ ENDIF()
 IF (KOKKOS_ENABLE_OPENMPTARGET)
   SET(CLANG_CUDA_ARCH ${KOKKOS_CUDA_ARCH_FLAG})
   IF (CLANG_CUDA_ARCH)
-    STRING(REPLACE "sm_" "cc" NVHPC_CUDA_ARCH ${CLANG_CUDA_ARCH})
-    COMPILER_SPECIFIC_FLAGS(
-      Clang -Xopenmp-target -march=${CLANG_CUDA_ARCH} -fopenmp-targets=nvptx64
-      NVHPC -gpu=${NVHPC_CUDA_ARCH}
-    )
+    IF(KOKKOS_CLANG_IS_CRAY)
+      COMPILER_SPECIFIC_FLAGS(
+        Cray -fopenmp
+      )
+    ELSE()
+      STRING(REPLACE "sm_" "cc" NVHPC_CUDA_ARCH ${CLANG_CUDA_ARCH})
+      COMPILER_SPECIFIC_FLAGS(
+        Clang -Xopenmp-target -march=${CLANG_CUDA_ARCH} -fopenmp-targets=nvptx64
+        NVHPC -gpu=${NVHPC_CUDA_ARCH}
+      )
+    ENDIF()
   ENDIF()
   SET(CLANG_AMDGPU_ARCH ${KOKKOS_AMDGPU_ARCH_FLAG})
   IF (CLANG_AMDGPU_ARCH)
@@ -691,9 +796,17 @@ ENDIF()
 
 IF (KOKKOS_ENABLE_OPENACC)
   IF(KOKKOS_CUDA_ARCH_FLAG)
+    SET(CLANG_CUDA_ARCH ${KOKKOS_CUDA_ARCH_FLAG})
     STRING(REPLACE "sm_" "cc" NVHPC_CUDA_ARCH ${KOKKOS_CUDA_ARCH_FLAG})
     COMPILER_SPECIFIC_FLAGS(
       NVHPC -acc -gpu=${NVHPC_CUDA_ARCH}
+      Clang -Xopenmp-target=nvptx64-nvidia-cuda -march=${CLANG_CUDA_ARCH}
+            -fopenmp-targets=nvptx64-nvidia-cuda
+    )
+  ELSEIF(KOKKOS_AMDGPU_ARCH_FLAG)
+    COMPILER_SPECIFIC_FLAGS(
+      Clang -Xopenmp-target=amdgcn-amd-amdhsa -march=${KOKKOS_AMDGPU_ARCH_FLAG}
+            -fopenmp-targets=amdgcn-amd-amdhsa
     )
   ELSE()
     COMPILER_SPECIFIC_FLAGS(
@@ -706,39 +819,52 @@ IF (KOKKOS_ENABLE_SYCL)
   IF(CUDA_ARCH_ALREADY_SPECIFIED)
     IF(KOKKOS_ENABLE_UNSUPPORTED_ARCHS)
       COMPILER_SPECIFIC_FLAGS(
-        DEFAULT -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend "${CUDA_ARCH_FLAG}=${KOKKOS_CUDA_ARCH_FLAG}"
+        DEFAULT -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=${KOKKOS_CUDA_ARCH_FLAG}
       )
     ELSE()
       MESSAGE(SEND_ERROR "Setting a CUDA architecture for SYCL is only allowed with Kokkos_ENABLE_UNSUPPORTED_ARCHS=ON!")
     ENDIF()
+  ELSEIF(AMDGPU_ARCH_ALREADY_SPECIFIED)
+    IF(KOKKOS_ENABLE_UNSUPPORTED_ARCHS)
+      COMPILER_SPECIFIC_FLAGS(
+        DEFAULT -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=${KOKKOS_AMDGPU_ARCH_FLAG}
+      )
+    ELSE()
+      MESSAGE(SEND_ERROR "Setting a AMDGPU architecture for SYCL is only allowed with Kokkos_ENABLE_UNSUPPORTED_ARCHS=ON!")
+    ENDIF()
   ELSEIF(KOKKOS_ARCH_INTEL_GEN)
     COMPILER_SPECIFIC_FLAGS(
       DEFAULT -fsycl-targets=spir64
     )
-  ELSEIF(KOKKOS_ARCH_INTEL_GEN9)
-    COMPILER_SPECIFIC_FLAGS(
-      DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen9"
-    )
-  ELSEIF(KOKKOS_ARCH_INTEL_GEN11)
-    COMPILER_SPECIFIC_FLAGS(
-      DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen11"
-    )
-  ELSEIF(KOKKOS_ARCH_INTEL_GEN12LP)
-    COMPILER_SPECIFIC_FLAGS(
-      DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen12lp"
-    )
-  ELSEIF(KOKKOS_ARCH_INTEL_DG1)
-    COMPILER_SPECIFIC_FLAGS(
-      DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device dg1"
-    )
-  ELSEIF(KOKKOS_ARCH_INTEL_XEHP)
-    COMPILER_SPECIFIC_FLAGS(
-      DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device 12.50.4"
-    )
-  ELSEIF(KOKKOS_ARCH_INTEL_PVC)
-    COMPILER_SPECIFIC_FLAGS(
-      DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device 12.60.7"
+  ELSE()
+    COMPILER_SPECIFIC_OPTIONS(
+      DEFAULT -fsycl-targets=spir64_gen
     )
+    IF(KOKKOS_ARCH_INTEL_GEN9)
+      COMPILER_SPECIFIC_LINK_OPTIONS(
+        DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen9"
+      )
+    ELSEIF(KOKKOS_ARCH_INTEL_GEN11)
+      COMPILER_SPECIFIC_LINK_OPTIONS(
+        DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen11"
+      )
+    ELSEIF(KOKKOS_ARCH_INTEL_GEN12LP)
+      COMPILER_SPECIFIC_LINK_OPTIONS(
+        DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen12lp"
+      )
+    ELSEIF(KOKKOS_ARCH_INTEL_DG1)
+      COMPILER_SPECIFIC_LINK_OPTIONS(
+        DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device dg1"
+      )
+    ELSEIF(KOKKOS_ARCH_INTEL_XEHP)
+      COMPILER_SPECIFIC_LINK_OPTIONS(
+        DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device 12.50.4"
+      )
+    ELSEIF(KOKKOS_ARCH_INTEL_PVC)
+      COMPILER_SPECIFIC_LINK_OPTIONS(
+        DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device 12.60.7"
+      )
+    ENDIF()
   ENDIF()
 ENDIF()
 
@@ -871,9 +997,45 @@ IF(KOKKOS_ENABLE_HIP AND NOT AMDGPU_ARCH_ALREADY_SPECIFIED)
   ENDIF()
 ENDIF()
 
+FOREACH(ARCH IN LISTS SUPPORTED_AMD_ARCHS)
+  IF (KOKKOS_ARCH_${ARCH})
+    STRING(REGEX MATCH "90A" IS_90A ${ARCH})
+    IF(IS_90A)
+      SET(KOKKOS_ARCH_AMD_GFX90A ON)
+      SET(KOKKOS_ARCH_VEGA90A ON)
+      BREAK()
+    ENDIF()
+    STRING(REGEX MATCH "908" IS_908 ${ARCH})
+    IF(IS_908)
+      SET(KOKKOS_ARCH_AMD_GFX908 ON)
+      SET(KOKKOS_ARCH_VEGA908 ON)
+      BREAK()
+    ENDIF()
+    STRING(REGEX MATCH "906" IS_906 ${ARCH})
+    IF(IS_906)
+      SET(KOKKOS_ARCH_AMD_GFX906 ON)
+      SET(KOKKOS_ARCH_VEGA906 ON)
+      BREAK()
+    ENDIF()
+    STRING(REGEX MATCH "1100" IS_1100 ${ARCH})
+    IF(IS_1100)
+      SET(KOKKOS_ARCH_AMD_GFX1100 ON)
+      SET(KOKKOS_ARCH_NAVI1100 ON)
+      BREAK()
+    ENDIF()
+    STRING(REGEX MATCH "1030" IS_1030 ${ARCH})
+    IF(IS_1030)
+      SET(KOKKOS_ARCH_AMD_GFX1030 ON)
+      SET(KOKKOS_ARCH_NAVI1030 ON)
+      BREAK()
+    ENDIF()
+  ENDIF()
+ENDFOREACH()
+
 #Regardless of version, make sure we define the general architecture name
 FOREACH(ARCH IN LISTS SUPPORTED_AMD_ARCHS)
   IF (KOKKOS_ARCH_${ARCH})
+    SET(KOKKOS_ARCH_AMD_GPU ON)
     STRING(REGEX MATCH "(VEGA)" IS_VEGA ${ARCH})
     IF(IS_VEGA)
       SET(KOKKOS_ARCH_VEGA ON)
diff --git a/packages/kokkos/cmake/kokkos_compiler_id.cmake b/packages/kokkos/cmake/kokkos_compiler_id.cmake
index 27a3102a6b524fbecb83c21fef477a234a32fec1..04589befc3ada08c204a8242b096501723728d01 100644
--- a/packages/kokkos/cmake/kokkos_compiler_id.cmake
+++ b/packages/kokkos/cmake/kokkos_compiler_id.cmake
@@ -42,10 +42,13 @@ IF(Kokkos_ENABLE_CUDA)
     # If launcher was found and nvcc_wrapper was not specified as
     # compiler and `CMAKE_CXX_COMPILIER_LAUNCHER` is not set, set to use launcher.
     # Will ensure CMAKE_CXX_COMPILER is replaced by nvcc_wrapper
-    IF(Kokkos_COMPILE_LAUNCHER AND NOT INTERNAL_HAVE_COMPILER_NVCC AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
+    IF(Kokkos_COMPILE_LAUNCHER AND NOT INTERNAL_HAVE_COMPILER_NVCC AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL Clang
+       AND NOT (Kokkos_ENABLE_IMPL_NVHPC_AS_DEVICE_COMPILER AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC))
       IF(CMAKE_CXX_COMPILER_LAUNCHER)
-       MESSAGE(FATAL_ERROR "Cannot use CMAKE_CXX_COMPILER_LAUNCHER if the CMAKE_CXX_COMPILER is not able to compile CUDA code, i.e. nvcc_wrapper or
-clang++!")
+        IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
+          MESSAGE(STATUS "Using nvc++ as device compiler requires Kokkos_ENABLE_IMPL_NVHPC_AS_DEVICE_COMPILER=ON!")
+        ENDIF()
+        MESSAGE(FATAL_ERROR "Cannot use CMAKE_CXX_COMPILER_LAUNCHER if the CMAKE_CXX_COMPILER is not able to compile CUDA code, i.e. nvcc_wrapper or clang++!")
       ENDIF()
       # the first argument to launcher is always the C++ compiler defined by cmake
       # if the second argument matches the C++ compiler, it forwards the rest of the
@@ -152,7 +155,7 @@ SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    Clang(CUDA)       10.0.0 or
 SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    GCC                8.2.0 or higher")
 SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    Intel             19.0.5 or higher")
 SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    IntelLLVM(CPU)  2021.1.1 or higher")
-SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    IntelLLVM(SYCL) 2022.0.0 or higher") #FIXME
+SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    IntelLLVM(SYCL) 2023.0.0 or higher")
 SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    NVCC              11.0.0 or higher")
 SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    HIPCC              5.2.0 or higher")
 SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    NVHPC/PGI           22.3 or higher")
@@ -181,7 +184,7 @@ ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM AND NOT Kokkos_ENABLE_SYCL)
     MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}")
   ENDIF()
 ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM AND Kokkos_ENABLE_SYCL)
-  IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 2022.0.0) #FIXME 2022.2.0
+  IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 2023.0.0)
     MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}")
   ENDIF()
 ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA)
diff --git a/packages/kokkos/cmake/kokkos_enable_devices.cmake b/packages/kokkos/cmake/kokkos_enable_devices.cmake
index d4a7744eb59b74a89059dfb421f470fca62e0b9e..9a977520a3a02e37e0d1dd2c9dac38d166aa7bfb 100644
--- a/packages/kokkos/cmake/kokkos_enable_devices.cmake
+++ b/packages/kokkos/cmake/kokkos_enable_devices.cmake
@@ -41,6 +41,16 @@ ENDIF()
 KOKKOS_DEVICE_OPTION(OPENMP ${OMP_DEFAULT} HOST "Whether to build OpenMP backend")
 
 KOKKOS_DEVICE_OPTION(OPENACC OFF DEVICE "Whether to build the OpenACC backend")
+IF (KOKKOS_ENABLE_OPENACC)
+  COMPILER_SPECIFIC_FLAGS(
+    Clang -fopenacc -fopenacc-fake-async-wait
+          -Wno-openacc-and-cxx -Wno-openmp-mapping -Wno-unknown-cuda-version
+          -Wno-pass-failed
+  )
+  COMPILER_SPECIFIC_DEFS(
+    Clang KOKKOS_WORKAROUND_OPENMPTARGET_CLANG
+  )
+ENDIF()
 
 KOKKOS_DEVICE_OPTION(OPENMPTARGET OFF DEVICE "Whether to build the OpenMP target backend")
 IF (KOKKOS_ENABLE_OPENMPTARGET)
diff --git a/packages/kokkos/cmake/kokkos_enable_options.cmake b/packages/kokkos/cmake/kokkos_enable_options.cmake
index f9f1bc5a8b5170028de88341a9b3398823618629..89e23b019bdca0a3084ce3795bf7f3bde76baf66 100644
--- a/packages/kokkos/cmake/kokkos_enable_options.cmake
+++ b/packages/kokkos/cmake/kokkos_enable_options.cmake
@@ -11,7 +11,7 @@
 FUNCTION(KOKKOS_ENABLE_OPTION SUFFIX DEFAULT DOCSTRING)
   KOKKOS_OPTION(ENABLE_${SUFFIX} ${DEFAULT} BOOL ${DOCSTRING})
   STRING(TOUPPER ${SUFFIX} UC_NAME)
-  IF (KOKKOS_ENABLE_${UC_NAME})
+  IF (KOKKOS_ENABLE_${UC_NAME} AND NOT "Kokkos_ENABLE_${UC_NAME}" IN_LIST Kokkos_OPTIONS_NOT_TO_EXPORT)
     LIST(APPEND KOKKOS_ENABLED_OPTIONS ${UC_NAME})
     #I hate that CMake makes me do this
     SET(KOKKOS_ENABLED_OPTIONS ${KOKKOS_ENABLED_OPTIONS} PARENT_SCOPE)
@@ -26,19 +26,32 @@ KOKKOS_CFG_DEPENDS(OPTIONS COMPILER_ID)
 # Put a check in just in case people are using this option
 KOKKOS_DEPRECATED_LIST(OPTIONS ENABLE)
 
-# Set the Default for Desul Atomics usage.
-set(_DESUL_ATOMICS_DEFAULT ON)
-
 KOKKOS_ENABLE_OPTION(CUDA_RELOCATABLE_DEVICE_CODE  OFF "Whether to enable relocatable device code (RDC) for CUDA")
 KOKKOS_ENABLE_OPTION(CUDA_UVM             OFF "Whether to use unified memory (UM) for CUDA by default")
 KOKKOS_ENABLE_OPTION(CUDA_LDG_INTRINSIC   OFF "Whether to use CUDA LDG intrinsics")
-# As of 08/12/2021 CudaMallocAsync causes issues if UCX is used as MPI communication layer.
-KOKKOS_ENABLE_OPTION(IMPL_CUDA_MALLOC_ASYNC      OFF  "Whether to enable CudaMallocAsync (requires CUDA Toolkit 11.2)")
+# In contrast to other CUDA-dependent, options CUDA_LAMBDA is ON by default.
+# That is problematic when CUDA is not enabled because this not only yields a
+# bogus warning, but also exports the Kokkos_ENABLE_CUDA_LAMBDA variable and
+# sets it to ON. This if-clause is a crutch that delays the refactoring of the
+# way we declare all options until after we get rid of TriBITS.
+IF (Trilinos_ENABLE_Kokkos AND TPL_ENABLE_CUDA)
+   SET(CUDA_LAMBDA_DEFAULT ON)
+ELSEIF (KOKKOS_ENABLE_CUDA)
+   SET(CUDA_LAMBDA_DEFAULT ON)
+ELSE()
+   SET(CUDA_LAMBDA_DEFAULT OFF)
+ENDIF()
+KOKKOS_ENABLE_OPTION(CUDA_LAMBDA ${CUDA_LAMBDA_DEFAULT} "Whether to allow lambda expressions on the device with NVCC **DEPRECATED**")
+
+# May be used to disable our use of CudaMallocAsync.  It had caused issues in
+# the past when UCX was used as MPI communication layer.  We expect it is
+# resolved but we keep the option around a bit longer to be safe.
+KOKKOS_ENABLE_OPTION(IMPL_CUDA_MALLOC_ASYNC ON  "Whether to enable CudaMallocAsync (requires CUDA Toolkit 11.2)")
+KOKKOS_ENABLE_OPTION(IMPL_NVHPC_AS_DEVICE_COMPILER OFF "Whether to allow nvc++ as Cuda device compiler")
 KOKKOS_ENABLE_OPTION(DEPRECATED_CODE_3    OFF "Whether code deprecated in major release 3 is available" )
 KOKKOS_ENABLE_OPTION(DEPRECATED_CODE_4    ON "Whether code deprecated in major release 4 is available" )
 KOKKOS_ENABLE_OPTION(DEPRECATION_WARNINGS ON "Whether to emit deprecation warnings" )
 KOKKOS_ENABLE_OPTION(HIP_RELOCATABLE_DEVICE_CODE  OFF "Whether to enable relocatable device code (RDC) for HIP")
-KOKKOS_ENABLE_OPTION(HPX_ASYNC_DISPATCH   OFF "Whether HPX supports asynchronous dispatch")
 KOKKOS_ENABLE_OPTION(TESTS         OFF  "Whether to build the unit tests")
 KOKKOS_ENABLE_OPTION(BENCHMARKS    OFF  "Whether to build the benchmarks")
 KOKKOS_ENABLE_OPTION(EXAMPLES      OFF  "Whether to build the examples")
@@ -54,32 +67,21 @@ UNSET(_UPPERCASE_CMAKE_BUILD_TYPE)
 KOKKOS_ENABLE_OPTION(LARGE_MEM_TESTS      OFF "Whether to perform extra large memory tests")
 KOKKOS_ENABLE_OPTION(DEBUG_BOUNDS_CHECK   OFF "Whether to use bounds checking - will increase runtime")
 KOKKOS_ENABLE_OPTION(COMPILER_WARNINGS    OFF "Whether to print all compiler warnings")
-KOKKOS_ENABLE_OPTION(PROFILING_LOAD_PRINT OFF "Whether to print information about which profiling tools got loaded")
 KOKKOS_ENABLE_OPTION(TUNING               OFF "Whether to create bindings for tuning tools")
 KOKKOS_ENABLE_OPTION(AGGRESSIVE_VECTORIZATION OFF "Whether to aggressively vectorize loops")
-KOKKOS_ENABLE_OPTION(LAUNCH_COMPILER      ON  "Whether to potentially use the launch compiler")
 KOKKOS_ENABLE_OPTION(COMPILE_AS_CMAKE_LANGUAGE OFF "Whether to use native cmake language support")
 KOKKOS_ENABLE_OPTION(HIP_MULTIPLE_KERNEL_INSTANTIATIONS OFF "Whether multiple kernels are instantiated at compile time - improve performance but increase compile time")
 
 # This option will go away eventually, but allows fallback to old implementation when needed.
-KOKKOS_ENABLE_OPTION(IMPL_DESUL_ATOMICS   ON  "Whether to use desul based atomics - option only during beta")
 KOKKOS_ENABLE_OPTION(DESUL_ATOMICS_EXTERNAL OFF "Whether to use an external desul installation")
 
 KOKKOS_ENABLE_OPTION(IMPL_MDSPAN OFF "Whether to enable experimental mdspan support")
 KOKKOS_ENABLE_OPTION(MDSPAN_EXTERNAL OFF BOOL "Whether to use an external version of mdspan")
-KOKKOS_ENABLE_OPTION(IMPL_SKIP_COMPILER_MDSPAN OFF BOOL "Whether to use an internal version of mdspan even if the compiler supports mdspan")
+KOKKOS_ENABLE_OPTION(IMPL_SKIP_COMPILER_MDSPAN ON BOOL "Whether to use an internal version of mdspan even if the compiler supports mdspan")
 mark_as_advanced(Kokkos_ENABLE_IMPL_MDSPAN)
 mark_as_advanced(Kokkos_ENABLE_MDSPAN_EXTERNAL)
 mark_as_advanced(Kokkos_ENABLE_IMPL_SKIP_COMPILER_MDSPAN)
 
-IF (Trilinos_ENABLE_Kokkos AND TPL_ENABLE_CUDA)
-  SET(CUDA_LAMBDA_DEFAULT ON)
-ELSEIF (KOKKOS_ENABLE_CUDA)
-  SET(CUDA_LAMBDA_DEFAULT ON)
-ELSE()
-  SET(CUDA_LAMBDA_DEFAULT OFF)
-ENDIF()
-KOKKOS_ENABLE_OPTION(CUDA_LAMBDA ${CUDA_LAMBDA_DEFAULT} "Whether to activate experimental lambda features")
 IF (Trilinos_ENABLE_Kokkos)
   SET(COMPLEX_ALIGN_DEFAULT OFF)
 ELSE()
@@ -104,6 +106,13 @@ ELSE()
 ENDIF()
 KOKKOS_ENABLE_OPTION(CUDA_CONSTEXPR ${CUDA_CONSTEXPR_DEFAULT} "Whether to activate experimental relaxed constexpr functions")
 
+IF (KOKKOS_ENABLE_HPX)
+  SET(HPX_ASYNC_DISPATCH_DEFAULT ON)
+ELSE()
+  SET(HPX_ASYNC_DISPATCH_DEFAULT OFF)
+ENDIF()
+KOKKOS_ENABLE_OPTION(IMPL_HPX_ASYNC_DISPATCH ${HPX_ASYNC_DISPATCH_DEFAULT} "Whether HPX supports asynchronous dispatch")
+
 Kokkos_ENABLE_OPTION(UNSUPPORTED_ARCHS OFF "Whether to allow architectures in backends Kokkos doesn't optimize for")
 
 FUNCTION(check_device_specific_options)
@@ -123,7 +132,7 @@ ENDFUNCTION()
 
 CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE CUDA OPTIONS CUDA_UVM CUDA_RELOCATABLE_DEVICE_CODE CUDA_LAMBDA CUDA_CONSTEXPR CUDA_LDG_INTRINSIC)
 CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE HIP OPTIONS HIP_RELOCATABLE_DEVICE_CODE)
-CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE HPX OPTIONS HPX_ASYNC_DISPATCH)
+CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE HPX OPTIONS IMPL_HPX_ASYNC_DISPATCH)
 
 # Needed due to change from deprecated name to new header define name
 IF (KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION)
@@ -166,3 +175,17 @@ IF(Kokkos_ENABLE_CUDA_LDG_INTRINSIC)
     MESSAGE(FATAL_ERROR "Kokkos_ENABLE_CUDA_LDG_INTRINSIC has been removed. LDG intrinsics are always enabled.")
   ENDIF()
 ENDIF()
+IF(Kokkos_ENABLE_CUDA AND NOT Kokkos_ENABLE_CUDA_LAMBDA)
+  IF(KOKKOS_ENABLE_DEPRECATED_CODE_4)
+    MESSAGE(DEPRECATION "Setting Kokkos_ENABLE_CUDA_LAMBDA is deprecated. Lambda expressions in device code are always enabled. Forcing -DKokkos_ENABLE_CUDA_LAMBDA=ON")
+    set(Kokkos_ENABLE_CUDA_LAMBDA ON CACHE BOOL "Kokkos turned Cuda lambda support ON!" FORCE)
+    set(KOKKOS_ENABLE_CUDA_LAMBDA ON)
+  ELSE()
+    MESSAGE(FATAL_ERROR "Kokkos_ENABLE_CUDA_LAMBDA has been removed. Lambda expressions in device code always enabled.")
+  ENDIF()
+ENDIF()
+
+
+IF(DEFINED Kokkos_ENABLE_IMPL_DESUL_ATOMICS)
+  MESSAGE(WARNING "Kokkos_ENABLE_IMPL_DESUL_ATOMICS option has been removed. Desul atomics cannot be disabled.")
+ENDIF()
diff --git a/packages/kokkos/cmake/kokkos_functions.cmake b/packages/kokkos/cmake/kokkos_functions.cmake
index 55b1ebbf818ea0a972c069324c1f08d29010a51b..9dab1ca00ea4aced372c28711c07161c63e58e32 100644
--- a/packages/kokkos/cmake/kokkos_functions.cmake
+++ b/packages/kokkos/cmake/kokkos_functions.cmake
@@ -6,7 +6,12 @@
 # upper-case version for use within
 
 set(Kokkos_OPTIONS_NOT_TO_EXPORT
-  Kokkos_ENABLE_TESTS  Kokkos_ENABLE_EXAMPLES)
+  Kokkos_ENABLE_BENCHMARKS
+  Kokkos_ENABLE_EXAMPLES
+  Kokkos_ENABLE_TESTS
+  Kokkos_ENABLE_HEADER_SELF_CONTAINMENT_TESTS
+  Kokkos_ENABLE_COMPILER_WARNINGS
+)
 
 #
 #
diff --git a/packages/kokkos/cmake/kokkos_install.cmake b/packages/kokkos/cmake/kokkos_install.cmake
index fb658239d8d82c568eff465f8d8839151896ff3b..f818dfa24485b7e22f46d364a15fe07ede71cb71 100644
--- a/packages/kokkos/cmake/kokkos_install.cmake
+++ b/packages/kokkos/cmake/kokkos_install.cmake
@@ -28,6 +28,15 @@ IF (NOT KOKKOS_HAS_TRILINOS AND NOT Kokkos_INSTALL_TESTING)
     "${Kokkos_BINARY_DIR}/KokkosConfigVersion.cmake"
     DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Kokkos)
   install(EXPORT KokkosTargets NAMESPACE Kokkos:: DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Kokkos)
+  export(EXPORT KokkosTargets NAMESPACE Kokkos:: FILE ${Kokkos_BINARY_DIR}/KokkosTargets.cmake)
+
+  # Required to be a TriBITS-compliant external package
+  file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/cmake_packages/Kokkos)
+  file(COPY ${Kokkos_BINARY_DIR}/KokkosConfig.cmake
+            ${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake
+            ${Kokkos_BINARY_DIR}/KokkosConfigVersion.cmake
+            DESTINATION ${CMAKE_BINARY_DIR}/cmake_packages/Kokkos)
+  export(EXPORT KokkosTargets NAMESPACE Kokkos:: FILE ${CMAKE_BINARY_DIR}/cmake_packages/Kokkos/KokkosTargets.cmake)
 ELSE()
   CONFIGURE_FILE(cmake/KokkosConfigCommon.cmake.in ${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake @ONLY)
   file(READ ${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake KOKKOS_CONFIG_COMMON)
diff --git a/packages/kokkos/cmake/kokkos_test_cxx_std.cmake b/packages/kokkos/cmake/kokkos_test_cxx_std.cmake
index 5f8e15cd67373688d39338e630b82e74223ece77..7ad49fdd2d9df05a2a5bbdf62eda5d5a38a622dd 100644
--- a/packages/kokkos/cmake/kokkos_test_cxx_std.cmake
+++ b/packages/kokkos/cmake/kokkos_test_cxx_std.cmake
@@ -120,8 +120,12 @@ IF(KOKKOS_ENABLE_CUDA)
     ELSEIF(CMAKE_CXX_EXTENSIONS)
       MESSAGE(FATAL_ERROR "Compiling CUDA code with clang doesn't support C++ extensions.  Set -DCMAKE_CXX_EXTENSIONS=OFF")
     ENDIF()
-  ELSEIF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
-    MESSAGE(FATAL_ERROR "Invalid compiler for CUDA.  The compiler must be nvcc_wrapper or Clang or NVC++ or use kokkos_launch_compiler, but compiler ID was ${KOKKOS_CXX_COMPILER_ID}")
+  ELSEIF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA AND NOT (Kokkos_ENABLE_IMPL_NVHPC_AS_DEVICE_COMPILER AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC))
+    IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
+      MESSAGE(FATAL_ERROR "Invalid compiler for CUDA. To allow nvc++ as Cuda compiler, Kokkos_ENABLE_IMPL_NVHPC_AS_DEVICE_COMPILER=ON must be set!")
+    ELSE()
+      MESSAGE(FATAL_ERROR "Invalid compiler for CUDA. The compiler must be nvcc_wrapper or Clang or NVC++ or use kokkos_launch_compiler, but compiler ID was ${KOKKOS_CXX_COMPILER_ID}")
+    ENDIF()
   ENDIF()
 ENDIF()
 
diff --git a/packages/kokkos/cmake/kokkos_tpls.cmake b/packages/kokkos/cmake/kokkos_tpls.cmake
index ba66ee4d38dcfa5cd3f7e3b12c2bc7acb2de5eb4..f124596a84e012a5dca9c94798c05b573d8bf032 100644
--- a/packages/kokkos/cmake/kokkos_tpls.cmake
+++ b/packages/kokkos/cmake/kokkos_tpls.cmake
@@ -31,8 +31,7 @@ FUNCTION(KOKKOS_TPL_OPTION PKG DEFAULT)
   ENDIF()
 ENDFUNCTION()
 
-KOKKOS_TPL_OPTION(HWLOC   Off)
-KOKKOS_TPL_OPTION(LIBNUMA Off)
+KOKKOS_TPL_OPTION(HWLOC   Off TRIBITS HWLOC)
 KOKKOS_TPL_OPTION(MEMKIND Off)
 IF(KOKKOS_ENABLE_MEMKIND)
   SET(KOKKOS_ENABLE_HBWSPACE ON)
@@ -46,6 +45,12 @@ ELSE()
   SET(ROCM_DEFAULT OFF)
 ENDIF()
 KOKKOS_TPL_OPTION(ROCM    ${ROCM_DEFAULT})
+IF(KOKKOS_ENABLE_SYCL AND NOT KOKKOS_HAS_TRILINOS)
+  SET(ONEDPL_DEFAULT ON)
+ELSE()
+  SET(ONEDPL_DEFAULT OFF)
+ENDIF()
+KOKKOS_TPL_OPTION(ONEDPL  ${ONEDPL_DEFAULT})
 
 IF (WIN32)
   SET(LIBDL_DEFAULT Off)
@@ -76,7 +81,6 @@ IF (NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE)
   KOKKOS_IMPORT_TPL(CUDA INTERFACE)
 ENDIF()
 KOKKOS_IMPORT_TPL(HWLOC)
-KOKKOS_IMPORT_TPL(LIBNUMA)
 KOKKOS_IMPORT_TPL(LIBRT)
 KOKKOS_IMPORT_TPL(LIBDL)
 KOKKOS_IMPORT_TPL(MEMKIND)
@@ -85,10 +89,11 @@ IF (NOT WIN32)
 ENDIF()
 IF (NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE)
   KOKKOS_IMPORT_TPL(ROCM INTERFACE)
+  KOKKOS_IMPORT_TPL(ONEDPL INTERFACE)
 ENDIF()
 KOKKOS_IMPORT_TPL(LIBQUADMATH)
 
-IF (Kokkos_ENABLE_IMPL_DESUL_ATOMICS AND Kokkos_ENABLE_DESUL_ATOMICS_EXTERNAL)
+IF (Kokkos_ENABLE_DESUL_ATOMICS_EXTERNAL)
   find_package(desul REQUIRED COMPONENTS atomics)
   KOKKOS_EXPORT_CMAKE_TPL(desul REQUIRED COMPONENTS atomics)
 ENDIF()
diff --git a/packages/kokkos/cmake/kokkos_tribits.cmake b/packages/kokkos/cmake/kokkos_tribits.cmake
index 0f39551423fb1e731de7efbc5499fe4803493c7f..b30ca70ab954bb51dc0cedf95f8b408f31057fe2 100644
--- a/packages/kokkos/cmake/kokkos_tribits.cmake
+++ b/packages/kokkos/cmake/kokkos_tribits.cmake
@@ -44,54 +44,14 @@ IF (KOKKOS_HAS_TRILINOS)
   ENDIF()
 ENDIF()
 
-MACRO(KOKKOS_SUBPACKAGE NAME)
-  if (KOKKOS_HAS_TRILINOS)
-    TRIBITS_SUBPACKAGE(${NAME})
-  else()
-    SET(PACKAGE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
-    SET(PARENT_PACKAGE_NAME ${PACKAGE_NAME})
-    SET(PACKAGE_NAME ${PACKAGE_NAME}${NAME})
-    STRING(TOUPPER ${PACKAGE_NAME} PACKAGE_NAME_UC)
-    SET(${PACKAGE_NAME}_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
-    #ADD_INTERFACE_LIBRARY(PACKAGE_${PACKAGE_NAME})
-    #GLOBAL_SET(${PACKAGE_NAME}_LIBS "")
-  endif()
-ENDMACRO()
-
-MACRO(KOKKOS_SUBPACKAGE_POSTPROCESS)
-  if (KOKKOS_HAS_TRILINOS)
-    TRIBITS_SUBPACKAGE_POSTPROCESS()
-  endif()
-ENDMACRO()
-
-MACRO(KOKKOS_PACKAGE_DECL)
-
-  if (KOKKOS_HAS_TRILINOS)
-    TRIBITS_PACKAGE_DECL(Kokkos)
-  else()
-    SET(PACKAGE_NAME Kokkos)
-    SET(${PACKAGE_NAME}_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
-    STRING(TOUPPER ${PACKAGE_NAME} PACKAGE_NAME_UC)
-  endif()
-
-  #SET(TRIBITS_DEPS_DIR "${CMAKE_SOURCE_DIR}/cmake/deps")
-  #FILE(GLOB TPLS_FILES "${TRIBITS_DEPS_DIR}/*.cmake")
-  #FOREACH(TPL_FILE ${TPLS_FILES})
-  #  TRIBITS_PROCESS_TPL_DEP_FILE(${TPL_FILE})
-  #ENDFOREACH()
-
-ENDMACRO()
-
-
 MACRO(KOKKOS_PROCESS_SUBPACKAGES)
-  if (KOKKOS_HAS_TRILINOS)
-    TRIBITS_PROCESS_SUBPACKAGES()
-  else()
-    ADD_SUBDIRECTORY(core)
-    ADD_SUBDIRECTORY(containers)
-    ADD_SUBDIRECTORY(algorithms)
-    ADD_SUBDIRECTORY(simd)
+  ADD_SUBDIRECTORY(core)
+  ADD_SUBDIRECTORY(containers)
+  ADD_SUBDIRECTORY(algorithms)
+  ADD_SUBDIRECTORY(simd)
+  if (NOT KOKKOS_HAS_TRILINOS)
     ADD_SUBDIRECTORY(example)
+    ADD_SUBDIRECTORY(benchmarks)
   endif()
 ENDMACRO()
 
@@ -572,3 +532,11 @@ MACRO(KOKKOS_ADD_EXAMPLE_DIRECTORIES)
     ENDIF()
   endif()
 ENDMACRO()
+
+MACRO(KOKKOS_ADD_BENCHMARK_DIRECTORIES)
+  IF(KOKKOS_ENABLE_BENCHMARKS)
+    FOREACH(BENCHMARK_DIR ${ARGN})
+      ADD_SUBDIRECTORY(${BENCHMARK_DIR})
+    ENDFOREACH()
+  ENDIF()
+ENDMACRO()
diff --git a/packages/kokkos/containers/CMakeLists.txt b/packages/kokkos/containers/CMakeLists.txt
index b450c2720993752460aa07035fed38415440e739..0857d7007b44b7f5280a8ca5b44f4eec09191951 100644
--- a/packages/kokkos/containers/CMakeLists.txt
+++ b/packages/kokkos/containers/CMakeLists.txt
@@ -1,7 +1,3 @@
-
-
-KOKKOS_SUBPACKAGE(Containers)
-
 IF (NOT Kokkos_INSTALL_TESTING)
   ADD_SUBDIRECTORY(src)
 ENDIF()
@@ -11,5 +7,3 @@ IF(NOT KOKKOS_ENABLE_OPENACC)
 KOKKOS_ADD_TEST_DIRECTORIES(unit_tests)
 KOKKOS_ADD_TEST_DIRECTORIES(performance_tests)
 ENDIF()
-
-KOKKOS_SUBPACKAGE_POSTPROCESS()
diff --git a/packages/kokkos/containers/cmake/Dependencies.cmake b/packages/kokkos/containers/cmake/Dependencies.cmake
deleted file mode 100644
index 1d71d8af341181f689a6a8bf63036b67584cb138..0000000000000000000000000000000000000000
--- a/packages/kokkos/containers/cmake/Dependencies.cmake
+++ /dev/null
@@ -1,5 +0,0 @@
-TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
-  LIB_REQUIRED_PACKAGES KokkosCore
-  LIB_OPTIONAL_TPLS Pthread CUDA HWLOC
-  TEST_OPTIONAL_TPLS CUSPARSE
-  )
diff --git a/packages/kokkos/containers/cmake/KokkosContainers_config.h.in b/packages/kokkos/containers/cmake/KokkosContainers_config.h.in
deleted file mode 100644
index d91fdda1e353eddb2088ff86327e142676c9a6c9..0000000000000000000000000000000000000000
--- a/packages/kokkos/containers/cmake/KokkosContainers_config.h.in
+++ /dev/null
@@ -1,4 +0,0 @@
-#ifndef KOKKOS_CONTAINERS_CONFIG_H
-#define KOKKOS_CONTAINERS_CONFIG_H
-
-#endif
diff --git a/packages/kokkos/containers/performance_tests/CMakeLists.txt b/packages/kokkos/containers/performance_tests/CMakeLists.txt
index 4f1eeacdad6183bc0f2b27614ddf79ff202426e3..e325e45e85dc0c8fd95fa19cc93e6733e7c37339 100644
--- a/packages/kokkos/containers/performance_tests/CMakeLists.txt
+++ b/packages/kokkos/containers/performance_tests/CMakeLists.txt
@@ -16,7 +16,7 @@ foreach(Tag Threads;OpenMP;Cuda;HPX;HIP)
     )
 
     KOKKOS_ADD_EXECUTABLE_AND_TEST(
-      PerformanceTest_${Tag}
+      ContainersPerformanceTest_${Tag}
       SOURCES ${SOURCES}
     )
   endif()
diff --git a/packages/kokkos/containers/performance_tests/TestCuda.cpp b/packages/kokkos/containers/performance_tests/TestCuda.cpp
index e98595a41e4d5d3c30bc7ee3ee3efed6aa1c70a4..71b1c1d4ee8e57699ef4d78ab93b9d3ec2459dd5 100644
--- a/packages/kokkos/containers/performance_tests/TestCuda.cpp
+++ b/packages/kokkos/containers/performance_tests/TestCuda.cpp
@@ -44,10 +44,6 @@ TEST(TEST_CATEGORY, dynrankview_perf) {
 }
 
 TEST(TEST_CATEGORY, global_2_local) {
-#if defined(KOKKOS_ENABLE_CUDA) && \
-    defined(KOKKOS_COMPILER_NVHPC)  // FIXME_NVHPC
-  GTEST_SKIP() << "errors reported for all sizes";
-#endif
   std::cout << "Cuda" << std::endl;
   std::cout << "size, create, generate, fill, find" << std::endl;
   for (unsigned i = Performance::begin_id_size; i <= Performance::end_id_size;
diff --git a/packages/kokkos/containers/src/CMakeLists.txt b/packages/kokkos/containers/src/CMakeLists.txt
index cdbc6527fda54fc2a53dc2832afd67c3af347713..b7d85ebf11d77b30750d81a3084c9e5f41f0617b 100644
--- a/packages/kokkos/containers/src/CMakeLists.txt
+++ b/packages/kokkos/containers/src/CMakeLists.txt
@@ -1,6 +1,3 @@
-
-KOKKOS_CONFIGURE_FILE(${PACKAGE_NAME}_config.h)
-
 #need these here for now
 KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
 KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
diff --git a/packages/kokkos/containers/src/Kokkos_Bitset.hpp b/packages/kokkos/containers/src/Kokkos_Bitset.hpp
index 35f691ecf4e0db1e4d77dfeb364ce4726a0ab124..cd5ca4ea512365584c38142adb25afdc20556bd9 100644
--- a/packages/kokkos/containers/src/Kokkos_Bitset.hpp
+++ b/packages/kokkos/containers/src/Kokkos_Bitset.hpp
@@ -28,6 +28,24 @@
 
 namespace Kokkos {
 
+namespace Impl {
+//! Either append to the label if the property already exists, or set it.
+template <typename... P>
+auto with_updated_label(const ViewCtorProp<P...>& view_ctor_prop,
+                        const std::string& label) {
+  using vcp_t = ViewCtorProp<P...>;
+  //! If the label property is already set, append. Otherwise, set label.
+  if constexpr (vcp_t::has_label) {
+    vcp_t new_ctor_props(view_ctor_prop);
+    static_cast<ViewCtorProp<void, std::string>&>(new_ctor_props)
+        .value.append(label);
+    return new_ctor_props;
+  } else {
+    return Impl::with_properties_if_unset(view_ctor_prop, label);
+  }
+}
+}  // namespace Impl
+
 template <typename Device = Kokkos::DefaultExecutionSpace>
 class Bitset;
 
@@ -70,13 +88,32 @@ class Bitset {
     block_shift = Kokkos::Impl::integral_power_of_two(block_size)
   };
 
+  //! Type of @ref m_blocks.
+  using block_view_type = View<unsigned*, Device, MemoryTraits<RandomAccess>>;
+
  public:
   /// constructor
   /// arg_size := number of bit in set
-  Bitset(unsigned arg_size = 0u)
-      : m_size(arg_size),
-        m_last_block_mask(0u),
-        m_blocks("Bitset", ((m_size + block_mask) >> block_shift)) {
+  Bitset(unsigned arg_size = 0u) : Bitset(Kokkos::view_alloc(), arg_size) {}
+
+  template <class... P>
+  Bitset(const Impl::ViewCtorProp<P...>& arg_prop, unsigned arg_size)
+      : m_size(arg_size), m_last_block_mask(0u) {
+    //! Ensure that allocation properties are consistent.
+    using alloc_prop_t = std::decay_t<decltype(arg_prop)>;
+    static_assert(alloc_prop_t::initialize,
+                  "Allocation property 'initialize' should be true.");
+    static_assert(
+        !alloc_prop_t::has_pointer,
+        "Allocation properties should not contain the 'pointer' property.");
+
+    //! Update 'label' property and allocate.
+    const auto prop_copy = Kokkos::Impl::with_updated_label(
+        Impl::with_properties_if_unset(arg_prop, std::string("Bitset")),
+        " - blocks");
+    m_blocks =
+        block_view_type(prop_copy, ((m_size + block_mask) >> block_shift));
+
     for (int i = 0, end = static_cast<int>(m_size & block_mask); i < end; ++i) {
       m_last_block_mask |= 1u << i;
     }
@@ -105,7 +142,7 @@ class Bitset {
   /// number of bits which are set to 1
   /// can only be called from the host
   unsigned count() const {
-    Impl::BitsetCount<Bitset<Device> > f(*this);
+    Impl::BitsetCount<Bitset<Device>> f(*this);
     return f.apply();
   }
 
@@ -275,7 +312,7 @@ class Bitset {
  private:
   unsigned m_size;
   unsigned m_last_block_mask;
-  View<unsigned*, Device, MemoryTraits<RandomAccess> > m_blocks;
+  block_view_type m_blocks;
 
  private:
   template <typename DDevice>
@@ -302,6 +339,7 @@ class ConstBitset {
  public:
   using execution_space = typename Device::execution_space;
   using size_type       = unsigned int;
+  using block_view_type = typename Bitset<Device>::block_view_type::const_type;
 
  private:
   enum { block_size = static_cast<unsigned>(sizeof(unsigned) * CHAR_BIT) };
@@ -340,7 +378,7 @@ class ConstBitset {
   unsigned size() const { return m_size; }
 
   unsigned count() const {
-    Impl::BitsetCount<ConstBitset<Device> > f(*this);
+    Impl::BitsetCount<ConstBitset<Device>> f(*this);
     return f.apply();
   }
 
@@ -356,7 +394,7 @@ class ConstBitset {
 
  private:
   unsigned m_size;
-  View<const unsigned*, Device, MemoryTraits<RandomAccess> > m_blocks;
+  block_view_type m_blocks;
 
  private:
   template <typename DDevice>
diff --git a/packages/kokkos/containers/src/Kokkos_DualView.hpp b/packages/kokkos/containers/src/Kokkos_DualView.hpp
index 07256b446426f9a0024df1755a8dd42045eaf65e..84bced2cc4472c10fcbebb0b11ce6e7ad72d24d1 100644
--- a/packages/kokkos/containers/src/Kokkos_DualView.hpp
+++ b/packages/kokkos/containers/src/Kokkos_DualView.hpp
@@ -86,22 +86,57 @@ inline const Kokkos::Cuda& get_cuda_space(const NonCudaExecSpace&) {
 #endif  // KOKKOS_ENABLE_CUDA
 
 }  // namespace Impl
+
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
 template <class DataType, class Arg1Type = void, class Arg2Type = void,
           class Arg3Type = void>
+class DualView;
+#else
+template <class DataType, class... Properties>
+class DualView;
+#endif
+
+template <class>
+struct is_dual_view : public std::false_type {};
+
+template <class DT, class... DP>
+struct is_dual_view<DualView<DT, DP...>> : public std::true_type {};
+
+template <class DT, class... DP>
+struct is_dual_view<const DualView<DT, DP...>> : public std::true_type {};
+
+template <class T>
+inline constexpr bool is_dual_view_v = is_dual_view<T>::value;
+
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
+template <class DataType, class Arg1Type, class Arg2Type, class Arg3Type>
 class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
   template <class, class, class, class>
+#else
+template <class DataType, class... Properties>
+class DualView : public ViewTraits<DataType, Properties...> {
+  template <class, class...>
+#endif
   friend class DualView;
 
  public:
   //! \name Typedefs for device types and various Kokkos::View specializations.
   //@{
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
   using traits = ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type>;
+#else
+  using traits      = ViewTraits<DataType, Properties...>;
+#endif
 
   //! The Kokkos Host Device type;
   using host_mirror_space = typename traits::host_mirror_space;
 
   //! The type of a Kokkos::View on the device.
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
   using t_dev = View<typename traits::data_type, Arg1Type, Arg2Type, Arg3Type>;
+#else
+  using t_dev       = View<typename traits::data_type, Properties...>;
+#endif
 
   /// \typedef t_host
   /// \brief The type of a Kokkos::View host mirror of \c t_dev.
@@ -109,8 +144,12 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
 
   //! The type of a const View on the device.
   //! The type of a Kokkos::View on the device.
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
   using t_dev_const =
       View<typename traits::const_data_type, Arg1Type, Arg2Type, Arg3Type>;
+#else
+  using t_dev_const = View<typename traits::const_data_type, Properties...>;
+#endif
 
   /// \typedef t_host_const
   /// \brief The type of a const View host mirror of \c t_dev_const.
@@ -239,22 +278,32 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
       : modified_flags(t_modified_flags("DualView::modified_flags")),
         d_view(arg_prop, n0, n1, n2, n3, n4, n5, n6, n7) {
     // without UVM, host View mirrors
-    if (Kokkos::Impl::has_type<Impl::WithoutInitializing_t, P...>::value)
+    if constexpr (Kokkos::Impl::has_type<Impl::WithoutInitializing_t,
+                                         P...>::value)
       h_view = Kokkos::create_mirror_view(Kokkos::WithoutInitializing, d_view);
     else
       h_view = Kokkos::create_mirror_view(d_view);
   }
 
   //! Copy constructor (shallow copy)
-  template <class SS, class LS, class DS, class MS>
-  DualView(const DualView<SS, LS, DS, MS>& src)
+  template <typename DT, typename... DP>
+  DualView(const DualView<DT, DP...>& src)
       : modified_flags(src.modified_flags),
         d_view(src.d_view),
         h_view(src.h_view) {}
 
+  //! Copy assignment operator (shallow copy assignment)
+  template <typename DT, typename... DP>
+  DualView& operator=(const DualView<DT, DP...>& src) {
+    modified_flags = src.modified_flags;
+    d_view         = src.d_view;
+    h_view         = src.h_view;
+    return *this;
+  }
+
   //! Subview constructor
-  template <class SD, class S1, class S2, class S3, class Arg0, class... Args>
-  DualView(const DualView<SD, S1, S2, S3>& src, const Arg0& arg0, Args... args)
+  template <class DT, class... DP, class Arg0, class... Args>
+  DualView(const DualView<DT, DP...>& src, const Arg0& arg0, Args... args)
       : modified_flags(src.modified_flags),
         d_view(Kokkos::subview(src.d_view, arg0, args...)),
         h_view(Kokkos::subview(src.h_view, arg0, args...)) {}
@@ -576,8 +625,8 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
         impl_report_host_sync();
       }
     }
-    if (std::is_same<typename t_host::memory_space,
-                     typename t_dev::memory_space>::value) {
+    if constexpr (std::is_same<typename t_host::memory_space,
+                               typename t_dev::memory_space>::value) {
       typename t_dev::execution_space().fence(
           "Kokkos::DualView<>::sync: fence after syncing DualView");
       typename t_host::execution_space().fence(
@@ -1141,23 +1190,24 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
 namespace Kokkos {
 namespace Impl {
 
-template <class D, class A1, class A2, class A3, class... Args>
-struct DualViewSubview {
-  using dst_traits = typename Kokkos::Impl::ViewMapping<
-      void, Kokkos::ViewTraits<D, A1, A2, A3>, Args...>::traits_type;
+template <class V>
+struct V2DV;
 
-  using type = Kokkos::DualView<
-      typename dst_traits::data_type, typename dst_traits::array_layout,
-      typename dst_traits::device_type, typename dst_traits::memory_traits>;
+template <class D, class... P>
+struct V2DV<View<D, P...>> {
+  using type = DualView<D, P...>;
 };
-
 } /* namespace Impl */
 
-template <class D, class A1, class A2, class A3, class... Args>
-typename Impl::DualViewSubview<D, A1, A2, A3, Args...>::type subview(
-    const DualView<D, A1, A2, A3>& src, Args... args) {
-  return typename Impl::DualViewSubview<D, A1, A2, A3, Args...>::type(src,
-                                                                      args...);
+template <class DataType, class... Properties, class... Args>
+auto subview(const DualView<DataType, Properties...>& src, Args&&... args) {
+  // leverage Kokkos::View facilities to deduce the properties of the subview
+  using deduce_subview_type =
+      decltype(subview(std::declval<View<DataType, Properties...>>(),
+                       std::forward<Args>(args)...));
+  // map it back to dual view
+  return typename Impl::V2DV<deduce_subview_type>::type(
+      src, std::forward<Args>(args)...);
 }
 
 } /* namespace Kokkos */
@@ -1171,11 +1221,8 @@ namespace Kokkos {
 // Partial specialization of Kokkos::deep_copy() for DualView objects.
 //
 
-template <class DT, class DL, class DD, class DM, class ST, class SL, class SD,
-          class SM>
-void deep_copy(
-    DualView<DT, DL, DD, DM> dst,  // trust me, this must not be a reference
-    const DualView<ST, SL, SD, SM>& src) {
+template <class DT, class... DP, class ST, class... SP>
+void deep_copy(DualView<DT, DP...>& dst, const DualView<ST, SP...>& src) {
   if (src.need_sync_device()) {
     deep_copy(dst.h_view, src.h_view);
     dst.modify_host();
@@ -1185,12 +1232,9 @@ void deep_copy(
   }
 }
 
-template <class ExecutionSpace, class DT, class DL, class DD, class DM,
-          class ST, class SL, class SD, class SM>
-void deep_copy(
-    const ExecutionSpace& exec,
-    DualView<DT, DL, DD, DM> dst,  // trust me, this must not be a reference
-    const DualView<ST, SL, SD, SM>& src) {
+template <class ExecutionSpace, class DT, class... DP, class ST, class... SP>
+void deep_copy(const ExecutionSpace& exec, DualView<DT, DP...>& dst,
+               const DualView<ST, SP...>& src) {
   if (src.need_sync_device()) {
     deep_copy(exec, dst.h_view, src.h_view);
     dst.modify_host();
diff --git a/packages/kokkos/containers/src/Kokkos_DynRankView.hpp b/packages/kokkos/containers/src/Kokkos_DynRankView.hpp
index ce433b0bfc088a0cea29cd15ff3cbde0c9af5acb..52aa86d8ee4351f14f711ab8da11fe42902b175b 100644
--- a/packages/kokkos/containers/src/Kokkos_DynRankView.hpp
+++ b/packages/kokkos/containers/src/Kokkos_DynRankView.hpp
@@ -221,7 +221,7 @@ KOKKOS_INLINE_FUNCTION bool dyn_rank_view_verify_operator_bounds(
     return (size_t(i) < map.extent(R)) &&
            dyn_rank_view_verify_operator_bounds<R + 1>(rank, map, args...);
   } else if (i != 0) {
-    KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+    Kokkos::printf(
         "DynRankView Debug Bounds Checking Error: at rank %u\n  Extra "
         "arguments beyond the rank must be zero \n",
         R);
@@ -346,7 +346,7 @@ class ViewMapping<
     dst.m_map.m_impl_handle = Kokkos::Impl::ViewDataHandle<DstTraits>::assign(
         src.m_map.m_impl_handle, src.m_track.m_tracker);
     dst.m_track.assign(src.m_track.m_tracker, DstTraits::is_managed);
-    dst.m_rank = src.Rank;
+    dst.m_rank = Kokkos::View<ST, SP...>::rank();
   }
 };
 
@@ -375,6 +375,9 @@ template <class D, class... P>
 struct is_dyn_rank_view<Kokkos::DynRankView<D, P...>> : public std::true_type {
 };
 
+template <class T>
+inline constexpr bool is_dyn_rank_view_v = is_dyn_rank_view<T>::value;
+
 template <typename DataType, class... Properties>
 class DynRankView : public ViewTraits<DataType, Properties...> {
   static_assert(!std::is_array<DataType>::value &&
@@ -1025,7 +1028,7 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
   // Copy/Assign View to DynRankView
   template <class RT, class... RP>
   KOKKOS_INLINE_FUNCTION DynRankView(const View<RT, RP...>& rhs)
-      : m_track(), m_map(), m_rank(rhs.Rank) {
+      : m_track(), m_map(), m_rank(View<RT, RP...>::rank()) {
     using SrcTraits = typename View<RT, RP...>::traits;
     using Mapping =
         Kokkos::Impl::ViewMapping<traits, SrcTraits,
@@ -1089,37 +1092,12 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
           "execution space");
     }
 
-//------------------------------------------------------------
-#if defined(KOKKOS_ENABLE_CUDA)
-    // If allocating in CudaUVMSpace must fence before and after
-    // the allocation to protect against possible concurrent access
-    // on the CPU and the GPU.
-    // Fence using the trait's executon space (which will be Kokkos::Cuda)
-    // to avoid incomplete type errors from usng Kokkos::Cuda directly.
-    if (std::is_same<Kokkos::CudaUVMSpace,
-                     typename traits::device_type::memory_space>::value) {
-      typename traits::device_type::memory_space::execution_space().fence(
-          "Kokkos::DynRankView<>::DynRankView: fence before UVM allocation");
-    }
-#endif
-    //------------------------------------------------------------
-
     Kokkos::Impl::SharedAllocationRecord<>* record = m_map.allocate_shared(
         prop_copy,
         Impl::DynRankDimTraits<typename traits::specialize>::
             template createLayout<traits, P...>(arg_prop, arg_layout),
         Impl::ViewCtorProp<P...>::has_execution_space);
 
-//------------------------------------------------------------
-#if defined(KOKKOS_ENABLE_CUDA)
-    if (std::is_same<Kokkos::CudaUVMSpace,
-                     typename traits::device_type::memory_space>::value) {
-      typename traits::device_type::memory_space::execution_space().fence(
-          "Kokkos::DynRankView<>::DynRankView: fence after UVM allocation");
-    }
-#endif
-    //------------------------------------------------------------
-
     // Setup and initialization complete, start tracking
     m_track.assign_allocated_record_to_uninitialized(record);
   }
@@ -1573,7 +1551,7 @@ struct DynRankViewFill {
 };
 
 template <class OutputView>
-struct DynRankViewFill<OutputView, std::enable_if_t<OutputView::Rank == 0>> {
+struct DynRankViewFill<OutputView, std::enable_if_t<OutputView::rank == 0>> {
   DynRankViewFill(const OutputView& dst,
                   const typename OutputView::const_value_type& src) {
     Kokkos::Impl::DeepCopy<typename OutputView::memory_space,
@@ -2324,9 +2302,10 @@ inline void impl_resize(const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop,
   if constexpr (alloc_prop_input::has_execution_space)
     Kokkos::Impl::DynRankViewRemap<drview_type, drview_type>(
         Impl::get_property<Impl::ExecutionSpaceTag>(prop_copy), v_resized, v);
-  else
+  else {
     Kokkos::Impl::DynRankViewRemap<drview_type, drview_type>(v_resized, v);
-
+    Kokkos::fence("Kokkos::resize(DynRankView)");
+  }
   v = v_resized;
 }
 
diff --git a/packages/kokkos/containers/src/Kokkos_DynamicView.hpp b/packages/kokkos/containers/src/Kokkos_DynamicView.hpp
index f8636b6212fdcf32d525d12ae1e1d0e3521441a8..12885edbae9238c0c412d6c59c9ad8f9a8dcd50d 100644
--- a/packages/kokkos/containers/src/Kokkos_DynamicView.hpp
+++ b/packages/kokkos/containers/src/Kokkos_DynamicView.hpp
@@ -553,6 +553,9 @@ template <class D, class... P>
 struct is_dynamic_view<Kokkos::Experimental::DynamicView<D, P...>>
     : public std::true_type {};
 
+template <class T>
+inline constexpr bool is_dynamic_view_v = is_dynamic_view<T>::value;
+
 }  // namespace Kokkos
 
 namespace Kokkos {
diff --git a/packages/kokkos/containers/src/Kokkos_OffsetView.hpp b/packages/kokkos/containers/src/Kokkos_OffsetView.hpp
index 35b28999c1ddb077162535cdbd7a8f92f30ace75..92aead28784fd0acb1499767e124f743b894b1a4 100644
--- a/packages/kokkos/containers/src/Kokkos_OffsetView.hpp
+++ b/packages/kokkos/containers/src/Kokkos_OffsetView.hpp
@@ -43,6 +43,9 @@ struct is_offset_view<OffsetView<D, P...>> : public std::true_type {};
 template <class D, class... P>
 struct is_offset_view<const OffsetView<D, P...>> : public std::true_type {};
 
+template <class T>
+inline constexpr bool is_offset_view_v = is_offset_view<T>::value;
+
 #define KOKKOS_INVALID_OFFSET int64_t(0x7FFFFFFFFFFFFFFFLL)
 #define KOKKOS_INVALID_INDEX_RANGE \
   { KOKKOS_INVALID_OFFSET, KOKKOS_INVALID_OFFSET }
@@ -827,7 +830,7 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
                   "Incompatible OffsetView copy construction");
     Mapping::assign(m_map, aview.impl_map(), m_track);
 
-    for (int i = 0; i < aview.Rank; ++i) {
+    for (size_t i = 0; i < View<RT, RP...>::rank(); ++i) {
       m_begins[i] = 0;
     }
   }
@@ -938,10 +941,10 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
           ")"
           "\n";
 
-    // If there are no errors so far, then rank == Rank
+    // If there are no errors so far, then arg_rank == Rank
     // Otherwise, check as much as possible
-    size_t rank = begins.size() < ends.size() ? begins.size() : ends.size();
-    for (size_t i = 0; i != rank; ++i) {
+    size_t arg_rank = begins.size() < ends.size() ? begins.size() : ends.size();
+    for (size_t i = 0; i != arg_rank; ++i) {
       subtraction_failure sf = check_subtraction(at(ends, i), at(begins, i));
       if (sf != subtraction_failure::none) {
         message +=
@@ -1191,35 +1194,10 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
           "execution space");
     }
 
-    //------------------------------------------------------------
-#if defined(KOKKOS_ENABLE_CUDA)
-    // If allocating in CudaUVMSpace must fence before and after
-    // the allocation to protect against possible concurrent access
-    // on the CPU and the GPU.
-    // Fence using the trait's executon space (which will be Kokkos::Cuda)
-    // to avoid incomplete type errors from usng Kokkos::Cuda directly.
-    if (std::is_same<Kokkos::CudaUVMSpace,
-                     typename traits::device_type::memory_space>::value) {
-      typename traits::device_type::memory_space::execution_space().fence(
-          "Kokkos::OffsetView::OffsetView(): fence before UVM allocation");
-    }
-#endif
-    //------------------------------------------------------------
-
     Kokkos::Impl::SharedAllocationRecord<>* record = m_map.allocate_shared(
         prop_copy, arg_layout,
         Kokkos::Impl::ViewCtorProp<P...>::has_execution_space);
 
-    //------------------------------------------------------------
-#if defined(KOKKOS_ENABLE_CUDA)
-    if (std::is_same<Kokkos::CudaUVMSpace,
-                     typename traits::device_type::memory_space>::value) {
-      typename traits::device_type::memory_space::execution_space().fence(
-          "Kokkos::OffsetView::OffsetView(): fence after UVM allocation");
-    }
-#endif
-    //------------------------------------------------------------
-
     // Setup and initialization complete, start tracking
     m_track.assign_allocated_record_to_uninitialized(record);
 
@@ -1251,8 +1229,7 @@ shift_input(const T arg, const int64_t offset) {
 }
 
 KOKKOS_INLINE_FUNCTION
-Kokkos::Impl::ALL_t shift_input(const Kokkos::Impl::ALL_t arg,
-                                const int64_t /*offset*/) {
+Kokkos::ALL_t shift_input(const Kokkos::ALL_t arg, const int64_t /*offset*/) {
   return arg;
 }
 
@@ -1302,7 +1279,7 @@ KOKKOS_INLINE_FUNCTION
       Kokkos::Impl::ViewMapping<void /* deduce subview type from source view
                                         traits */
                                 ,
-                                ViewTraits<D, P...>, T>::type::Rank;
+                                ViewTraits<D, P...>, T>::type::rank;
 
   auto theSubview = Kokkos::subview(theView, shiftedArg);
 
@@ -1341,7 +1318,7 @@ KOKKOS_INLINE_FUNCTION
       Kokkos::Impl::ViewMapping<void /* deduce subview type from source view
                                         traits */
                                 ,
-                                ViewTraits<D, P...>, T0, T1>::type::Rank;
+                                ViewTraits<D, P...>, T0, T1>::type::rank;
 
   Kokkos::Array<int64_t, rank> subviewBegins;
   size_t counter = 0;
@@ -1382,7 +1359,7 @@ KOKKOS_INLINE_FUNCTION
       Kokkos::Impl::ViewMapping<void /* deduce subview type from source view
                                         traits */
                                 ,
-                                ViewTraits<D, P...>, T0, T1, T2>::type::Rank;
+                                ViewTraits<D, P...>, T0, T1, T2>::type::rank;
 
   Kokkos::Array<int64_t, rank> subviewBegins;
 
@@ -1427,7 +1404,7 @@ KOKKOS_INLINE_FUNCTION
   constexpr size_t rank = Kokkos::Impl::ViewMapping<
       void /* deduce subview type from source view traits */
       ,
-      ViewTraits<D, P...>, T0, T1, T2, T3>::type::Rank;
+      ViewTraits<D, P...>, T0, T1, T2, T3>::type::rank;
   Kokkos::Array<int64_t, rank> subviewBegins;
 
   size_t counter = 0;
@@ -1474,7 +1451,7 @@ KOKKOS_INLINE_FUNCTION
   constexpr size_t rank = Kokkos::Impl::ViewMapping<
       void /* deduce subview type from source view traits */
       ,
-      ViewTraits<D, P...>, T0, T1, T2, T3, T4>::type::Rank;
+      ViewTraits<D, P...>, T0, T1, T2, T3, T4>::type::rank;
   Kokkos::Array<int64_t, rank> subviewBegins;
 
   size_t counter = 0;
@@ -1526,7 +1503,7 @@ KOKKOS_INLINE_FUNCTION
   constexpr size_t rank = Kokkos::Impl::ViewMapping<
       void /* deduce subview type from source view traits */
       ,
-      ViewTraits<D, P...>, T0, T1, T2, T3, T4, T5>::type::Rank;
+      ViewTraits<D, P...>, T0, T1, T2, T3, T4, T5>::type::rank;
 
   Kokkos::Array<int64_t, rank> subviewBegins;
 
@@ -1581,7 +1558,7 @@ KOKKOS_INLINE_FUNCTION
   constexpr size_t rank = Kokkos::Impl::ViewMapping<
       void /* deduce subview type from source view traits */
       ,
-      ViewTraits<D, P...>, T0, T1, T2, T3, T4, T5, T6>::type::Rank;
+      ViewTraits<D, P...>, T0, T1, T2, T3, T4, T5, T6>::type::rank;
 
   Kokkos::Array<int64_t, rank> subviewBegins;
 
@@ -1640,7 +1617,7 @@ KOKKOS_INLINE_FUNCTION
   constexpr size_t rank = Kokkos::Impl::ViewMapping<
       void /* deduce subview type from source view traits */
       ,
-      ViewTraits<D, P...>, T0, T1, T2, T3, T4, T5, T6, T7>::type::Rank;
+      ViewTraits<D, P...>, T0, T1, T2, T3, T4, T5, T6, T7>::type::rank;
 
   Kokkos::Array<int64_t, rank> subviewBegins;
 
diff --git a/packages/kokkos/containers/src/Kokkos_ScatterView.hpp b/packages/kokkos/containers/src/Kokkos_ScatterView.hpp
index 3b30996ad635a886142a12f75dd8ce8285191296..9d04cf6acd0edc12b28c7c759512c8f2f3c0b526 100644
--- a/packages/kokkos/containers/src/Kokkos_ScatterView.hpp
+++ b/packages/kokkos/containers/src/Kokkos_ScatterView.hpp
@@ -532,7 +532,7 @@ void args_to_array(size_t* array, int pos, T dim0, Dims... dims) {
    subview where the index specified is the largest-stride one. */
 template <typename Layout, int rank, typename V, typename... Args>
 struct Slice {
-  using next       = Slice<Layout, rank - 1, V, Kokkos::Impl::ALL_t, Args...>;
+  using next       = Slice<Layout, rank - 1, V, Kokkos::ALL_t, Args...>;
   using value_type = typename next::value_type;
 
   static value_type get(V const& src, const size_t i, Args... args) {
@@ -663,6 +663,18 @@ template <typename DataType,
                   typename DeviceType::execution_space, Duplication>::type>
 class ScatterView;
 
+template <class>
+struct is_scatter_view : public std::false_type {};
+
+template <class D, class... P>
+struct is_scatter_view<ScatterView<D, P...>> : public std::true_type {};
+
+template <class D, class... P>
+struct is_scatter_view<const ScatterView<D, P...>> : public std::true_type {};
+
+template <class T>
+inline constexpr bool is_scatter_view_v = is_scatter_view<T>::value;
+
 template <typename DataType, typename Op, typename DeviceType, typename Layout,
           typename Duplication, typename Contribution,
           typename OverrideContribution>
@@ -721,10 +733,11 @@ class ScatterView<DataType, Layout, DeviceType, Op, ScatterNonDuplicated,
       : internal_view(other_view.internal_view) {}
 
   template <typename OtherDataType, typename OtherDeviceType>
-  KOKKOS_FUNCTION void operator=(
+  KOKKOS_FUNCTION ScatterView& operator=(
       const ScatterView<OtherDataType, Layout, OtherDeviceType, Op,
                         ScatterNonDuplicated, Contribution>& other_view) {
     internal_view = other_view.internal_view;
+    return *this;
   }
 
   template <typename OverrideContribution = Contribution>
@@ -942,11 +955,12 @@ class ScatterView<DataType, Kokkos::LayoutRight, DeviceType, Op,
         internal_view(other_view.internal_view) {}
 
   template <typename OtherDataType, typename OtherDeviceType>
-  KOKKOS_FUNCTION void operator=(
+  KOKKOS_FUNCTION ScatterView& operator=(
       const ScatterView<OtherDataType, Kokkos::LayoutRight, OtherDeviceType, Op,
                         ScatterDuplicated, Contribution>& other_view) {
     unique_token  = other_view.unique_token;
     internal_view = other_view.internal_view;
+    return *this;
   }
 
   template <typename RT, typename... RP>
@@ -1019,7 +1033,7 @@ class ScatterView<DataType, Kokkos::LayoutRight, DeviceType, Op,
                                              internal_view_type>::value_type
   subview() const {
     return Kokkos::Impl::Experimental::Slice<
-        Kokkos::LayoutRight, internal_view_type::Rank,
+        Kokkos::LayoutRight, internal_view_type::rank,
         internal_view_type>::get(internal_view, 0);
   }
 
@@ -1278,11 +1292,12 @@ class ScatterView<DataType, Kokkos::LayoutLeft, DeviceType, Op,
         internal_view(other_view.internal_view) {}
 
   template <typename OtherDataType, typename OtherDeviceType>
-  KOKKOS_FUNCTION void operator=(
+  KOKKOS_FUNCTION ScatterView& operator=(
       const ScatterView<OtherDataType, Kokkos::LayoutLeft, OtherDeviceType, Op,
                         ScatterDuplicated, Contribution>& other_view) {
     unique_token  = other_view.unique_token;
     internal_view = other_view.internal_view;
+    return *this;
   }
 
   template <typename OverrideContribution = Contribution>
diff --git a/packages/kokkos/containers/src/Kokkos_UnorderedMap.hpp b/packages/kokkos/containers/src/Kokkos_UnorderedMap.hpp
index 058b6626c40cb13a8c6352a86a7914b9aa785ed2..e001c062de3a38ec8e9c918276352b3475832909 100644
--- a/packages/kokkos/containers/src/Kokkos_UnorderedMap.hpp
+++ b/packages/kokkos/containers/src/Kokkos_UnorderedMap.hpp
@@ -34,8 +34,7 @@
 
 #include <impl/Kokkos_Traits.hpp>
 #include <impl/Kokkos_UnorderedMap_impl.hpp>
-
-#include <iostream>
+#include <impl/Kokkos_ViewCtor.hpp>
 
 #include <cstdint>
 
@@ -119,6 +118,36 @@ class UnorderedMapInsertResult {
   uint32_t m_status;
 };
 
+/// \class UnorderedMapInsertOpTypes
+///
+/// \brief Operations applied to the values array upon subsequent insertions.
+///
+/// The default behavior when a k,v pair already exists in the UnorderedMap is
+/// to perform no operation. Alternatively, the caller may select to
+/// instantiate the UnorderedMap with the AtomicAdd insert operator such that
+/// duplicate keys accumulate values into the given values array entry.
+/// \tparam ValueTypeView The UnorderedMap value array type.
+/// \tparam ValuesIdxType The index type for lookups in the value array.
+///
+/// Supported operations:
+///   NoOp:      the first key inserted stores the associated value.
+///   AtomicAdd: duplicate key insertions sum values together.
+template <class ValueTypeView, class ValuesIdxType>
+struct UnorderedMapInsertOpTypes {
+  using value_type = typename ValueTypeView::non_const_value_type;
+  struct NoOp {
+    KOKKOS_FUNCTION
+    void op(ValueTypeView, ValuesIdxType, const value_type) const {}
+  };
+  struct AtomicAdd {
+    KOKKOS_FUNCTION
+    void op(ValueTypeView values, ValuesIdxType values_idx,
+            const value_type v) const {
+      Kokkos::atomic_add(values.data() + values_idx, v);
+    }
+  };
+};
+
 /// \class UnorderedMap
 /// \brief Thread-safe, performance-portable lookup table.
 ///
@@ -186,7 +215,6 @@ class UnorderedMap {
  public:
   //! \name Public types and constants
   //@{
-
   // key_types
   using declared_key_type = Key;
   using key_type          = std::remove_const_t<declared_key_type>;
@@ -232,7 +260,6 @@ class UnorderedMap {
       UnorderedMap<Key, Value, host_mirror_space, Hasher, EqualTo>;
 
   using histogram_type = Impl::UnorderedMapHistogram<const_map_type>;
-
   //@}
 
  private:
@@ -263,37 +290,89 @@ class UnorderedMap {
  public:
   //! \name Public member functions
   //@{
+  using default_op_type =
+      typename UnorderedMapInsertOpTypes<value_type_view, uint32_t>::NoOp;
 
   /// \brief Constructor
   ///
   /// \param capacity_hint [in] Initial guess of how many unique keys will be
-  /// inserted into the map \param hash [in] Hasher function for \c Key
-  /// instances.  The
-  ///   default value usually suffices.
+  ///                           inserted into the map.
+  /// \param hash          [in] Hasher function for \c Key instances.  The
+  ///                           default value usually suffices.
+  /// \param equal_to      [in] The operator used for determining if two
+  ///                           keys are equal.
   UnorderedMap(size_type capacity_hint = 0, hasher_type hasher = hasher_type(),
                equal_to_type equal_to = equal_to_type())
-      : m_bounded_insert(true),
-        m_hasher(hasher),
-        m_equal_to(equal_to),
-        m_size(),
-        m_available_indexes(calculate_capacity(capacity_hint)),
-        m_hash_lists(view_alloc(WithoutInitializing, "UnorderedMap hash list"),
-                     Impl::find_hash_size(capacity())),
-        m_next_index(view_alloc(WithoutInitializing, "UnorderedMap next index"),
-                     capacity() + 1)  // +1 so that the *_at functions can
-                                      // always return a valid reference
-        ,
-        m_keys("UnorderedMap keys", capacity()),
-        m_values("UnorderedMap values", (is_set ? 0 : capacity())),
-        m_scalars("UnorderedMap scalars") {
+      : UnorderedMap(Kokkos::view_alloc(), capacity_hint, hasher, equal_to) {}
+
+  template <class... P>
+  UnorderedMap(const Impl::ViewCtorProp<P...> &arg_prop,
+               size_type capacity_hint = 0, hasher_type hasher = hasher_type(),
+               equal_to_type equal_to = equal_to_type())
+      : m_bounded_insert(true), m_hasher(hasher), m_equal_to(equal_to) {
     if (!is_insertable_map) {
       Kokkos::Impl::throw_runtime_exception(
           "Cannot construct a non-insertable (i.e. const key_type) "
           "unordered_map");
     }
 
-    Kokkos::deep_copy(m_hash_lists, invalid_index);
-    Kokkos::deep_copy(m_next_index, invalid_index);
+    //! Ensure that allocation properties are consistent.
+    using alloc_prop_t = std::decay_t<decltype(arg_prop)>;
+    static_assert(alloc_prop_t::initialize,
+                  "Allocation property 'initialize' should be true.");
+    static_assert(
+        !alloc_prop_t::has_pointer,
+        "Allocation properties should not contain the 'pointer' property.");
+
+    /// Update allocation properties with 'label' and 'without initializing'
+    /// properties.
+    const auto prop_copy =
+        Impl::with_properties_if_unset(arg_prop, std::string("UnorderedMap"));
+    const auto prop_copy_noinit =
+        Impl::with_properties_if_unset(prop_copy, Kokkos::WithoutInitializing);
+
+    //! Initialize member views.
+    m_size = shared_size_t(Kokkos::view_alloc(
+        Kokkos::DefaultHostExecutionSpace{},
+        Impl::get_property<Impl::LabelTag>(prop_copy) + " - size"));
+
+    m_available_indexes =
+        bitset_type(Kokkos::Impl::with_updated_label(prop_copy, " - bitset"),
+                    calculate_capacity(capacity_hint));
+
+    m_hash_lists = size_type_view(
+        Kokkos::Impl::with_updated_label(prop_copy_noinit, " - hash list"),
+        Impl::find_hash_size(capacity()));
+
+    m_next_index = size_type_view(
+        Kokkos::Impl::with_updated_label(prop_copy_noinit, " - next index"),
+        capacity() + 1);  // +1 so that the *_at functions can always return a
+                          // valid reference
+
+    m_keys = key_type_view(
+        Kokkos::Impl::with_updated_label(prop_copy, " - keys"), capacity());
+
+    m_values = value_type_view(
+        Kokkos::Impl::with_updated_label(prop_copy, " - values"),
+        is_set ? 0 : capacity());
+
+    m_scalars =
+        scalars_view(Kokkos::Impl::with_updated_label(prop_copy, " - scalars"));
+
+    /**
+     * Deep copies should also be done using the space instance if given.
+     * Instead of the if/else we could use the
+     * @c get_property_or_default, but giving even the default execution space
+     * instance will change the behavior of @c deep_copy.
+     */
+    if constexpr (alloc_prop_t::has_execution_space) {
+      const auto &space = Impl::get_property<Impl::ExecutionSpaceTag>(arg_prop);
+      Kokkos::deep_copy(space, m_hash_lists, invalid_index);
+      Kokkos::deep_copy(space, m_next_index, invalid_index);
+    } else {
+      Kokkos::deep_copy(m_hash_lists, invalid_index);
+      Kokkos::deep_copy(m_next_index, invalid_index);
+    }
   }
 
   void reset_failed_insert_flag() { reset_flag(failed_insert_idx); }
@@ -315,7 +394,7 @@ class UnorderedMap {
       Kokkos::deep_copy(m_keys, tmp);
     }
     Kokkos::deep_copy(m_scalars, 0);
-    m_size = 0;
+    m_size() = 0;
   }
 
   KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const {
@@ -369,10 +448,10 @@ class UnorderedMap {
   size_type size() const {
     if (capacity() == 0u) return 0u;
     if (modified()) {
-      m_size = m_available_indexes.count();
+      m_size() = m_available_indexes.count();
       reset_flag(modified_idx);
     }
-    return m_size;
+    return m_size();
   }
 
   /// \brief The current number of failed insert() calls.
@@ -442,9 +521,18 @@ class UnorderedMap {
   /// \param v [in] The corresponding value to attempt to insert.  If
   ///   using this class as a set (with Value = void), then you need not
   ///   provide this value.
-  KOKKOS_INLINE_FUNCTION
-  insert_result insert(key_type const &k,
-                       impl_value_type const &v = impl_value_type()) const {
+  /// \param insert_op [in] The operator used for combining values if a
+  ///                       key already exists. See
+  ///                       Kokkos::UnorderedMapInsertOpTypes for more ops.
+  template <typename InsertOpType = default_op_type>
+  KOKKOS_INLINE_FUNCTION insert_result
+  insert(key_type const &k, impl_value_type const &v = impl_value_type(),
+         [[maybe_unused]] InsertOpType arg_insert_op = InsertOpType()) const {
+    if constexpr (is_set) {
+      static_assert(std::is_same_v<InsertOpType, default_op_type>,
+                    "Insert Operations are not supported on sets.");
+    }
+
     insert_result result;
 
     if (!is_insertable_map || capacity() == 0u ||
@@ -527,11 +615,14 @@ class UnorderedMap {
           // Previously claimed an unused entry that was not inserted.
           // Release this unused entry immediately.
           if (!m_available_indexes.reset(new_index)) {
-            KOKKOS_IMPL_DO_NOT_USE_PRINTF("Unable to free existing\n");
+            Kokkos::printf("Unable to free existing\n");
           }
         }
 
         result.set_existing(curr, free_existing);
+        if constexpr (!is_set) {
+          arg_insert_op.op(m_values, curr, v);
+        }
         not_done = false;
       }
       //------------------------------------------------------------
@@ -725,7 +816,7 @@ class UnorderedMap {
       tmp.m_bounded_insert    = src.m_bounded_insert;
       tmp.m_hasher            = src.m_hasher;
       tmp.m_equal_to          = src.m_equal_to;
-      tmp.m_size              = src.size();
+      tmp.m_size()            = src.m_size();
       tmp.m_available_indexes = bitset_type(src.capacity());
       tmp.m_hash_lists        = size_type_view(
           view_alloc(WithoutInitializing, "UnorderedMap hash list"),
@@ -818,7 +909,8 @@ class UnorderedMap {
   bool m_bounded_insert;
   hasher_type m_hasher;
   equal_to_type m_equal_to;
-  mutable size_type m_size;
+  using shared_size_t = View<size_type, Kokkos::DefaultHostExecutionSpace>;
+  shared_size_t m_size;
   bitset_type m_available_indexes;
   size_type_view m_hash_lists;
   size_type_view m_next_index;
diff --git a/packages/kokkos/containers/src/Kokkos_Vector.hpp b/packages/kokkos/containers/src/Kokkos_Vector.hpp
index 29bb15f5eab84ea32522889417d34806b525edce..88109fb0ba56ac4afe35e837b146ce152342792d 100644
--- a/packages/kokkos/containers/src/Kokkos_Vector.hpp
+++ b/packages/kokkos/containers/src/Kokkos_Vector.hpp
@@ -21,6 +21,22 @@
 #define KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_VECTOR
 #endif
 
+#include <Kokkos_Macros.hpp>
+
+#if defined(KOKKOS_ENABLE_DEPRECATED_CODE_4)
+#if defined(KOKKOS_ENABLE_DEPRECATION_WARNINGS)
+namespace {
+[[deprecated("Deprecated <Kokkos_Vector.hpp> header is included")]] int
+emit_warning_kokkos_vector_deprecated() {
+  return 0;
+}
+static auto do_not_include = emit_warning_kokkos_vector_deprecated();
+}  // namespace
+#endif
+#else
+#error "Deprecated <Kokkos_Vector.hpp> header is included"
+#endif
+
 #include <Kokkos_Core_fwd.hpp>
 #include <Kokkos_DualView.hpp>
 
@@ -31,8 +47,10 @@
  */
 namespace Kokkos {
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
 template <class Scalar, class Arg1Type = void>
-class vector : public DualView<Scalar*, LayoutLeft, Arg1Type> {
+class KOKKOS_DEPRECATED vector
+    : public DualView<Scalar*, LayoutLeft, Arg1Type> {
  public:
   using value_type      = Scalar;
   using pointer         = Scalar*;
@@ -196,10 +214,16 @@ class vector : public DualView<Scalar*, LayoutLeft, Arg1Type> {
 
   iterator begin() const { return DV::h_view.data(); }
 
+  const_iterator cbegin() const { return DV::h_view.data(); }
+
   iterator end() const {
     return _size > 0 ? DV::h_view.data() + _size : DV::h_view.data();
   }
 
+  const_iterator cend() const {
+    return _size > 0 ? DV::h_view.data() + _size : DV::h_view.data();
+  }
+
   reference front() { return DV::h_view(0); }
 
   reference back() { return DV::h_view(_size - 1); }
@@ -306,6 +330,7 @@ class vector : public DualView<Scalar*, LayoutLeft, Arg1Type> {
     void operator()(const int& i) const { _data(i) = _val; }
   };
 };
+#endif
 
 }  // namespace Kokkos
 #ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_VECTOR
diff --git a/packages/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp b/packages/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp
index c598e4b67b26596cbec7dc261860f8fdc7231825..b71037afeaacfce5c9738c8345d93a58909f8dbd 100644
--- a/packages/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp
+++ b/packages/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp
@@ -23,7 +23,6 @@
 
 #include <cstdio>
 #include <climits>
-#include <iostream>
 #include <iomanip>
 
 namespace Kokkos {
diff --git a/packages/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp b/packages/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp
index 2c431d1579276606a97fd92bf2688a73a70dba7b..8f8cd9523b726379c62a4e2cd7f5d096988f8b13 100644
--- a/packages/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp
+++ b/packages/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp
@@ -22,7 +22,6 @@
 
 #include <cstdio>
 #include <climits>
-#include <iostream>
 #include <iomanip>
 
 namespace Kokkos {
@@ -226,8 +225,8 @@ struct UnorderedMapPrint {
     uint32_t list = m_map.m_hash_lists(i);
     for (size_type curr = list, ii = 0; curr != invalid_index;
          curr = m_map.m_next_index[curr], ++ii) {
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d[%d]: %d->%d\n", list, ii,
-                                    m_map.key_at(curr), m_map.value_at(curr));
+      Kokkos::printf("%d[%d]: %d->%d\n", list, ii, m_map.key_at(curr),
+                     m_map.value_at(curr));
     }
   }
 };
diff --git a/packages/kokkos/containers/unit_tests/CMakeLists.txt b/packages/kokkos/containers/unit_tests/CMakeLists.txt
index 261d9dcd4215d712ef7b6fca3b0ad08c9ecb0052..e69e46bb6a85c04bbfa4eda0cbd78ee96ee2fe51 100644
--- a/packages/kokkos/containers/unit_tests/CMakeLists.txt
+++ b/packages/kokkos/containers/unit_tests/CMakeLists.txt
@@ -28,6 +28,9 @@ foreach(Tag Threads;Serial;OpenMP;HPX;Cuda;HIP;SYCL)
         Vector
         ViewCtorPropEmbeddedDim
         )
+      if(NOT Kokkos_ENABLE_DEPRECATED_CODE_4 AND Name STREQUAL "Vector")
+        continue() # skip Kokkos::vector test if deprecated code 4 is not enabled
+      endif()
       # Write to a temporary intermediate file and call configure_file to avoid
       # updating timestamps triggering unnecessary rebuilds on subsequent cmake runs.
       set(file ${dir}/Test${Tag}_${Name}.cpp)
@@ -43,15 +46,22 @@ foreach(Tag Threads;Serial;OpenMP;HPX;Cuda;HIP;SYCL)
      LIST(REMOVE_ITEM UnitTestSources ${dir}/TestCuda_DynViewAPI_generic.cpp)
     endif()
 
-    KOKKOS_ADD_EXECUTABLE_AND_TEST(UnitTest_${Tag} SOURCES ${UnitTestSources})
+    # FIXME_NVHPC: NVC++-S-0000-Internal compiler error. extractor: bad opc       0
+    if(KOKKOS_ENABLE_CUDA AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
+     LIST(REMOVE_ITEM UnitTestSources ${dir}/TestCuda_WithoutInitializing.cpp)
+    endif()
+
+    KOKKOS_ADD_EXECUTABLE_AND_TEST(ContainersUnitTest_${Tag} SOURCES ${UnitTestSources})
   endif()
 endforeach()
 
 SET(COMPILE_ONLY_SOURCES
   TestCreateMirror.cpp
+  TestDualViewParameterPack.cpp
+  TestIsViewTrait.cpp
 )
 KOKKOS_ADD_EXECUTABLE(
-  TestCompileOnly
+  ContainersTestCompileOnly
   SOURCES
   TestCompileMain.cpp
   ${COMPILE_ONLY_SOURCES}
diff --git a/packages/kokkos/containers/unit_tests/Makefile b/packages/kokkos/containers/unit_tests/Makefile
index a6235983d56554f58a624585f9f124d6eccb529f..2e35832cc8916e998b075ee2f85a7e79a36d7ccf 100644
--- a/packages/kokkos/containers/unit_tests/Makefile
+++ b/packages/kokkos/containers/unit_tests/Makefile
@@ -31,7 +31,7 @@ KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/containers/unit_tests -I${KO
 TEST_TARGETS =
 TARGETS =
 
-TESTS = Bitset DualView DynamicView DynViewAPI_generic DynViewAPI_rank12345 DynViewAPI_rank67 ErrorReporter OffsetView ScatterView StaticCrsGraph UnorderedMap Vector ViewCtorPropEmbeddedDim
+TESTS = Bitset DualView DynamicView DynViewAPI_generic DynViewAPI_rank12345 DynViewAPI_rank67 ErrorReporter OffsetView ScatterView StaticCrsGraph UnorderedMap ViewCtorPropEmbeddedDim
 tmp := $(foreach device, $(KOKKOS_DEVICELIST), \
   tmp2 := $(foreach test, $(TESTS), \
     $(if $(filter Test$(device)_$(test).cpp, $(shell ls Test$(device)_$(test).cpp 2>/dev/null)),,\
@@ -54,7 +54,6 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
 	OBJ_CUDA += TestCuda_ScatterView.o
 	OBJ_CUDA += TestCuda_StaticCrsGraph.o
 	OBJ_CUDA += TestCuda_UnorderedMap.o
-	OBJ_CUDA += TestCuda_Vector.o
 	OBJ_CUDA += TestCuda_ViewCtorPropEmbeddedDim.o
 	TARGETS += KokkosContainers_UnitTest_Cuda
 	TEST_TARGETS += test-cuda
@@ -73,7 +72,6 @@ ifeq ($(KOKKOS_INTERNAL_USE_THREADS), 1)
 	OBJ_THREADS += TestThreads_ScatterView.o
 	OBJ_THREADS += TestThreads_StaticCrsGraph.o
 	OBJ_THREADS += TestThreads_UnorderedMap.o
-	OBJ_THREADS += TestThreads_Vector.o
 	OBJ_THREADS += TestThreads_ViewCtorPropEmbeddedDim.o
 	TARGETS += KokkosContainers_UnitTest_Threads
 	TEST_TARGETS += test-threads
@@ -92,7 +90,6 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
 	OBJ_OPENMP += TestOpenMP_ScatterView.o
 	OBJ_OPENMP += TestOpenMP_StaticCrsGraph.o
 	OBJ_OPENMP += TestOpenMP_UnorderedMap.o
-	OBJ_OPENMP += TestOpenMP_Vector.o
 	OBJ_OPENMP += TestOpenMP_ViewCtorPropEmbeddedDim.o
 	TARGETS += KokkosContainers_UnitTest_OpenMP
 	TEST_TARGETS += test-openmp
@@ -111,7 +108,6 @@ ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
 	OBJ_HPX += TestHPX_ScatterView.o
 	OBJ_HPX += TestHPX_StaticCrsGraph.o
 	OBJ_HPX += TestHPX_UnorderedMap.o
-	OBJ_HPX += TestHPX_Vector.o
 	OBJ_HPX += TestHPX_ViewCtorPropEmbeddedDim.o
 	TARGETS += KokkosContainers_UnitTest_HPX
 	TEST_TARGETS += test-hpx
@@ -130,7 +126,6 @@ ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
 	OBJ_SERIAL += TestSerial_ScatterView.o
 	OBJ_SERIAL += TestSerial_StaticCrsGraph.o
 	OBJ_SERIAL += TestSerial_UnorderedMap.o
-	OBJ_SERIAL += TestSerial_Vector.o
 	OBJ_SERIAL += TestSerial_ViewCtorPropEmbeddedDim.o
 	TARGETS += KokkosContainers_UnitTest_Serial
 	TEST_TARGETS += test-serial
diff --git a/packages/kokkos/containers/unit_tests/TestBitset.hpp b/packages/kokkos/containers/unit_tests/TestBitset.hpp
index d97bdf9679f18ffad68903a4bc419788f93a444c..3ad0d2bf573431868464f80fe1af413c4b9da8db 100644
--- a/packages/kokkos/containers/unit_tests/TestBitset.hpp
+++ b/packages/kokkos/containers/unit_tests/TestBitset.hpp
@@ -190,19 +190,8 @@ void test_bitset() {
     {
       Impl::TestBitsetTest<const_bitset_type> f(bitset);
       uint32_t count = f.testit();
-#if defined(KOKKOS_ENABLE_CUDA) && \
-    defined(KOKKOS_COMPILER_NVHPC)  // FIXME_NVHPC
-      if constexpr (!std::is_same_v<typename Device::execution_space,
-                                    Kokkos::Cuda>) {
-        EXPECT_EQ(bitset.size(), count);
-        EXPECT_EQ(count, bitset.count());
-      } else {
-        (void)count;
-      }
-#else
       EXPECT_EQ(bitset.size(), count);
       EXPECT_EQ(count, bitset.count());
-#endif
     }
 
     // std::cout << "  Check reset() " << std::endl;
diff --git a/packages/kokkos/containers/unit_tests/TestDualView.hpp b/packages/kokkos/containers/unit_tests/TestDualView.hpp
index 6e2e56c5cf7864ba9897bbb3f29bf0f2a8e326d7..a15e5fa299726e80d7d6654e3ea28e801cd5f4c5 100644
--- a/packages/kokkos/containers/unit_tests/TestDualView.hpp
+++ b/packages/kokkos/containers/unit_tests/TestDualView.hpp
@@ -60,6 +60,47 @@ struct test_dualview_alloc {
   }
 };
 
+template <typename Scalar, class Device>
+struct test_dualview_copy_construction_and_assignment {
+  using scalar_type     = Scalar;
+  using execution_space = Device;
+
+  void operator()() {
+    constexpr unsigned int n = 10;
+    constexpr unsigned int m = 5;
+
+    using SrcViewType = Kokkos::DualView<Scalar**, Kokkos::LayoutLeft, Device>;
+    using DstViewType =
+        Kokkos::DualView<const Scalar * [m], Kokkos::LayoutLeft, Device>;
+
+    SrcViewType a("A", n, m);
+
+    // Copy construction
+    DstViewType b(a);
+
+    // Copy assignment
+    DstViewType c = a;
+
+    // Check equality (shallow) of the host and device views
+    ASSERT_EQ(a.view_host(), b.view_host());
+    ASSERT_EQ(a.view_device(), b.view_device());
+
+    ASSERT_EQ(a.view_host(), c.view_host());
+    ASSERT_EQ(a.view_device(), c.view_device());
+
+    // We can't test shallow equality of modified_flags because it's protected.
+    // So we test it indirectly through sync state behavior.
+    if (!std::decay_t<SrcViewType>::impl_dualview_is_single_device::value) {
+      a.clear_sync_state();
+      a.modify_host();
+      ASSERT_TRUE(a.need_sync_device());
+      ASSERT_TRUE(b.need_sync_device());
+      ASSERT_TRUE(c.need_sync_device());
+      a.clear_sync_state();
+    }
+  }
+};
+
 template <typename Scalar, class Device>
 struct test_dualview_combinations {
   using self_type = test_dualview_combinations<Scalar, Device>;
@@ -379,6 +420,11 @@ void test_dualview_alloc(unsigned int size) {
   ASSERT_TRUE(test.result);
 }
 
+template <typename Scalar, typename Device>
+void test_dualview_copy_construction_and_assignment() {
+  Impl::test_dualview_copy_construction_and_assignment<Scalar, Device>()();
+}
+
 template <typename Scalar, typename Device>
 void test_dualview_deep_copy() {
   Impl::test_dual_view_deep_copy<Scalar, Device>();
@@ -404,6 +450,10 @@ TEST(TEST_CATEGORY, dualview_alloc) {
   test_dualview_alloc<int, TEST_EXECSPACE>(10);
 }
 
+TEST(TEST_CATEGORY, test_dualview_copy_construction_and_assignment) {
+  test_dualview_copy_construction_and_assignment<int, TEST_EXECSPACE>();
+}
+
 TEST(TEST_CATEGORY, dualview_combinations_without_init) {
   test_dualview_combinations<int, TEST_EXECSPACE>(10, false);
 }
diff --git a/packages/kokkos/containers/unit_tests/TestDualViewParameterPack.cpp b/packages/kokkos/containers/unit_tests/TestDualViewParameterPack.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..861eba2b3e42c0a8ce91dbd0dd91bafca3c48b12
--- /dev/null
+++ b/packages/kokkos/containers/unit_tests/TestDualViewParameterPack.cpp
@@ -0,0 +1,43 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_DualView.hpp>
+
+namespace {
+
+template <class DataType, class Arg1Type = void, class Arg2Type = void,
+          class Arg3Type = void>
+void not_supported_anymore(
+    Kokkos::DualView<DataType, Arg1Type, Arg2Type, Arg2Type> x) {
+  static_assert(Kokkos::is_dual_view_v<decltype(x)>);
+}
+
+template <class DataType, class... Properties>
+void prefer_instead(Kokkos::DualView<DataType, Properties...> x) {
+  static_assert(Kokkos::is_dual_view_v<decltype(x)>);
+}
+
+using KDV = Kokkos::DualView<int*>;
+
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
+static_assert(
+    std::is_void_v<decltype(not_supported_anymore(std::declval<KDV>()))>);
+#endif
+
+static_assert(std::is_void_v<decltype(prefer_instead(std::declval<KDV>()))>);
+
+}  // namespace
diff --git a/packages/kokkos/containers/unit_tests/TestDynViewAPI.hpp b/packages/kokkos/containers/unit_tests/TestDynViewAPI.hpp
index c7a5b13dbbecd3f07a83269c446e61fc63aa8259..4ecb6cf25cc5bb1cf15746874821da1bd8ba4611 100644
--- a/packages/kokkos/containers/unit_tests/TestDynViewAPI.hpp
+++ b/packages/kokkos/containers/unit_tests/TestDynViewAPI.hpp
@@ -1200,19 +1200,19 @@ class TestDynViewAPI {
 
     View7 vtest1("vtest1", 2, 2, 2, 2, 2, 2, 2);
     dView0 dfromv1(vtest1);
-    ASSERT_EQ(dfromv1.rank(), vtest1.Rank);
+    ASSERT_EQ(dfromv1.rank(), vtest1.rank);
     ASSERT_EQ(dfromv1.extent(0), vtest1.extent(0));
     ASSERT_EQ(dfromv1.extent(1), vtest1.extent(1));
     ASSERT_EQ(dfromv1.use_count(), vtest1.use_count());
 
     dView0 dfromv2(vcast);
-    ASSERT_EQ(dfromv2.rank(), vcast.Rank);
+    ASSERT_EQ(dfromv2.rank(), vcast.rank);
     ASSERT_EQ(dfromv2.extent(0), vcast.extent(0));
     ASSERT_EQ(dfromv2.extent(1), vcast.extent(1));
     ASSERT_EQ(dfromv2.use_count(), vcast.use_count());
 
     dView0 dfromv3 = vcast1;
-    ASSERT_EQ(dfromv3.rank(), vcast1.Rank);
+    ASSERT_EQ(dfromv3.rank(), vcast1.rank);
     ASSERT_EQ(dfromv3.extent(0), vcast1.extent(0));
     ASSERT_EQ(dfromv3.extent(1), vcast1.extent(1));
     ASSERT_EQ(dfromv3.use_count(), vcast1.use_count());
diff --git a/packages/kokkos/containers/unit_tests/TestDynViewAPI_rank12345.hpp b/packages/kokkos/containers/unit_tests/TestDynViewAPI_rank12345.hpp
index d6019e823960852ae681b0b7d984503ea78ae8fe..9cefad190a54dea282a76cbf7c947b080dc188be 100644
--- a/packages/kokkos/containers/unit_tests/TestDynViewAPI_rank12345.hpp
+++ b/packages/kokkos/containers/unit_tests/TestDynViewAPI_rank12345.hpp
@@ -20,4 +20,56 @@ namespace Test {
 TEST(TEST_CATEGORY, dyn_rank_view_api_operator_rank12345) {
   TestDynViewAPI<double, TEST_EXECSPACE>::run_operator_test_rank12345();
 }
+
+template <typename SharedMemorySpace>
+void test_dyn_rank_view_resize() {
+  int n = 1000;
+  Kokkos::DynRankView<double, SharedMemorySpace> device_view("device view", n);
+  // Make sure we don't deallocate memory in Kokkos::resize
+  auto device_view_copy = device_view;
+
+  Kokkos::resize(device_view, 2 * n);
+
+  // Loop in reverse to increase likelihood of missing fence detection assuming
+  // that resize copies values in order.
+  for (int i = 2 * n - 1; i >= 0; --i) device_view(i) = i + 1;
+
+  Kokkos::fence();
+
+  // Check that Kokkos::resize completed before setting the values on the host
+  // manually (possibly because of missing fences).
+  for (int i = 0; i < 2 * n; ++i) ASSERT_EQ(device_view(i), i + 1);
+}
+
+template <typename SharedMemorySpace>
+void test_dyn_rank_view_realloc() {
+  int n = 1000;
+  Kokkos::DynRankView<double, SharedMemorySpace> device_view("device view", n);
+  // Make sure we don't deallocate memory in Kokkos::realloc
+  auto device_view_copy = device_view;
+
+  Kokkos::realloc(device_view, 2 * n);
+
+  // Loop in reverse to increase likelihood of missing fence detection assuming
+  // that realloc sets values in order.
+  for (int i = 2 * n - 1; i >= 0; --i) device_view(i) = i + 1;
+
+  Kokkos::fence();
+
+  // Check that Kokkos::realloc completed before setting the values on the host
+  // manually (possibly because of missing fences).
+  for (int i = 0; i < 2 * n; ++i) ASSERT_EQ(device_view(i), i + 1);
+}
+
+#ifdef KOKKOS_HAS_SHARED_SPACE
+TEST(TEST_CATEGORY, dyn_rank_view_check_fence_resize_realloc) {
+  if constexpr (std::is_same_v<TEST_EXECSPACE, Kokkos::DefaultExecutionSpace>) {
+    test_dyn_rank_view_resize<Kokkos::SharedSpace>();
+    test_dyn_rank_view_realloc<Kokkos::SharedSpace>();
+  } else {
+    GTEST_SKIP() << "skipping since not default execution space";
+  }
+}
+#endif
+
 }  // namespace Test
diff --git a/packages/kokkos/containers/unit_tests/TestDynamicView.hpp b/packages/kokkos/containers/unit_tests/TestDynamicView.hpp
index cd1511276f46785b04a33ae08c77214ac806fff8..c8f8fed3b8b36a3108f3bcb3df075ab7ab79d6d9 100644
--- a/packages/kokkos/containers/unit_tests/TestDynamicView.hpp
+++ b/packages/kokkos/containers/unit_tests/TestDynamicView.hpp
@@ -274,19 +274,13 @@ struct TestDynamicView {
       // swapped in the deep_copy implementation.
       // Once that's fixed, both deep_copy's will fail at runtime because the
       // destination execution space cannot access the source memory space.
-      try {
-        Kokkos::deep_copy(host_view, device_dynamic_view);
-      } catch (std::runtime_error const& error) {
-        std::string msg = error.what();
-        std::cerr << "Copy from on-device DynamicView to on-host View failed:\n"
-                  << msg << std::endl;
-      }
-      try {
-        Kokkos::deep_copy(device_dynamic_view, host_view);
-      } catch (std::runtime_error const& error) {
-        std::string msg = error.what();
-        std::cerr << "Copy from on-host View to on-device DynamicView failed:\n"
-                  << msg << std::endl;
+      // Check if the memory spaces are different before testing the deep_copy.
+      if (!Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                      memory_space>::accessible) {
+        ASSERT_THROW(Kokkos::deep_copy(host_view, device_dynamic_view),
+                     std::runtime_error);
+        ASSERT_THROW(Kokkos::deep_copy(device_dynamic_view, host_view),
+                     std::runtime_error);
       }
     }
   }
diff --git a/packages/kokkos/containers/unit_tests/TestIsViewTrait.cpp b/packages/kokkos/containers/unit_tests/TestIsViewTrait.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..994ba7ca0cbcef94d72fdcf45df5f101556a7523
--- /dev/null
+++ b/packages/kokkos/containers/unit_tests/TestIsViewTrait.cpp
@@ -0,0 +1,68 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_DualView.hpp>
+#include <Kokkos_DynRankView.hpp>
+#include <Kokkos_DynamicView.hpp>
+#include <Kokkos_OffsetView.hpp>
+#include <Kokkos_ScatterView.hpp>
+
+namespace {
+
+using view_t          = Kokkos::View<int*>;
+using dual_view_t     = Kokkos::DualView<int*>;
+using dyn_rank_view_t = Kokkos::DynRankView<int*>;
+using dynamic_view_t  = Kokkos::Experimental::DynamicView<int*>;
+using offset_view_t   = Kokkos::Experimental::OffsetView<int*>;
+using scatter_view_t  = Kokkos::Experimental::ScatterView<int*>;
+
+static_assert(Kokkos::is_dual_view_v<dual_view_t>);
+static_assert(!Kokkos::is_dyn_rank_view_v<dual_view_t>);
+static_assert(!Kokkos::is_dynamic_view_v<dual_view_t>);
+static_assert(!Kokkos::Experimental::is_offset_view_v<dual_view_t>);
+static_assert(!Kokkos::Experimental::is_scatter_view_v<dual_view_t>);
+static_assert(!Kokkos::is_view_v<dual_view_t>);
+
+static_assert(!Kokkos::is_dual_view_v<dyn_rank_view_t>);
+static_assert(Kokkos::is_dyn_rank_view_v<dyn_rank_view_t>);
+static_assert(!Kokkos::is_dynamic_view_v<dyn_rank_view_t>);
+static_assert(!Kokkos::Experimental::is_offset_view_v<dyn_rank_view_t>);
+static_assert(!Kokkos::Experimental::is_scatter_view_v<dyn_rank_view_t>);
+static_assert(!Kokkos::is_view_v<dyn_rank_view_t>);
+
+static_assert(!Kokkos::is_dual_view_v<dynamic_view_t>);
+static_assert(!Kokkos::is_dyn_rank_view_v<dynamic_view_t>);
+static_assert(Kokkos::is_dynamic_view_v<dynamic_view_t>);
+static_assert(!Kokkos::Experimental::is_offset_view_v<dynamic_view_t>);
+static_assert(!Kokkos::Experimental::is_scatter_view_v<dynamic_view_t>);
+static_assert(!Kokkos::is_view_v<dynamic_view_t>);
+
+static_assert(!Kokkos::is_dual_view_v<offset_view_t>);
+static_assert(!Kokkos::is_dyn_rank_view_v<offset_view_t>);
+static_assert(!Kokkos::is_dynamic_view_v<offset_view_t>);
+static_assert(Kokkos::Experimental::is_offset_view_v<offset_view_t>);
+static_assert(!Kokkos::Experimental::is_scatter_view_v<offset_view_t>);
+static_assert(!Kokkos::is_view_v<offset_view_t>);
+
+static_assert(!Kokkos::is_dual_view_v<scatter_view_t>);
+static_assert(!Kokkos::is_dyn_rank_view_v<scatter_view_t>);
+static_assert(!Kokkos::is_dynamic_view_v<scatter_view_t>);
+static_assert(!Kokkos::Experimental::is_offset_view_v<scatter_view_t>);
+static_assert(Kokkos::Experimental::is_scatter_view_v<scatter_view_t>);
+static_assert(!Kokkos::is_view_v<scatter_view_t>);
+
+}  // namespace
diff --git a/packages/kokkos/containers/unit_tests/TestScatterView.hpp b/packages/kokkos/containers/unit_tests/TestScatterView.hpp
index 347e914ea50b0c809085762b93817e5e1c0e27c0..733f43122ce90f0dcb568fe1140559641bc40777 100644
--- a/packages/kokkos/containers/unit_tests/TestScatterView.hpp
+++ b/packages/kokkos/containers/unit_tests/TestScatterView.hpp
@@ -726,9 +726,9 @@ void test_scatter_view(int64_t n) {
   }
 #endif
   // with hundreds of threads we were running out of memory.
-  // limit (n) so that duplication doesn't exceed 4GB
+  // limit (n) so that duplication doesn't exceed 1GB
   constexpr std::size_t maximum_allowed_total_bytes =
-      4ull * 1024ull * 1024ull * 1024ull;
+      1ull * 1024ull * 1024ull * 1024ull;
   std::size_t const maximum_allowed_copy_bytes =
       maximum_allowed_total_bytes /
       std::size_t(execution_space().concurrency());
@@ -770,9 +770,18 @@ TEST(TEST_CATEGORY, scatterview) {
   int big_n = 100 * 1000;
 #else
 
-#ifdef KOKKOS_ENABLE_SERIAL
+#if defined(KOKKOS_ENABLE_SERIAL) || defined(KOKKOS_ENABLE_OPENMP)
+#if defined(KOKKOS_ENABLE_SERIAL)
   bool is_serial = std::is_same<TEST_EXECSPACE, Kokkos::Serial>::value;
-  int big_n      = is_serial ? 100 * 1000 : 10000 * 1000;
+#else
+  bool is_serial = false;
+#endif
+#if defined(KOKKOS_ENABLE_OPENMP)
+  bool is_openmp = std::is_same<TEST_EXECSPACE, Kokkos::OpenMP>::value;
+#else
+  bool is_openmp = false;
+#endif
+  int big_n      = is_serial || is_openmp ? 100 * 1000 : 10000 * 1000;
 #else
   int big_n = 10000 * 1000;
 #endif
diff --git a/packages/kokkos/containers/unit_tests/TestUnorderedMap.hpp b/packages/kokkos/containers/unit_tests/TestUnorderedMap.hpp
index 989779b53a0320f39520ecd099f9a5c9e3a5416e..f63f1c6afe37faf507ab6e2abd43007801274071 100644
--- a/packages/kokkos/containers/unit_tests/TestUnorderedMap.hpp
+++ b/packages/kokkos/containers/unit_tests/TestUnorderedMap.hpp
@@ -25,18 +25,31 @@ namespace Test {
 
 namespace Impl {
 
-template <typename MapType, bool Near = false>
+template <typename MapType,
+          typename InsertOp = typename MapType::default_op_type,
+          bool Near = false, bool CheckValues = false>
 struct TestInsert {
   using map_type        = MapType;
   using execution_space = typename map_type::execution_space;
   using value_type      = uint32_t;
 
+  struct ExpectedValues {
+    unsigned map_idx;
+    typename map_type::value_type v;
+  };
+  using expected_values_type = Kokkos::View<ExpectedValues *, execution_space>;
+  expected_values_type expected_values;
+
   map_type map;
   uint32_t inserts;
   uint32_t collisions;
+  InsertOp insert_op;
 
   TestInsert(map_type arg_map, uint32_t arg_inserts, uint32_t arg_collisions)
-      : map(arg_map), inserts(arg_inserts), collisions(arg_collisions) {}
+      : map(arg_map), inserts(arg_inserts), collisions(arg_collisions) {
+    auto len = map.capacity() > arg_inserts ? map.capacity() : arg_inserts;
+    expected_values = expected_values_type("ExpectedValues", len);
+  }
 
   void testit(bool rehash_on_fail = true) {
     execution_space().fence();
@@ -54,7 +67,29 @@ struct TestInsert {
       }
     } while (rehash_on_fail && failed_count > 0u);
 
+    // Trigger the m_size mutable bug.
+    typename map_type::HostMirror map_h;
+    execution_space().fence();
+    Kokkos::deep_copy(map_h, map);
     execution_space().fence();
+    ASSERT_EQ(map_h.size(), map.size());
+
+    if (!rehash_on_fail && CheckValues) {
+      typename expected_values_type::HostMirror expected_values_h =
+          create_mirror_view(expected_values);
+      Kokkos::deep_copy(expected_values_h, expected_values);
+      for (unsigned i = 0; i < map_h.size(); i++) {
+        auto map_idx = expected_values_h(i).map_idx;
+        if (map_idx != static_cast<unsigned>(~0)) {
+          ASSERT_EQ(expected_values_h(map_idx).v, map_h.value_at(map_idx));
+        }
+      }
+    }
+
+    const unsigned int old_size = map_h.size();
+    map_h.clear();
+    ASSERT_EQ(map.size(), old_size);
+    ASSERT_EQ(map_h.size(), 0u);
   }
 
   KOKKOS_INLINE_FUNCTION
@@ -65,10 +100,47 @@ struct TestInsert {
     failed_count += count;
   }
 
+  template <typename UmapOpType = InsertOp>
+  KOKKOS_FORCEINLINE_FUNCTION bool is_op_noop() const {
+    using vt             = typename map_type::value_type;
+    using Device         = typename map_type::device_type;
+    using UmapOpTypeArg1 = Kokkos::View<
+        std::remove_const_t<std::conditional_t<std::is_void_v<vt>, int, vt>> *,
+        Device>;
+    return std::is_base_of_v<
+        InsertOp, typename Kokkos::UnorderedMapInsertOpTypes<UmapOpTypeArg1,
+                                                             uint32_t>::NoOp>;
+  }
+
+  template <typename UmapOpType = InsertOp>
+  KOKKOS_FORCEINLINE_FUNCTION bool is_op_atomic_add() const {
+    using vt             = typename map_type::value_type;
+    using Device         = typename map_type::device_type;
+    using UmapOpTypeArg1 = Kokkos::View<
+        std::remove_const_t<std::conditional_t<std::is_void_v<vt>, int, vt>> *,
+        Device>;
+    return std::is_base_of_v<UmapOpType,
+                             typename Kokkos::UnorderedMapInsertOpTypes<
+                                 UmapOpTypeArg1, uint32_t>::AtomicAdd>;
+  }
+
   KOKKOS_INLINE_FUNCTION
   void operator()(uint32_t i, value_type &failed_count) const {
     const uint32_t key = Near ? i / collisions : i % (inserts / collisions);
-    if (map.insert(key, i).failed()) ++failed_count;
+    auto ret           = map.insert(key, i, insert_op);
+    if (ret.failed()) {
+      ++failed_count;
+      expected_values(i).map_idx = static_cast<unsigned>(~0);
+    } else if (CheckValues) {
+      auto map_idx                     = map.find(key);
+      expected_values(map_idx).map_idx = map_idx;
+      auto ptr                         = expected_values.data();
+      if (is_op_atomic_add()) {
+        Kokkos::atomic_add(&((ptr + map_idx)[0].v), i);
+      } else if (ret.success() && is_op_noop()) {
+        Kokkos::atomic_store(&((ptr + map_idx)[0].v), i);
+      }
+    }
   }
 };
 
@@ -149,26 +221,26 @@ struct TestFind {
 // MSVC reports a syntax error for this test.
 // WORKAROUND MSVC
 #ifndef _WIN32
-template <typename Device>
+template <typename Device, class map_type, class const_map_type,
+          class insert_op_type, bool check_values = false>
 void test_insert(uint32_t num_nodes, uint32_t num_inserts,
                  uint32_t num_duplicates, bool near) {
-  using map_type = Kokkos::UnorderedMap<uint32_t, uint32_t, Device>;
-  using const_map_type =
-      Kokkos::UnorderedMap<const uint32_t, const uint32_t, Device>;
-
   const uint32_t expected_inserts =
       (num_inserts + num_duplicates - 1u) / num_duplicates;
+  typename map_type::size_type arg_capacity_hint = 0;
+  typename map_type::hasher_type arg_hasher;
+  typename map_type::equal_to_type arg_equal_to;
 
-  map_type map;
+  map_type map(arg_capacity_hint, arg_hasher, arg_equal_to);
   map.rehash(num_nodes, false);
 
   if (near) {
-    Impl::TestInsert<map_type, true> test_insert(map, num_inserts,
-                                                 num_duplicates);
+    Impl::TestInsert<map_type, insert_op_type, true, check_values> test_insert(
+        map, num_inserts, num_duplicates);
     test_insert.testit();
   } else {
-    Impl::TestInsert<map_type, false> test_insert(map, num_inserts,
-                                                  num_duplicates);
+    Impl::TestInsert<map_type, insert_op_type, false, check_values> test_insert(
+        map, num_inserts, num_duplicates);
     test_insert.testit();
   }
 
@@ -186,8 +258,7 @@ void test_insert(uint32_t num_nodes, uint32_t num_inserts,
 
     {
       uint32_t find_errors = 0;
-      Impl::TestFind<const_map_type> test_find(map, num_inserts,
-                                               num_duplicates);
+      Impl::TestFind<map_type> test_find(map, num_inserts, num_duplicates);
       test_find.testit(find_errors);
       EXPECT_EQ(0u, find_errors);
     }
@@ -199,6 +270,64 @@ void test_insert(uint32_t num_nodes, uint32_t num_inserts,
     map.end_erase();
     EXPECT_EQ(0u, map.size());
   }
+
+  // Check the values from the insert operation
+  {
+    Impl::TestInsert<map_type, insert_op_type, true> test_insert(
+        map, num_inserts, num_duplicates);
+    test_insert.testit(false);
+  }
+}
+
+template <typename Device>
+void test_inserts(uint32_t num_nodes, uint32_t num_inserts,
+                  uint32_t num_duplicates, bool near) {
+  using key_type        = uint32_t;
+  using value_type      = uint32_t;
+  using value_view_type = Kokkos::View<value_type *, Device>;
+  using size_type       = uint32_t;
+  using hasher_type     = typename Kokkos::pod_hash<key_type>;
+  using equal_to_type   = typename Kokkos::pod_equal_to<key_type>;
+
+  using map_op_type =
+      Kokkos::UnorderedMapInsertOpTypes<value_view_type, size_type>;
+  using noop_type = typename map_op_type::NoOp;
+
+  using map_type = Kokkos::UnorderedMap<key_type, value_type, Device,
+                                        hasher_type, equal_to_type>;
+  using const_map_type =
+      Kokkos::UnorderedMap<const key_type, const value_type, Device,
+                           hasher_type, equal_to_type>;
+
+  test_insert<Device, map_type, const_map_type, noop_type>(
+      num_nodes, num_inserts, num_duplicates, near);
+}
+
+template <typename Device>
+void test_all_insert_ops(uint32_t num_nodes, uint32_t num_inserts,
+                         uint32_t num_duplicates, bool near) {
+  using key_type        = uint32_t;
+  using value_type      = uint32_t;
+  using value_view_type = Kokkos::View<value_type *, Device>;
+  using size_type       = uint32_t;
+  using hasher_type     = typename Kokkos::pod_hash<key_type>;
+  using equal_to_type   = typename Kokkos::pod_equal_to<key_type>;
+
+  using map_op_type =
+      Kokkos::UnorderedMapInsertOpTypes<value_view_type, size_type>;
+  using noop_type       = typename map_op_type::NoOp;
+  using atomic_add_type = typename map_op_type::AtomicAdd;
+
+  using map_type = Kokkos::UnorderedMap<key_type, value_type, Device,
+                                        hasher_type, equal_to_type>;
+  using const_map_type =
+      Kokkos::UnorderedMap<const key_type, const value_type, Device,
+                           hasher_type, equal_to_type>;
+
+  test_insert<Device, map_type, const_map_type, noop_type, true>(
+      num_nodes, num_inserts, num_duplicates, near);
+  test_insert<Device, map_type, const_map_type, atomic_add_type, true>(
+      num_nodes, num_inserts, num_duplicates, near);
 }
 #endif
 
@@ -267,15 +396,13 @@ void test_deep_copy(uint32_t num_nodes) {
 
 #if !defined(_WIN32)
 TEST(TEST_CATEGORY, UnorderedMap_insert) {
-#if defined(KOKKOS_ENABLE_CUDA) && \
-    defined(KOKKOS_COMPILER_NVHPC)  // FIXME_NVHPC
-  if constexpr (std::is_same_v<TEST_EXECSPACE, Kokkos::Cuda>) {
-    GTEST_SKIP() << "unit test is hanging from index 0";
-  }
-#endif
   for (int i = 0; i < 500; ++i) {
-    test_insert<TEST_EXECSPACE>(100000, 90000, 100, true);
-    test_insert<TEST_EXECSPACE>(100000, 90000, 100, false);
+    test_inserts<TEST_EXECSPACE>(100000, 90000, 100, true);
+    test_inserts<TEST_EXECSPACE>(100000, 90000, 100, false);
+  }
+  for (int i = 0; i < 5; ++i) {
+    test_all_insert_ops<TEST_EXECSPACE>(1000, 900, 10, true);
+    test_all_insert_ops<TEST_EXECSPACE>(1000, 900, 10, false);
   }
 }
 #endif
@@ -285,12 +412,6 @@ TEST(TEST_CATEGORY, UnorderedMap_failed_insert) {
 }
 
 TEST(TEST_CATEGORY, UnorderedMap_deep_copy) {
-#if defined(KOKKOS_ENABLE_CUDA) && \
-    defined(KOKKOS_COMPILER_NVHPC)  // FIXME_NVHPC
-  if constexpr (std::is_same_v<TEST_EXECSPACE, Kokkos::Cuda>) {
-    GTEST_SKIP() << "unit test is hanging from index 0";
-  }
-#endif
   for (int i = 0; i < 2; ++i) test_deep_copy<TEST_EXECSPACE>(10000);
 }
 
@@ -308,17 +429,57 @@ TEST(TEST_CATEGORY, UnorderedMap_valid_empty) {
   ASSERT_TRUE(n.is_allocated());
 }
 
+/**
+ * This helper is needed because NVCC does not like extended lambdas
+ * in private member functions.
+ * Google Test bodies are private member functions. So it is incompatible.
+ * See also https://github.com/google/googletest/issues/4104.
+ */
+template <typename map_type>
+struct UnorderedMapInsert {
+  //! Type of range-for policy and its index type.
+  using range_policy_t =
+      Kokkos::RangePolicy<typename map_type::execution_space,
+                          Kokkos::IndexType<unsigned short int>>;
+  using index_t = typename range_policy_t::index_type;
+
+  const map_type m_map;
+
+  //! Ensure shared ownership of @ref m_map.
+  UnorderedMapInsert(map_type map) : m_map(std::move(map)) {}
+
+  //! Insert a single value.
+  template <typename T>
+  void insert_single(const T &arg) const {
+    Kokkos::parallel_for(
+        Kokkos::RangePolicy<typename map_type::execution_space>(0, 1),
+        // NOLINTNEXTLINE(kokkos-implicit-this-capture)
+        KOKKOS_CLASS_LAMBDA(const index_t) { m_map.insert(arg); });
+  }
+
+  //! Insert multiple values.
+  template <typename... Args>
+  void insert(Args &&... args) const {
+    static_assert(sizeof...(Args) > 1, "Prefer the single value version");
+    constexpr size_t size = sizeof...(Args);
+    Kokkos::Array<typename map_type::key_type, size> values{
+        std::forward<Args>(args)...};
+    Kokkos::parallel_for(
+        Kokkos::RangePolicy<typename map_type::execution_space>(0, size),
+        // NOLINTNEXTLINE(kokkos-implicit-this-capture)
+        KOKKOS_CLASS_LAMBDA(const index_t i) { m_map.insert(values[i]); });
+  }
+};
+
 TEST(TEST_CATEGORY, UnorderedMap_clear_zero_size) {
-  using Map =
-      Kokkos::UnorderedMap<int, void, Kokkos::DefaultHostExecutionSpace>;
+  using map_type = Kokkos::UnorderedMap<int, void, TEST_EXECSPACE>;
+
+  map_type m(11);
 
-  Map m(11);
   ASSERT_EQ(0u, m.size());
 
-  m.insert(2);
-  m.insert(3);
-  m.insert(5);
-  m.insert(7);
+  UnorderedMapInsert<map_type>(m).insert(2, 3, 5, 7);
+
   ASSERT_EQ(4u, m.size());
   m.rehash(0);
   ASSERT_EQ(128u, m.capacity());
@@ -328,6 +489,79 @@ TEST(TEST_CATEGORY, UnorderedMap_clear_zero_size) {
   ASSERT_EQ(0u, m.size());
 }
 
+TEST(TEST_CATEGORY, UnorderedMap_consistent_size) {
+  using map_type = Kokkos::UnorderedMap<int, void, TEST_EXECSPACE>;
+
+  map_type m(11);
+  UnorderedMapInsert<map_type> inserter(m);
+
+  inserter.insert_single(7);
+
+  ASSERT_EQ(1u, m.size());
+
+  {
+    auto m_copy = m;
+    UnorderedMapInsert<decltype(m_copy)> inserter_copy(m_copy);
+    inserter_copy.insert_single(2);
+    // This line triggers modified flags to be cleared in both m and m2
+    const auto sz = m_copy.size();
+    ASSERT_EQ(2u, sz);
+  }
+
+  ASSERT_EQ(2u, m.size());
+}
+
+struct TestMapCopy {
+  using map_type = Kokkos::UnorderedMap<int, void, TEST_EXECSPACE>;
+  map_type m_map;
+
+  KOKKOS_FUNCTION
+  void test_insert_to_map_copy(map_type const &input_map, const int i) const {
+    auto map = input_map;
+    map.insert(i);
+  }
+
+  KOKKOS_FUNCTION
+  void operator()(const int i) const { test_insert_to_map_copy(m_map, i); }
+};
+
+TEST(TEST_CATEGORY, UnorderedMap_shallow_copyable_on_device) {
+  TestMapCopy test_map_copy;
+
+  Kokkos::parallel_for(Kokkos::RangePolicy<TEST_EXECSPACE>(0, 1),
+                       test_map_copy);
+  ASSERT_EQ(1u, test_map_copy.m_map.size());
+}
+
+#if !defined(KOKKOS_ENABLE_CUDA) || \
+    (defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_ENABLE_CUDA_LAMBDA))
+void test_unordered_map_device_capture() {
+  TestMapCopy::map_type map;
+
+  Kokkos::parallel_for(
+      Kokkos::RangePolicy<TEST_EXECSPACE>(0, 1),
+      KOKKOS_LAMBDA(int const i) { map.insert(i); });
+
+  ASSERT_EQ(1u, map.size());
+}
+
+TEST(TEST_CATEGORY, UnorderedMap_lambda_capturable) {
+  test_unordered_map_device_capture();
+}
+#endif
+
+/**
+ * @test This test ensures that an @ref UnorderedMap can be built
+ *       with an execution space instance (using @ref view_alloc).
+ */
+TEST(TEST_CATEGORY, UnorderedMap_constructor_view_alloc) {
+  using map_type = Kokkos::UnorderedMap<size_t, void, TEST_EXECSPACE>;
+  map_type map(Kokkos::view_alloc(TEST_EXECSPACE{}, "test umap"), 150);
+  ASSERT_EQ(map.size(), 0u);
+  ASSERT_GE(map.capacity(), 150u);
+  ASSERT_TRUE(map.is_allocated());
+}
+
 }  // namespace Test
 
 #endif  // KOKKOS_TEST_UNORDERED_MAP_HPP
diff --git a/packages/kokkos/containers/unit_tests/TestVector.hpp b/packages/kokkos/containers/unit_tests/TestVector.hpp
index fa59607484663a80d2b76bb2bef572dc89297f4d..a7d341b789d6d6c61f3037b8bca492fca9860752 100644
--- a/packages/kokkos/containers/unit_tests/TestVector.hpp
+++ b/packages/kokkos/containers/unit_tests/TestVector.hpp
@@ -49,61 +49,27 @@ struct test_vector_insert {
 
     it = a.begin();
     it += 17;
-// Looks like some std::vector implementations do not have the restriction
-// right on the overload taking three iterators, and thus the following call
-// will hit that overload and then fail to compile.
-#if defined(KOKKOS_COMPILER_INTEL)
-// And at least GCC 4.8.4 doesn't implement vector insert correct for C++11
-// Return type is void ...
-#if (__GNUC__ < 5)
-    a.insert(it, typename Vector::size_type(n + 5), scalar_type(5));
-    it_return = a.begin() + 17;
-#else
-    it_return = a.insert(it, typename Vector::size_type(n + 5), scalar_type(5));
-#endif
-#else
-#if (__GNUC__ < 5)
-    a.insert(it, n + 5, scalar_type(5));
-    it_return = a.begin() + 17;
-#else
     it_return = a.insert(it, n + 5, scalar_type(5));
-#endif
-#endif
 
     ASSERT_EQ(a.size(), n + 1 + n + 5);
-    ASSERT_EQ(std::distance(it_return, a.begin() + 17), 0u);
+    ASSERT_EQ(std::distance(it_return, a.begin() + 17), 0);
 
     Vector b;
 
-// Looks like some std::vector implementations do not have the restriction
-// right on the overload taking three iterators, and thus the following call
-// will hit that overload and then fail to compile.
-#if defined(KOKKOS_COMPILER_INTEL)
-    b.insert(b.begin(), typename Vector::size_type(7), 9);
-#else
     b.insert(b.begin(), 7, 9);
-#endif
     ASSERT_EQ(b.size(), 7u);
     ASSERT_EQ(b[0], scalar_type(9));
 
     it = a.begin();
     it += 27 + n;
-#if (__GNUC__ < 5)
-    a.insert(it, b.begin(), b.end());
-    it_return = a.begin() + (27 + n);
-#else
     it_return = a.insert(it, b.begin(), b.end());
-#endif
+
     ASSERT_EQ(a.size(), n + 1 + n + 5 + 7);
-    ASSERT_EQ(std::distance(it_return, a.begin() + 27 + n), 0u);
+    ASSERT_EQ(std::distance(it_return, a.begin() + 27 + n), 0);
 
     // Testing insert at end via all three function interfaces
     a.insert(a.end(), 11);
-#if defined(KOKKOS_COMPILER_INTEL)
-    a.insert(a.end(), typename Vector::size_type(2), 12);
-#else
     a.insert(a.end(), 2, 12);
-#endif
     a.insert(a.end(), b.begin(), b.end());
   }
 
diff --git a/packages/kokkos/containers/unit_tests/TestWithoutInitializing.hpp b/packages/kokkos/containers/unit_tests/TestWithoutInitializing.hpp
index d3bb05195cc5f7900ea29f35eec60e4b461670f8..7201cd402a95a5db073ba1a2908f0f9c25520f25 100644
--- a/packages/kokkos/containers/unit_tests/TestWithoutInitializing.hpp
+++ b/packages/kokkos/containers/unit_tests/TestWithoutInitializing.hpp
@@ -24,6 +24,19 @@
 
 #include <../../core/unit_test/tools/include/ToolTestingUtilities.hpp>
 
+/// Some tests are skipped for @c CudaUVM memory space.
+/// @todo To be revised according to the future of @c KOKKOS_ENABLE_CUDA_UVM.
+///@{
+#ifdef KOKKOS_ENABLE_CUDA
+#define GTEST_SKIP_IF_CUDAUVM_MEMORY_SPACE                            \
+  if constexpr (std::is_same_v<typename TEST_EXECSPACE::memory_space, \
+                               Kokkos::CudaUVMSpace>)                 \
+    GTEST_SKIP() << "skipping since CudaUVMSpace requires additional fences";
+#else
+#define GTEST_SKIP_IF_CUDAUVM_MEMORY_SPACE
+#endif
+///@}
+
 TEST(TEST_CATEGORY, resize_realloc_no_init_dualview) {
   using namespace Kokkos::Test::Tools;
   listen_tool_events(Config::DisableAll(), Config::EnableKernels());
@@ -125,11 +138,7 @@ TEST(TEST_CATEGORY, resize_exec_space_dualview) {
 }
 
 TEST(TEST_CATEGORY, realloc_exec_space_dualview) {
-#ifdef KOKKOS_ENABLE_CUDA
-  if (std::is_same<typename TEST_EXECSPACE::memory_space,
-                   Kokkos::CudaUVMSpace>::value)
-    GTEST_SKIP() << "skipping since CudaUVMSpace requires additional fences";
-#endif
+  GTEST_SKIP_IF_CUDAUVM_MEMORY_SPACE
 
   using namespace Kokkos::Test::Tools;
   listen_tool_events(Config::DisableAll(), Config::EnableFences());
@@ -221,11 +230,8 @@ TEST(TEST_CATEGORY, resize_exec_space_dynrankview) {
 }
 
 TEST(TEST_CATEGORY, realloc_exec_space_dynrankview) {
-#ifdef KOKKOS_ENABLE_CUDA
-  if (std::is_same<typename TEST_EXECSPACE::memory_space,
-                   Kokkos::CudaUVMSpace>::value)
-    GTEST_SKIP() << "skipping since CudaUVMSpace requires additional fences";
-#endif
+  GTEST_SKIP_IF_CUDAUVM_MEMORY_SPACE
+
 // FIXME_THREADS The Threads backend fences every parallel_for
 #ifdef KOKKOS_ENABLE_THREADS
   if (std::is_same<TEST_EXECSPACE, Kokkos::Threads>::value)
@@ -363,16 +369,20 @@ TEST(TEST_CATEGORY, resize_exec_space_scatterview) {
 }
 
 TEST(TEST_CATEGORY, realloc_exec_space_scatterview) {
-#ifdef KOKKOS_ENABLE_CUDA
-  if (std::is_same<typename TEST_EXECSPACE::memory_space,
-                   Kokkos::CudaUVMSpace>::value)
-    GTEST_SKIP() << "skipping since CudaUVMSpace requires additional fences";
-#endif
+  GTEST_SKIP_IF_CUDAUVM_MEMORY_SPACE
+
 // FIXME_THREADS The Threads backend fences every parallel_for
 #ifdef KOKKOS_ENABLE_THREADS
   if (std::is_same<typename TEST_EXECSPACE, Kokkos::Threads>::value)
     GTEST_SKIP() << "skipping since the Threads backend isn't asynchronous";
 #endif
+#if defined(KOKKOS_ENABLE_HPX) && \
+    !defined(KOKKOS_ENABLE_IMPL_HPX_ASYNC_DISPATCH)
+  if (std::is_same<Kokkos::DefaultExecutionSpace,
+                   Kokkos::Experimental::HPX>::value)
+    GTEST_SKIP() << "skipping since the HPX backend always fences with async "
+                    "dispatch disabled";
+#endif
 
   using namespace Kokkos::Test::Tools;
   listen_tool_events(Config::DisableAll(), Config::EnableFences());
@@ -470,11 +480,8 @@ TEST(TEST_CATEGORY, create_mirror_no_init_dynrankview_viewctor) {
 }
 
 TEST(TEST_CATEGORY, create_mirror_view_and_copy_dynrankview) {
-#ifdef KOKKOS_ENABLE_CUDA
-  if (std::is_same<typename TEST_EXECSPACE::memory_space,
-                   Kokkos::CudaUVMSpace>::value)
-    return;
-#endif
+  GTEST_SKIP_IF_CUDAUVM_MEMORY_SPACE
+
   using namespace Kokkos::Test::Tools;
   listen_tool_events(Config::DisableAll(), Config::EnableKernels(),
                      Config::EnableFences());
@@ -577,11 +584,8 @@ TEST(TEST_CATEGORY, create_mirror_no_init_offsetview_view_ctor) {
 }
 
 TEST(TEST_CATEGORY, create_mirror_view_and_copy_offsetview) {
-#ifdef KOKKOS_ENABLE_CUDA
-  if (std::is_same<typename TEST_EXECSPACE::memory_space,
-                   Kokkos::CudaUVMSpace>::value)
-    return;
-#endif
+  GTEST_SKIP_IF_CUDAUVM_MEMORY_SPACE
+
   using namespace Kokkos::Test::Tools;
   listen_tool_events(Config::DisableAll(), Config::EnableKernels(),
                      Config::EnableFences());
@@ -652,11 +656,8 @@ TEST(TEST_CATEGORY, create_mirror_no_init_dynamicview) {
 }
 
 TEST(TEST_CATEGORY, create_mirror_view_and_copy_dynamicview) {
-#ifdef KOKKOS_ENABLE_CUDA
-  if (std::is_same<typename TEST_EXECSPACE::memory_space,
-                   Kokkos::CudaUVMSpace>::value)
-    return;
-#endif
+  GTEST_SKIP_IF_CUDAUVM_MEMORY_SPACE
+
   using namespace Kokkos::Test::Tools;
   listen_tool_events(Config::DisableAll(), Config::EnableKernels(),
                      Config::EnableFences());
diff --git a/packages/kokkos/core/CMakeLists.txt b/packages/kokkos/core/CMakeLists.txt
index 5d9fde56d202685f05a5e5db49608489986dcbc3..0917928001a92749dacf4f863df4367e2e3a06ea 100644
--- a/packages/kokkos/core/CMakeLists.txt
+++ b/packages/kokkos/core/CMakeLists.txt
@@ -1,15 +1,22 @@
-
-
-KOKKOS_SUBPACKAGE(Core)
-
 IF (NOT Kokkos_INSTALL_TESTING)
   ADD_SUBDIRECTORY(src)
 ENDIF()
 
-KOKKOS_ADD_TEST_DIRECTORIES(unit_test)
-IF (NOT KOKKOS_HAS_TRILINOS)
-  # We are using the githash etc in here, which does not work correct in Trilinos
-  KOKKOS_ADD_TEST_DIRECTORIES(perf_test)
-ENDIF()
+FUNCTION(KOKKOS_ADD_BENCHMARK_DIRECTORY DIR_NAME)
+  IF(NOT Kokkos_ENABLE_BENCHMARKS)
+    RETURN()
+  ENDIF()
+
+  IF(KOKKOS_HAS_TRILINOS)
+    message(
+      STATUS
+      "Benchmarks are not supported when building as part of Trilinos"
+    )
+    RETURN()
+  ENDIF()
 
-KOKKOS_SUBPACKAGE_POSTPROCESS()
+  ADD_SUBDIRECTORY(${DIR_NAME})
+ENDFUNCTION()
+
+KOKKOS_ADD_TEST_DIRECTORIES(unit_test)
+KOKKOS_ADD_BENCHMARK_DIRECTORY(perf_test)
diff --git a/packages/kokkos/core/cmake/Dependencies.cmake b/packages/kokkos/core/cmake/Dependencies.cmake
deleted file mode 100644
index 611c089b2e3feec2ec79228360f93c242fc055e2..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/cmake/Dependencies.cmake
+++ /dev/null
@@ -1,6 +0,0 @@
-TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
-  LIB_OPTIONAL_TPLS Pthread CUDA HWLOC DLlib
-  TEST_OPTIONAL_TPLS CUSPARSE
-  )
-
-TRIBITS_TPL_TENTATIVELY_ENABLE(DLlib)
diff --git a/packages/kokkos/core/perf_test/BenchmarkMain.cpp b/packages/kokkos/core/perf_test/BenchmarkMain.cpp
index bba2bca36d89277da1690c97e56d5bd75e7de301..2232019c191f62cdd2a71375332ef07e92614837 100644
--- a/packages/kokkos/core/perf_test/BenchmarkMain.cpp
+++ b/packages/kokkos/core/perf_test/BenchmarkMain.cpp
@@ -16,15 +16,20 @@
 
 #include <benchmark/benchmark.h>
 
-#include <Benchmark_Context.hpp>
+#include "Benchmark_Context.hpp"
 #include <Kokkos_Core.hpp>
 
+#include "PerfTest_Category.hpp"
+
 int main(int argc, char** argv) {
   Kokkos::initialize(argc, argv);
   benchmark::Initialize(&argc, argv);
   benchmark::SetDefaultTimeUnit(benchmark::kSecond);
   KokkosBenchmark::add_benchmark_context(true);
 
+  (void)Test::command_line_num_args(argc);
+  (void)Test::command_line_arg(0, argv);
+
   benchmark::RunSpecifiedBenchmarks();
 
   benchmark::Shutdown();
diff --git a/packages/kokkos/core/perf_test/Benchmark_Context.cpp b/packages/kokkos/core/perf_test/Benchmark_Context.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9aa63cc34f3215f9523191a2dba71b5517b27d6e
--- /dev/null
+++ b/packages/kokkos/core/perf_test/Benchmark_Context.cpp
@@ -0,0 +1,81 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include "Benchmark_Context.hpp"
+
+namespace KokkosBenchmark {
+
+/**
+ * \brief Remove unwanted spaces and colon signs from input string. In case of
+ * invalid input it will return an empty string.
+ */
+std::string remove_unwanted_characters(const std::string& str) {
+  auto from = str.find_first_not_of(" :");
+  auto to   = str.find_last_not_of(" :");
+
+  if (from == std::string::npos || to == std::string::npos) {
+    return "";
+  }
+
+  // return extracted part of string without unwanted spaces and colon signs
+  return str.substr(from, to + 1);
+}
+
+/**
+ * \brief Extract all key:value pairs from kokkos configuration and add it to
+ * the benchmark context
+ */
+void add_kokkos_configuration(bool verbose) {
+  std::ostringstream msg;
+  Kokkos::print_configuration(msg, verbose);
+
+  // Iterate over lines returned from kokkos and extract key:value pairs
+  std::stringstream ss{msg.str()};
+  for (std::string line; std::getline(ss, line, '\n');) {
+    auto found = line.find_first_of(':');
+    if (found != std::string::npos) {
+      auto val = remove_unwanted_characters(line.substr(found + 1));
+      // Ignore line without value, for example a category name
+      if (!val.empty()) {
+        benchmark::AddCustomContext(
+            remove_unwanted_characters(line.substr(0, found)), val);
+      }
+    }
+  }
+}
+
+void add_git_info() {
+  if (!Kokkos::Impl::GIT_BRANCH.empty()) {
+    benchmark::AddCustomContext("GIT_BRANCH", Kokkos::Impl::GIT_BRANCH);
+    benchmark::AddCustomContext("GIT_COMMIT_HASH",
+                                Kokkos::Impl::GIT_COMMIT_HASH);
+    benchmark::AddCustomContext("GIT_CLEAN_STATUS",
+                                Kokkos::Impl::GIT_CLEAN_STATUS);
+    benchmark::AddCustomContext("GIT_COMMIT_DESCRIPTION",
+                                Kokkos::Impl::GIT_COMMIT_DESCRIPTION);
+    benchmark::AddCustomContext("GIT_COMMIT_DATE",
+                                Kokkos::Impl::GIT_COMMIT_DATE);
+  }
+}
+
+void add_benchmark_context(bool verbose) {
+  // Add Kokkos configuration to benchmark context data
+  add_kokkos_configuration(verbose);
+  // Add git information to benchmark context data
+  add_git_info();
+}
+
+}  // namespace KokkosBenchmark
diff --git a/packages/kokkos/core/perf_test/Benchmark_Context.hpp b/packages/kokkos/core/perf_test/Benchmark_Context.hpp
index ba6eff6646c85e3beefab1bb2a3ba8ae15ed017f..e823b3a8ad0c1942a635244fbb8b57e057d50a14 100644
--- a/packages/kokkos/core/perf_test/Benchmark_Context.hpp
+++ b/packages/kokkos/core/perf_test/Benchmark_Context.hpp
@@ -26,62 +26,34 @@
 
 namespace KokkosBenchmark {
 
-/// \brief Remove unwanted spaces and colon signs from input string. In case of
-/// invalid input it will return an empty string.
-std::string remove_unwanted_characters(std::string str) {
-  auto from = str.find_first_not_of(" :");
-  auto to   = str.find_last_not_of(" :");
-
-  if (from == std::string::npos || to == std::string::npos) {
-    return "";
-  }
-
-  // return extracted part of string without unwanted spaces and colon signs
-  return str.substr(from, to + 1);
-}
-
-/// \brief Extract all key:value pairs from kokkos configuration and add it to
-/// the benchmark context
-void add_kokkos_configuration(bool verbose) {
-  std::ostringstream msg;
-  Kokkos::print_configuration(msg, verbose);
-
-  // Iterate over lines returned from kokkos and extract key:value pairs
-  std::stringstream ss{msg.str()};
-  for (std::string line; std::getline(ss, line, '\n');) {
-    auto found = line.find_first_of(':');
-    if (found != std::string::npos) {
-      auto val = remove_unwanted_characters(line.substr(found + 1));
-      // Ignore line without value, for example a category name
-      if (!val.empty()) {
-        benchmark::AddCustomContext(
-            remove_unwanted_characters(line.substr(0, found)), val);
-      }
-    }
-  }
-}
-
-/// \brief Add all data related to git to benchmark context
-void add_git_info() {
-  if (!Kokkos::Impl::GIT_BRANCH.empty()) {
-    benchmark::AddCustomContext("GIT_BRANCH", Kokkos::Impl::GIT_BRANCH);
-    benchmark::AddCustomContext("GIT_COMMIT_HASH",
-                                Kokkos::Impl::GIT_COMMIT_HASH);
-    benchmark::AddCustomContext("GIT_CLEAN_STATUS",
-                                Kokkos::Impl::GIT_CLEAN_STATUS);
-    benchmark::AddCustomContext("GIT_COMMIT_DESCRIPTION",
-                                Kokkos::Impl::GIT_COMMIT_DESCRIPTION);
-    benchmark::AddCustomContext("GIT_COMMIT_DATE",
-                                Kokkos::Impl::GIT_COMMIT_DATE);
-  }
+/**
+ * \brief Gather all context information and add it to benchmark context data
+ */
+void add_benchmark_context(bool verbose = false);
+
+/**
+ * \brief Mark the label as a figure of merit.
+ */
+inline std::string benchmark_fom(const std::string& label) {
+  return "FOM: " + label;
 }
 
-/// \brief Gather all context information and add it to benchmark context data
-void add_benchmark_context(bool verbose = false) {
-  // Add Kokkos configuration to benchmark context data
-  add_kokkos_configuration(verbose);
-  // Add git information to benchmark context data
-  add_git_info();
+/**
+ * \brief Report throughput and amount of data processed for simple View
+ * operations
+ */
+template <class ViewType>
+void report_results(benchmark::State& state, ViewType view, int data_ratio,
+                    double time) {
+  // data processed in megabytes
+  const double data_processed = data_ratio * view.size() *
+                                sizeof(typename ViewType::value_type) /
+                                1'000'000;
+
+  state.SetIterationTime(time);
+  state.counters["MB"] = benchmark::Counter(data_processed);
+  state.counters[KokkosBenchmark::benchmark_fom("GB/s")] = benchmark::Counter(
+      data_processed / 1'000, benchmark::Counter::kIsIterationInvariantRate);
 }
 
 }  // namespace KokkosBenchmark
diff --git a/packages/kokkos/core/perf_test/CMakeLists.txt b/packages/kokkos/core/perf_test/CMakeLists.txt
index 814102975f5a5d7bd39b013a7de4380dc25fb08d..7f3916da31272e796a5cc083ead1138f7deaa62a 100644
--- a/packages/kokkos/core/perf_test/CMakeLists.txt
+++ b/packages/kokkos/core/perf_test/CMakeLists.txt
@@ -1,108 +1,34 @@
-
-#INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
-#INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR})
-
-
-# warning: PerfTest_CustomReduction.cpp uses
-# ../../algorithms/src/Kokkos_Random.hpp
-# we'll just allow it to be included, but note
-# that in TriBITS KokkosAlgorithms can be disabled...
-#INCLUDE_DIRECTORIES("${CMAKE_CURRENT_SOURCE_DIR}/../../algorithms/src")
-
 # FIXME_OPENMPTARGET - the NVIDIA HPC compiler nvc++ in the OpenMPTarget backend does not pass the perf_tests.
 # FIXME_OPENACC - temporarily disabled due to unimplemented features
 IF ((KOKKOS_ENABLE_OPENMPTARGET OR KOKKOS_ENABLE_OPENACC) AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
   RETURN()
 ENDIF()
-
-
-SET(SOURCES
-  PerfTestMain.cpp
-  PerfTestGramSchmidt.cpp
-  PerfTestHexGrad.cpp
-  PerfTest_CustomReduction.cpp
-  PerfTest_ExecSpacePartitioning.cpp
-  PerfTest_ViewAllocate.cpp
-  PerfTest_ViewFill_123.cpp
-  PerfTest_ViewFill_45.cpp
-  PerfTest_ViewFill_6.cpp
-  PerfTest_ViewFill_7.cpp
-  PerfTest_ViewFill_8.cpp
-  PerfTest_ViewResize_123.cpp
-  PerfTest_ViewResize_45.cpp
-  PerfTest_ViewResize_6.cpp
-  PerfTest_ViewResize_7.cpp
-  PerfTest_ViewResize_8.cpp
-  )
-
-IF(Kokkos_ENABLE_OPENMPTARGET)
-# FIXME OPENMPTARGET requires TeamPolicy Reductions and Custom Reduction
-  LIST(REMOVE_ITEM SOURCES
-    PerfTestGramSchmidt.cpp
-    PerfTest_CustomReduction.cpp
-    PerfTest_ExecSpacePartitioning.cpp
-  )
-ENDIF()
-
-IF(KOKKOS_ENABLE_CUDA OR KOKKOS_ENABLE_HIP OR KOKKOS_ENABLE_SYCL)
-  KOKKOS_ADD_EXECUTABLE (
-    PerformanceTest_SharedSpace
-    SOURCES test_sharedSpace.cpp
-  )
-ENDIF()
-
-# Per #374, we always want to build this test, but we only want to run
-# it as a PERFORMANCE test.  That's why we separate building the test
-# from running the test.
-
-#leave these as basic includes for now
-#I don't need anything transitive
-KOKKOS_INCLUDE_DIRECTORIES("${CMAKE_CURRENT_SOURCE_DIR}/../../algorithms/src")
-KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
-KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR})
-
-# This test currently times out for MSVC
-IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC")
-  KOKKOS_ADD_EXECUTABLE_AND_TEST(
-    PerfTestExec
-    SOURCES ${SOURCES}
-    CATEGORIES PERFORMANCE
-  )
+IF (KOKKOS_ENABLE_OPENACC AND KOKKOS_CXX_COMPILER_ID STREQUAL Clang)
+  RETURN()
 ENDIF()
 
-KOKKOS_ADD_EXECUTABLE_AND_TEST(
-  PerformanceTest_Atomic
-  SOURCES test_atomic.cpp
-  CATEGORIES PERFORMANCE
-)
+# all PerformanceTest_* executables are part of regular tests
+# TODO: finish converting these into benchmarks (in progress)
+IF(KOKKOS_ENABLE_TESTS)
+  IF(KOKKOS_ENABLE_CUDA OR KOKKOS_ENABLE_HIP OR KOKKOS_ENABLE_SYCL)
+    KOKKOS_ADD_EXECUTABLE (
+      PerformanceTest_SharedSpace
+      SOURCES test_sharedSpace.cpp
+    )
+  ENDIF()
 
-IF(NOT KOKKOS_ENABLE_CUDA OR KOKKOS_ENABLE_CUDA_LAMBDA)
-  KOKKOS_ADD_EXECUTABLE_AND_TEST(
-    PerformanceTest_Atomic_MinMax
-    SOURCES test_atomic_minmax_simple.cpp
-    CATEGORIES PERFORMANCE
-  )
-ENDIF()
+  KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR})
 
-# FIXME_NVHPC
-IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
-KOKKOS_ADD_EXECUTABLE_AND_TEST(
-  PerformanceTest_Mempool
-  SOURCES test_mempool.cpp
-  CATEGORIES PERFORMANCE
-)
-ENDIF()
-
-IF(NOT Kokkos_ENABLE_OPENMPTARGET)
-# FIXME OPENMPTARGET needs tasking
-  KOKKOS_ADD_EXECUTABLE_AND_TEST(
-    PerformanceTest_TaskDag
-    SOURCES test_taskdag.cpp
-    CATEGORIES PERFORMANCE
-  )
+  IF(NOT Kokkos_ENABLE_OPENMPTARGET)
+  # FIXME OPENMPTARGET needs tasking
+    KOKKOS_ADD_EXECUTABLE_AND_TEST(
+      PerformanceTest_TaskDag
+      SOURCES test_taskdag.cpp
+      CATEGORIES PERFORMANCE
+    )
+  ENDIF()
 ENDIF()
 
-
 IF(NOT Kokkos_ENABLE_BENCHMARKS)
   RETURN()
 ENDIF()
@@ -111,7 +37,8 @@ IF (KOKKOS_HAS_TRILINOS)
   message(FATAL_ERROR "Benchmarks are not supported when building as part of Trilinos")
 ENDIF()
 
-find_package(benchmark QUIET)
+# Find or download google/benchmark library
+find_package(benchmark QUIET 1.5.6)
 IF(benchmark_FOUND)
   MESSAGE(STATUS "Using google benchmark found in ${benchmark_DIR}")
 ELSE()
@@ -119,17 +46,16 @@ ELSE()
   include(FetchContent)
   SET(BENCHMARK_ENABLE_TESTING OFF)
 
-  list(APPEND CMAKE_MESSAGE_INDENT "    ")
+  list(APPEND CMAKE_MESSAGE_INDENT "[benchmark] ")
   FetchContent_Declare(
     googlebenchmark
+    DOWNLOAD_EXTRACT_TIMESTAMP FALSE
     URL https://github.com/google/benchmark/archive/refs/tags/v1.6.2.tar.gz
     URL_HASH MD5=14d14849e075af116143a161bc3b927b
   )
   FetchContent_MakeAvailable(googlebenchmark)
   list(POP_BACK CMAKE_MESSAGE_INDENT)
 
-  include_directories(${benchmark_SOURCE_DIR}/include)
-
   # Suppress clang-tidy diagnostics on code that we do not have control over
   IF(CMAKE_CXX_CLANG_TIDY)
     SET_TARGET_PROPERTIES(benchmark PROPERTIES CXX_CLANG_TIDY "")
@@ -157,6 +83,10 @@ FUNCTION(KOKKOS_ADD_BENCHMARK NAME)
   ENDIF()
 
   SET(BENCHMARK_NAME ${PACKAGE_NAME}_${NAME})
+  LIST(APPEND BENCHMARK_SOURCES
+    BenchmarkMain.cpp
+    Benchmark_Context.cpp
+  )
 
   ADD_EXECUTABLE(
     ${BENCHMARK_NAME}
@@ -166,6 +96,11 @@ FUNCTION(KOKKOS_ADD_BENCHMARK NAME)
     ${BENCHMARK_NAME}
     PRIVATE benchmark::benchmark Kokkos::kokkos impl_git_version
   )
+  TARGET_INCLUDE_DIRECTORIES(
+    ${BENCHMARK_NAME}
+    SYSTEM PRIVATE ${benchmark_SOURCE_DIR}/include
+  )
+
   FOREACH(SOURCE_FILE ${BENCHMARK_SOURCES})
     SET_SOURCE_FILES_PROPERTIES(
       ${SOURCE_FILE}
@@ -188,7 +123,12 @@ ENDFUNCTION()
 
 SET(
   BENCHMARK_SOURCES
-  BenchmarkMain.cpp
+  PerfTestGramSchmidt.cpp
+  PerfTest_CustomReduction.cpp
+  PerfTest_ExecSpacePartitioning.cpp
+  PerfTestHexGrad.cpp
+  PerfTest_MallocFree.cpp
+  PerfTest_ViewAllocate.cpp
   PerfTest_ViewCopy_a123.cpp
   PerfTest_ViewCopy_b123.cpp
   PerfTest_ViewCopy_c123.cpp
@@ -210,9 +150,50 @@ SET(
   PerfTest_ViewCopy_c8.cpp
   PerfTest_ViewCopy_d8.cpp
   PerfTest_ViewCopy_Raw.cpp
+  PerfTest_ViewFill_123.cpp
+  PerfTest_ViewFill_45.cpp
+  PerfTest_ViewFill_6.cpp
+  PerfTest_ViewFill_7.cpp
+  PerfTest_ViewFill_8.cpp
+  PerfTest_ViewFill_Raw.cpp
+  PerfTest_ViewResize_123.cpp
+  PerfTest_ViewResize_45.cpp
+  PerfTest_ViewResize_6.cpp
+  PerfTest_ViewResize_7.cpp
+  PerfTest_ViewResize_8.cpp
+  PerfTest_ViewResize_Raw.cpp
 )
 
+IF(Kokkos_ENABLE_OPENMPTARGET)
+# FIXME OPENMPTARGET requires TeamPolicy Reductions and Custom Reduction
+  LIST(REMOVE_ITEM BENCHMARK_SOURCES
+    PerfTestGramSchmidt.cpp
+    PerfTest_CustomReduction.cpp
+    PerfTest_ExecSpacePartitioning.cpp
+  )
+ENDIF()
+
 KOKKOS_ADD_BENCHMARK(
   PerformanceTest_Benchmark
   SOURCES ${BENCHMARK_SOURCES}
 )
+
+IF(NOT KOKKOS_ENABLE_CUDA OR KOKKOS_ENABLE_CUDA_LAMBDA)
+  KOKKOS_ADD_BENCHMARK(
+    Benchmark_Atomic_MinMax
+    SOURCES test_atomic_minmax_simple.cpp
+  )
+ENDIF()
+
+# FIXME_NVHPC
+IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
+  KOKKOS_ADD_BENCHMARK(
+    PerformanceTest_Mempool
+    SOURCES test_mempool.cpp
+  )
+ENDIF()
+
+KOKKOS_ADD_BENCHMARK(
+  PerformanceTest_Atomic
+  SOURCES test_atomic.cpp
+)
diff --git a/packages/kokkos/core/perf_test/Makefile b/packages/kokkos/core/perf_test/Makefile
index 396387c622f112d81cadcc75dc59bd1403a5cc28..5e1e0f6541c1d7913adc96acd1977f036fb0ce2b 100644
--- a/packages/kokkos/core/perf_test/Makefile
+++ b/packages/kokkos/core/perf_test/Makefile
@@ -14,7 +14,7 @@ else
   CXX = g++
 endif
 
-CXXFLAGS = -O3 
+CXXFLAGS = -O3
 #CXXFLAGS += -DGENERIC_REDUCER
 LINK ?= $(CXX)
 LDFLAGS ?=
@@ -29,43 +29,12 @@ TARGETS =
 
 #
 
-OBJ_PERF = PerfTestMain.o gtest-all.o
-OBJ_PERF += PerfTest_ExecSpacePartitioning.o
-OBJ_PERF += PerfTestGramSchmidt.o
-OBJ_PERF += PerfTestHexGrad.o
-OBJ_PERF += PerfTest_CustomReduction.o
-OBJ_PERF += PerfTest_ViewAllocate.o
-OBJ_PERF += PerfTest_ViewFill_123.o PerfTest_ViewFill_45.o PerfTest_ViewFill_6.o PerfTest_ViewFill_7.o PerfTest_ViewFill_8.o
-OBJ_PERF += PerfTest_ViewResize_123.o PerfTest_ViewResize_45.o PerfTest_ViewResize_6.o PerfTest_ViewResize_7.o PerfTest_ViewResize_8.o
-TARGETS += KokkosCore_PerformanceTest
-TEST_TARGETS += test-performance
-
-#
-
-OBJ_ATOMICS = test_atomic.o 
-TARGETS += KokkosCore_PerformanceTest_Atomics
-TEST_TARGETS += test-atomic
-
-#
-
-OBJ_MEMPOOL = test_mempool.o 
-TARGETS += KokkosCore_PerformanceTest_Mempool
-TEST_TARGETS += test-mempool
-
-#
-
-OBJ_TASKDAG = test_taskdag.o 
+OBJ_TASKDAG = test_taskdag.o
 TARGETS += KokkosCore_PerformanceTest_TaskDAG
 TEST_TARGETS += test-taskdag
 
 #
 
-OBJ_ATOMICS_MINMAX = test_atomic_minmax_simple.o
-TARGETS += KokkosCore_PerformanceTest_Atomics_MinMax
-TEST_TARGETS += test-atomic-minmax
-
-#
-
 KokkosCore_PerformanceTest: $(OBJ_PERF) $(KOKKOS_LINK_DEPENDS)
 	$(LINK) $(EXTRA_PATH) $(OBJ_PERF) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosCore_PerformanceTest
 
diff --git a/packages/kokkos/core/perf_test/PerfTestBlasKernels.hpp b/packages/kokkos/core/perf_test/PerfTestBlasKernels.hpp
index 1d7073fe5a5f334e5833b0206a89f81992bee4d6..5e6e52f11530e43e14e1da221644ce0c6830b3eb 100644
--- a/packages/kokkos/core/perf_test/PerfTestBlasKernels.hpp
+++ b/packages/kokkos/core/perf_test/PerfTestBlasKernels.hpp
@@ -25,8 +25,8 @@ template <class Type>
 struct Dot {
   using execution_space = typename Type::execution_space;
 
-  static_assert(static_cast<unsigned>(Type::Rank) == static_cast<unsigned>(1),
-                "Dot static_assert Fail: Rank != 1");
+  static_assert(static_cast<unsigned>(Type::rank) == static_cast<unsigned>(1),
+                "Dot static_assert Fail: rank != 1");
 
   using value_type = double;
 
@@ -56,8 +56,8 @@ template <class Type>
 struct DotSingle {
   using execution_space = typename Type::execution_space;
 
-  static_assert(static_cast<unsigned>(Type::Rank) == static_cast<unsigned>(1),
-                "DotSingle static_assert Fail: Rank != 1");
+  static_assert(static_cast<unsigned>(Type::rank) == static_cast<unsigned>(1),
+                "DotSingle static_assert Fail: rank != 1");
 
   using value_type = double;
 
@@ -88,13 +88,13 @@ template <class ScalarType, class VectorType>
 struct Scale {
   using execution_space = typename VectorType::execution_space;
 
-  static_assert(static_cast<unsigned>(ScalarType::Rank) ==
+  static_assert(static_cast<unsigned>(ScalarType::rank) ==
                     static_cast<unsigned>(0),
-                "Scale static_assert Fail: ScalarType::Rank != 0");
+                "Scale static_assert Fail: ScalarType::rank != 0");
 
-  static_assert(static_cast<unsigned>(VectorType::Rank) ==
+  static_assert(static_cast<unsigned>(VectorType::rank) ==
                     static_cast<unsigned>(1),
-                "Scale static_assert Fail: VectorType::Rank != 1");
+                "Scale static_assert Fail: VectorType::rank != 1");
 
 #if 1
   typename ScalarType::const_type alpha;
@@ -115,17 +115,17 @@ template <class ScalarType, class ConstVectorType, class VectorType>
 struct AXPBY {
   using execution_space = typename VectorType::execution_space;
 
-  static_assert(static_cast<unsigned>(ScalarType::Rank) ==
+  static_assert(static_cast<unsigned>(ScalarType::rank) ==
                     static_cast<unsigned>(0),
-                "AXPBY static_assert Fail: ScalarType::Rank != 0");
+                "AXPBY static_assert Fail: ScalarType::rank != 0");
 
-  static_assert(static_cast<unsigned>(ConstVectorType::Rank) ==
+  static_assert(static_cast<unsigned>(ConstVectorType::rank) ==
                     static_cast<unsigned>(1),
-                "AXPBY static_assert Fail: ConstVectorType::Rank != 1");
+                "AXPBY static_assert Fail: ConstVectorType::rank != 1");
 
-  static_assert(static_cast<unsigned>(VectorType::Rank) ==
+  static_assert(static_cast<unsigned>(VectorType::rank) ==
                     static_cast<unsigned>(1),
-                "AXPBY static_assert Fail: VectorType::Rank != 1");
+                "AXPBY static_assert Fail: VectorType::rank != 1");
 
 #if 1
   typename ScalarType::const_type alpha, beta;
diff --git a/packages/kokkos/core/perf_test/PerfTestGramSchmidt.cpp b/packages/kokkos/core/perf_test/PerfTestGramSchmidt.cpp
index c8f8487ffc9f6f6cad816f97136034eb18f840ee..ddfa73d4ba0d38d5ebcae8a1893186a9421f25b5 100644
--- a/packages/kokkos/core/perf_test/PerfTestGramSchmidt.cpp
+++ b/packages/kokkos/core/perf_test/PerfTestGramSchmidt.cpp
@@ -15,11 +15,11 @@
 //@HEADER
 
 #include <Kokkos_Core.hpp>
-#include <gtest/gtest.h>
-#include <PerfTest_Category.hpp>
+#include <benchmark/benchmark.h>
+#include "PerfTest_Category.hpp"
 
 #include <cmath>
-#include <PerfTestBlasKernels.hpp>
+#include "PerfTestBlasKernels.hpp"
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@@ -137,87 +137,61 @@ struct ModifiedGramSchmidt {
 
   //--------------------------------------------------------------------------
 
-  static double test(const size_type length, const size_type count,
-                     const size_t iter = 1) {
+  static double test(const size_type length, const size_type count) {
     multivector_type Q_("Q", length, count);
     multivector_type R_("R", count, count);
 
     typename multivector_type::HostMirror A = Kokkos::create_mirror(Q_);
 
     // Create and fill A on the host
-
     for (size_type j = 0; j < count; ++j) {
       for (size_type i = 0; i < length; ++i) {
         A(i, j) = (i + 1) * (j + 1);
       }
     }
 
-    double dt_min = 0;
-
-    for (size_t i = 0; i < iter; ++i) {
-      Kokkos::deep_copy(Q_, A);
-
-      // A = Q * R
+    Kokkos::deep_copy(Q_, A);
 
-      const double dt = factorization(Q_, R_);
+    // A = Q * R
+    const double dt = factorization(Q_, R_);
 
-      if (0 == i)
-        dt_min = dt;
-      else
-        dt_min = dt < dt_min ? dt : dt_min;
-    }
-
-    return dt_min;
+    return dt;
   }
 };
 
-template <class DeviceType>
-void run_test_gramschmidt(int exp_beg, int exp_end, int num_trials,
-                          const char deviceTypeName[]) {
-  std::string label_gramschmidt;
-  label_gramschmidt.append("\"GramSchmidt< double , ");
-  label_gramschmidt.append(deviceTypeName);
-  label_gramschmidt.append(" >\"");
-
-  for (int i = exp_beg; i < exp_end; ++i) {
-    double min_seconds = 0.0;
-    double max_seconds = 0.0;
-    double avg_seconds = 0.0;
-
-    const int parallel_work_length = 1 << i;
-
-    for (int j = 0; j < num_trials; ++j) {
-      const double seconds = ModifiedGramSchmidt<double, DeviceType>::test(
-          parallel_work_length, 32);
-
-      if (0 == j) {
-        min_seconds = seconds;
-        max_seconds = seconds;
-      } else {
-        if (seconds < min_seconds) min_seconds = seconds;
-        if (seconds > max_seconds) max_seconds = seconds;
-      }
-      avg_seconds += seconds;
-    }
-    avg_seconds /= num_trials;
+template <class Scalar>
+static void GramSchmidt(benchmark::State& state) {
+  const int parallel_work_length = state.range(0);
+
+  for (auto _ : state) {
+    const double seconds =
+        ModifiedGramSchmidt<Scalar, Kokkos::DefaultExecutionSpace>::test(
+            parallel_work_length, 32);
 
-    std::cout << label_gramschmidt << " , " << parallel_work_length << " , "
-              << min_seconds << " , " << (min_seconds / parallel_work_length)
-              << ", " << avg_seconds << std::endl;
+    state.SetIterationTime(seconds);
+    state.counters["Count"] = benchmark::Counter(parallel_work_length);
+    state.counters["Time normalized"] =
+        benchmark::Counter(seconds / parallel_work_length);
   }
 }
 
-TEST(default_exec, gramschmidt) {
-  int exp_beg    = 10;
-  int exp_end    = 20;
-  int num_trials = 5;
-
-  if (command_line_num_args() > 1) exp_beg = std::stoi(command_line_arg(1));
-  if (command_line_num_args() > 2) exp_end = std::stoi(command_line_arg(2));
-  if (command_line_num_args() > 3) num_trials = std::stoi(command_line_arg(3));
-
-  EXPECT_NO_THROW(run_test_gramschmidt<Kokkos::DefaultExecutionSpace>(
-      exp_beg, exp_end, num_trials, Kokkos::DefaultExecutionSpace::name()));
-}
+// FIXME_SYCL SYCL+Cuda reports "an illegal memory access was encountered"
+#if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU)
+BENCHMARK(GramSchmidt<double>)
+    ->ArgName("Count")
+    ->ArgsProduct({
+        benchmark::CreateRange(1 << 10, 1 << 18, 2),
+    })
+    ->UseManualTime()
+    ->Iterations(5);
+#else
+BENCHMARK(GramSchmidt<double>)
+    ->ArgName("Count")
+    ->ArgsProduct({
+        benchmark::CreateRange(1 << 10, 1 << 19, 2),
+    })
+    ->UseManualTime()
+    ->Iterations(5);
+#endif
 
 }  // namespace Test
diff --git a/packages/kokkos/core/perf_test/PerfTestHexGrad.cpp b/packages/kokkos/core/perf_test/PerfTestHexGrad.cpp
index ef92de7ce1095a18be29ed58352193e01dd0db9c..98cb246c71e1faed8191791af2111c42acd0b300 100644
--- a/packages/kokkos/core/perf_test/PerfTestHexGrad.cpp
+++ b/packages/kokkos/core/perf_test/PerfTestHexGrad.cpp
@@ -15,8 +15,9 @@
 //@HEADER
 
 #include <Kokkos_Core.hpp>
-#include <gtest/gtest.h>
-#include <PerfTest_Category.hpp>
+#include <benchmark/benchmark.h>
+#include "Benchmark_Context.hpp"
+#include "PerfTest_Category.hpp"
 
 namespace Test {
 
@@ -195,78 +196,43 @@ struct HexGrad {
 
   //--------------------------------------------------------------------------
 
-  static double test(const int count, const int iter = 1) {
+  static double test(const int count) {
     elem_coord_type coord("coord", count);
     elem_grad_type grad("grad", count);
 
     // Execute the parallel kernels on the arrays:
-
-    double dt_min = 0;
-
     Kokkos::parallel_for(count, Init(coord));
     execution_space().fence();
 
-    for (int i = 0; i < iter; ++i) {
-      Kokkos::Timer timer;
-      Kokkos::parallel_for(count, HexGrad<execution_space>(coord, grad));
-      execution_space().fence();
-      const double dt = timer.seconds();
-      if (0 == i)
-        dt_min = dt;
-      else
-        dt_min = dt < dt_min ? dt : dt_min;
-    }
-
-    return dt_min;
+    Kokkos::Timer timer;
+    Kokkos::parallel_for(count, HexGrad<execution_space>(coord, grad));
+    execution_space().fence();
+    return timer.seconds();
   }
 };
 
-template <class DeviceType>
-void run_test_hexgrad(int exp_beg, int exp_end, int num_trials,
-                      const char deviceTypeName[]) {
-  std::string label_hexgrad;
-  label_hexgrad.append("\"HexGrad< double , ");
-  label_hexgrad.append(deviceTypeName);
-  label_hexgrad.append(" >\"");
-
-  for (int i = exp_beg; i < exp_end; ++i) {
-    double min_seconds = 0.0;
-    double max_seconds = 0.0;
-    double avg_seconds = 0.0;
-
-    const int parallel_work_length = 1 << i;
-
-    for (int j = 0; j < num_trials; ++j) {
-      const double seconds = HexGrad<DeviceType>::test(parallel_work_length);
-
-      if (0 == j) {
-        min_seconds = seconds;
-        max_seconds = seconds;
-      } else {
-        if (seconds < min_seconds) min_seconds = seconds;
-        if (seconds > max_seconds) max_seconds = seconds;
-      }
-      avg_seconds += seconds;
-    }
-    avg_seconds /= num_trials;
+template <class CoordScalarType>
+static void HexGrad_Benchmark(benchmark::State& state) {
+  const auto parallel_work_length = state.range(0);
+
+  for (auto _ : state) {
+    const auto time =
+        HexGrad<Kokkos::DefaultExecutionSpace, CoordScalarType>::test(
+            parallel_work_length);
 
-    std::cout << label_hexgrad << " , " << parallel_work_length << " , "
-              << min_seconds << " , " << (min_seconds / parallel_work_length)
-              << avg_seconds << std::endl;
+    state.SetIterationTime(time);
+    state.counters["Count"] = benchmark::Counter(parallel_work_length);
+    state.counters["Time normalized"] =
+        benchmark::Counter(time / parallel_work_length);
   }
 }
 
-TEST(default_exec, hexgrad) {
-  int exp_beg    = 10;
-  int exp_end    = 20;
-  int num_trials = 5;
-
-  if (command_line_num_args() > 1) exp_beg = std::stoi(command_line_arg(1));
-  if (command_line_num_args() > 2) exp_end = std::stoi(command_line_arg(2));
-  if (command_line_num_args() > 3) num_trials = std::stoi(command_line_arg(3));
-
-  EXPECT_NO_THROW(run_test_hexgrad<Kokkos::DefaultExecutionSpace>(
-      exp_beg, exp_end, num_trials, Kokkos::DefaultExecutionSpace::name()));
-}
+BENCHMARK(HexGrad_Benchmark<double>)
+    ->ArgName("count")
+    ->ArgsProduct({
+        benchmark::CreateRange(1 << 10, 1 << 19, 2),
+    })
+    ->UseManualTime()
+    ->Iterations(5);
 
 }  // namespace Test
diff --git a/packages/kokkos/core/perf_test/PerfTestMain.cpp b/packages/kokkos/core/perf_test/PerfTestMain.cpp
index 2729432adc32d727d305606bf024b94dd4ad235d..7315f26e5c16d045a48c8dbe53376dcb8abff2d6 100644
--- a/packages/kokkos/core/perf_test/PerfTestMain.cpp
+++ b/packages/kokkos/core/perf_test/PerfTestMain.cpp
@@ -18,24 +18,7 @@
 #include <cstdlib>
 
 #include <Kokkos_Core.hpp>
-
-namespace Test {
-int command_line_num_args(int n = 0) {
-  static int n_args = 0;
-  if (n > 0) n_args = n;
-  return n_args;
-}
-
-const char* command_line_arg(int k, char** input_args = nullptr) {
-  static char** args;
-  if (input_args != nullptr) args = input_args;
-  if (command_line_num_args() > k)
-    return args[k];
-  else
-    return nullptr;
-}
-
-}  // namespace Test
+#include <PerfTest_Category.hpp>
 
 int main(int argc, char* argv[]) {
   ::testing::InitGoogleTest(&argc, argv);
diff --git a/packages/kokkos/core/perf_test/PerfTest_Category.hpp b/packages/kokkos/core/perf_test/PerfTest_Category.hpp
index 126c9611479b453df799a17b9592e7675f312086..60f76ea8f561a870a698fccee31a6b546328082b 100644
--- a/packages/kokkos/core/perf_test/PerfTest_Category.hpp
+++ b/packages/kokkos/core/perf_test/PerfTest_Category.hpp
@@ -17,12 +17,22 @@
 #ifndef KOKKOS_TEST_PERFTEST_CAT_HPP
 #define KOKKOS_TEST_PERFTEST_CAT_HPP
 
-#include <gtest/gtest.h>
-
 namespace Test {
 
-extern int command_line_num_args(int n = 0);
-extern const char* command_line_arg(int k, char** input_args = nullptr);
+inline int command_line_num_args(int n = 0) {
+  static int n_args = 0;
+  if (n > 0) n_args = n;
+  return n_args;
+}
+
+inline const char* command_line_arg(int k, char** input_args = nullptr) {
+  static char** args;
+  if (input_args != nullptr) args = input_args;
+  if (command_line_num_args() > k)
+    return args[k];
+  else
+    return nullptr;
+}
 
 }  // namespace Test
 
diff --git a/packages/kokkos/core/perf_test/PerfTest_CustomReduction.cpp b/packages/kokkos/core/perf_test/PerfTest_CustomReduction.cpp
index 049301f9a712cac5fced36fed114d3135e0b5a59..2110f38a916f813b8a4a96f6af5899a6c59910c9 100644
--- a/packages/kokkos/core/perf_test/PerfTest_CustomReduction.cpp
+++ b/packages/kokkos/core/perf_test/PerfTest_CustomReduction.cpp
@@ -15,14 +15,16 @@
 //@HEADER
 
 #include <Kokkos_Core.hpp>
-#include <gtest/gtest.h>
-#include <PerfTest_Category.hpp>
+#include <benchmark/benchmark.h>
+#include "Benchmark_Context.hpp"
+#include "PerfTest_Category.hpp"
 #include <Kokkos_Random.hpp>
+#include <utility>
 
 #ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
 namespace Test {
 template <class Scalar>
-void custom_reduction_test(int N, int R, int num_trials) {
+std::pair<double, Scalar> custom_reduction_test(int N, int R) {
   Kokkos::Random_XorShift64_Pool<> rand_pool(183291);
   Kokkos::View<Scalar*> a("A", N);
   Kokkos::fill_random(a, rand_pool, 1.0);
@@ -62,49 +64,70 @@ void custom_reduction_test(int N, int R, int num_trials) {
 
   // Timing
   Kokkos::Timer timer;
-  for (int r = 0; r < num_trials; r++) {
-    Kokkos::parallel_reduce(
-        Kokkos::TeamPolicy<>(N / 1024, team_size),
-        KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type& team,
-                      Scalar& lmax) {
-          Scalar team_max = Scalar(0);
-          for (int rr = 0; rr < R; rr++) {
-            int i = team.league_rank();
-            Kokkos::parallel_reduce(
-                Kokkos::TeamThreadRange(team, 32),
-                [&](const int& j, Scalar& thread_max) {
-                  Scalar t_max = Scalar(0);
-                  Kokkos::parallel_reduce(
-                      Kokkos::ThreadVectorRange(team, 32),
-                      [&](const int& k, Scalar& max_) {
-                        const Scalar val = a((i * 32 + j) * 32 + k);
-                        if (val > max_) max_ = val;
-                        if ((k == 11) && (j == 17) && (i == 2)) max_ = 11.5;
-                      },
-                      Kokkos::Max<Scalar>(t_max));
-                  if (t_max > thread_max) thread_max = t_max;
-                },
-                Kokkos::Max<Scalar>(team_max));
-          }
-          if (team_max > lmax) lmax = team_max;
-        },
-        Kokkos::Max<Scalar>(max));
-  }
-  double time = timer.seconds();
-  printf("%e %e %e\n", time,
-         1.0 * N * R * num_trials * sizeof(Scalar) / time / 1024 / 1024 / 1024,
-         max);
+  Kokkos::parallel_reduce(
+      Kokkos::TeamPolicy<>(N / 1024, team_size),
+      KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type& team,
+                    Scalar& lmax) {
+        Scalar team_max = Scalar(0);
+        for (int rr = 0; rr < R; rr++) {
+          int i = team.league_rank();
+          Kokkos::parallel_reduce(
+              Kokkos::TeamThreadRange(team, 32),
+              [&](const int& j, Scalar& thread_max) {
+                Scalar t_max = Scalar(0);
+                Kokkos::parallel_reduce(
+                    Kokkos::ThreadVectorRange(team, 32),
+                    [&](const int& k, Scalar& max_) {
+                      const Scalar val = a((i * 32 + j) * 32 + k);
+                      if (val > max_) max_ = val;
+                      if ((k == 11) && (j == 17) && (i == 2)) max_ = 11.5;
+                    },
+                    Kokkos::Max<Scalar>(t_max));
+                if (t_max > thread_max) thread_max = t_max;
+              },
+              Kokkos::Max<Scalar>(team_max));
+        }
+        if (team_max > lmax) lmax = team_max;
+      },
+      Kokkos::Max<Scalar>(max));
+
+  return std::make_pair(timer.seconds(), max);
 }
 
-TEST(default_exec, custom_reduction) {
-  int N          = 100000;
-  int R          = 1000;
-  int num_trials = 1;
+int get_N(benchmark::State& state) {
+  return (Test::command_line_num_args() > 1)
+             ? std::stoi(Test::command_line_arg(1))
+             : state.range(0);
+}
 
-  if (command_line_num_args() > 1) N = std::stoi(command_line_arg(1));
-  if (command_line_num_args() > 2) R = std::stoi(command_line_arg(2));
-  if (command_line_num_args() > 3) num_trials = std::stoi(command_line_arg(3));
-  custom_reduction_test<double>(N, R, num_trials);
+int get_R(benchmark::State& state) {
+  return (Test::command_line_num_args() > 2)
+             ? std::stoi(Test::command_line_arg(2))
+             : state.range(1);
 }
+
+template <class Scalar>
+static void CustomReduction(benchmark::State& state) {
+  int N = get_N(state);
+  int R = get_R(state);
+
+  for (auto _ : state) {
+    auto results = custom_reduction_test<double>(N, R);
+    // data processed in gigabytes
+    const double data_processed =
+        N * R * sizeof(Scalar) / results.first / 1'000'000'000;
+
+    state.SetIterationTime(results.first);
+    state.counters[KokkosBenchmark::benchmark_fom("GB/s")] = benchmark::Counter(
+        data_processed, benchmark::Counter::kIsIterationInvariantRate);
+    state.counters["Max"] = benchmark::Counter(results.second);
+  }
+}
+
+BENCHMARK(CustomReduction<double>)
+    ->ArgNames({"N", "R"})
+    ->Args({100'000, 1'000})
+    ->UseManualTime();
+
 }  // namespace Test
 #endif
diff --git a/packages/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp b/packages/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp
index 48419c7ec681be818183e66eb89a78a8337dd877..d2a3d0b823a214de6f73d44e5280f1183fe1c5ae 100644
--- a/packages/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp
+++ b/packages/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp
@@ -15,8 +15,8 @@
 //@HEADER
 
 #include <Kokkos_Core.hpp>
-#include <gtest/gtest.h>
-#include <PerfTest_Category.hpp>
+#include <benchmark/benchmark.h>
+#include "PerfTest_Category.hpp"
 
 namespace Test {
 
@@ -54,7 +54,7 @@ bool is_overlapping<Kokkos::HIP>(const Kokkos::HIP&) {
 }
 #endif
 
-#ifdef KOKKOS_ENABLE_SYCL
+#if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU)
 template <>
 bool is_overlapping<Kokkos::Experimental::SYCL>(
     const Kokkos::Experimental::SYCL&) {
@@ -154,10 +154,10 @@ struct FunctorTeamReduce {
   }
 };
 
-TEST(default_exec, overlap_range_policy) {
-  int N = 2000;
-  int M = 10000;
-  int R = 10;
+static void OverlapRangePolicy(benchmark::State& state) {
+  int N = state.range(0);
+  int M = state.range(1);
+  int R = state.range(2);
 
   TEST_EXECSPACE space;
   std::vector<TEST_EXECSPACE> execution_space_instances =
@@ -165,160 +165,172 @@ TEST(default_exec, overlap_range_policy) {
   TEST_EXECSPACE space1 = execution_space_instances[0];
   TEST_EXECSPACE space2 = execution_space_instances[1];
 
-  Kokkos::View<double**, TEST_EXECSPACE> a("A", N, M);
-  FunctorRange f(M, R, a);
-  FunctorRangeReduce fr(M, R, a);
-  Kokkos::parallel_for("default_exec::overlap_range_policy::kernel0",
-                       Kokkos::RangePolicy<TEST_EXECSPACE>(0, N),
-                       FunctorRange(M, R, a));
-
-  Kokkos::parallel_for(
-      "default_exec::overlap_range_policy::kernel1",
-      Kokkos::Experimental::require(
-          Kokkos::RangePolicy<TEST_EXECSPACE>(space1, 0, N),
-          Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-      f);
-  Kokkos::parallel_for(
-      "default_exec::overlap_range_policy::kernel2",
-      Kokkos::Experimental::require(
-          Kokkos::RangePolicy<TEST_EXECSPACE>(space2, 0, N),
-          Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-      f);
-  Kokkos::fence();
-
-  Kokkos::Timer timer;
-  Kokkos::parallel_for(
-      "default_exec::overlap_range_policy::kernel3",
-      Kokkos::Experimental::require(
-          Kokkos::RangePolicy<TEST_EXECSPACE>(space, 0, N),
-          Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-      f);
-  Kokkos::parallel_for(
-      "default_exec::overlap_range_policy::kernel4",
-      Kokkos::Experimental::require(
-          Kokkos::RangePolicy<TEST_EXECSPACE>(space, 0, N),
-          Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-      f);
-  Kokkos::fence();
-
-  timer.reset();
-  Kokkos::parallel_for(
-      "default_exec::overlap_range_policy::kernel5",
-      Kokkos::Experimental::require(
-          Kokkos::RangePolicy<TEST_EXECSPACE>(space1, 0, N),
-          Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-      FunctorRange(M, R, a));
-  Kokkos::parallel_for(
-      "default_exec::overlap_range_policy::kernel6",
-      Kokkos::Experimental::require(
-          Kokkos::RangePolicy<TEST_EXECSPACE>(space2, 0, N),
-          Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-      FunctorRange(M, R, a));
-  Kokkos::fence();
-  double time_overlap = timer.seconds();
-
-  timer.reset();
-  Kokkos::parallel_for(
-      "default_exec::overlap_range_policy::kernel7",
-      Kokkos::Experimental::require(
-          Kokkos::RangePolicy<TEST_EXECSPACE>(space, 0, N),
-          Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-      f);
-  Kokkos::parallel_for(
-      "default_exec::overlap_range_policy::kernel8",
-      Kokkos::Experimental::require(
-          Kokkos::RangePolicy<TEST_EXECSPACE>(space, 0, N),
-          Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-      f);
-  Kokkos::fence();
-  double time_end = timer.seconds();
-
-  if (is_overlapping(space)) {
-    ASSERT_GT(time_end, 1.5 * time_overlap);
-  }
-  printf("Time RangePolicy: NonOverlap: %lf Time Overlap: %lf\n", time_end,
-         time_overlap);
-
-  Kokkos::View<double, TEST_EXECSPACE> result("result");
-  Kokkos::View<double, TEST_EXECSPACE> result1("result1");
-  Kokkos::View<double, TEST_EXECSPACE> result2("result2");
-  Kokkos::View<double, Kokkos::HostSpace> h_result("h_result");
-  Kokkos::View<double, Kokkos::HostSpace> h_result1("h_result1");
-  Kokkos::View<double, Kokkos::HostSpace> h_result2("h_result2");
-
-  timer.reset();
-  Kokkos::parallel_reduce(
-      "default_exec::overlap_range_policy::kernel_reduce",
-      Kokkos::Experimental::require(
-          Kokkos::RangePolicy<TEST_EXECSPACE>(space, 0, N),
-          Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-      fr, result);
-  Kokkos::fence();
-  double time_fenced = timer.seconds();
-  Kokkos::deep_copy(h_result, result);
-
-  timer.reset();
-  Kokkos::parallel_reduce(
-      "default_exec::overlap_range_policy::kernel_reduce",
-      Kokkos::Experimental::require(
-          Kokkos::RangePolicy<TEST_EXECSPACE>(space, 0, N),
-          Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-      fr, result);
-  double time_not_fenced = timer.seconds();
-  Kokkos::fence();
-  if (is_overlapping(space)) {
-    ASSERT_GT(time_fenced, 2.0 * time_not_fenced);
-  }
+  for (auto _ : state) {
+    Kokkos::View<double**, TEST_EXECSPACE> a("A", N, M);
+    FunctorRange f(M, R, a);
+    FunctorRangeReduce fr(M, R, a);
+    Kokkos::parallel_for("default_exec::overlap_range_policy::kernel0",
+                         Kokkos::RangePolicy<TEST_EXECSPACE>(0, N),
+                         FunctorRange(M, R, a));
+
+    Kokkos::parallel_for(
+        "default_exec::overlap_range_policy::kernel1",
+        Kokkos::Experimental::require(
+            Kokkos::RangePolicy<TEST_EXECSPACE>(space1, 0, N),
+            Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+        f);
+    Kokkos::parallel_for(
+        "default_exec::overlap_range_policy::kernel2",
+        Kokkos::Experimental::require(
+            Kokkos::RangePolicy<TEST_EXECSPACE>(space2, 0, N),
+            Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+        f);
+    Kokkos::fence();
+
+    Kokkos::Timer timer;
+    Kokkos::parallel_for(
+        "default_exec::overlap_range_policy::kernel3",
+        Kokkos::Experimental::require(
+            Kokkos::RangePolicy<TEST_EXECSPACE>(space, 0, N),
+            Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+        f);
+    Kokkos::parallel_for(
+        "default_exec::overlap_range_policy::kernel4",
+        Kokkos::Experimental::require(
+            Kokkos::RangePolicy<TEST_EXECSPACE>(space, 0, N),
+            Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+        f);
+    Kokkos::fence();
+
+    timer.reset();
+    Kokkos::parallel_for(
+        "default_exec::overlap_range_policy::kernel5",
+        Kokkos::Experimental::require(
+            Kokkos::RangePolicy<TEST_EXECSPACE>(space1, 0, N),
+            Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+        FunctorRange(M, R, a));
+    Kokkos::parallel_for(
+        "default_exec::overlap_range_policy::kernel6",
+        Kokkos::Experimental::require(
+            Kokkos::RangePolicy<TEST_EXECSPACE>(space2, 0, N),
+            Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+        FunctorRange(M, R, a));
+    Kokkos::fence();
+    double time_overlap = timer.seconds();
+
+    timer.reset();
+    Kokkos::parallel_for(
+        "default_exec::overlap_range_policy::kernel7",
+        Kokkos::Experimental::require(
+            Kokkos::RangePolicy<TEST_EXECSPACE>(space, 0, N),
+            Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+        f);
+    Kokkos::parallel_for(
+        "default_exec::overlap_range_policy::kernel8",
+        Kokkos::Experimental::require(
+            Kokkos::RangePolicy<TEST_EXECSPACE>(space, 0, N),
+            Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+        f);
+    Kokkos::fence();
+    double time_end = timer.seconds();
+
+    if (is_overlapping(space)) {
+      KOKKOS_ASSERT(time_end > 1.5 * time_overlap);
+    }
+    state.counters["Time NonOverlap"] = benchmark::Counter(time_end);
+    state.counters["Time Overlap"]    = benchmark::Counter(time_overlap);
+
+    Kokkos::View<double, TEST_EXECSPACE> result("result");
+    Kokkos::View<double, TEST_EXECSPACE> result1("result1");
+    Kokkos::View<double, TEST_EXECSPACE> result2("result2");
+    Kokkos::View<double, Kokkos::HostSpace> h_result("h_result");
+    Kokkos::View<double, Kokkos::HostSpace> h_result1("h_result1");
+    Kokkos::View<double, Kokkos::HostSpace> h_result2("h_result2");
+
+    timer.reset();
+    Kokkos::parallel_reduce(
+        "default_exec::overlap_range_policy::kernel_reduce",
+        Kokkos::Experimental::require(
+            Kokkos::RangePolicy<TEST_EXECSPACE>(space, 0, N),
+            Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+        fr, result);
+    Kokkos::fence();
+    double time_fenced = timer.seconds();
+    Kokkos::deep_copy(h_result, result);
+
+    timer.reset();
+    Kokkos::parallel_reduce(
+        "default_exec::overlap_range_policy::kernel_reduce",
+        Kokkos::Experimental::require(
+            Kokkos::RangePolicy<TEST_EXECSPACE>(space, 0, N),
+            Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+        fr, result);
+    double time_not_fenced = timer.seconds();
+    Kokkos::fence();
+    if (is_overlapping(space)) {
+      KOKKOS_ASSERT(time_fenced > 2.0 * time_not_fenced);
+    }
 
-  timer.reset();
-  Kokkos::parallel_reduce(
-      "default_exec::overlap_range_policy::kernel_reduce",
-      Kokkos::Experimental::require(
-          Kokkos::RangePolicy<TEST_EXECSPACE>(space, 0, N),
-          Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-      fr, result);
-  Kokkos::parallel_reduce(
-      "default_exec::overlap_range_policy::kernel_reduce",
-      Kokkos::Experimental::require(
-          Kokkos::RangePolicy<TEST_EXECSPACE>(space, 0, N),
-          Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-      fr, result);
-  Kokkos::fence();
-  double time_no_overlapped_reduce = timer.seconds();
-
-  timer.reset();
-  Kokkos::parallel_reduce(
-      "default_exec::overlap_range_policy::kernel_reduce",
-      Kokkos::Experimental::require(
-          Kokkos::RangePolicy<TEST_EXECSPACE>(space1, 0, N),
-          Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-      fr, result1);
-  Kokkos::parallel_reduce(
-      "default_exec::overlap_range_policy::kernel_reduce",
-      Kokkos::Experimental::require(
-          Kokkos::RangePolicy<TEST_EXECSPACE>(space2, 0, N),
-          Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-      fr, result2);
-  Kokkos::fence();
-  double time_overlapped_reduce = timer.seconds();
-
-  Kokkos::deep_copy(h_result2, result2);
-  Kokkos::deep_copy(h_result1, result1);
-
-  ASSERT_EQ(h_result1(), h_result());
-  ASSERT_EQ(h_result2(), h_result());
-
-  if (is_overlapping(space)) {
-    ASSERT_LT(time_overlapped_reduce, 1.5 * time_no_overlapped_reduce);
+    state.counters["Time fenced"]     = benchmark::Counter(time_fenced);
+    state.counters["Time not fenced"] = benchmark::Counter(time_not_fenced);
+
+    timer.reset();
+    Kokkos::parallel_reduce(
+        "default_exec::overlap_range_policy::kernel_reduce",
+        Kokkos::Experimental::require(
+            Kokkos::RangePolicy<TEST_EXECSPACE>(space, 0, N),
+            Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+        fr, result);
+    Kokkos::parallel_reduce(
+        "default_exec::overlap_range_policy::kernel_reduce",
+        Kokkos::Experimental::require(
+            Kokkos::RangePolicy<TEST_EXECSPACE>(space, 0, N),
+            Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+        fr, result);
+    Kokkos::fence();
+    double time_no_overlapped_reduce = timer.seconds();
+
+    timer.reset();
+    Kokkos::parallel_reduce(
+        "default_exec::overlap_range_policy::kernel_reduce",
+        Kokkos::Experimental::require(
+            Kokkos::RangePolicy<TEST_EXECSPACE>(space1, 0, N),
+            Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+        fr, result1);
+    Kokkos::parallel_reduce(
+        "default_exec::overlap_range_policy::kernel_reduce",
+        Kokkos::Experimental::require(
+            Kokkos::RangePolicy<TEST_EXECSPACE>(space2, 0, N),
+            Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+        fr, result2);
+    Kokkos::fence();
+    double time_overlapped_reduce = timer.seconds();
+
+    Kokkos::deep_copy(h_result2, result2);
+    Kokkos::deep_copy(h_result1, result1);
+
+    KOKKOS_ASSERT(h_result1() == h_result());
+    KOKKOS_ASSERT(h_result2() == h_result());
+
+    if (is_overlapping(space)) {
+      KOKKOS_ASSERT(time_overlapped_reduce < 1.5 * time_no_overlapped_reduce);
+    }
+
+    state.counters["Time Reduce: NonOverlap"] =
+        benchmark::Counter(time_no_overlapped_reduce);
+    state.counters["Time Reduce: Overlap"] =
+        benchmark::Counter(time_overlapped_reduce);
   }
-  printf("Time RangePolicy Reduce: NonOverlap: %lf Time Overlap: %lf\n",
-         time_no_overlapped_reduce, time_overlapped_reduce);
 }
 
-TEST(default_exec, overlap_mdrange_policy) {
-  int N = 200;
-  int M = 10000;
-  int R = 10;
+BENCHMARK(OverlapRangePolicy)
+    ->ArgNames({"N", "M", "R"})
+    ->Args({2'000, 10'000, 10});
+
+static void OverlapMDRangePolicy(benchmark::State& state) {
+  int N = state.range(0);
+  int M = state.range(1);
+  int R = state.range(2);
 
   TEST_EXECSPACE space;
   std::vector<TEST_EXECSPACE> execution_space_instances =
@@ -326,178 +338,191 @@ TEST(default_exec, overlap_mdrange_policy) {
   TEST_EXECSPACE space1 = execution_space_instances[0];
   TEST_EXECSPACE space2 = execution_space_instances[1];
 
-  Kokkos::View<double**, TEST_EXECSPACE> a("A", N, M);
-  FunctorMDRange f(M, R, a);
-  FunctorMDRangeReduce fr(M, R, a);
-  Kokkos::parallel_for(
-      "default_exec::overlap_range_policy::kernel0",
-      Kokkos::Experimental::require(
-          Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>({0, 0},
-                                                                 {N, R}),
-          Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-      FunctorMDRange(M, R, a));
-
-  Kokkos::parallel_for(
-      "default_exec::overlap_range_policy::kernel1",
-      Kokkos::Experimental::require(
-          Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(space1, {0, 0},
-                                                                 {N, R}),
-          Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-      f);
-  Kokkos::parallel_for(
-      "default_exec::overlap_range_policy::kernel2",
-      Kokkos::Experimental::require(
-          Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(space2, {0, 0},
-                                                                 {N, R}),
-          Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-      f);
-  Kokkos::fence();
-
-  Kokkos::Timer timer;
-  Kokkos::parallel_for(
-      "default_exec::overlap_range_policy::kernel3",
-      Kokkos::Experimental::require(
-          Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(space, {0, 0},
-                                                                 {N, R}),
-          Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-      f);
-  Kokkos::parallel_for(
-      "default_exec::overlap_range_policy::kernel4",
-      Kokkos::Experimental::require(
-          Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(space, {0, 0},
-                                                                 {N, R}),
-          Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-      f);
-  Kokkos::fence();
-
-  timer.reset();
-  Kokkos::parallel_for(
-      "default_exec::overlap_range_policy::kernel5",
-      Kokkos::Experimental::require(
-          Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(space1, {0, 0},
-                                                                 {N, R}),
-          Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-      FunctorMDRange(M, R, a));
-  Kokkos::parallel_for(
-      "default_exec::overlap_range_policy::kernel6",
-      Kokkos::Experimental::require(
-          Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(space2, {0, 0},
-                                                                 {N, R}),
-          Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-      FunctorMDRange(M, R, a));
-  Kokkos::fence();
-  double time_overlap = timer.seconds();
-
-  timer.reset();
-  Kokkos::parallel_for(
-      "default_exec::overlap_range_policy::kernel7",
-      Kokkos::Experimental::require(
-          Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(space, {0, 0},
-                                                                 {N, R}),
-          Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-      f);
-  Kokkos::parallel_for(
-      "default_exec::overlap_range_policy::kernel8",
-      Kokkos::Experimental::require(
-          Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(space, {0, 0},
-                                                                 {N, R}),
-          Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-      f);
-  Kokkos::fence();
-  double time_end = timer.seconds();
-
-  if (is_overlapping(space)) {
-    ASSERT_GT(time_end, 1.5 * time_overlap);
-  }
-  printf("Time MDRangePolicy: NonOverlap: %lf Time Overlap: %lf\n", time_end,
-         time_overlap);
-
-  Kokkos::View<double, TEST_EXECSPACE> result("result");
-  Kokkos::View<double, TEST_EXECSPACE> result1("result1");
-  Kokkos::View<double, TEST_EXECSPACE> result2("result2");
-  Kokkos::View<double, Kokkos::HostSpace> h_result("h_result");
-  Kokkos::View<double, Kokkos::HostSpace> h_result1("h_result1");
-  Kokkos::View<double, Kokkos::HostSpace> h_result2("h_result2");
-
-  timer.reset();
-  Kokkos::parallel_reduce(
-      "default_exec::overlap_mdrange_policy::kernel_reduce",
-      Kokkos::Experimental::require(
-          Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(space, {0, 0},
-                                                                 {N, R}),
-          Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-      fr, result);
-  Kokkos::fence();
-  double time_fenced = timer.seconds();
-  Kokkos::deep_copy(h_result, result);
-
-  timer.reset();
-  Kokkos::parallel_reduce(
-      "default_exec::overlap_mdrange_policy::kernel_reduce",
-      Kokkos::Experimental::require(
-          Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(space, {0, 0},
-                                                                 {N, R}),
-          Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-      fr, result);
-  double time_not_fenced = timer.seconds();
-  Kokkos::fence();
-  if (is_overlapping(space)) {
-    ASSERT_GT(time_fenced, 2.0 * time_not_fenced);
-  }
+  for (auto _ : state) {
+    Kokkos::View<double**, TEST_EXECSPACE> a("A", N, M);
+    FunctorMDRange f(M, R, a);
+    FunctorMDRangeReduce fr(M, R, a);
+    Kokkos::parallel_for(
+        "default_exec::overlap_range_policy::kernel0",
+        Kokkos::Experimental::require(
+            Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>({0, 0},
+                                                                   {N, R}),
+            Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+        FunctorMDRange(M, R, a));
+
+    Kokkos::parallel_for(
+        "default_exec::overlap_range_policy::kernel1",
+        Kokkos::Experimental::require(
+            Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(
+                space1, {0, 0}, {N, R}),
+            Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+        f);
+    Kokkos::parallel_for(
+        "default_exec::overlap_range_policy::kernel2",
+        Kokkos::Experimental::require(
+            Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(
+                space2, {0, 0}, {N, R}),
+            Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+        f);
+    Kokkos::fence();
+
+    Kokkos::Timer timer;
+    Kokkos::parallel_for(
+        "default_exec::overlap_range_policy::kernel3",
+        Kokkos::Experimental::require(
+            Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(
+                space, {0, 0}, {N, R}),
+            Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+        f);
+    Kokkos::parallel_for(
+        "default_exec::overlap_range_policy::kernel4",
+        Kokkos::Experimental::require(
+            Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(
+                space, {0, 0}, {N, R}),
+            Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+        f);
+    Kokkos::fence();
+
+    timer.reset();
+    Kokkos::parallel_for(
+        "default_exec::overlap_range_policy::kernel5",
+        Kokkos::Experimental::require(
+            Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(
+                space1, {0, 0}, {N, R}),
+            Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+        FunctorMDRange(M, R, a));
+    Kokkos::parallel_for(
+        "default_exec::overlap_range_policy::kernel6",
+        Kokkos::Experimental::require(
+            Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(
+                space2, {0, 0}, {N, R}),
+            Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+        FunctorMDRange(M, R, a));
+    Kokkos::fence();
+    double time_overlap = timer.seconds();
+
+    timer.reset();
+    Kokkos::parallel_for(
+        "default_exec::overlap_range_policy::kernel7",
+        Kokkos::Experimental::require(
+            Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(
+                space, {0, 0}, {N, R}),
+            Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+        f);
+    Kokkos::parallel_for(
+        "default_exec::overlap_range_policy::kernel8",
+        Kokkos::Experimental::require(
+            Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(
+                space, {0, 0}, {N, R}),
+            Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+        f);
+    Kokkos::fence();
+    double time_end = timer.seconds();
+
+    if (is_overlapping(space)) {
+      KOKKOS_ASSERT(time_end > 1.5 * time_overlap);
+    }
 
-  timer.reset();
-  Kokkos::parallel_reduce(
-      "default_exec::overlap_mdrange_policy::kernel_reduce",
-      Kokkos::Experimental::require(
-          Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(space, {0, 0},
-                                                                 {N, R}),
-          Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-      fr, result);
-  Kokkos::parallel_reduce(
-      "default_exec::overlap_mdrange_policy::kernel_reduce",
-      Kokkos::Experimental::require(
-          Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(space, {0, 0},
-                                                                 {N, R}),
-          Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-      fr, result);
-  Kokkos::fence();
-  double time_no_overlapped_reduce = timer.seconds();
-
-  timer.reset();
-  Kokkos::parallel_reduce(
-      "default_exec::overlap_mdrange_policy::kernel_reduce",
-      Kokkos::Experimental::require(
-          Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(space1, {0, 0},
-                                                                 {N, R}),
-          Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-      fr, result1);
-  Kokkos::parallel_reduce(
-      "default_exec::overlap_mdrange_policy::kernel_reduce",
-      Kokkos::Experimental::require(
-          Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(space2, {0, 0},
-                                                                 {N, R}),
-          Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-      fr, result2);
-  Kokkos::fence();
-  double time_overlapped_reduce = timer.seconds();
-
-  Kokkos::deep_copy(h_result2, result2);
-  Kokkos::deep_copy(h_result1, result1);
-
-  ASSERT_EQ(h_result1(), h_result());
-  ASSERT_EQ(h_result2(), h_result());
-
-  if (is_overlapping(space)) {
-    ASSERT_LT(time_overlapped_reduce, 1.5 * time_no_overlapped_reduce);
+    state.counters["Time NonOverlap"] = benchmark::Counter(time_end);
+    state.counters["Time Overlap"]    = benchmark::Counter(time_overlap);
+
+    Kokkos::View<double, TEST_EXECSPACE> result("result");
+    Kokkos::View<double, TEST_EXECSPACE> result1("result1");
+    Kokkos::View<double, TEST_EXECSPACE> result2("result2");
+    Kokkos::View<double, Kokkos::HostSpace> h_result("h_result");
+    Kokkos::View<double, Kokkos::HostSpace> h_result1("h_result1");
+    Kokkos::View<double, Kokkos::HostSpace> h_result2("h_result2");
+
+    timer.reset();
+    Kokkos::parallel_reduce(
+        "default_exec::overlap_mdrange_policy::kernel_reduce",
+        Kokkos::Experimental::require(
+            Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(
+                space, {0, 0}, {N, R}),
+            Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+        fr, result);
+    Kokkos::fence();
+    double time_fenced = timer.seconds();
+    Kokkos::deep_copy(h_result, result);
+
+    timer.reset();
+    Kokkos::parallel_reduce(
+        "default_exec::overlap_mdrange_policy::kernel_reduce",
+        Kokkos::Experimental::require(
+            Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(
+                space, {0, 0}, {N, R}),
+            Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+        fr, result);
+    double time_not_fenced = timer.seconds();
+    Kokkos::fence();
+    if (is_overlapping(space)) {
+      KOKKOS_ASSERT(time_fenced > 2.0 * time_not_fenced);
+    }
+
+    state.counters["Time fenced"]     = benchmark::Counter(time_fenced);
+    state.counters["Time not fenced"] = benchmark::Counter(time_not_fenced);
+
+    timer.reset();
+    Kokkos::parallel_reduce(
+        "default_exec::overlap_mdrange_policy::kernel_reduce",
+        Kokkos::Experimental::require(
+            Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(
+                space, {0, 0}, {N, R}),
+            Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+        fr, result);
+    Kokkos::parallel_reduce(
+        "default_exec::overlap_mdrange_policy::kernel_reduce",
+        Kokkos::Experimental::require(
+            Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(
+                space, {0, 0}, {N, R}),
+            Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+        fr, result);
+    Kokkos::fence();
+    double time_no_overlapped_reduce = timer.seconds();
+
+    timer.reset();
+    Kokkos::parallel_reduce(
+        "default_exec::overlap_mdrange_policy::kernel_reduce",
+        Kokkos::Experimental::require(
+            Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(
+                space1, {0, 0}, {N, R}),
+            Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+        fr, result1);
+    Kokkos::parallel_reduce(
+        "default_exec::overlap_mdrange_policy::kernel_reduce",
+        Kokkos::Experimental::require(
+            Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(
+                space2, {0, 0}, {N, R}),
+            Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+        fr, result2);
+    Kokkos::fence();
+    double time_overlapped_reduce = timer.seconds();
+
+    Kokkos::deep_copy(h_result2, result2);
+    Kokkos::deep_copy(h_result1, result1);
+
+    KOKKOS_ASSERT(h_result1() == h_result());
+    KOKKOS_ASSERT(h_result2() == h_result());
+
+    if (is_overlapping(space)) {
+      KOKKOS_ASSERT(time_overlapped_reduce < 1.5 * time_no_overlapped_reduce);
+    }
+
+    state.counters["Time Reduce: NonOverlap"] =
+        benchmark::Counter(time_no_overlapped_reduce);
+    state.counters["Time Reduce: Time Overlap"] =
+        benchmark::Counter(time_overlapped_reduce);
   }
-  printf("Time MDRangePolicy Reduce: NonOverlap: %lf Time Overlap: %lf\n",
-         time_no_overlapped_reduce, time_overlapped_reduce);
 }
 
-TEST(default_exec, overlap_team_policy) {
-  int N = 20;
-  int M = 1000000;
-  int R = 10;
+BENCHMARK(OverlapMDRangePolicy)
+    ->ArgNames({"N", "M", "R"})
+    ->Args({200, 10'000, 10});
+
+static void OverlapTeamPolicy(benchmark::State& state) {
+  int N = state.range(0);
+  int M = state.range(1);
+  int R = state.range(2);
 
   TEST_EXECSPACE space;
   std::vector<TEST_EXECSPACE> execution_space_instances =
@@ -505,155 +530,170 @@ TEST(default_exec, overlap_team_policy) {
   TEST_EXECSPACE space1 = execution_space_instances[0];
   TEST_EXECSPACE space2 = execution_space_instances[1];
 
-  Kokkos::View<double**, Kokkos::LayoutRight, TEST_EXECSPACE> a("A", N, M);
-  FunctorTeam f(M, R, a);
-  FunctorTeamReduce fr(M, R, a);
-  Kokkos::parallel_for(
-      "default_exec::overlap_range_policy::kernel0",
-      Kokkos::Experimental::require(
-          Kokkos::TeamPolicy<TEST_EXECSPACE>(N, Kokkos::AUTO),
-          Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-      FunctorTeam(M, R, a));
-
-  Kokkos::parallel_for(
-      "default_exec::overlap_range_policy::kernel1",
-      Kokkos::Experimental::require(
-          Kokkos::TeamPolicy<TEST_EXECSPACE>(space1, N, Kokkos::AUTO),
-          Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-      f);
-  Kokkos::parallel_for(
-      "default_exec::overlap_range_policy::kernel2",
-      Kokkos::Experimental::require(
-          Kokkos::TeamPolicy<TEST_EXECSPACE>(space2, N, Kokkos::AUTO),
-          Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-      f);
-  Kokkos::fence();
-
-  Kokkos::Timer timer;
-  Kokkos::parallel_for(
-      "default_exec::overlap_range_policy::kernel3",
-      Kokkos::Experimental::require(
-          Kokkos::TeamPolicy<TEST_EXECSPACE>(space, N, Kokkos::AUTO),
-          Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-      f);
-  Kokkos::parallel_for(
-      "default_exec::overlap_range_policy::kernel4",
-      Kokkos::Experimental::require(
-          Kokkos::TeamPolicy<TEST_EXECSPACE>(space, N, Kokkos::AUTO),
-          Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-      f);
-  Kokkos::fence();
-
-  timer.reset();
-  Kokkos::parallel_for(
-      "default_exec::overlap_range_policy::kernel5",
-      Kokkos::Experimental::require(
-          Kokkos::TeamPolicy<TEST_EXECSPACE>(space1, N, Kokkos::AUTO),
-          Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-      FunctorTeam(M, R, a));
-  Kokkos::parallel_for(
-      "default_exec::overlap_range_policy::kernel6",
-      Kokkos::Experimental::require(
-          Kokkos::TeamPolicy<TEST_EXECSPACE>(space2, N, Kokkos::AUTO),
-          Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-      FunctorTeam(M, R, a));
-  Kokkos::fence();
-  double time_overlap = timer.seconds();
-
-  timer.reset();
-  Kokkos::parallel_for(
-      "default_exec::overlap_range_policy::kernel7",
-      Kokkos::Experimental::require(
-          Kokkos::TeamPolicy<TEST_EXECSPACE>(space, N, Kokkos::AUTO),
-          Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-      f);
-  Kokkos::parallel_for(
-      "default_exec::overlap_range_policy::kernel8",
-      Kokkos::Experimental::require(
-          Kokkos::TeamPolicy<TEST_EXECSPACE>(space, N, Kokkos::AUTO),
-          Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-      f);
-  Kokkos::fence();
-  double time_end = timer.seconds();
-
-  if (is_overlapping(space)) {
-    ASSERT_GT(time_end, 1.5 * time_overlap);
-  }
-  printf("Time TeamPolicy: NonOverlap: %lf Time Overlap: %lf\n", time_end,
-         time_overlap);
-
-  Kokkos::View<double, TEST_EXECSPACE> result("result");
-  Kokkos::View<double, TEST_EXECSPACE> result1("result1");
-  Kokkos::View<double, TEST_EXECSPACE> result2("result2");
-  Kokkos::View<double, Kokkos::HostSpace> h_result("h_result");
-  Kokkos::View<double, Kokkos::HostSpace> h_result1("h_result1");
-  Kokkos::View<double, Kokkos::HostSpace> h_result2("h_result2");
-
-  timer.reset();
-  Kokkos::parallel_reduce(
-      "default_exec::overlap_team_policy::kernel_reduce",
-      Kokkos::Experimental::require(
-          Kokkos::TeamPolicy<TEST_EXECSPACE>(space, N, Kokkos::AUTO),
-          Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-      fr, result);
-  Kokkos::fence();
-  double time_fenced = timer.seconds();
-  Kokkos::deep_copy(h_result, result);
-
-  timer.reset();
-  Kokkos::parallel_reduce(
-      "default_exec::overlap_team_policy::kernel_reduce",
-      Kokkos::Experimental::require(
-          Kokkos::TeamPolicy<TEST_EXECSPACE>(space, N, Kokkos::AUTO),
-          Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-      fr, result);
-  double time_not_fenced = timer.seconds();
-  Kokkos::fence();
-  if (is_overlapping(space)) {
-    ASSERT_GT(time_fenced, 2.0 * time_not_fenced);
-  }
-  timer.reset();
-  Kokkos::parallel_reduce(
-      "default_exec::overlap_team_policy::kernel_reduce",
-      Kokkos::Experimental::require(
-          Kokkos::TeamPolicy<TEST_EXECSPACE>(space, N, Kokkos::AUTO),
-          Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-      fr, result);
-  Kokkos::parallel_reduce(
-      "default_exec::overlap_team_policy::kernel_reduce",
-      Kokkos::Experimental::require(
-          Kokkos::TeamPolicy<TEST_EXECSPACE>(space, N, Kokkos::AUTO),
-          Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-      fr, result);
-  Kokkos::fence();
-  double time_no_overlapped_reduce = timer.seconds();
-
-  timer.reset();
-  Kokkos::parallel_reduce(
-      "default_exec::overlap_team_policy::kernel_reduce",
-      Kokkos::Experimental::require(
-          Kokkos::TeamPolicy<TEST_EXECSPACE>(space1, N, Kokkos::AUTO),
-          Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-      fr, result1);
-  Kokkos::parallel_reduce(
-      "default_exec::overlap_team_policy::kernel_reduce",
-      Kokkos::Experimental::require(
-          Kokkos::TeamPolicy<TEST_EXECSPACE>(space2, N, Kokkos::AUTO),
-          Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-      fr, result2);
-  Kokkos::fence();
-  double time_overlapped_reduce = timer.seconds();
-
-  Kokkos::deep_copy(h_result2, result2);
-  Kokkos::deep_copy(h_result1, result1);
-
-  ASSERT_EQ(h_result1(), h_result());
-  ASSERT_EQ(h_result2(), h_result());
-
-  if (is_overlapping(space)) {
-    ASSERT_LT(time_overlapped_reduce, 1.5 * time_no_overlapped_reduce);
+  for (auto _ : state) {
+    Kokkos::View<double**, Kokkos::LayoutRight, TEST_EXECSPACE> a("A", N, M);
+    FunctorTeam f(M, R, a);
+    FunctorTeamReduce fr(M, R, a);
+    Kokkos::parallel_for(
+        "default_exec::overlap_range_policy::kernel0",
+        Kokkos::Experimental::require(
+            Kokkos::TeamPolicy<TEST_EXECSPACE>(N, Kokkos::AUTO),
+            Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+        FunctorTeam(M, R, a));
+
+    Kokkos::parallel_for(
+        "default_exec::overlap_range_policy::kernel1",
+        Kokkos::Experimental::require(
+            Kokkos::TeamPolicy<TEST_EXECSPACE>(space1, N, Kokkos::AUTO),
+            Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+        f);
+    Kokkos::parallel_for(
+        "default_exec::overlap_range_policy::kernel2",
+        Kokkos::Experimental::require(
+            Kokkos::TeamPolicy<TEST_EXECSPACE>(space2, N, Kokkos::AUTO),
+            Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+        f);
+    Kokkos::fence();
+
+    Kokkos::Timer timer;
+    Kokkos::parallel_for(
+        "default_exec::overlap_range_policy::kernel3",
+        Kokkos::Experimental::require(
+            Kokkos::TeamPolicy<TEST_EXECSPACE>(space, N, Kokkos::AUTO),
+            Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+        f);
+    Kokkos::parallel_for(
+        "default_exec::overlap_range_policy::kernel4",
+        Kokkos::Experimental::require(
+            Kokkos::TeamPolicy<TEST_EXECSPACE>(space, N, Kokkos::AUTO),
+            Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+        f);
+    Kokkos::fence();
+
+    timer.reset();
+    Kokkos::parallel_for(
+        "default_exec::overlap_range_policy::kernel5",
+        Kokkos::Experimental::require(
+            Kokkos::TeamPolicy<TEST_EXECSPACE>(space1, N, Kokkos::AUTO),
+            Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+        FunctorTeam(M, R, a));
+    Kokkos::parallel_for(
+        "default_exec::overlap_range_policy::kernel6",
+        Kokkos::Experimental::require(
+            Kokkos::TeamPolicy<TEST_EXECSPACE>(space2, N, Kokkos::AUTO),
+            Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+        FunctorTeam(M, R, a));
+    Kokkos::fence();
+    double time_overlap = timer.seconds();
+
+    timer.reset();
+    Kokkos::parallel_for(
+        "default_exec::overlap_range_policy::kernel7",
+        Kokkos::Experimental::require(
+            Kokkos::TeamPolicy<TEST_EXECSPACE>(space, N, Kokkos::AUTO),
+            Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+        f);
+    Kokkos::parallel_for(
+        "default_exec::overlap_range_policy::kernel8",
+        Kokkos::Experimental::require(
+            Kokkos::TeamPolicy<TEST_EXECSPACE>(space, N, Kokkos::AUTO),
+            Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+        f);
+    Kokkos::fence();
+    double time_end = timer.seconds();
+
+    if (is_overlapping(space)) {
+      KOKKOS_ASSERT(time_end > 1.5 * time_overlap);
+    }
+
+    state.counters["Time NonOverlap"] = benchmark::Counter(time_end);
+    state.counters["Time Overlap"]    = benchmark::Counter(time_overlap);
+
+    Kokkos::View<double, TEST_EXECSPACE> result("result");
+    Kokkos::View<double, TEST_EXECSPACE> result1("result1");
+    Kokkos::View<double, TEST_EXECSPACE> result2("result2");
+    Kokkos::View<double, Kokkos::HostSpace> h_result("h_result");
+    Kokkos::View<double, Kokkos::HostSpace> h_result1("h_result1");
+    Kokkos::View<double, Kokkos::HostSpace> h_result2("h_result2");
+
+    timer.reset();
+    Kokkos::parallel_reduce(
+        "default_exec::overlap_team_policy::kernel_reduce",
+        Kokkos::Experimental::require(
+            Kokkos::TeamPolicy<TEST_EXECSPACE>(space, N, Kokkos::AUTO),
+            Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+        fr, result);
+    Kokkos::fence();
+    double time_fenced = timer.seconds();
+    Kokkos::deep_copy(h_result, result);
+
+    timer.reset();
+    Kokkos::parallel_reduce(
+        "default_exec::overlap_team_policy::kernel_reduce",
+        Kokkos::Experimental::require(
+            Kokkos::TeamPolicy<TEST_EXECSPACE>(space, N, Kokkos::AUTO),
+            Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+        fr, result);
+    double time_not_fenced = timer.seconds();
+    Kokkos::fence();
+    if (is_overlapping(space)) {
+      KOKKOS_ASSERT(time_fenced > 2.0 * time_not_fenced);
+    }
+
+    state.counters["Time fenced"]     = benchmark::Counter(time_fenced);
+    state.counters["Time not fenced"] = benchmark::Counter(time_not_fenced);
+
+    timer.reset();
+    Kokkos::parallel_reduce(
+        "default_exec::overlap_team_policy::kernel_reduce",
+        Kokkos::Experimental::require(
+            Kokkos::TeamPolicy<TEST_EXECSPACE>(space, N, Kokkos::AUTO),
+            Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+        fr, result);
+    Kokkos::parallel_reduce(
+        "default_exec::overlap_team_policy::kernel_reduce",
+        Kokkos::Experimental::require(
+            Kokkos::TeamPolicy<TEST_EXECSPACE>(space, N, Kokkos::AUTO),
+            Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+        fr, result);
+    Kokkos::fence();
+    double time_no_overlapped_reduce = timer.seconds();
+
+    timer.reset();
+    Kokkos::parallel_reduce(
+        "default_exec::overlap_team_policy::kernel_reduce",
+        Kokkos::Experimental::require(
+            Kokkos::TeamPolicy<TEST_EXECSPACE>(space1, N, Kokkos::AUTO),
+            Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+        fr, result1);
+    Kokkos::parallel_reduce(
+        "default_exec::overlap_team_policy::kernel_reduce",
+        Kokkos::Experimental::require(
+            Kokkos::TeamPolicy<TEST_EXECSPACE>(space2, N, Kokkos::AUTO),
+            Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+        fr, result2);
+    Kokkos::fence();
+    double time_overlapped_reduce = timer.seconds();
+
+    Kokkos::deep_copy(h_result2, result2);
+    Kokkos::deep_copy(h_result1, result1);
+
+    KOKKOS_ASSERT(h_result1() == h_result());
+    KOKKOS_ASSERT(h_result2() == h_result());
+
+    if (is_overlapping(space)) {
+      KOKKOS_ASSERT(time_overlapped_reduce < 1.5 * time_no_overlapped_reduce);
+    }
+
+    state.counters["Time Reduce: NonOverlap"] =
+        benchmark::Counter(time_no_overlapped_reduce);
+    state.counters["Time Reduce: Time Overlap"] =
+        benchmark::Counter(time_overlapped_reduce);
   }
-  printf("Time TeamPolicy Reduce: NonOverlap: %lf Time Overlap: %lf\n",
-         time_no_overlapped_reduce, time_overlapped_reduce);
 }
+
+BENCHMARK(OverlapTeamPolicy)
+    ->ArgNames({"N", "M", "R"})
+    ->Args({20, 1'000'000, 10});
+
 }  // namespace Test
diff --git a/packages/kokkos/core/perf_test/PerfTest_MallocFree.cpp b/packages/kokkos/core/perf_test/PerfTest_MallocFree.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..80736d99152174b4744887eb9153194c0b171b2a
--- /dev/null
+++ b/packages/kokkos/core/perf_test/PerfTest_MallocFree.cpp
@@ -0,0 +1,100 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <Kokkos_Core.hpp>
+#include <benchmark/benchmark.h>
+#include "Benchmark_Context.hpp"
+
+namespace Benchmark {
+
+// when the time will be recorded
+enum class When { after_malloc, after_touch, after_free };
+
+static void Impl(benchmark::State& state, const bool touch, const When when) {
+  const size_t N = state.range(0);
+  for (auto _ : state) {
+    Kokkos::Timer timer;
+    char* a_ptr = static_cast<char*>(Kokkos::kokkos_malloc("A", N));
+    if (When::after_malloc == when) {
+      state.SetIterationTime(timer.seconds());
+    }
+    if (touch) {
+      constexpr size_t STRIDE = 1024;  // stride for touching the allocation.
+      // this is intended to be a safe value that would touch every "page", but
+      // not saturate the memory bandwidth
+      Kokkos::parallel_for(
+          N / STRIDE,
+          KOKKOS_LAMBDA(const size_t& i) { a_ptr[i * STRIDE] = i * STRIDE; });
+      Kokkos::fence();
+    }
+    if (When::after_touch == when) {
+      state.SetIterationTime(timer.seconds());
+    }
+    Kokkos::kokkos_free(a_ptr);
+    if (When::after_free == when) {
+      state.SetIterationTime(timer.seconds());
+    }
+  }
+
+  state.counters[KokkosBenchmark::benchmark_fom("rate")] =
+      benchmark::Counter(state.iterations(), benchmark::Counter::kIsRate);
+}
+
+static void Malloc(benchmark::State& state) {
+  Impl(state, false, When::after_malloc);
+}
+
+static void MallocFree(benchmark::State& state) {
+  Impl(state, false, When::after_free);
+}
+
+static void MallocTouch(benchmark::State& state) {
+  Impl(state, true, When::after_touch);
+}
+
+static void MallocTouchFree(benchmark::State& state) {
+  Impl(state, true, When::after_free);
+}
+
+BENCHMARK(Malloc)
+    ->ArgName("N")
+    ->RangeMultiplier(16)
+    ->Range(1, int64_t(1) << 32)
+    ->UseManualTime()
+    ->Unit(benchmark::kMicrosecond);
+
+BENCHMARK(MallocFree)
+    ->ArgName("N")
+    ->RangeMultiplier(16)
+    ->Range(1, int64_t(1) << 32)
+    ->UseManualTime()
+    ->Unit(benchmark::kMicrosecond);
+
+BENCHMARK(MallocTouch)
+    ->ArgName("N")
+    ->RangeMultiplier(16)
+    ->Range(1, int64_t(1) << 32)
+    ->UseManualTime()
+    ->Unit(benchmark::kMicrosecond);
+
+BENCHMARK(MallocTouchFree)
+    ->ArgName("N")
+    ->RangeMultiplier(16)
+    ->Range(1, int64_t(1) << 32)
+    ->UseManualTime()
+    ->Unit(benchmark::kMicrosecond);
+
+}  // namespace Benchmark
diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewAllocate.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewAllocate.cpp
index c1d98958476429b6a37c5568c449d189904019eb..163e1d7d0488c22f0eb7570fbd81c01809e65e07 100644
--- a/packages/kokkos/core/perf_test/PerfTest_ViewAllocate.cpp
+++ b/packages/kokkos/core/perf_test/PerfTest_ViewAllocate.cpp
@@ -15,119 +15,184 @@
 //@HEADER
 
 #include <Kokkos_Core.hpp>
-#include <gtest/gtest.h>
-#include <cstdio>
-#include <PerfTest_Category.hpp>
+#include <benchmark/benchmark.h>
+#include "Benchmark_Context.hpp"
 
 namespace Test {
 
+static constexpr int N = 10;
+
 template <class Layout>
-void run_allocateview_tests(int N, int R) {
-  const int N1 = N;
-  const int N2 = N * N;
-  const int N3 = N2 * N;
-  const int N4 = N2 * N2;
-  const int N8 = N4 * N4;
-
-  double time1, time2, time3, time4, time5, time6, time7, time8,
-      time_raw = 100000.0;
-  {
-    Kokkos::Timer timer;
-    for (int r = 0; r < R; r++) {
-      Kokkos::View<double*, Layout> a("A1", N8);
-    }
-    time1 = timer.seconds() / R;
-  }
-  {
+static void ViewAllocate_Rank1(benchmark::State& state) {
+  const int N8 = std::pow(state.range(0), 8);
+
+  for (auto _ : state) {
     Kokkos::Timer timer;
-    for (int r = 0; r < R; r++) {
-      Kokkos::View<double**, Layout> a("A2", N4, N4);
-    }
-    time2 = timer.seconds() / R;
+    Kokkos::View<double*, Layout> a("A1", N8);
+    KokkosBenchmark::report_results(state, a, 1, timer.seconds());
   }
-  {
+}
+
+template <class Layout>
+static void ViewAllocate_Rank2(benchmark::State& state) {
+  const int N4 = std::pow(state.range(0), 4);
+
+  for (auto _ : state) {
     Kokkos::Timer timer;
-    for (int r = 0; r < R; r++) {
-      Kokkos::View<double***, Layout> a("A3", N3, N3, N2);
-    }
-    time3 = timer.seconds() / R;
+    Kokkos::View<double**, Layout> a("A2", N4, N4);
+    KokkosBenchmark::report_results(state, a, 1, timer.seconds());
   }
-  {
+}
+
+template <class Layout>
+static void ViewAllocate_Rank3(benchmark::State& state) {
+  const int N2 = std::pow(state.range(0), 2);
+  const int N3 = std::pow(state.range(0), 3);
+
+  for (auto _ : state) {
     Kokkos::Timer timer;
-    for (int r = 0; r < R; r++) {
-      Kokkos::View<double****, Layout> a("A4", N2, N2, N2, N2);
-    }
-    time4 = timer.seconds() / R;
+    Kokkos::View<double***, Layout> a("A3", N3, N3, N2);
+    KokkosBenchmark::report_results(state, a, 1, timer.seconds());
   }
-  {
+}
+
+template <class Layout>
+static void ViewAllocate_Rank4(benchmark::State& state) {
+  const int N2 = std::pow(state.range(0), 2);
+
+  for (auto _ : state) {
     Kokkos::Timer timer;
-    for (int r = 0; r < R; r++) {
-      Kokkos::View<double*****, Layout> a("A5", N2, N2, N1, N1, N2);
-    }
-    time5 = timer.seconds() / R;
+    Kokkos::View<double****, Layout> a("A4", N2, N2, N2, N2);
+    KokkosBenchmark::report_results(state, a, 1, timer.seconds());
   }
-  {
+}
+
+template <class Layout>
+static void ViewAllocate_Rank5(benchmark::State& state) {
+  const int N1 = state.range(0);
+  const int N2 = N1 * N1;
+
+  for (auto _ : state) {
     Kokkos::Timer timer;
-    for (int r = 0; r < R; r++) {
-      Kokkos::View<double******, Layout> a("A6", N2, N1, N1, N1, N1, N2);
-    }
-    time6 = timer.seconds() / R;
+    Kokkos::View<double*****, Layout> a("A5", N2, N2, N1, N1, N2);
+    KokkosBenchmark::report_results(state, a, 1, timer.seconds());
   }
-  {
+}
+
+template <class Layout>
+static void ViewAllocate_Rank6(benchmark::State& state) {
+  const int N1 = state.range(0);
+  const int N2 = N1 * N1;
+
+  for (auto _ : state) {
     Kokkos::Timer timer;
-    for (int r = 0; r < R; r++) {
-      Kokkos::View<double*******, Layout> a("A7", N2, N1, N1, N1, N1, N1, N1);
-    }
-    time7 = timer.seconds() / R;
+    Kokkos::View<double******, Layout> a("A6", N2, N1, N1, N1, N1, N2);
+    KokkosBenchmark::report_results(state, a, 1, timer.seconds());
   }
-  {
+}
+
+template <class Layout>
+static void ViewAllocate_Rank7(benchmark::State& state) {
+  const int N1 = state.range(0);
+  const int N2 = N1 * N1;
+
+  for (auto _ : state) {
     Kokkos::Timer timer;
-    for (int r = 0; r < R; r++) {
-      Kokkos::View<double********, Layout> a("A8", N1, N1, N1, N1, N1, N1, N1,
-                                             N1);
-    }
-    time8 = timer.seconds() / R;
+    Kokkos::View<double*******, Layout> a("A7", N2, N1, N1, N1, N1, N1, N1);
+    KokkosBenchmark::report_results(state, a, 1, timer.seconds());
   }
-#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
-  {
+}
+
+template <class Layout>
+static void ViewAllocate_Rank8(benchmark::State& state) {
+  const int N1 = state.range(0);
+
+  for (auto _ : state) {
     Kokkos::Timer timer;
-    for (int r = 0; r < R; r++) {
-      double* a_ptr =
-          static_cast<double*>(Kokkos::kokkos_malloc("A", sizeof(double) * N8));
-      Kokkos::parallel_for(
-          N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = 0.0; });
-      Kokkos::fence();
-      Kokkos::kokkos_free(a_ptr);
-    }
-    time_raw = timer.seconds() / R;
+    Kokkos::View<double********, Layout> a("A8", N1, N1, N1, N1, N1, N1, N1,
+                                           N1);
+    KokkosBenchmark::report_results(state, a, 1, timer.seconds());
   }
-#endif
-  double size = 1.0 * N8 * 8 / 1024 / 1024;
-  printf("   Raw:   %lf s   %lf MB   %lf GB/s\n", time_raw, size,
-         size / 1024 / time_raw);
-  printf("   Rank1: %lf s   %lf MB   %lf GB/s\n", time1, size,
-         size / 1024 / time1);
-  printf("   Rank2: %lf s   %lf MB   %lf GB/s\n", time2, size,
-         size / 1024 / time2);
-  printf("   Rank3: %lf s   %lf MB   %lf GB/s\n", time3, size,
-         size / 1024 / time3);
-  printf("   Rank4: %lf s   %lf MB   %lf GB/s\n", time4, size,
-         size / 1024 / time4);
-  printf("   Rank5: %lf s   %lf MB   %lf GB/s\n", time5, size,
-         size / 1024 / time5);
-  printf("   Rank6: %lf s   %lf MB   %lf GB/s\n", time6, size,
-         size / 1024 / time6);
-  printf("   Rank7: %lf s   %lf MB   %lf GB/s\n", time7, size,
-         size / 1024 / time7);
-  printf("   Rank8: %lf s   %lf MB   %lf GB/s\n", time8, size,
-         size / 1024 / time8);
 }
 
-TEST(default_exec, ViewCreate) {
-  printf("Create View Performance for LayoutLeft:\n");
-  run_allocateview_tests<Kokkos::LayoutLeft>(10, 1);
-  printf("Create View Performance for LayoutRight:\n");
-  run_allocateview_tests<Kokkos::LayoutRight>(10, 1);
-}
+BENCHMARK(ViewAllocate_Rank1<Kokkos::LayoutLeft>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime();
+
+BENCHMARK(ViewAllocate_Rank1<Kokkos::LayoutRight>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime();
+
+BENCHMARK(ViewAllocate_Rank2<Kokkos::LayoutLeft>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime();
+
+BENCHMARK(ViewAllocate_Rank2<Kokkos::LayoutRight>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime();
+
+BENCHMARK(ViewAllocate_Rank3<Kokkos::LayoutLeft>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime();
+
+BENCHMARK(ViewAllocate_Rank3<Kokkos::LayoutRight>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime();
+
+BENCHMARK(ViewAllocate_Rank4<Kokkos::LayoutLeft>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime();
+
+BENCHMARK(ViewAllocate_Rank4<Kokkos::LayoutRight>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime();
+
+BENCHMARK(ViewAllocate_Rank5<Kokkos::LayoutLeft>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime();
+
+BENCHMARK(ViewAllocate_Rank5<Kokkos::LayoutRight>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime();
+
+BENCHMARK(ViewAllocate_Rank6<Kokkos::LayoutLeft>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime();
+
+BENCHMARK(ViewAllocate_Rank6<Kokkos::LayoutRight>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime();
+
+BENCHMARK(ViewAllocate_Rank7<Kokkos::LayoutLeft>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime();
+
+BENCHMARK(ViewAllocate_Rank7<Kokkos::LayoutRight>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime();
+
+BENCHMARK(ViewAllocate_Rank8<Kokkos::LayoutLeft>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime();
+
+BENCHMARK(ViewAllocate_Rank8<Kokkos::LayoutRight>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime();
 
 }  // namespace Test
diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewCopy.hpp b/packages/kokkos/core/perf_test/PerfTest_ViewCopy.hpp
index b0216ca6fc4ce2436c2e5246ae1acce7ca45b6fa..b7b1e1ad48ed47b99f0c96330fa6d54f6e41f896 100644
--- a/packages/kokkos/core/perf_test/PerfTest_ViewCopy.hpp
+++ b/packages/kokkos/core/perf_test/PerfTest_ViewCopy.hpp
@@ -18,32 +18,15 @@
 #define KOKKOS_CORE_PERFTEST_BENCHMARK_VIEW_COPY_HPP
 
 #include <Kokkos_Core.hpp>
+
 #include <benchmark/benchmark.h>
+
+#include "Benchmark_Context.hpp"
 #include <cmath>
 
 namespace Test {
 
-/**
- * \brief Mark the label as a figure of merit.
- */
-inline std::string benchmark_fom(const std::string& label) {
-  return "FOM: " + label;
-}
-
-inline void report_results(benchmark::State& state, std::size_t num_elems,
-                           double time) {
-  state.SetIterationTime(time);
-
-  // data size in megabytes
-  const auto size = 1.0 * num_elems * sizeof(double) / 1000 / 1000;
-  // data processed in gigabytes
-  const auto data_processed = 2 * size / 1000;
-
-  state.counters["MB"] =
-      benchmark::Counter(size, benchmark::Counter::kDefaults);
-  state.counters[benchmark_fom("GB/s")] = benchmark::Counter(
-      data_processed, benchmark::Counter::kIsIterationInvariantRate);
-}
+static constexpr int DATA_RATIO = 2;
 
 template <class ViewTypeA, class ViewTypeB>
 void deepcopy_view(ViewTypeA& a, ViewTypeB& b, benchmark::State& state) {
@@ -51,7 +34,7 @@ void deepcopy_view(ViewTypeA& a, ViewTypeB& b, benchmark::State& state) {
     Kokkos::fence();
     Kokkos::Timer timer;
     Kokkos::deep_copy(a, b);
-    report_results(state, a.size(), timer.seconds());
+    KokkosBenchmark::report_results(state, a, DATA_RATIO, timer.seconds());
   }
 }
 
@@ -158,8 +141,7 @@ static void ViewDeepCopy_Raw(benchmark::State& state) {
     Kokkos::parallel_for(
         N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = b_ptr[i]; });
     Kokkos::fence();
-
-    report_results(state, a.size(), timer.seconds());
+    KokkosBenchmark::report_results(state, a, DATA_RATIO, timer.seconds());
   }
 }
 
diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_Raw.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_Raw.cpp
index 976f800487878f2ee40a94cf7982577de390e66c..67a8d7e555451e7426b00b2a2f3a3dfe8d9eb03e 100644
--- a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_Raw.cpp
+++ b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_Raw.cpp
@@ -14,7 +14,7 @@
 //
 //@HEADER
 
-#include <PerfTest_ViewCopy.hpp>
+#include "PerfTest_ViewCopy.hpp"
 
 namespace Test {
 
diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_a123.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_a123.cpp
index 3fc1c2480cf2c95e1c68993967c9eb86025b2237..db33d1180964917450d2e25c03a6c2e195427a71 100644
--- a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_a123.cpp
+++ b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_a123.cpp
@@ -14,7 +14,7 @@
 //
 //@HEADER
 
-#include <PerfTest_ViewCopy.hpp>
+#include "PerfTest_ViewCopy.hpp"
 
 namespace Test {
 
diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_a45.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_a45.cpp
index 542f5534be7ef0a01a419bf8d8b31a9fdc3684aa..32006025660184fd1244bc09c4b79bc9927e9ffc 100644
--- a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_a45.cpp
+++ b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_a45.cpp
@@ -14,7 +14,7 @@
 //
 //@HEADER
 
-#include <PerfTest_ViewCopy.hpp>
+#include "PerfTest_ViewCopy.hpp"
 
 namespace Test {
 
diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_a6.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_a6.cpp
index 782628072db34bd0aa3cc44c5eec430e8e4b99ca..0855299aad32db058ff0c4eae42a7bf702fccea9 100644
--- a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_a6.cpp
+++ b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_a6.cpp
@@ -14,7 +14,7 @@
 //
 //@HEADER
 
-#include <PerfTest_ViewCopy.hpp>
+#include "PerfTest_ViewCopy.hpp"
 
 namespace Test {
 
diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_a7.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_a7.cpp
index 000c8b401c97f4b56ec8a9dfe442a20afe08844b..36577ef2eff76f955bb99daaffd44ec1fd984a74 100644
--- a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_a7.cpp
+++ b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_a7.cpp
@@ -14,7 +14,7 @@
 //
 //@HEADER
 
-#include <PerfTest_ViewCopy.hpp>
+#include "PerfTest_ViewCopy.hpp"
 
 namespace Test {
 
diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_a8.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_a8.cpp
index f7d7c6040a61ad242797d4f64df66eed31787a50..c449d684f1c5b2115f4b3c08aa8bce1d0bcd781d 100644
--- a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_a8.cpp
+++ b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_a8.cpp
@@ -14,7 +14,7 @@
 //
 //@HEADER
 
-#include <PerfTest_ViewCopy.hpp>
+#include "PerfTest_ViewCopy.hpp"
 
 namespace Test {
 
diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_b123.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_b123.cpp
index 7820e8997314824ab870dcef926e611c89182854..8675f427d7493031ed202d96d54993d4ad000f04 100644
--- a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_b123.cpp
+++ b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_b123.cpp
@@ -14,7 +14,7 @@
 //
 //@HEADER
 
-#include <PerfTest_ViewCopy.hpp>
+#include "PerfTest_ViewCopy.hpp"
 
 namespace Test {
 
diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_b45.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_b45.cpp
index 58f58314b093b8ce0788705d3d352ccd0947a866..93522fcf0d495b87680efae00c3dd45262bc0193 100644
--- a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_b45.cpp
+++ b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_b45.cpp
@@ -14,7 +14,7 @@
 //
 //@HEADER
 
-#include <PerfTest_ViewCopy.hpp>
+#include "PerfTest_ViewCopy.hpp"
 
 namespace Test {
 
diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_b6.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_b6.cpp
index 920bf1222f140d0a716d9a845e7af5e9bebb5b98..be95c7cab3ead770fe7052199ad48ed20c8803a0 100644
--- a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_b6.cpp
+++ b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_b6.cpp
@@ -14,7 +14,7 @@
 //
 //@HEADER
 
-#include <PerfTest_ViewCopy.hpp>
+#include "PerfTest_ViewCopy.hpp"
 
 namespace Test {
 
diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_b7.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_b7.cpp
index 11b7a0d0d1caa04966896c165a551efed7d59835..f8eee75ce71dd3a8afe889beb1687d9a16db2153 100644
--- a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_b7.cpp
+++ b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_b7.cpp
@@ -14,7 +14,7 @@
 //
 //@HEADER
 
-#include <PerfTest_ViewCopy.hpp>
+#include "PerfTest_ViewCopy.hpp"
 
 namespace Test {
 
diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_b8.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_b8.cpp
index 56a3d8d4c3221d8d84213dbd1b2ed839c29c8168..01dda2a33f5fa9ea31c47199b9e0547f9edc6378 100644
--- a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_b8.cpp
+++ b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_b8.cpp
@@ -14,7 +14,7 @@
 //
 //@HEADER
 
-#include <PerfTest_ViewCopy.hpp>
+#include "PerfTest_ViewCopy.hpp"
 
 namespace Test {
 
diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_c123.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_c123.cpp
index 375f7c8985b2f560553a06ae762178a4a10b8908..25e87474746271ad86a2835867b60774966ded1a 100644
--- a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_c123.cpp
+++ b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_c123.cpp
@@ -14,7 +14,7 @@
 //
 //@HEADER
 
-#include <PerfTest_ViewCopy.hpp>
+#include "PerfTest_ViewCopy.hpp"
 
 namespace Test {
 
diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_c45.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_c45.cpp
index 3625631617d86429b67e771c9773ea581e7b2b3c..b1f4a7b577322525b045c4ebf7c9871918eed74a 100644
--- a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_c45.cpp
+++ b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_c45.cpp
@@ -14,7 +14,7 @@
 //
 //@HEADER
 
-#include <PerfTest_ViewCopy.hpp>
+#include "PerfTest_ViewCopy.hpp"
 
 namespace Test {
 
diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_c6.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_c6.cpp
index bcb70b6764196e8ba7fc445156b390879fdfe121..8120664792a86a6c3bacdf1b0eaf5970ae1c0a3c 100644
--- a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_c6.cpp
+++ b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_c6.cpp
@@ -14,7 +14,7 @@
 //
 //@HEADER
 
-#include <PerfTest_ViewCopy.hpp>
+#include "PerfTest_ViewCopy.hpp"
 
 namespace Test {
 
diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_c7.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_c7.cpp
index 055d0e344c8bbbc7aae154f2b0288dafcc20884a..cee9f5bd014b4fd978a229e435721f8e8bbd6891 100644
--- a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_c7.cpp
+++ b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_c7.cpp
@@ -14,7 +14,7 @@
 //
 //@HEADER
 
-#include <PerfTest_ViewCopy.hpp>
+#include "PerfTest_ViewCopy.hpp"
 
 namespace Test {
 
diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_c8.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_c8.cpp
index 1e5342ef5217ee53231e9bbdb2f2a9f7af8ddea7..6f204a42221545dcf523949a619b116db9245728 100644
--- a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_c8.cpp
+++ b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_c8.cpp
@@ -14,7 +14,7 @@
 //
 //@HEADER
 
-#include <PerfTest_ViewCopy.hpp>
+#include "PerfTest_ViewCopy.hpp"
 
 namespace Test {
 
diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_d123.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_d123.cpp
index d61e01f9f6a9ea0d7f1ed1cdde2d94b6a7c097f8..6d72bea4908c13a258b16c7f57572694909e6f33 100644
--- a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_d123.cpp
+++ b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_d123.cpp
@@ -14,7 +14,7 @@
 //
 //@HEADER
 
-#include <PerfTest_ViewCopy.hpp>
+#include "PerfTest_ViewCopy.hpp"
 
 namespace Test {
 
diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_d45.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_d45.cpp
index 385d5b48ae96c39a4c6656bf34ca62a57bd69476..1a407cd648cfeebaa87da39e02523a06344f6485 100644
--- a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_d45.cpp
+++ b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_d45.cpp
@@ -14,7 +14,7 @@
 //
 //@HEADER
 
-#include <PerfTest_ViewCopy.hpp>
+#include "PerfTest_ViewCopy.hpp"
 
 namespace Test {
 
diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_d6.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_d6.cpp
index 0ae16012d6a65a1128757facef98df8eaeeffd96..27b1a816fc8215bfed72d8c265cea52857ccd4cf 100644
--- a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_d6.cpp
+++ b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_d6.cpp
@@ -14,7 +14,7 @@
 //
 //@HEADER
 
-#include <PerfTest_ViewCopy.hpp>
+#include "PerfTest_ViewCopy.hpp"
 
 namespace Test {
 
diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_d7.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_d7.cpp
index 4ebbb6359b59cc1ea475a206c7ff855d9c2a5276..17d4bf2077305622d22d7af0e2b9f99b0393921e 100644
--- a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_d7.cpp
+++ b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_d7.cpp
@@ -14,7 +14,7 @@
 //
 //@HEADER
 
-#include <PerfTest_ViewCopy.hpp>
+#include "PerfTest_ViewCopy.hpp"
 
 namespace Test {
 
diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_d8.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_d8.cpp
index 3a888b6155737819f263e0fae7faee5cf9dc9c84..7bd02632287310723babd8a815d72604152e9bf1 100644
--- a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_d8.cpp
+++ b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_d8.cpp
@@ -14,7 +14,7 @@
 //
 //@HEADER
 
-#include <PerfTest_ViewCopy.hpp>
+#include "PerfTest_ViewCopy.hpp"
 
 namespace Test {
 
diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewFill.hpp b/packages/kokkos/core/perf_test/PerfTest_ViewFill.hpp
index 7f76ed0ff70f41faf44f6c3999a9ec68ffbe7ae1..cc5eed85f7f6556b76311e1844fa97ac1958bebd 100644
--- a/packages/kokkos/core/perf_test/PerfTest_ViewFill.hpp
+++ b/packages/kokkos/core/perf_test/PerfTest_ViewFill.hpp
@@ -14,202 +14,115 @@
 //
 //@HEADER
 
-#include <Kokkos_Core.hpp>
-#include <gtest/gtest.h>
-#include <cstdio>
-#include <PerfTest_Category.hpp>
+#include "Benchmark_Context.hpp"
+
+#include <cmath>
 
 namespace Test {
 
+static constexpr int N = 10;
+
 template <class ViewType>
-double fill_view(ViewType& a, typename ViewType::const_value_type& val,
-                 int repeat) {
-  Kokkos::Timer timer;
-  for (int i = 0; i < repeat; i++) {
+void fill_view(ViewType& a, typename ViewType::const_value_type& val,
+               benchmark::State& state) {
+  for (auto _ : state) {
+    Kokkos::fence();
+    Kokkos::Timer timer;
     Kokkos::deep_copy(a, val);
+    KokkosBenchmark::report_results(state, a, 1, timer.seconds());
   }
-  Kokkos::fence();
-  return timer.seconds();
 }
 
 template <class Layout>
-void run_fillview_tests123(int N, int R) {
-  const int N1 = N;
+static void ViewFill_Rank1(benchmark::State& state) {
+  const int N1 = state.range(0);
   const int N2 = N1 * N1;
-  const int N3 = N2 * N1;
   const int N4 = N2 * N2;
   const int N8 = N4 * N4;
 
-  double time1, time2, time3, time_raw = 100000.0;
-  {
-    Kokkos::View<double*, Layout> a("A1", N8);
-    time1 = fill_view(a, 1.1, R) / R;
-  }
-  {
-    Kokkos::View<double**, Layout> a("A2", N4, N4);
-    time2 = fill_view(a, 1.1, R) / R;
-  }
-  {
-    Kokkos::View<double***, Layout> a("A3", N3, N3, N2);
-    time3 = fill_view(a, 1.1, R) / R;
-  }
-#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
-  {
-    Kokkos::View<double*, Layout> a("A1", N8);
-    double* a_ptr = a.data();
-    Kokkos::Timer timer;
-    for (int r = 0; r < R; r++) {
-      Kokkos::parallel_for(
-          N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = 1.1; });
-    }
-    Kokkos::fence();
-    time_raw = timer.seconds() / R;
-  }
-#endif
-  double size = 1.0 * N8 * 8 / 1024 / 1024;
-  printf("   Raw:   %lf s   %lf MB   %lf GB/s\n", time_raw, size,
-         size / 1024 / time_raw);
-  printf("   Rank1: %lf s   %lf MB   %lf GB/s\n", time1, size,
-         size / 1024 / time1);
-  printf("   Rank2: %lf s   %lf MB   %lf GB/s\n", time2, size,
-         size / 1024 / time2);
-  printf("   Rank3: %lf s   %lf MB   %lf GB/s\n", time3, size,
-         size / 1024 / time3);
+  Kokkos::View<double*, Layout> a("A1", N8);
+  fill_view(a, 1.1, state);
 }
 
 template <class Layout>
-void run_fillview_tests45(int N, int R) {
-  const int N1 = N;
+static void ViewFill_Rank2(benchmark::State& state) {
+  const int N1 = state.range(0);
   const int N2 = N1 * N1;
   const int N4 = N2 * N2;
-  const int N8 = N4 * N4;
 
-  double time4, time5, time_raw = 100000.0;
-  {
-    Kokkos::View<double****, Layout> a("A4", N2, N2, N2, N2);
-    time4 = fill_view(a, 1.1, R) / R;
-  }
-  {
-    Kokkos::View<double*****, Layout> a("A5", N2, N2, N1, N1, N2);
-    time5 = fill_view(a, 1.1, R) / R;
-  }
-#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
-  {
-    Kokkos::View<double*, Layout> a("A1", N8);
-    double* a_ptr = a.data();
-    Kokkos::Timer timer;
-    for (int r = 0; r < R; r++) {
-      Kokkos::parallel_for(
-          N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = 1.1; });
-    }
-    Kokkos::fence();
-    time_raw = timer.seconds() / R;
-  }
-#endif
-  double size = 1.0 * N8 * 8 / 1024 / 1024;
-  printf("   Raw:   %lf s   %lf MB   %lf GB/s\n", time_raw, size,
-         size / 1024 / time_raw);
-  printf("   Rank4: %lf s   %lf MB   %lf GB/s\n", time4, size,
-         size / 1024 / time4);
-  printf("   Rank5: %lf s   %lf MB   %lf GB/s\n", time5, size,
-         size / 1024 / time5);
+  Kokkos::View<double**, Layout> a("A2", N4, N4);
+  fill_view(a, 1.1, state);
 }
 
 template <class Layout>
-void run_fillview_tests6(int N, int R) {
-  const int N1 = N;
+static void ViewFill_Rank3(benchmark::State& state) {
+  const int N1 = state.range(0);
   const int N2 = N1 * N1;
-  const int N4 = N2 * N2;
-  const int N8 = N4 * N4;
+  const int N3 = N2 * N1;
 
-  double time6, time_raw = 100000.0;
-  {
-    Kokkos::View<double******, Layout> a("A6", N2, N1, N1, N1, N1, N2);
-    time6 = fill_view(a, 1.1, R) / R;
-  }
-#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
-  {
-    Kokkos::View<double*, Layout> a("A1", N8);
-    double* a_ptr = a.data();
-    Kokkos::Timer timer;
-    for (int r = 0; r < R; r++) {
-      Kokkos::parallel_for(
-          N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = 1.1; });
-    }
-    Kokkos::fence();
-    time_raw = timer.seconds() / R;
-  }
-#endif
-  double size = 1.0 * N8 * 8 / 1024 / 1024;
-  printf("   Raw:   %lf s   %lf MB   %lf GB/s\n", time_raw, size,
-         size / 1024 / time_raw);
-  printf("   Rank6: %lf s   %lf MB   %lf GB/s\n", time6, size,
-         size / 1024 / time6);
+  Kokkos::View<double***, Layout> a("A3", N3, N3, N2);
+  fill_view(a, 1.1, state);
 }
 
 template <class Layout>
-void run_fillview_tests7(int N, int R) {
-  const int N1 = N;
+static void ViewFill_Rank4(benchmark::State& state) {
+  const int N1 = state.range(0);
   const int N2 = N1 * N1;
-  const int N4 = N2 * N2;
-  const int N8 = N4 * N4;
 
-  double time7, time_raw = 100000.0;
-  {
-    Kokkos::View<double*******, Layout> a("A7", N2, N1, N1, N1, N1, N1, N1);
-    time7 = fill_view(a, 1.1, R) / R;
-  }
-#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
-  {
-    Kokkos::View<double*, Layout> a("A1", N8);
-    double* a_ptr = a.data();
-    Kokkos::Timer timer;
-    for (int r = 0; r < R; r++) {
-      Kokkos::parallel_for(
-          N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = 1.1; });
-    }
-    Kokkos::fence();
-    time_raw = timer.seconds() / R;
-  }
-#endif
-  double size = 1.0 * N8 * 8 / 1024 / 1024;
-  printf("   Raw:   %lf s   %lf MB   %lf GB/s\n", time_raw, size,
-         size / 1024 / time_raw);
-  printf("   Rank7: %lf s   %lf MB   %lf GB/s\n", time7, size,
-         size / 1024 / time7);
+  Kokkos::View<double****, Layout> a("A4", N2, N2, N2, N2);
+  fill_view(a, 1.1, state);
 }
 
 template <class Layout>
-void run_fillview_tests8(int N, int R) {
-  const int N1 = N;
+static void ViewFill_Rank5(benchmark::State& state) {
+  const int N1 = state.range(0);
   const int N2 = N1 * N1;
-  const int N4 = N2 * N2;
-  const int N8 = N4 * N4;
 
-  double time8, time_raw = 100000.0;
-  {
-    Kokkos::View<double********, Layout> a("A8", N1, N1, N1, N1, N1, N1, N1,
-                                           N1);
-    time8 = fill_view(a, 1.1, R) / R;
-  }
-#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
-  {
-    Kokkos::View<double*, Layout> a("A1", N8);
-    double* a_ptr = a.data();
+  Kokkos::View<double*****, Layout> a("A5", N2, N2, N1, N1, N2);
+  fill_view(a, 1.1, state);
+}
+
+template <class Layout>
+static void ViewFill_Rank6(benchmark::State& state) {
+  const int N1 = state.range(0);
+  const int N2 = N1 * N1;
+
+  Kokkos::View<double******, Layout> a("A6", N2, N1, N1, N1, N1, N2);
+  fill_view(a, 1.1, state);
+}
+
+template <class Layout>
+static void ViewFill_Rank7(benchmark::State& state) {
+  const int N1 = state.range(0);
+  const int N2 = N1 * N1;
+
+  Kokkos::View<double*******, Layout> a("A7", N2, N1, N1, N1, N1, N1, N1);
+  fill_view(a, 1.1, state);
+}
+
+template <class Layout>
+static void ViewFill_Rank8(benchmark::State& state) {
+  const int N1 = state.range(0);
+
+  Kokkos::View<double********, Layout> a("A8", N1, N1, N1, N1, N1, N1, N1, N1);
+  fill_view(a, 1.1, state);
+}
+
+template <class Layout>
+static void ViewFill_Raw(benchmark::State& state) {
+  const int N8 = std::pow(state.range(0), 8);
+
+  Kokkos::View<double*, Layout> a("A1", N8);
+  double* a_ptr = a.data();
+
+  for (auto _ : state) {
     Kokkos::Timer timer;
-    for (int r = 0; r < R; r++) {
-      Kokkos::parallel_for(
-          N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = 1.1; });
-    }
+    Kokkos::parallel_for(
+        N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = 1.1; });
     Kokkos::fence();
-    time_raw = timer.seconds() / R;
+
+    KokkosBenchmark::report_results(state, a, 1, timer.seconds());
   }
-#endif
-  double size = 1.0 * N8 * 8 / 1024 / 1024;
-  printf("   Raw:   %lf s   %lf MB   %lf GB/s\n", time_raw, size,
-         size / 1024 / time_raw);
-  printf("   Rank8: %lf s   %lf MB   %lf GB/s\n", time8, size,
-         size / 1024 / time8);
 }
 
 }  // namespace Test
diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewFill_123.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewFill_123.cpp
index 1e050e2311fdd2b2e046be663ee77210ec7384b3..b95b5279a165649cfa4cd92320e6bea53862466f 100644
--- a/packages/kokkos/core/perf_test/PerfTest_ViewFill_123.cpp
+++ b/packages/kokkos/core/perf_test/PerfTest_ViewFill_123.cpp
@@ -14,13 +14,38 @@
 //
 //@HEADER
 
-#include <PerfTest_ViewFill.hpp>
+#include "PerfTest_ViewFill.hpp"
 
 namespace Test {
-TEST(default_exec, ViewFill_Rank123) {
-  printf("ViewFill Performance for LayoutLeft:\n");
-  run_fillview_tests123<Kokkos::LayoutLeft>(10, 1);
-  printf("ViewFill Performance for LayoutRight:\n");
-  run_fillview_tests123<Kokkos::LayoutRight>(10, 1);
-}
+
+BENCHMARK(ViewFill_Rank1<Kokkos::LayoutLeft>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime();
+
+BENCHMARK(ViewFill_Rank1<Kokkos::LayoutRight>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime();
+
+BENCHMARK(ViewFill_Rank2<Kokkos::LayoutLeft>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime();
+
+BENCHMARK(ViewFill_Rank2<Kokkos::LayoutRight>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime();
+
+BENCHMARK(ViewFill_Rank3<Kokkos::LayoutLeft>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime();
+
+BENCHMARK(ViewFill_Rank3<Kokkos::LayoutRight>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime();
+
 }  // namespace Test
diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewFill_45.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewFill_45.cpp
index 2bf93b2048bee4b15edbd932fd3945f13bd81f21..6a5acfb0d6e58d161dc21c64a9b02070694bf02b 100644
--- a/packages/kokkos/core/perf_test/PerfTest_ViewFill_45.cpp
+++ b/packages/kokkos/core/perf_test/PerfTest_ViewFill_45.cpp
@@ -14,13 +14,28 @@
 //
 //@HEADER
 
-#include <PerfTest_ViewFill.hpp>
+#include "PerfTest_ViewFill.hpp"
 
 namespace Test {
-TEST(default_exec, ViewFill_Rank45) {
-  printf("ViewFill Performance for LayoutLeft:\n");
-  run_fillview_tests45<Kokkos::LayoutLeft>(10, 1);
-  printf("ViewFill Performance for LayoutRight:\n");
-  run_fillview_tests45<Kokkos::LayoutRight>(10, 1);
-}
+
+BENCHMARK(ViewFill_Rank4<Kokkos::LayoutLeft>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime();
+
+BENCHMARK(ViewFill_Rank4<Kokkos::LayoutRight>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime();
+
+BENCHMARK(ViewFill_Rank5<Kokkos::LayoutLeft>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime();
+
+BENCHMARK(ViewFill_Rank5<Kokkos::LayoutRight>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime();
+
 }  // namespace Test
diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewFill_6.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewFill_6.cpp
index 588a1e2293cac76cacfde24db1bc0e9f7570af68..dca20c70dfb52547c143b5be5bef1ce87f45696f 100644
--- a/packages/kokkos/core/perf_test/PerfTest_ViewFill_6.cpp
+++ b/packages/kokkos/core/perf_test/PerfTest_ViewFill_6.cpp
@@ -14,13 +14,18 @@
 //
 //@HEADER
 
-#include <PerfTest_ViewFill.hpp>
+#include "PerfTest_ViewFill.hpp"
 
 namespace Test {
-TEST(default_exec, ViewFill_Rank6) {
-  printf("ViewFill Performance for LayoutLeft:\n");
-  run_fillview_tests6<Kokkos::LayoutLeft>(10, 1);
-  printf("ViewFill Performance for LayoutRight:\n");
-  run_fillview_tests6<Kokkos::LayoutRight>(10, 1);
-}
+
+BENCHMARK(ViewFill_Rank6<Kokkos::LayoutLeft>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime();
+
+BENCHMARK(ViewFill_Rank6<Kokkos::LayoutRight>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime();
+
 }  // namespace Test
diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewFill_7.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewFill_7.cpp
index fffeb951c94c345871613272f58614d5113bc67d..6fa8a418c6a31897b8fc5e56f5b8d0d1734b6ad3 100644
--- a/packages/kokkos/core/perf_test/PerfTest_ViewFill_7.cpp
+++ b/packages/kokkos/core/perf_test/PerfTest_ViewFill_7.cpp
@@ -14,13 +14,18 @@
 //
 //@HEADER
 
-#include <PerfTest_ViewFill.hpp>
+#include "PerfTest_ViewFill.hpp"
 
 namespace Test {
-TEST(default_exec, ViewFill_Rank7) {
-  printf("ViewFill Performance for LayoutLeft:\n");
-  run_fillview_tests7<Kokkos::LayoutLeft>(10, 1);
-  printf("ViewFill Performance for LayoutRight:\n");
-  run_fillview_tests7<Kokkos::LayoutRight>(10, 1);
-}
+
+BENCHMARK(ViewFill_Rank7<Kokkos::LayoutLeft>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime();
+
+BENCHMARK(ViewFill_Rank7<Kokkos::LayoutRight>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime();
+
 }  // namespace Test
diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewFill_8.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewFill_8.cpp
index b2188af1a3a54917dee7865592693b38a170de86..954b097d83e9a52624baad5dffba329a7f73cc78 100644
--- a/packages/kokkos/core/perf_test/PerfTest_ViewFill_8.cpp
+++ b/packages/kokkos/core/perf_test/PerfTest_ViewFill_8.cpp
@@ -14,13 +14,18 @@
 //
 //@HEADER
 
-#include <PerfTest_ViewFill.hpp>
+#include "PerfTest_ViewFill.hpp"
 
 namespace Test {
-TEST(default_exec, ViewFill_Rank8) {
-  printf("ViewFill Performance for LayoutLeft:\n");
-  run_fillview_tests8<Kokkos::LayoutLeft>(10, 1);
-  printf("ViewFill Performance for LayoutRight:\n");
-  run_fillview_tests8<Kokkos::LayoutRight>(10, 1);
-}
+
+BENCHMARK(ViewFill_Rank8<Kokkos::LayoutLeft>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime();
+
+BENCHMARK(ViewFill_Rank8<Kokkos::LayoutRight>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime();
+
 }  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestRandomCommon.hpp b/packages/kokkos/core/perf_test/PerfTest_ViewFill_Raw.cpp
similarity index 65%
rename from packages/kokkos/algorithms/unit_tests/TestRandomCommon.hpp
rename to packages/kokkos/core/perf_test/PerfTest_ViewFill_Raw.cpp
index c53d66f4d02cc2b3cbaf2ee3056615022eaa2446..c11074d9154fd71ff9d7bfc5678d6398d6e2aee5 100644
--- a/packages/kokkos/algorithms/unit_tests/TestRandomCommon.hpp
+++ b/packages/kokkos/core/perf_test/PerfTest_ViewFill_Raw.cpp
@@ -14,19 +14,20 @@
 //
 //@HEADER
 
-#ifndef KOKKOS_ALGORITHMS_UNITTESTS_TESTRANDOM_COMMON_HPP
-#define KOKKOS_ALGORITHMS_UNITTESTS_TESTRANDOM_COMMON_HPP
-
-#include <TestRandom.hpp>
+#include "PerfTest_ViewFill.hpp"
 
 namespace Test {
 
-TEST(TEST_CATEGORY, Random_XorShift64) {
-  test_random_xorshift64<TEST_EXECSPACE>();
-}
-TEST(TEST_CATEGORY, Random_XorShift1024_0) {
-  test_random_xorshift1024<TEST_EXECSPACE>();
-}
-}  // namespace Test
+#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
+BENCHMARK(ViewFill_Raw<Kokkos::LayoutLeft>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime();
 
+BENCHMARK(ViewFill_Raw<Kokkos::LayoutRight>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime();
 #endif
+
+}  // namespace Test
diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewResize.hpp b/packages/kokkos/core/perf_test/PerfTest_ViewResize.hpp
index dfcd3f1347f74dd022af61d952d47a552aa0e8fb..de6981e17afd37975a2a7a51e26652be2b4e1df1 100644
--- a/packages/kokkos/core/perf_test/PerfTest_ViewResize.hpp
+++ b/packages/kokkos/core/perf_test/PerfTest_ViewResize.hpp
@@ -15,346 +15,291 @@
 //@HEADER
 
 #include <Kokkos_Core.hpp>
-#include <gtest/gtest.h>
-#include <cstdio>
-#include <PerfTest_Category.hpp>
+#include <benchmark/benchmark.h>
+#include <cmath>
+#include "Benchmark_Context.hpp"
 
 namespace Test {
 
+static constexpr int R = 10;
+static constexpr int N = 10;
+
 template <class Layout>
-void run_resizeview_tests123(int N, int R) {
-  const int N1 = N;
-  const int N2 = N1 * N1;
-  const int N3 = N2 * N1;
-  const int N4 = N2 * N2;
-  const int N8 = N4 * N4;
-
-  double time1, time2, time3, time_raw = 100000.0;
-  double time1_noinit, time2_noinit, time3_noinit;
-  {
-    Kokkos::View<double*, Layout> a("A1", N8);
-    Kokkos::Timer timer;
-    for (int r = 0; r < R; r++) {
-      Kokkos::View<double*, Layout> a_(a);
-      Kokkos::resize(a_, int(N8 * 1.1));
-    }
-    time1 = timer.seconds() / R;
-  }
-  {
-    Kokkos::View<double**, Layout> a("A2", N4, N4);
-    Kokkos::Timer timer;
-    for (int r = 0; r < R; r++) {
-      Kokkos::View<double**, Layout> a_(a);
-      Kokkos::resize(a_, int(N4 * 1.1), N4);
-    }
-    time2 = timer.seconds() / R;
-  }
-  {
-    Kokkos::View<double***, Layout> a("A3", N3, N3, N2);
-    Kokkos::Timer timer;
-    for (int r = 0; r < R; r++) {
-      Kokkos::View<double***, Layout> a_(a);
-      Kokkos::resize(a_, int(N3 * 1.1), N3, N2);
-    }
-    time3 = timer.seconds() / R;
-  }
-  {
-    Kokkos::View<double*, Layout> a("A1", N8);
+static void ViewResize_Rank1(benchmark::State& state) {
+  const int N8 = std::pow(state.range(0), 8);
+  Kokkos::View<double*, Layout> a("A1", N8);
+  Kokkos::View<double*, Layout> a_(a);
+
+  for (auto _ : state) {
+    Kokkos::fence();
     Kokkos::Timer timer;
-    for (int r = 0; r < R; r++) {
-      Kokkos::View<double*, Layout> a_(a);
-      Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N8 * 1.1));
-    }
-    time1_noinit = timer.seconds() / R;
+    Kokkos::resize(a_, int(N8 * 1.1));
+    Kokkos::fence();
+    KokkosBenchmark::report_results(state, a, 2, timer.seconds());
   }
-  {
-    Kokkos::View<double**, Layout> a("A2", N4, N4);
+}
+
+template <class Layout>
+static void ViewResize_Rank2(benchmark::State& state) {
+  const int N4 = std::pow(state.range(0), 4);
+  Kokkos::View<double**, Layout> a("A2", N4, N4);
+  Kokkos::View<double**, Layout> a_(a);
+
+  for (auto _ : state) {
+    Kokkos::fence();
     Kokkos::Timer timer;
-    for (int r = 0; r < R; r++) {
-      Kokkos::View<double**, Layout> a_(a);
-      Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N4 * 1.1), N4);
-    }
-    time2_noinit = timer.seconds() / R;
+    Kokkos::resize(a_, int(N4 * 1.1), N4);
+    Kokkos::fence();
+    KokkosBenchmark::report_results(state, a, 2, timer.seconds());
   }
-  {
-    Kokkos::View<double***, Layout> a("A3", N3, N3, N2);
+}
+
+template <class Layout>
+static void ViewResize_Rank3(benchmark::State& state) {
+  const int N2 = std::pow(state.range(0), 2);
+  const int N3 = std::pow(state.range(0), 3);
+  Kokkos::View<double***, Layout> a("A3", N3, N3, N2);
+  Kokkos::View<double***, Layout> a_(a);
+
+  for (auto _ : state) {
+    Kokkos::fence();
     Kokkos::Timer timer;
-    for (int r = 0; r < R; r++) {
-      Kokkos::View<double***, Layout> a_(a);
-      Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N3 * 1.1), N3, N2);
-    }
-    time3_noinit = timer.seconds() / R;
+    Kokkos::resize(a_, int(N3 * 1.1), N3, N2);
+    Kokkos::fence();
+    KokkosBenchmark::report_results(state, a, 2, timer.seconds());
   }
-#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
-  {
-    Kokkos::View<double*, Layout> a("A1", N8);
-    double* a_ptr = a.data();
+}
+
+template <class Layout>
+static void ViewResize_Rank4(benchmark::State& state) {
+  const int N2 = std::pow(state.range(0), 2);
+  Kokkos::View<double****, Layout> a("A4", N2, N2, N2, N2);
+  Kokkos::View<double****, Layout> a_(a);
+
+  for (auto _ : state) {
+    Kokkos::fence();
     Kokkos::Timer timer;
-    for (int r = 0; r < R; r++) {
-      Kokkos::View<double*, Layout> a1(
-          Kokkos::view_alloc(Kokkos::WithoutInitializing, "A1"), int(N8 * 1.1));
-      double* a1_ptr = a1.data();
-      Kokkos::parallel_for(
-          N8, KOKKOS_LAMBDA(const int& i) { a1_ptr[i] = a_ptr[i]; });
-      Kokkos::fence();
-    }
-    Kokkos::fence();
-    time_raw = timer.seconds() / R;
+    Kokkos::resize(a_, int(N2 * 1.1), N2, N2, N2);
+    Kokkos::fence();
+    KokkosBenchmark::report_results(state, a, 2, timer.seconds());
   }
-#endif
-  double size = 1.0 * N8 * 8 / 1024 / 1024;
-  printf("   Raw:   %lf s   %lf MB   %lf GB/s\n", time_raw, size,
-         2.0 * size / 1024 / time_raw);
-  printf("   Rank1: %lf s   %lf MB   %lf GB/s\n", time1, size,
-         2.0 * size / 1024 / time1);
-  printf("   Rank2: %lf s   %lf MB   %lf GB/s\n", time2, size,
-         2.0 * size / 1024 / time2);
-  printf("   Rank3: %lf s   %lf MB   %lf GB/s\n", time3, size,
-         2.0 * size / 1024 / time3);
-  printf("   Rank1 (WithoutInitializing): %lf s   %lf MB   %lf GB/s\n",
-         time1_noinit, size, 2.0 * size / 1024 / time1_noinit);
-  printf("   Rank2 (WithoutInitializing): %lf s   %lf MB   %lf GB/s\n",
-         time2_noinit, size, 2.0 * size / 1024 / time2_noinit);
-  printf("   Rank3 (WithoutInitializing): %lf s   %lf MB   %lf GB/s\n",
-         time3_noinit, size, 2.0 * size / 1024 / time3_noinit);
 }
 
 template <class Layout>
-void run_resizeview_tests45(int N, int R) {
-  const int N1 = N;
+static void ViewResize_Rank5(benchmark::State& state) {
+  const int N1 = state.range(0);
   const int N2 = N1 * N1;
-  const int N4 = N2 * N2;
-  const int N8 = N4 * N4;
 
-  double time4, time5, time_raw = 100000.0;
-  double time4_noinit, time5_noinit;
-  {
-    Kokkos::View<double****, Layout> a("A4", N2, N2, N2, N2);
+  Kokkos::View<double*****, Layout> a("A5", N2, N2, N1, N1, N2);
+  Kokkos::View<double*****, Layout> a_(a);
+
+  for (auto _ : state) {
+    Kokkos::fence();
     Kokkos::Timer timer;
-    for (int r = 0; r < R; r++) {
-      Kokkos::View<double****, Layout> a_(a);
-      Kokkos::resize(a_, int(N2 * 1.1), N2, N2, N2);
-    }
-    time4 = timer.seconds() / R;
+    Kokkos::resize(a_, int(N2 * 1.1), N2, N1, N1, N2);
+    Kokkos::fence();
+    KokkosBenchmark::report_results(state, a, 2, timer.seconds());
   }
-  {
-    Kokkos::View<double*****, Layout> a("A5", N2, N2, N1, N1, N2);
+}
+
+template <class Layout>
+static void ViewResize_Rank6(benchmark::State& state) {
+  const int N1 = state.range(0);
+  const int N2 = N1 * N1;
+
+  Kokkos::View<double******, Layout> a("A6", N2, N1, N1, N1, N1, N2);
+  Kokkos::View<double******, Layout> a_(a);
+
+  for (auto _ : state) {
+    Kokkos::fence();
     Kokkos::Timer timer;
-    for (int r = 0; r < R; r++) {
-      Kokkos::View<double*****, Layout> a_(a);
-      Kokkos::resize(a_, int(N2 * 1.1), N2, N1, N1, N2);
-    }
-    time5 = timer.seconds() / R;
+    Kokkos::resize(a_, int(N2 * 1.1), N1, N1, N1, N1, N2);
+    Kokkos::fence();
+    KokkosBenchmark::report_results(state, a, 2, timer.seconds());
   }
-  {
-    Kokkos::View<double****, Layout> a("A4", N2, N2, N2, N2);
+}
+
+template <class Layout>
+static void ViewResize_Rank7(benchmark::State& state) {
+  const int N1 = state.range(0);
+  const int N2 = N1 * N1;
+
+  Kokkos::View<double*******, Layout> a("A7", N2, N1, N1, N1, N1, N1, N1);
+  Kokkos::View<double*******, Layout> a_(a);
+
+  for (auto _ : state) {
+    Kokkos::fence();
     Kokkos::Timer timer;
-    for (int r = 0; r < R; r++) {
-      Kokkos::View<double****, Layout> a_(a);
-      Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N2 * 1.1), N2, N2,
-                     N2);
-    }
-    time4_noinit = timer.seconds() / R;
+    Kokkos::resize(a_, int(N2 * 1.1), N1, N1, N1, N1, N1, N1);
+    Kokkos::fence();
+    KokkosBenchmark::report_results(state, a, 2, timer.seconds());
   }
-  {
-    Kokkos::View<double*****, Layout> a("A5", N2, N2, N1, N1, N2);
+}
+
+template <class Layout>
+static void ViewResize_Rank8(benchmark::State& state) {
+  const int N1 = state.range(0);
+
+  Kokkos::View<double********, Layout> a("A8", N1, N1, N1, N1, N1, N1, N1, N1);
+  Kokkos::View<double********, Layout> a_(a);
+
+  for (auto _ : state) {
+    Kokkos::fence();
     Kokkos::Timer timer;
-    for (int r = 0; r < R; r++) {
-      Kokkos::View<double*****, Layout> a_(a);
-      Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N2 * 1.1), N2, N1, N1,
-                     N2);
-    }
-    time5_noinit = timer.seconds() / R;
+    Kokkos::resize(a_, int(N1 * 1.1), N1, N1, N1, N1, N1, N1, N1);
+    Kokkos::fence();
+    KokkosBenchmark::report_results(state, a, 2, timer.seconds());
   }
-#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
-  {
-    Kokkos::View<double*, Layout> a("A1", N8);
-    double* a_ptr = a.data();
+}
+
+template <class Layout>
+static void ViewResize_NoInit_Rank1(benchmark::State& state) {
+  const int N8 = std::pow(state.range(0), 8);
+  Kokkos::View<double*, Layout> a("A1", N8);
+  Kokkos::View<double*, Layout> a_(a);
+
+  for (auto _ : state) {
+    Kokkos::fence();
     Kokkos::Timer timer;
-    for (int r = 0; r < R; r++) {
-      Kokkos::View<double*, Layout> a1(
-          Kokkos::view_alloc(Kokkos::WithoutInitializing, "A1"), int(N8 * 1.1));
-      double* a1_ptr = a1.data();
-      Kokkos::parallel_for(
-          N8, KOKKOS_LAMBDA(const int& i) { a1_ptr[i] = a_ptr[i]; });
-      Kokkos::fence();
-    }
-    Kokkos::fence();
-    time_raw = timer.seconds() / R;
+    Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N8 * 1.1));
+    Kokkos::fence();
+    KokkosBenchmark::report_results(state, a, 2, timer.seconds());
   }
-#endif
-  double size = 1.0 * N8 * 8 / 1024 / 1024;
-  printf("   Raw:   %lf s   %lf MB   %lf GB/s\n", time_raw, size,
-         2.0 * size / 1024 / time_raw);
-  printf("   Rank4: %lf s   %lf MB   %lf GB/s\n", time4, size,
-         2.0 * size / 1024 / time4);
-  printf("   Rank5: %lf s   %lf MB   %lf GB/s\n", time5, size,
-         2.0 * size / 1024 / time5);
-  printf("   Rank4 (WithoutInitializing): %lf s   %lf MB   %lf GB/s\n",
-         time4_noinit, size, 2.0 * size / 1024 / time4_noinit);
-  printf("   Rank5 (WithoutInitializing): %lf s   %lf MB   %lf GB/s\n",
-         time5_noinit, size, 2.0 * size / 1024 / time5_noinit);
 }
 
 template <class Layout>
-void run_resizeview_tests6(int N, int R) {
-  const int N1 = N;
-  const int N2 = N1 * N1;
-  const int N4 = N2 * N2;
-  const int N8 = N4 * N4;
+static void ViewResize_NoInit_Rank2(benchmark::State& state) {
+  const int N4 = std::pow(state.range(0), 4);
+  Kokkos::View<double**, Layout> a("A2", N4, N4);
+  Kokkos::View<double**, Layout> a_(a);
 
-  double time6, time6_noinit, time_raw = 100000.0;
-  {
-    Kokkos::View<double******, Layout> a("A6", N2, N1, N1, N1, N1, N2);
+  for (auto _ : state) {
+    Kokkos::fence();
     Kokkos::Timer timer;
-    for (int r = 0; r < R; r++) {
-      Kokkos::View<double******, Layout> a_(a);
-      Kokkos::resize(a_, int(N2 * 1.1), N1, N1, N1, N1, N2);
-    }
-    time6 = timer.seconds() / R;
+    Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N4 * 1.1), N4);
+    Kokkos::fence();
+    KokkosBenchmark::report_results(state, a, 2, timer.seconds());
   }
-  {
-    Kokkos::View<double******, Layout> a("A6", N2, N1, N1, N1, N1, N2);
+}
+
+template <class Layout>
+static void ViewResize_NoInit_Rank3(benchmark::State& state) {
+  const int N2 = std::pow(state.range(0), 2);
+  const int N3 = std::pow(state.range(0), 3);
+  Kokkos::View<double***, Layout> a("A3", N3, N3, N2);
+  Kokkos::View<double***, Layout> a_(a);
+
+  for (auto _ : state) {
+    Kokkos::fence();
     Kokkos::Timer timer;
-    for (int r = 0; r < R; r++) {
-      Kokkos::View<double******, Layout> a_(a);
-      Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N2 * 1.1), N1, N1, N1,
-                     N1, N2);
-    }
-    time6_noinit = timer.seconds() / R;
+    Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N3 * 1.1), N3, N2);
+    Kokkos::fence();
+    KokkosBenchmark::report_results(state, a, 2, timer.seconds());
   }
-#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
-  {
-    Kokkos::View<double*, Layout> a("A1", N8);
-    double* a_ptr = a.data();
+}
+
+template <class Layout>
+static void ViewResize_NoInit_Rank4(benchmark::State& state) {
+  const int N2 = std::pow(state.range(0), 2);
+  Kokkos::View<double****, Layout> a("A4", N2, N2, N2, N2);
+  Kokkos::View<double****, Layout> a_(a);
+
+  for (auto _ : state) {
+    Kokkos::fence();
     Kokkos::Timer timer;
-    for (int r = 0; r < R; r++) {
-      Kokkos::View<double*, Layout> a1(
-          Kokkos::view_alloc(Kokkos::WithoutInitializing, "A1"), int(N8 * 1.1));
-      double* a1_ptr = a1.data();
-      Kokkos::parallel_for(
-          N8, KOKKOS_LAMBDA(const int& i) { a1_ptr[i] = a_ptr[i]; });
-      Kokkos::fence();
-    }
-    Kokkos::fence();
-    time_raw = timer.seconds() / R;
+    Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N2 * 1.1), N2, N2, N2);
+    Kokkos::fence();
+    KokkosBenchmark::report_results(state, a, 2, timer.seconds());
   }
-#endif
-  double size = 1.0 * N8 * 8 / 1024 / 1024;
-  printf("   Raw:   %lf s   %lf MB   %lf GB/s\n", time_raw, size,
-         2.0 * size / 1024 / time_raw);
-  printf("   Rank6: %lf s   %lf MB   %lf GB/s\n", time6, size,
-         2.0 * size / 1024 / time6);
-  printf("   Rank6 (WithoutInitializing): %lf s   %lf MB   %lf GB/s\n",
-         time6_noinit, size, 2.0 * size / 1024 / time6_noinit);
 }
 
 template <class Layout>
-void run_resizeview_tests7(int N, int R) {
-  const int N1 = N;
+static void ViewResize_NoInit_Rank5(benchmark::State& state) {
+  const int N1 = state.range(0);
   const int N2 = N1 * N1;
-  const int N4 = N2 * N2;
-  const int N8 = N4 * N4;
 
-  double time7, time7_noinit, time_raw = 100000.0;
-  {
-    Kokkos::View<double*******, Layout> a("A7", N2, N1, N1, N1, N1, N1, N1);
-    Kokkos::Timer timer;
-    for (int r = 0; r < R; r++) {
-      Kokkos::View<double*******, Layout> a_(a);
-      Kokkos::resize(a_, int(N2 * 1.1), N1, N1, N1, N1, N1, N1);
-    }
-    time7 = timer.seconds() / R;
-  }
-  {
-    Kokkos::View<double*******, Layout> a("A7", N2, N1, N1, N1, N1, N1, N1);
+  Kokkos::View<double*****, Layout> a("A5", N2, N2, N1, N1, N2);
+  Kokkos::View<double*****, Layout> a_(a);
+
+  for (auto _ : state) {
+    Kokkos::fence();
     Kokkos::Timer timer;
-    for (int r = 0; r < R; r++) {
-      Kokkos::View<double*******, Layout> a_(a);
-      Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N2 * 1.1), N1, N1, N1,
-                     N1, N1, N1);
-    }
-    time7_noinit = timer.seconds() / R;
+    Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N2 * 1.1), N2, N1, N1,
+                   N2);
+    Kokkos::fence();
+    KokkosBenchmark::report_results(state, a, 2, timer.seconds());
   }
-#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
-  {
-    Kokkos::View<double*, Layout> a("A1", N8);
-    double* a_ptr = a.data();
+}
+
+template <class Layout>
+static void ViewResize_NoInit_Rank6(benchmark::State& state) {
+  const int N1 = state.range(0);
+  const int N2 = N1 * N1;
+
+  Kokkos::View<double******, Layout> a("A6", N2, N1, N1, N1, N1, N2);
+  Kokkos::View<double******, Layout> a_(a);
+
+  for (auto _ : state) {
+    Kokkos::fence();
     Kokkos::Timer timer;
-    for (int r = 0; r < R; r++) {
-      Kokkos::View<double*, Layout> a1(
-          Kokkos::view_alloc(Kokkos::WithoutInitializing, "A1"), int(N8 * 1.1));
-      double* a1_ptr = a1.data();
-      Kokkos::parallel_for(
-          N8, KOKKOS_LAMBDA(const int& i) { a1_ptr[i] = a_ptr[i]; });
-      Kokkos::fence();
-    }
-    Kokkos::fence();
-    time_raw = timer.seconds() / R;
+    Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N2 * 1.1), N1, N1, N1,
+                   N1, N2);
+    Kokkos::fence();
+    KokkosBenchmark::report_results(state, a, 2, timer.seconds());
   }
-#endif
-  double size = 1.0 * N8 * 8 / 1024 / 1024;
-  printf("   Raw:   %lf s   %lf MB   %lf GB/s\n", time_raw, size,
-         2.0 * size / 1024 / time_raw);
-  printf("   Rank7: %lf s   %lf MB   %lf GB/s\n", time7, size,
-         2.0 * size / 1024 / time7);
-  printf("   Rank7 (WithoutInitializing): %lf s   %lf MB   %lf GB/s\n",
-         time7_noinit, size, 2.0 * size / 1024 / time7_noinit);
 }
 
 template <class Layout>
-void run_resizeview_tests8(int N, int R) {
-  const int N1 = N;
+static void ViewResize_NoInit_Rank7(benchmark::State& state) {
+  const int N1 = state.range(0);
   const int N2 = N1 * N1;
-  const int N4 = N2 * N2;
-  const int N8 = N4 * N4;
 
-  double time8, time8_noinit, time_raw = 100000.0;
-  {
-    Kokkos::View<double********, Layout> a("A8", N1, N1, N1, N1, N1, N1, N1,
-                                           N1);
+  Kokkos::View<double*******, Layout> a("A7", N2, N1, N1, N1, N1, N1, N1);
+  Kokkos::View<double*******, Layout> a_(a);
+
+  for (auto _ : state) {
+    Kokkos::fence();
     Kokkos::Timer timer;
-    for (int r = 0; r < R; r++) {
-      Kokkos::View<double********, Layout> a_(a);
-      Kokkos::resize(a_, int(N1 * 1.1), N1, N1, N1, N1, N1, N1, N1);
-    }
-    time8 = timer.seconds() / R;
+    Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N2 * 1.1), N1, N1, N1,
+                   N1, N1, N1);
+    Kokkos::fence();
+    KokkosBenchmark::report_results(state, a, 2, timer.seconds());
   }
-  {
-    Kokkos::View<double********, Layout> a("A8", N1, N1, N1, N1, N1, N1, N1,
-                                           N1);
+}
+
+template <class Layout>
+static void ViewResize_NoInit_Rank8(benchmark::State& state) {
+  const int N1 = state.range(0);
+
+  Kokkos::View<double********, Layout> a("A8", N1, N1, N1, N1, N1, N1, N1, N1);
+  Kokkos::View<double********, Layout> a_(a);
+
+  for (auto _ : state) {
+    Kokkos::fence();
     Kokkos::Timer timer;
-    for (int r = 0; r < R; r++) {
-      Kokkos::View<double********, Layout> a_(a);
-      Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N1 * 1.1), N1, N1, N1,
-                     N1, N1, N1, N1);
-    }
-    time8_noinit = timer.seconds() / R;
+    Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N1 * 1.1), N1, N1, N1,
+                   N1, N1, N1, N1);
+    Kokkos::fence();
+    KokkosBenchmark::report_results(state, a, 2, timer.seconds());
   }
-#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
-  {
-    Kokkos::View<double*, Layout> a("A1", N8);
-    double* a_ptr = a.data();
+}
+
+template <class Layout>
+static void ViewResize_NoInit_Raw(benchmark::State& state) {
+  const int N8 = std::pow(state.range(0), 8);
+  Kokkos::View<double*, Layout> a("A1", N8);
+  double* a_ptr = a.data();
+
+  for (auto _ : state) {
     Kokkos::Timer timer;
-    for (int r = 0; r < R; r++) {
-      Kokkos::View<double*, Layout> a1(
-          Kokkos::view_alloc(Kokkos::WithoutInitializing, "A1"), int(N8 * 1.1));
-      double* a1_ptr = a1.data();
-      Kokkos::parallel_for(
-          N8, KOKKOS_LAMBDA(const int& i) { a1_ptr[i] = a_ptr[i]; });
-      Kokkos::fence();
-    }
-    Kokkos::fence();
-    time_raw = timer.seconds() / R;
+    Kokkos::View<double*, Layout> a1(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "A1"), int(N8 * 1.1));
+    double* a1_ptr = a1.data();
+    Kokkos::parallel_for(
+        N8, KOKKOS_LAMBDA(const int& i) { a1_ptr[i] = a_ptr[i]; });
+    Kokkos::fence();
+    KokkosBenchmark::report_results(state, a, 2, timer.seconds());
   }
-#endif
-  double size = 1.0 * N8 * 8 / 1024 / 1024;
-  printf("   Raw:   %lf s   %lf MB   %lf GB/s\n", time_raw, size,
-         2.0 * size / 1024 / time_raw);
-  printf("   Rank8: %lf s   %lf MB   %lf GB/s\n", time8, size,
-         2.0 * size / 1024 / time8);
-  printf("   Rank8 (WithoutInitializing): %lf s   %lf MB   %lf GB/s\n",
-         time8_noinit, size, 2.0 * size / 1024 / time8_noinit);
 }
 
 }  // namespace Test
diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewResize_123.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewResize_123.cpp
index ed2e58192c9bc498dbd988c5a4bba3c82d5cc2ba..0b3141eead0059d7bacf13c6af9791df5b703d93 100644
--- a/packages/kokkos/core/perf_test/PerfTest_ViewResize_123.cpp
+++ b/packages/kokkos/core/perf_test/PerfTest_ViewResize_123.cpp
@@ -14,15 +14,80 @@
 //
 //@HEADER
 
-#include <PerfTest_ViewResize.hpp>
+#include "PerfTest_ViewResize.hpp"
 
 namespace Test {
 
-TEST(default_exec, ViewResize_Rank123) {
-  printf("Resize View Performance for LayoutLeft:\n");
-  run_resizeview_tests123<Kokkos::LayoutLeft>(10, 1);
-  printf("Resize View Performance for LayoutRight:\n");
-  run_resizeview_tests123<Kokkos::LayoutRight>(10, 1);
-}
+BENCHMARK(ViewResize_Rank1<Kokkos::LayoutLeft>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime()
+    ->Iterations(R);
+
+BENCHMARK(ViewResize_Rank1<Kokkos::LayoutRight>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime()
+    ->Iterations(R);
+
+BENCHMARK(ViewResize_Rank2<Kokkos::LayoutLeft>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime()
+    ->Iterations(R);
+
+BENCHMARK(ViewResize_Rank2<Kokkos::LayoutRight>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime()
+    ->Iterations(R);
+
+BENCHMARK(ViewResize_Rank3<Kokkos::LayoutLeft>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime()
+    ->Iterations(R);
+
+BENCHMARK(ViewResize_Rank3<Kokkos::LayoutRight>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime()
+    ->Iterations(R);
+
+BENCHMARK(ViewResize_NoInit_Rank1<Kokkos::LayoutLeft>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime()
+    ->Iterations(R);
+
+BENCHMARK(ViewResize_NoInit_Rank1<Kokkos::LayoutRight>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime()
+    ->Iterations(R);
+
+BENCHMARK(ViewResize_NoInit_Rank2<Kokkos::LayoutLeft>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime()
+    ->Iterations(R);
+
+BENCHMARK(ViewResize_NoInit_Rank2<Kokkos::LayoutRight>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime()
+    ->Iterations(R);
+
+BENCHMARK(ViewResize_NoInit_Rank3<Kokkos::LayoutLeft>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime()
+    ->Iterations(R);
+
+BENCHMARK(ViewResize_NoInit_Rank3<Kokkos::LayoutRight>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime()
+    ->Iterations(R);
 
 }  // namespace Test
diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewResize_45.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewResize_45.cpp
index 69028fab08c593962e251b01196ed1bf8efc5416..f5eec387cbdf717f2cdefe40219300a30a83c907 100644
--- a/packages/kokkos/core/perf_test/PerfTest_ViewResize_45.cpp
+++ b/packages/kokkos/core/perf_test/PerfTest_ViewResize_45.cpp
@@ -14,15 +14,56 @@
 //
 //@HEADER
 
-#include <PerfTest_ViewResize.hpp>
+#include "PerfTest_ViewResize.hpp"
 
 namespace Test {
 
-TEST(default_exec, ViewResize_Rank_45) {
-  printf("Resize View Performance for LayoutLeft:\n");
-  run_resizeview_tests45<Kokkos::LayoutLeft>(10, 1);
-  printf("Resize View Performance for LayoutRight:\n");
-  run_resizeview_tests45<Kokkos::LayoutRight>(10, 1);
-}
+BENCHMARK(ViewResize_Rank4<Kokkos::LayoutLeft>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime()
+    ->Iterations(R);
+
+BENCHMARK(ViewResize_Rank4<Kokkos::LayoutRight>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime()
+    ->Iterations(R);
+
+BENCHMARK(ViewResize_Rank5<Kokkos::LayoutLeft>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime()
+    ->Iterations(R);
+
+BENCHMARK(ViewResize_Rank5<Kokkos::LayoutRight>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime()
+    ->Iterations(R);
+
+BENCHMARK(ViewResize_NoInit_Rank4<Kokkos::LayoutLeft>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime()
+    ->Iterations(R);
+
+BENCHMARK(ViewResize_NoInit_Rank4<Kokkos::LayoutRight>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime()
+    ->Iterations(R);
+
+BENCHMARK(ViewResize_NoInit_Rank5<Kokkos::LayoutLeft>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime()
+    ->Iterations(R);
+
+BENCHMARK(ViewResize_NoInit_Rank5<Kokkos::LayoutRight>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime()
+    ->Iterations(R);
 
 }  // namespace Test
diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewResize_6.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewResize_6.cpp
index 486b44a0c1b1b2fe0a8903aa7d0ac64ebf174839..6b639d3a672279767f9bb949abf27f8be80b7bf4 100644
--- a/packages/kokkos/core/perf_test/PerfTest_ViewResize_6.cpp
+++ b/packages/kokkos/core/perf_test/PerfTest_ViewResize_6.cpp
@@ -14,15 +14,32 @@
 //
 //@HEADER
 
-#include <PerfTest_ViewResize.hpp>
+#include "PerfTest_ViewResize.hpp"
 
 namespace Test {
 
-TEST(default_exec, ViewResize_Rank6) {
-  printf("Resize View Performance for LayoutLeft:\n");
-  run_resizeview_tests6<Kokkos::LayoutLeft>(10, 1);
-  printf("Resize View Performance for LayoutRight:\n");
-  run_resizeview_tests6<Kokkos::LayoutRight>(10, 1);
-}
+BENCHMARK(ViewResize_Rank6<Kokkos::LayoutLeft>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime()
+    ->Iterations(R);
+
+BENCHMARK(ViewResize_Rank6<Kokkos::LayoutRight>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime()
+    ->Iterations(R);
+
+BENCHMARK(ViewResize_NoInit_Rank6<Kokkos::LayoutLeft>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime()
+    ->Iterations(R);
+
+BENCHMARK(ViewResize_NoInit_Rank6<Kokkos::LayoutRight>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime()
+    ->Iterations(R);
 
 }  // namespace Test
diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewResize_7.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewResize_7.cpp
index 84c2a79ad68b993420e7a94207ca2cd95b573814..8ebf80e3ffea1071681b6edc39b138ed1546091c 100644
--- a/packages/kokkos/core/perf_test/PerfTest_ViewResize_7.cpp
+++ b/packages/kokkos/core/perf_test/PerfTest_ViewResize_7.cpp
@@ -14,15 +14,32 @@
 //
 //@HEADER
 
-#include <PerfTest_ViewResize.hpp>
+#include "PerfTest_ViewResize.hpp"
 
 namespace Test {
 
-TEST(default_exec, ViewResize_Rank7) {
-  printf("Resize View Performance for LayoutLeft:\n");
-  run_resizeview_tests7<Kokkos::LayoutLeft>(10, 1);
-  printf("Resize View Performance for LayoutRight:\n");
-  run_resizeview_tests7<Kokkos::LayoutRight>(10, 1);
-}
+BENCHMARK(ViewResize_Rank7<Kokkos::LayoutLeft>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime()
+    ->Iterations(R);
+
+BENCHMARK(ViewResize_Rank7<Kokkos::LayoutRight>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime()
+    ->Iterations(R);
+
+BENCHMARK(ViewResize_NoInit_Rank7<Kokkos::LayoutLeft>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime()
+    ->Iterations(R);
+
+BENCHMARK(ViewResize_NoInit_Rank7<Kokkos::LayoutRight>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime()
+    ->Iterations(R);
 
 }  // namespace Test
diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewResize_8.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewResize_8.cpp
index 25910fb5754b7e9835cebcacadf2186848c2045b..5e741e800b1e339a3da4f4c3b2a31a929b2ca095 100644
--- a/packages/kokkos/core/perf_test/PerfTest_ViewResize_8.cpp
+++ b/packages/kokkos/core/perf_test/PerfTest_ViewResize_8.cpp
@@ -14,23 +14,39 @@
 //
 //@HEADER
 
-#include <PerfTest_ViewResize.hpp>
+#include "PerfTest_ViewResize.hpp"
 
 namespace Test {
 
-TEST(default_exec, ViewResize_Rank8) {
 // FIXME_SYCL Avoid running out of resources on the CUDA GPU used in the CI
 #ifdef KOKKOS_ENABLE_SYCL
-  printf("Resize View Performance for LayoutLeft:\n");
-  run_resizeview_tests8<Kokkos::LayoutLeft>(9, 1);
-  printf("Resize View Performance for LayoutRight:\n");
-  run_resizeview_tests8<Kokkos::LayoutRight>(9, 1);
+static constexpr int N_8 = N - 1;
 #else
-  printf("Resize View Performance for LayoutLeft:\n");
-  run_resizeview_tests8<Kokkos::LayoutLeft>(10, 1);
-  printf("Resize View Performance for LayoutRight:\n");
-  run_resizeview_tests8<Kokkos::LayoutRight>(10, 1);
+static constexpr int N_8 = N;
 #endif
-}
+
+BENCHMARK(ViewResize_Rank8<Kokkos::LayoutLeft>)
+    ->ArgName("N")
+    ->Arg(N_8)
+    ->UseManualTime()
+    ->Iterations(R);
+
+BENCHMARK(ViewResize_Rank8<Kokkos::LayoutRight>)
+    ->ArgName("N")
+    ->Arg(N_8)
+    ->UseManualTime()
+    ->Iterations(R);
+
+BENCHMARK(ViewResize_NoInit_Rank8<Kokkos::LayoutLeft>)
+    ->ArgName("N")
+    ->Arg(N_8)
+    ->UseManualTime()
+    ->Iterations(R);
+
+BENCHMARK(ViewResize_NoInit_Rank8<Kokkos::LayoutRight>)
+    ->ArgName("N")
+    ->Arg(N_8)
+    ->UseManualTime()
+    ->Iterations(R);
 
 }  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestOpenMP_Sort3D.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewResize_Raw.cpp
similarity index 59%
rename from packages/kokkos/algorithms/unit_tests/TestOpenMP_Sort3D.cpp
rename to packages/kokkos/core/perf_test/PerfTest_ViewResize_Raw.cpp
index cd6e8e8cbf337eac3158eaa14f80dc5cd5e08a8a..2d1bcbb3cab5edcda25fffe46b19d69f4c1f7d73 100644
--- a/packages/kokkos/algorithms/unit_tests/TestOpenMP_Sort3D.cpp
+++ b/packages/kokkos/core/perf_test/PerfTest_ViewResize_Raw.cpp
@@ -14,24 +14,22 @@
 //
 //@HEADER
 
-#include <Kokkos_Macros.hpp>
-#ifdef KOKKOS_ENABLE_OPENMP
-
-#include <gtest/gtest.h>
-#include <Kokkos_Core.hpp>
-
-//----------------------------------------------------------------------------
-#include <TestRandom.hpp>
-#include <TestSort.hpp>
-#include <iomanip>
+#include "PerfTest_ViewResize.hpp"
 
 namespace Test {
 
-TEST(openmp, SortUnsigned3D) {
-  Impl::test_3D_sort<Kokkos::OpenMP, unsigned>(171);
-}
+#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
+BENCHMARK(ViewResize_NoInit_Raw<Kokkos::LayoutLeft>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime()
+    ->Iterations(R);
 
-}  // namespace Test
-#else
-void KOKKOS_ALGORITHMS_UNITTESTS_TESTOPENMP_PREVENT_LINK_ERROR() {}
+BENCHMARK(ViewResize_NoInit_Raw<Kokkos::LayoutRight>)
+    ->ArgName("N")
+    ->Arg(N)
+    ->UseManualTime()
+    ->Iterations(R);
 #endif
+
+}  // namespace Test
diff --git a/packages/kokkos/core/perf_test/test_atomic.cpp b/packages/kokkos/core/perf_test/test_atomic.cpp
index 5f10afc45a18200caca69274bb968719929dd688..ce3059f47d32d9816c5303fcaa61accb52013b97 100644
--- a/packages/kokkos/core/perf_test/test_atomic.cpp
+++ b/packages/kokkos/core/perf_test/test_atomic.cpp
@@ -18,38 +18,14 @@
 #include <cstring>
 #include <cstdlib>
 
+#include <benchmark/benchmark.h>
+#include "Benchmark_Context.hpp"
+
 #include <Kokkos_Core.hpp>
 #include <Kokkos_Timer.hpp>
 
 using exec_space = Kokkos::DefaultExecutionSpace;
 
-#define RESET 0
-#define BRIGHT 1
-#define DIM 2
-#define UNDERLINE 3
-#define BLINK 4
-#define REVERSE 7
-#define HIDDEN 8
-
-#define BLACK 0
-#define RED 1
-#define GREEN 2
-#define YELLOW 3
-#define BLUE 4
-#define MAGENTA 5
-#define CYAN 6
-#define GREY 7
-#define WHITE 8
-
-void textcolor(int attr, int fg, int bg) {
-  char command[40];
-
-  /* Command is the control command to the terminal */
-  snprintf(command, 40, "%c[%d;%d;%dm", 0x1B, attr, fg + 30, bg + 40);
-  printf("%s", command);
-}
-void textcolor_standard() { textcolor(RESET, BLACK, WHITE); }
-
 template <class T, class DEVICE_TYPE>
 struct ZeroFunctor {
   using execution_space = DEVICE_TYPE;
@@ -370,7 +346,9 @@ T LoopVariantNonAtomic(int loop, int test) {
 }
 
 template <class T>
-void Loop(int loop, int test, const char* type_name) {
+void Loop(benchmark::State& state, int test) {
+  int loop = state.range(0);
+
   LoopVariant<T>(loop, test);
 
   Kokkos::Timer timer;
@@ -388,86 +366,36 @@ void Loop(int loop, int test, const char* type_name) {
   time *= 1e6 / loop;
   timeNonAtomic *= 1e6 / loop;
   timeSerial *= 1e6 / loop;
-  // textcolor_standard();
-  bool passed = true;
-  if (resSerial != res) passed = false;
-  // if(!passed) textcolor(RESET,BLACK,YELLOW);
-  printf(
-      "%s Test %i %s  --- Loop: %i Value (S,A,NA): %e %e %e Time: %7.4e %7.4e "
-      "%7.4e Size of Type %i)",
-      type_name, test, passed ? "PASSED" : "FAILED", loop, 1.0 * resSerial,
-      1.0 * res, 1.0 * resNonAtomic, timeSerial, time, timeNonAtomic,
-      (int)sizeof(T));
-  // if(!passed) textcolor_standard();
-  printf("\n");
-}
 
-template <class T>
-void Test(int loop, int test, const char* type_name) {
-  if (test == -1) {
-    Loop<T>(loop, 1, type_name);
-    Loop<T>(loop, 2, type_name);
-    Loop<T>(loop, 3, type_name);
-
-  } else
-    Loop<T>(loop, test, type_name);
-}
+  bool passed = (resSerial == res);
 
-int main(int argc, char* argv[]) {
-  int type = -1;
-  int loop = 100000;
-  int test = -1;
-
-  for (int i = 0; i < argc; i++) {
-    if ((strcmp(argv[i], "--test") == 0)) {
-      test = std::stoi(argv[++i]);
-      continue;
-    }
-    if ((strcmp(argv[i], "--type") == 0)) {
-      type = std::stoi(argv[++i]);
-      continue;
-    }
-    if ((strcmp(argv[i], "-l") == 0) || (strcmp(argv[i], "--loop") == 0)) {
-      loop = std::stoi(argv[++i]);
-      continue;
-    }
-  }
+  state.counters["Passed"]           = benchmark::Counter(passed);
+  state.counters["Value serial"]     = benchmark::Counter(resSerial);
+  state.counters["Value atomic"]     = benchmark::Counter(res);
+  state.counters["Value non-atomic"] = benchmark::Counter(resNonAtomic);
+  state.counters["Time serial"]      = benchmark::Counter(timeSerial);
+  state.counters["Time atomic"]      = benchmark::Counter(time);
+  state.counters["Time non-atomic"]  = benchmark::Counter(timeNonAtomic);
+  state.counters["Size of type"]     = benchmark::Counter(sizeof(T));
+}
 
-  Kokkos::initialize(argc, argv);
-
-  printf("Using %s\n", Kokkos::atomic_query_version());
-  bool all_tests = false;
-  if (type == -1) all_tests = true;
-  while (type < 100) {
-    if (type == 1) {
-      Test<int>(loop, test, "int                    ");
-    }
-    if (type == 2) {
-      Test<long int>(loop, test, "long int               ");
-    }
-    if (type == 3) {
-      Test<long long int>(loop, test, "long long int          ");
-    }
-    if (type == 4) {
-      Test<unsigned int>(loop, test, "unsigned int           ");
-    }
-    if (type == 5) {
-      Test<unsigned long int>(loop, test, "unsigned long int      ");
-    }
-    if (type == 6) {
-      Test<unsigned long long int>(loop, test, "unsigned long long int ");
-    }
-    if (type == 10) {
-      // Test<float>(loop,test,"float                  ");
-    }
-    if (type == 11) {
-      Test<double>(loop, test, "double                 ");
-    }
-    if (!all_tests)
-      type = 100;
-    else
-      type++;
+template <class T>
+static void Test_Atomic(benchmark::State& state) {
+  for (auto _ : state) {
+    Loop<T>(state, 1);
+    Loop<T>(state, 2);
+    Loop<T>(state, 3);
   }
-
-  Kokkos::finalize();
 }
+
+static constexpr int LOOP = 100'000;
+
+BENCHMARK(Test_Atomic<int>)->Arg(LOOP)->Iterations(10);
+BENCHMARK(Test_Atomic<long int>)->Arg(LOOP)->Iterations(10);
+BENCHMARK(Test_Atomic<long long int>)->Arg(LOOP)->Iterations(10);
+BENCHMARK(Test_Atomic<unsigned int>)->Arg(LOOP)->Iterations(10);
+BENCHMARK(Test_Atomic<unsigned long int>)->Arg(LOOP)->Iterations(10);
+BENCHMARK(Test_Atomic<unsigned long long int>)->Arg(LOOP)->Iterations(10);
+BENCHMARK(Test_Atomic<float>)->Arg(LOOP)->Iterations(10);
+BENCHMARK(Test_Atomic<double>)->Arg(LOOP)->Iterations(10);
+BENCHMARK(Test_Atomic<int>)->Arg(LOOP)->Iterations(10);
diff --git a/packages/kokkos/core/perf_test/test_atomic_minmax_simple.cpp b/packages/kokkos/core/perf_test/test_atomic_minmax_simple.cpp
index 4c2ae5c2d18144f91c420e2d8def00b2da710b2f..b838c8eccf02e7177c07886be5b775130b405253 100644
--- a/packages/kokkos/core/perf_test/test_atomic_minmax_simple.cpp
+++ b/packages/kokkos/core/perf_test/test_atomic_minmax_simple.cpp
@@ -21,240 +21,536 @@
 // core/src/libkokkoscore.a -ldl && OMP_NUM_THREADS=1
 // ./test_atomic_minmax_simple.x 10000000
 
-#include <cstdio>
-#include <cstdlib>
-
-#include <iostream>
-#include <typeinfo>
+#include <benchmark/benchmark.h>
 
+#include "Benchmark_Context.hpp"
+#include "PerfTest_Category.hpp"
 #include <Kokkos_Core.hpp>
-#include <Kokkos_Timer.hpp>
 
 using exec_space = Kokkos::DefaultExecutionSpace;
 
+constexpr int LENGTH = 1'000'000;
+
+template <typename T>
+Kokkos::View<T*, exec_space> prepare_input(const int length, const T value) {
+  Kokkos::View<T*, exec_space> input("input", length);
+  Kokkos::parallel_for(
+      length, KOKKOS_LAMBDA(const int i) { input(i) = value; });
+  Kokkos::fence();
+  return input;
+}
+
+int get_length(benchmark::State& state) {
+  return (Test::command_line_num_args() == 2)
+             ? std::stoi(Test::command_line_arg(1))
+             : state.range(0);
+}
+
 template <typename T>
-void test(const int length) {
+int check_errors_replacement(Kokkos::View<T*, exec_space> view) {
+  int errors = 0;
+  Kokkos::parallel_reduce(
+      view.size(),
+      KOKKOS_LAMBDA(const int i, int& inner) { inner += (view(i) != (T)i); },
+      errors);
+  Kokkos::fence();
+  return errors;
+}
+
+template <typename T>
+double atomic_min_replacement(Kokkos::View<T*, exec_space> input) {
+  const int length = input.size();
   Kokkos::Timer timer;
+  Kokkos::parallel_for(
+      length, KOKKOS_LAMBDA(const int i) {
+        (void)Kokkos::atomic_fetch_min(&(input(i)), (T)i);
+      });
+  Kokkos::fence();
+  return timer.seconds();
+}
 
-  using vector = Kokkos::View<T*, exec_space>;
-
-  vector inp("input", length);
-  T max = std::numeric_limits<T>::max();
-  T min = std::numeric_limits<T>::lowest();
-
-  // input is max values - all min atomics will replace
-  {
-    Kokkos::parallel_for(
-        length, KOKKOS_LAMBDA(const int i) { inp(i) = max; });
-    Kokkos::fence();
-
-    timer.reset();
-    Kokkos::parallel_for(
-        length, KOKKOS_LAMBDA(const int i) {
-          (void)Kokkos::atomic_fetch_min(&(inp(i)), (T)i);
-        });
-    Kokkos::fence();
-    double time = timer.seconds();
-
-    int errors(0);
-    Kokkos::parallel_reduce(
-        length,
-        KOKKOS_LAMBDA(const int i, int& inner) { inner += (inp(i) != (T)i); },
-        errors);
-    Kokkos::fence();
-
-    if (errors) {
-      std::cerr << "Error in 100% min replacements: " << errors << std::endl;
-      std::cerr << "inp(0)=" << inp(0) << std::endl;
+template <typename T>
+static void Atomic_MinReplacements(benchmark::State& state) {
+  const int length = get_length(state);
+  auto inp         = prepare_input(length, std::numeric_limits<T>::max());
+
+  for (auto _ : state) {
+    const auto time   = atomic_min_replacement(inp);
+    const auto errors = check_errors_replacement(inp);
+
+    // report results
+    state.SetIterationTime(time);
+    if (errors > 0) {
+      state.counters["Errors"] = benchmark::Counter(errors);
     }
-    std::cout << "Time for 100% min replacements: " << time << std::endl;
   }
+}
 
-  // input is min values - all max atomics will replace
-  {
-    Kokkos::parallel_for(
-        length, KOKKOS_LAMBDA(const int i) { inp(i) = min; });
-    Kokkos::fence();
-
-    timer.reset();
-    Kokkos::parallel_for(
-        length, KOKKOS_LAMBDA(const int i) {
-          (void)Kokkos::atomic_max_fetch(&(inp(i)), (T)i);
-        });
-    Kokkos::fence();
-    double time = timer.seconds();
-
-    int errors(0);
-    Kokkos::parallel_reduce(
-        length,
-        KOKKOS_LAMBDA(const int i, int& inner) { inner += (inp(i) != (T)i); },
-        errors);
-    Kokkos::fence();
-
-    if (errors) {
-      std::cerr << "Error in 100% max replacements: " << errors << std::endl;
-      std::cerr << "inp(0)=" << inp(0) << std::endl;
+template <typename T>
+double atomic_max_replacement(Kokkos::View<T*, exec_space> input) {
+  const int length = input.size();
+  Kokkos::Timer timer;
+  Kokkos::parallel_for(
+      length, KOKKOS_LAMBDA(const int i) {
+        (void)Kokkos::atomic_max_fetch(&(input(i)), (T)i);
+      });
+  Kokkos::fence();
+  return timer.seconds();
+}
+
+template <typename T>
+static void Atomic_MaxReplacements(benchmark::State& state) {
+  const auto length = get_length(state);
+  auto inp          = prepare_input(length, std::numeric_limits<T>::lowest());
+
+  for (auto _ : state) {
+    const auto time   = atomic_max_replacement(inp);
+    const auto errors = check_errors_replacement(inp);
+
+    // report results
+    state.SetIterationTime(time);
+    if (errors > 0) {
+      state.counters["Errors"] = benchmark::Counter(errors);
     }
-    std::cout << "Time for 100% max replacements: " << time << std::endl;
   }
+}
 
-  // input is max values - all max atomics will early exit
-  {
-    Kokkos::parallel_for(
-        length, KOKKOS_LAMBDA(const int i) { inp(i) = max; });
-    Kokkos::fence();
-
-    timer.reset();
-    Kokkos::parallel_for(
-        length, KOKKOS_LAMBDA(const int i) {
-          (void)Kokkos::atomic_max_fetch(&(inp(i)), (T)i);
-        });
-    Kokkos::fence();
-    double time = timer.seconds();
-
-    int errors(0);
-    Kokkos::parallel_reduce(
-        length,
-        KOKKOS_LAMBDA(const int i, int& inner) {
-          T ref = max;
-          inner += (inp(i) != ref);
-        },
-        errors);
-    Kokkos::fence();
-
-    if (errors) {
-      std::cerr << "Error in 100% max early exits: " << errors << std::endl;
-      std::cerr << "inp(0)=" << inp(0) << std::endl;
+template <typename T>
+int check_errors_early_exit(Kokkos::View<T*, exec_space> view, const T ref) {
+  int errors = 0;
+  Kokkos::parallel_reduce(
+      view.size(),
+      KOKKOS_LAMBDA(const int i, int& inner) { inner += (view(i) != ref); },
+      errors);
+  Kokkos::fence();
+  return errors;
+}
+
+template <typename T>
+static void Atomic_MaxEarlyExits(benchmark::State& state) {
+  const auto length = get_length(state);
+  auto inp          = prepare_input(length, std::numeric_limits<T>::max());
+
+  for (auto _ : state) {
+    const auto time = atomic_max_replacement(inp);
+    const auto errors =
+        check_errors_early_exit(inp, std::numeric_limits<T>::max());
+
+    // report results
+    state.SetIterationTime(time);
+    if (errors > 0) {
+      state.counters["Errors"] = benchmark::Counter(errors);
     }
-    std::cout << "Time for 100% max early exits: " << time << std::endl;
   }
+}
 
-  // input is min values - all min atomics will early exit
-  {
-    Kokkos::parallel_for(
-        length, KOKKOS_LAMBDA(const int i) { inp(i) = min; });
-    Kokkos::fence();
-
-    timer.reset();
-    Kokkos::parallel_for(
-        length, KOKKOS_LAMBDA(const int i) {
-          (void)Kokkos::atomic_min_fetch(&(inp(i)), (T)i);
-        });
-    Kokkos::fence();
-    double time = timer.seconds();
-
-    int errors(0);
-    Kokkos::parallel_reduce(
-        length,
-        KOKKOS_LAMBDA(const int i, int& inner) {
-          T ref = min;
-          inner += (inp(i) != ref);
-        },
-        errors);
-    Kokkos::fence();
-
-    if (errors) {
-      std::cerr << "Error in 100% min early exits: " << errors << std::endl;
-      std::cerr << "inp(0)=" << inp(0) << std::endl;
-      if (length > 9) std::cout << "inp(9)=" << inp(9) << std::endl;
+template <typename T>
+static void Atomic_MinEarlyExits(benchmark::State& state) {
+  const auto length = get_length(state);
+  auto inp          = prepare_input(length, std::numeric_limits<T>::lowest());
+
+  for (auto _ : state) {
+    const auto time = atomic_min_replacement(inp);
+    const auto errors =
+        check_errors_early_exit(inp, std::numeric_limits<T>::lowest());
+
+    // report results
+    state.SetIterationTime(time);
+    if (errors > 0) {
+      state.counters["Errors"] = benchmark::Counter(errors);
     }
-    std::cout << "Time for 100% min early exits: " << time << std::endl;
   }
+}
 
-  // limit iterations for contentious test, takes ~50x longer for same length
-  auto con_length = length / 5;
-  // input is min values - some max atomics will replace
-  {
-    Kokkos::parallel_for(
-        1, KOKKOS_LAMBDA(const int i) { inp(i) = min; });
-    Kokkos::fence();
-
-    T current(0);
-    timer.reset();
-    Kokkos::parallel_reduce(
-        con_length,
-        KOKKOS_LAMBDA(const int i, T& inner) {
-          inner = Kokkos::atomic_max_fetch(&(inp(0)), inner + 1);
-          if (i == con_length - 1) {
-            Kokkos::atomic_max_fetch(&(inp(0)), max);
-            inner = max;
-          }
-        },
-        Kokkos::Max<T>(current));
-    Kokkos::fence();
-    double time = timer.seconds();
-
-    if (current < max) {
-      std::cerr << "Error in contentious max replacements: " << std::endl;
-      std::cerr << "final=" << current << " inp(0)=" << inp(0) << " max=" << max
-                << std::endl;
-    }
-    std::cout << "Time for contentious max " << con_length
-              << " replacements: " << time << std::endl;
+template <typename T>
+void report_errors_contentious_replacement(benchmark::State& state,
+                                           const T final, const T first,
+                                           const T expected) {
+  state.counters["Errors"]   = benchmark::Counter(1);
+  state.counters["Final"]    = benchmark::Counter(final);
+  state.counters["First"]    = benchmark::Counter(first);
+  state.counters["Expected"] = benchmark::Counter(expected);
+}
+
+template <typename T>
+double atomic_contentious_max_replacement(benchmark::State& state,
+                                          Kokkos::View<T*, exec_space> input,
+                                          const int con_length) {
+  const auto max = std::numeric_limits<T>::max();
+  T current      = 0;
+
+  Kokkos::Timer timer;
+  Kokkos::parallel_reduce(
+      con_length,
+      KOKKOS_LAMBDA(const int i, T& inner) {
+        inner = Kokkos::atomic_max_fetch(&(input(0)), inner + 1);
+        if (i == con_length - 1) {
+          Kokkos::atomic_max_fetch(&(input(0)), max);
+          inner = max;
+        }
+      },
+      Kokkos::Max<T>(current));
+  Kokkos::fence();
+  const auto time = timer.seconds();
+
+  if (current < max) {
+    report_errors_contentious_replacement(state, current, input(0), max);
   }
 
-  // input is max values - some min atomics will replace
-  {
-    Kokkos::parallel_for(
-        1, KOKKOS_LAMBDA(const int i) { inp(i) = max; });
-    Kokkos::fence();
-
-    timer.reset();
-    T current(100000000);
-    Kokkos::parallel_reduce(
-        con_length,
-        KOKKOS_LAMBDA(const int i, T& inner) {
-          inner = Kokkos::atomic_min_fetch(&(inp(0)), inner - 1);
-          if (i == con_length - 1) {
-            Kokkos::atomic_min_fetch(&(inp(0)), min);
-            inner = min;
-          }
-        },
-        Kokkos::Min<T>(current));
-    Kokkos::fence();
-    double time = timer.seconds();
-
-    if (current > min) {
-      std::cerr << "Error in contentious min replacements: " << std::endl;
-      std::cerr << "final=" << current << " inp(0)=" << inp(0) << " min=" << min
-                << std::endl;
-    }
-    std::cout << "Time for contentious min " << con_length
-              << " replacements: " << time << std::endl;
+  return time;
+}
+
+template <typename T>
+static void Atomic_ContentiousMaxReplacements(benchmark::State& state) {
+  const auto length = get_length(state);
+  auto inp          = prepare_input(1, std::numeric_limits<T>::lowest());
+
+  for (auto _ : state) {
+    const auto time = atomic_contentious_max_replacement(state, inp, length);
+
+    state.SetIterationTime(time);
   }
 }
 
-int main(int argc, char* argv[]) {
-  Kokkos::initialize(argc, argv);
-  {
-    int length = 1000000;
-    if (argc == 2) {
-      length = std::stoi(argv[1]);
-    }
+template <typename T>
+double atomic_contentious_min_replacement(benchmark::State& state,
+                                          Kokkos::View<T*, exec_space> input,
+                                          const int con_length) {
+  const auto min = std::numeric_limits<T>::lowest();
+  T current      = 0;
 
-    if (length < 1) {
-      throw std::invalid_argument("");
-    }
+  Kokkos::Timer timer;
+  Kokkos::parallel_reduce(
+      con_length,
+      KOKKOS_LAMBDA(const int i, T& inner) {
+        inner = Kokkos::atomic_min_fetch(&(input(0)), inner - 1);
+        if (i == con_length - 1) {
+          Kokkos::atomic_min_fetch(&(input(0)), min);
+          inner = min;
+        }
+      },
+      Kokkos::Min<T>(current));
+  Kokkos::fence();
+  const auto time = timer.seconds();
+
+  if (current > min) {
+    report_errors_contentious_replacement(state, current, input(0), min);
+  }
 
-    std::cout << "================ int" << std::endl;
-    test<int>(length);
-    std::cout << "================ long" << std::endl;
-    test<long>(length);
-    std::cout << "================ long long" << std::endl;
-    test<long long>(length);
-
-    std::cout << "================ unsigned int" << std::endl;
-    test<unsigned int>(length);
-    std::cout << "================ unsigned long" << std::endl;
-    test<unsigned long>(length);
-    std::cout << "================ unsigned long long" << std::endl;
-    test<unsigned long long>(length);
-
-    std::cout << "================ float" << std::endl;
-    test<float>(length);
-    std::cout << "================ double" << std::endl;
-    test<double>(length);
+  return time;
+}
+
+template <typename T>
+static void Atomic_ContentiousMinReplacements(benchmark::State& state) {
+  const auto length = get_length(state);
+  auto inp          = prepare_input(1, std::numeric_limits<T>::max());
+
+  for (auto _ : state) {
+    const auto time = atomic_contentious_max_replacement(state, inp, length);
+
+    state.SetIterationTime(time);
   }
-  Kokkos::finalize();
-  return 0;
 }
+
+// int
+BENCHMARK(Atomic_MinReplacements<int>)
+    ->ArgName("Length")
+    ->Arg(LENGTH)
+    ->UseManualTime()
+    ->Iterations(10);
+
+BENCHMARK(Atomic_MaxReplacements<int>)
+    ->ArgName("Length")
+    ->Arg(LENGTH)
+    ->UseManualTime()
+    ->Iterations(10);
+
+BENCHMARK(Atomic_MaxEarlyExits<int>)
+    ->ArgName("Length")
+    ->Arg(LENGTH)
+    ->UseManualTime()
+    ->Iterations(10);
+
+BENCHMARK(Atomic_MinEarlyExits<int>)
+    ->ArgName("Length")
+    ->Arg(LENGTH)
+    ->UseManualTime()
+    ->Iterations(10);
+
+BENCHMARK(Atomic_ContentiousMaxReplacements<int>)
+    ->ArgName("Length")
+    ->Arg(LENGTH / 5)
+    ->UseManualTime()
+    ->Iterations(10);
+
+BENCHMARK(Atomic_ContentiousMinReplacements<int>)
+    ->ArgName("Length")
+    ->Arg(LENGTH / 5)
+    ->UseManualTime()
+    ->Iterations(10);
+///////////////////////////////////////////////////////////////////////
+
+// long
+BENCHMARK(Atomic_MinReplacements<long>)
+    ->ArgName("Length")
+    ->Arg(LENGTH)
+    ->UseManualTime()
+    ->Iterations(10);
+
+BENCHMARK(Atomic_MaxReplacements<long>)
+    ->ArgName("Length")
+    ->Arg(LENGTH)
+    ->UseManualTime()
+    ->Iterations(10);
+
+BENCHMARK(Atomic_MaxEarlyExits<long>)
+    ->ArgName("Length")
+    ->Arg(LENGTH)
+    ->UseManualTime()
+    ->Iterations(10);
+
+BENCHMARK(Atomic_MinEarlyExits<long>)
+    ->ArgName("Length")
+    ->Arg(LENGTH)
+    ->UseManualTime()
+    ->Iterations(10);
+
+BENCHMARK(Atomic_ContentiousMaxReplacements<long>)
+    ->ArgName("Length")
+    ->Arg(LENGTH / 5)
+    ->UseManualTime()
+    ->Iterations(10);
+
+BENCHMARK(Atomic_ContentiousMinReplacements<long>)
+    ->ArgName("Length")
+    ->Arg(LENGTH / 5)
+    ->UseManualTime()
+    ->Iterations(10);
+///////////////////////////////////////////////////////////////////////
+
+// long long
+BENCHMARK(Atomic_MinReplacements<long long>)
+    ->ArgName("Length")
+    ->Arg(LENGTH)
+    ->UseManualTime()
+    ->Iterations(10);
+
+BENCHMARK(Atomic_MaxReplacements<long long>)
+    ->ArgName("Length")
+    ->Arg(LENGTH)
+    ->UseManualTime()
+    ->Iterations(10);
+
+BENCHMARK(Atomic_MaxEarlyExits<long long>)
+    ->ArgName("Length")
+    ->Arg(LENGTH)
+    ->UseManualTime()
+    ->Iterations(10);
+
+BENCHMARK(Atomic_MinEarlyExits<long long>)
+    ->ArgName("Length")
+    ->Arg(LENGTH)
+    ->UseManualTime()
+    ->Iterations(10);
+
+BENCHMARK(Atomic_ContentiousMaxReplacements<long long>)
+    ->ArgName("Length")
+    ->Arg(LENGTH / 5)
+    ->UseManualTime()
+    ->Iterations(10);
+
+BENCHMARK(Atomic_ContentiousMinReplacements<long long>)
+    ->ArgName("Length")
+    ->Arg(LENGTH / 5)
+    ->UseManualTime()
+    ->Iterations(10);
+///////////////////////////////////////////////////////////////////////
+
+// unsigned int
+BENCHMARK(Atomic_MinReplacements<unsigned int>)
+    ->ArgName("Length")
+    ->Arg(LENGTH)
+    ->UseManualTime()
+    ->Iterations(10);
+
+BENCHMARK(Atomic_MaxReplacements<unsigned int>)
+    ->ArgName("Length")
+    ->Arg(LENGTH)
+    ->UseManualTime()
+    ->Iterations(10);
+
+BENCHMARK(Atomic_MaxEarlyExits<unsigned int>)
+    ->ArgName("Length")
+    ->Arg(LENGTH)
+    ->UseManualTime()
+    ->Iterations(10);
+
+BENCHMARK(Atomic_MinEarlyExits<unsigned int>)
+    ->ArgName("Length")
+    ->Arg(LENGTH)
+    ->UseManualTime()
+    ->Iterations(10);
+
+BENCHMARK(Atomic_ContentiousMaxReplacements<unsigned int>)
+    ->ArgName("Length")
+    ->Arg(LENGTH / 5)
+    ->UseManualTime()
+    ->Iterations(10);
+
+BENCHMARK(Atomic_ContentiousMinReplacements<unsigned int>)
+    ->ArgName("Length")
+    ->Arg(LENGTH / 5)
+    ->UseManualTime()
+    ->Iterations(10);
+///////////////////////////////////////////////////////////////////////
+
+// unsigned long
+BENCHMARK(Atomic_MinReplacements<unsigned long>)
+    ->ArgName("Length")
+    ->Arg(LENGTH)
+    ->UseManualTime()
+    ->Iterations(10);
+
+BENCHMARK(Atomic_MaxReplacements<unsigned long>)
+    ->ArgName("Length")
+    ->Arg(LENGTH)
+    ->UseManualTime()
+    ->Iterations(10);
+
+BENCHMARK(Atomic_MaxEarlyExits<unsigned long>)
+    ->ArgName("Length")
+    ->Arg(LENGTH)
+    ->UseManualTime()
+    ->Iterations(10);
+
+BENCHMARK(Atomic_MinEarlyExits<unsigned long>)
+    ->ArgName("Length")
+    ->Arg(LENGTH)
+    ->UseManualTime()
+    ->Iterations(10);
+
+BENCHMARK(Atomic_ContentiousMaxReplacements<unsigned long>)
+    ->ArgName("Length")
+    ->Arg(LENGTH / 5)
+    ->UseManualTime()
+    ->Iterations(10);
+
+BENCHMARK(Atomic_ContentiousMinReplacements<unsigned long>)
+    ->ArgName("Length")
+    ->Arg(LENGTH / 5)
+    ->UseManualTime()
+    ->Iterations(10);
+///////////////////////////////////////////////////////////////////////
+
+// unsigned long long
+BENCHMARK(Atomic_MinReplacements<unsigned long long>)
+    ->ArgName("Length")
+    ->Arg(LENGTH)
+    ->UseManualTime()
+    ->Iterations(10);
+
+BENCHMARK(Atomic_MaxReplacements<unsigned long long>)
+    ->ArgName("Length")
+    ->Arg(LENGTH)
+    ->UseManualTime()
+    ->Iterations(10);
+
+BENCHMARK(Atomic_MaxEarlyExits<unsigned long long>)
+    ->ArgName("Length")
+    ->Arg(LENGTH)
+    ->UseManualTime()
+    ->Iterations(10);
+
+BENCHMARK(Atomic_MinEarlyExits<unsigned long long>)
+    ->ArgName("Length")
+    ->Arg(LENGTH)
+    ->UseManualTime()
+    ->Iterations(10);
+
+BENCHMARK(Atomic_ContentiousMaxReplacements<unsigned long long>)
+    ->ArgName("Length")
+    ->Arg(LENGTH / 5)
+    ->UseManualTime()
+    ->Iterations(10);
+
+BENCHMARK(Atomic_ContentiousMinReplacements<unsigned long long>)
+    ->ArgName("Length")
+    ->Arg(LENGTH / 5)
+    ->UseManualTime()
+    ->Iterations(10);
+///////////////////////////////////////////////////////////////////////
+
+// float
+BENCHMARK(Atomic_MinReplacements<float>)
+    ->ArgName("Length")
+    ->Arg(LENGTH)
+    ->UseManualTime()
+    ->Iterations(10);
+
+BENCHMARK(Atomic_MaxReplacements<float>)
+    ->ArgName("Length")
+    ->Arg(LENGTH)
+    ->UseManualTime()
+    ->Iterations(10);
+
+BENCHMARK(Atomic_MaxEarlyExits<float>)
+    ->ArgName("Length")
+    ->Arg(LENGTH)
+    ->UseManualTime()
+    ->Iterations(10);
+
+BENCHMARK(Atomic_MinEarlyExits<float>)
+    ->ArgName("Length")
+    ->Arg(LENGTH)
+    ->UseManualTime()
+    ->Iterations(10);
+
+BENCHMARK(Atomic_ContentiousMaxReplacements<float>)
+    ->ArgName("Length")
+    ->Arg(LENGTH / 5)
+    ->UseManualTime()
+    ->Iterations(10);
+
+BENCHMARK(Atomic_ContentiousMinReplacements<float>)
+    ->ArgName("Length")
+    ->Arg(LENGTH / 5)
+    ->UseManualTime()
+    ->Iterations(10);
+///////////////////////////////////////////////////////////////////////
+
+// double
+BENCHMARK(Atomic_MinReplacements<double>)
+    ->ArgName("Length")
+    ->Arg(LENGTH)
+    ->UseManualTime()
+    ->Iterations(10);
+
+BENCHMARK(Atomic_MaxReplacements<double>)
+    ->ArgName("Length")
+    ->Arg(LENGTH)
+    ->UseManualTime()
+    ->Iterations(10);
+
+BENCHMARK(Atomic_MaxEarlyExits<double>)
+    ->ArgName("Length")
+    ->Arg(LENGTH)
+    ->UseManualTime()
+    ->Iterations(10);
+
+BENCHMARK(Atomic_MinEarlyExits<double>)
+    ->ArgName("Length")
+    ->Arg(LENGTH)
+    ->UseManualTime()
+    ->Iterations(10);
+
+BENCHMARK(Atomic_ContentiousMaxReplacements<double>)
+    ->ArgName("Length")
+    ->Arg(LENGTH / 5)
+    ->UseManualTime()
+    ->Iterations(10);
+
+BENCHMARK(Atomic_ContentiousMinReplacements<double>)
+    ->ArgName("Length")
+    ->Arg(LENGTH / 5)
+    ->UseManualTime()
+    ->Iterations(10);
diff --git a/packages/kokkos/core/perf_test/test_mempool.cpp b/packages/kokkos/core/perf_test/test_mempool.cpp
index e4e1b4c9a3944bd43ea855ce6a69c85c3af76fd8..9905740afb4d85e78dfe926fee81ac65bfe0949c 100644
--- a/packages/kokkos/core/perf_test/test_mempool.cpp
+++ b/packages/kokkos/core/perf_test/test_mempool.cpp
@@ -17,11 +17,16 @@
 #include <cstdio>
 #include <cstring>
 #include <cstdlib>
+#include <iostream>
 #include <limits>
 
+#include <benchmark/benchmark.h>
 #include <Kokkos_Core.hpp>
 #include <Kokkos_Timer.hpp>
 
+#include "Benchmark_Context.hpp"
+#include "PerfTest_Category.hpp"
+
 using ExecSpace   = Kokkos::DefaultExecutionSpace;
 using MemorySpace = Kokkos::DefaultExecutionSpace::memory_space;
 
@@ -146,53 +151,8 @@ struct TestFunctor {
   }
 };
 
-int main(int argc, char* argv[]) {
-  static const char help_flag[]         = "--help";
-  static const char alloc_size_flag[]   = "--alloc_size=";
-  static const char super_size_flag[]   = "--super_size=";
-  static const char chunk_span_flag[]   = "--chunk_span=";
-  static const char fill_stride_flag[]  = "--fill_stride=";
-  static const char fill_level_flag[]   = "--fill_level=";
-  static const char repeat_outer_flag[] = "--repeat_outer=";
-  static const char repeat_inner_flag[] = "--repeat_inner=";
-
-  long total_alloc_size   = 1000000;
-  int min_superblock_size = 10000;
-  int chunk_span          = 5;
-  int fill_stride         = 1;
-  int fill_level          = 70;
-  int repeat_outer        = 1;
-  int repeat_inner        = 1;
-
-  int ask_help = 0;
-
-  for (int i = 1; i < argc; i++) {
-    const char* const a = argv[i];
-
-    if (!strncmp(a, help_flag, strlen(help_flag))) ask_help = 1;
-
-    if (!strncmp(a, alloc_size_flag, strlen(alloc_size_flag)))
-      total_alloc_size = atol(a + strlen(alloc_size_flag));
-
-    if (!strncmp(a, super_size_flag, strlen(super_size_flag)))
-      min_superblock_size = std::stoi(a + strlen(super_size_flag));
-
-    if (!strncmp(a, fill_stride_flag, strlen(fill_stride_flag)))
-      fill_stride = std::stoi(a + strlen(fill_stride_flag));
-
-    if (!strncmp(a, fill_level_flag, strlen(fill_level_flag)))
-      fill_level = std::stoi(a + strlen(fill_level_flag));
-
-    if (!strncmp(a, chunk_span_flag, strlen(chunk_span_flag)))
-      chunk_span = std::stoi(a + strlen(chunk_span_flag));
-
-    if (!strncmp(a, repeat_outer_flag, strlen(repeat_outer_flag)))
-      repeat_outer = std::stoi(a + strlen(repeat_outer_flag));
-
-    if (!strncmp(a, repeat_inner_flag, strlen(repeat_inner_flag)))
-      repeat_inner = std::stoi(a + strlen(repeat_inner_flag));
-  }
-
+int get_number_alloc(int chunk_span, int min_superblock_size,
+                     long total_alloc_size, int fill_level) {
   int chunk_span_bytes = 0;
   for (int i = 0; i < chunk_span; ++i) {
     auto chunk_bytes = TestFunctor::chunk * (1 + i);
@@ -212,81 +172,85 @@ int main(int argc, char* argv[]) {
   auto bytes_wanted       = (actual_total_bytes * fill_level) / 100;
   auto chunk_spans        = bytes_wanted / chunk_span_bytes;
   auto number_alloc       = int(chunk_spans * chunk_span);
+  return number_alloc;
+}
+
+template <class T>
+T get_parameter(const char flag[], T default_value) {
+  auto argc  = Test::command_line_num_args();
+  auto value = default_value;
+
+  for (int i = 1; i < argc; i++) {
+    const char* const a = Test::command_line_arg(i);
 
-  if (ask_help) {
-    std::cout << "command line options:"
-              << " " << help_flag << " " << alloc_size_flag << "##"
-              << " " << super_size_flag << "##"
-              << " " << fill_stride_flag << "##"
-              << " " << fill_level_flag << "##"
-              << " " << chunk_span_flag << "##"
-              << " " << repeat_outer_flag << "##"
-              << " " << repeat_inner_flag << "##" << std::endl;
-    return 0;
+    if (!strncmp(a, flag, strlen(flag))) value = std::stoi(a + strlen(flag));
   }
 
-  Kokkos::initialize(argc, argv);
+  return value;
+}
 
-  double sum_fill_time  = 0;
-  double sum_cycle_time = 0;
-  double sum_both_time  = 0;
-  double min_fill_time  = std::numeric_limits<double>::max();
-  double min_cycle_time = std::numeric_limits<double>::max();
-  double min_both_time  = std::numeric_limits<double>::max();
-  // one alloc in fill, alloc/dealloc pair in repeat_inner
-  for (int i = 0; i < repeat_outer; ++i) {
+static void Mempool_Fill(benchmark::State& state) {
+  long total_alloc_size =
+      get_parameter("--alloc_size=", static_cast<long>(state.range(0)));
+  int min_superblock_size = get_parameter("--super_size=", state.range(1));
+  int chunk_span          = get_parameter("--chunk_span=", state.range(2));
+  int fill_stride         = get_parameter("--fill_stride=", state.range(3));
+  int fill_level          = get_parameter("--fill_level=", state.range(4));
+  int repeat_inner        = get_parameter("--repeat_inner=", state.range(5));
+  int number_alloc        = get_number_alloc(chunk_span, min_superblock_size,
+                                      total_alloc_size, fill_level);
+
+  for (auto _ : state) {
     TestFunctor functor(total_alloc_size, min_superblock_size, number_alloc,
                         fill_stride, chunk_span, repeat_inner);
-
     Kokkos::Timer timer;
 
     if (!functor.test_fill()) {
       Kokkos::abort("fill ");
     }
 
-    auto t0 = timer.seconds();
+    state.SetIterationTime(timer.seconds());
+    state.counters[KokkosBenchmark::benchmark_fom("fill ops per second")] =
+        benchmark::Counter(number_alloc,
+                           benchmark::Counter::kIsIterationInvariantRate);
+  }
+}
+
+static void Mempool_Alloc_Dealloc(benchmark::State& state) {
+  long total_alloc_size =
+      get_parameter("--alloc_size=", static_cast<long>(state.range(0)));
+  int min_superblock_size = get_parameter("--super_size=", state.range(1));
+  int chunk_span          = get_parameter("--chunk_span=", state.range(2));
+  int fill_stride         = get_parameter("--fill_stride=", state.range(3));
+  int fill_level          = get_parameter("--fill_level=", state.range(4));
+  int repeat_inner        = get_parameter("--repeat_inner=", state.range(5));
+  int number_alloc        = get_number_alloc(chunk_span, min_superblock_size,
+                                      total_alloc_size, fill_level);
+
+  for (auto _ : state) {
+    TestFunctor functor(total_alloc_size, min_superblock_size, number_alloc,
+                        fill_stride, chunk_span, repeat_inner);
+    Kokkos::Timer timer;
 
     if (!functor.test_alloc_dealloc()) {
       Kokkos::abort("alloc/dealloc ");
     }
 
-    auto t1              = timer.seconds();
-    auto this_fill_time  = t0;
-    auto this_cycle_time = t1 - t0;
-    auto this_both_time  = t1;
-    sum_fill_time += this_fill_time;
-    sum_cycle_time += this_cycle_time;
-    sum_both_time += this_both_time;
-    min_fill_time  = std::min(min_fill_time, this_fill_time);
-    min_cycle_time = std::min(min_cycle_time, this_cycle_time);
-    min_both_time  = std::min(min_both_time, this_both_time);
+    state.SetIterationTime(timer.seconds());
+    state.counters[KokkosBenchmark::benchmark_fom("cycle ops per second")] =
+        benchmark::Counter(2 * number_alloc * repeat_inner,
+                           benchmark::Counter::kIsIterationInvariantRate);
   }
+}
 
-  Kokkos::finalize();
-
-  printf(
-      "\"mempool: alloc super stride level span inner outer number\" %ld %d %d "
-      "%d %d %d %d %d\n",
-      total_alloc_size, min_superblock_size, fill_stride, fill_level,
-      chunk_span, repeat_inner, repeat_outer, number_alloc);
-
-  auto avg_fill_time  = sum_fill_time / repeat_outer;
-  auto avg_cycle_time = sum_cycle_time / repeat_outer;
-  auto avg_both_time  = sum_both_time / repeat_outer;
-
-  printf("\"mempool: fill time (min, avg)\" %.8f %.8f\n", min_fill_time,
-         avg_fill_time);
-
-  printf("\"mempool: cycle time (min, avg)\" %.8f %.8f\n", min_cycle_time,
-         avg_cycle_time);
-
-  printf("\"mempool: test time (min, avg)\" %.8f %.8f\n", min_both_time,
-         avg_both_time);
+const std::vector<std::string> ARG_NAMES = {
+    "total_alloc_size", "min_superblock_size", "chunk_span",
+    "fill_stride",      "fill_level",          "repeat_inner"};
+const std::vector<int64_t> ARGS = {1'000'000, 10'000, 5, 1, 70, 1};
 
-  printf("\"mempool: fill ops per second (max, avg)\" %g %g\n",
-         number_alloc / min_fill_time, number_alloc / avg_fill_time);
+BENCHMARK(Mempool_Fill)->ArgNames(ARG_NAMES)->Args(ARGS)->UseManualTime();
 
-  printf("\"mempool: cycle ops per second (max, avg)\" %g %g\n",
-         (2 * number_alloc * repeat_inner) / min_cycle_time,
-         (2 * number_alloc * repeat_inner) / avg_cycle_time);
-}
+BENCHMARK(Mempool_Alloc_Dealloc)
+    ->ArgNames(ARG_NAMES)
+    ->Args(ARGS)
+    ->UseManualTime();
diff --git a/packages/kokkos/core/perf_test/test_taskdag.cpp b/packages/kokkos/core/perf_test/test_taskdag.cpp
index bbb48af6c43e8bd16b350aad9ca4f0633f5fb8dc..fccaab64ddf1821169484b533b1f64dcab585485 100644
--- a/packages/kokkos/core/perf_test/test_taskdag.cpp
+++ b/packages/kokkos/core/perf_test/test_taskdag.cpp
@@ -14,6 +14,8 @@
 //
 //@HEADER
 
+#include <iostream>
+
 #include <Kokkos_Core.hpp>
 
 #if !defined(KOKKOS_ENABLE_TASKDAG) || \
diff --git a/packages/kokkos/core/src/CMakeLists.txt b/packages/kokkos/core/src/CMakeLists.txt
index 862c0c47dd2598815dd262fc2cb645addfa0245b..012af0a7d06ac5df551ceae49ca06bdee77802f6 100644
--- a/packages/kokkos/core/src/CMakeLists.txt
+++ b/packages/kokkos/core/src/CMakeLists.txt
@@ -3,13 +3,19 @@ KOKKOS_INCLUDE_DIRECTORIES(
   ${CMAKE_CURRENT_SOURCE_DIR}
   ${KOKKOS_TOP_BUILD_DIR}
 )
-IF (Kokkos_ENABLE_IMPL_DESUL_ATOMICS AND NOT desul_FOUND)
+IF (NOT desul_FOUND)
   IF(KOKKOS_ENABLE_CUDA)
     SET(DESUL_ATOMICS_ENABLE_CUDA ON)
   ENDIF()
+  IF(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE)
+    SET(DESUL_ATOMICS_ENABLE_CUDA_SEPARABLE_COMPILATION ON)
+  ENDIF()
   IF(KOKKOS_ENABLE_HIP)
     SET(DESUL_ATOMICS_ENABLE_HIP ON)
   ENDIF()
+  IF(KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE)
+    SET(DESUL_ATOMICS_ENABLE_HIP_SEPARABLE_COMPILATION ON)
+  ENDIF()
   IF(KOKKOS_ENABLE_SYCL)
     SET(DESUL_ATOMICS_ENABLE_SYCL ON)
   ENDIF()
@@ -88,8 +94,14 @@ IF (KOKKOS_ENABLE_SYCL)
   APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/SYCL/*.hpp)
 ENDIF()
 
-IF (Kokkos_ENABLE_IMPL_DESUL_ATOMICS AND NOT desul_FOUND)
-  APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/src/*.cpp)
+IF (NOT desul_FOUND)
+  IF (KOKKOS_ENABLE_CUDA)
+    APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/src/Lock_Array_CUDA.cpp)
+  ELSEIF (KOKKOS_ENABLE_HIP)
+    APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/src/Lock_Array_HIP.cpp)
+  ELSEIF (KOKKOS_ENABLE_SYCL)
+    APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/src/Lock_Array_SYCL.cpp)
+  ENDIF()
   APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include/desul/*.hpp)
   APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include/desul/*/*.hpp)
   APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include/desul/*/*/*.hpp)
@@ -125,7 +137,7 @@ KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkoscore
   ${CMAKE_CURRENT_BINARY_DIR}
   ${CMAKE_CURRENT_SOURCE_DIR}
 )
-IF (Kokkos_ENABLE_IMPL_DESUL_ATOMICS AND NOT desul_FOUND)
+IF (NOT desul_FOUND)
   KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkoscore
     ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include
   )
@@ -181,17 +193,18 @@ IF (NOT WIN32)
 ENDIF()
 IF (NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE)
   KOKKOS_LINK_TPL(kokkoscore PUBLIC ROCM)
+  KOKKOS_LINK_TPL(kokkoscore PUBLIC ONEDPL)
 ENDIF()
 
 # FIXME: We need a proper solution to figure out whether to enable
 #        libatomic
 # Most compilers only require libatomic for 128-bit CAS
 # I (CT) had removed 128bit CAS from desul to not need libatomic.
-IF (Kokkos_ENABLE_IMPL_DESUL_ATOMICS AND KOKKOS_ENABLE_OPENMPTARGET)
+IF (KOKKOS_ENABLE_OPENMPTARGET)
   target_link_libraries(kokkoscore PUBLIC atomic)
 ENDIF()
 
-IF (Kokkos_ENABLE_IMPL_DESUL_ATOMICS AND desul_FOUND)
+IF (desul_FOUND)
   target_link_libraries(kokkoscore PUBLIC desul_atomics)
 ENDIF()
 
diff --git a/packages/kokkos/core/src/Kokkos_Cuda.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda.hpp
similarity index 91%
rename from packages/kokkos/core/src/Kokkos_Cuda.hpp
rename to packages/kokkos/core/src/Cuda/Kokkos_Cuda.hpp
index fce7351b3286ab74ab96635a94e5bc5ef84bbb9a..8bfaf8317b6594e0ce4e11f6183605714c1c93e9 100644
--- a/packages/kokkos/core/src/Kokkos_Cuda.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda.hpp
@@ -31,7 +31,7 @@ static_assert(false,
 #include <vector>
 
 #include <impl/Kokkos_AnalyzePolicy.hpp>
-#include <Kokkos_CudaSpace.hpp>
+#include <Cuda/Kokkos_CudaSpace.hpp>
 #include <Cuda/Kokkos_Cuda_Error.hpp>  // CUDA_SAFE_CALL
 
 #include <Kokkos_Parallel.hpp>
@@ -80,6 +80,9 @@ struct CudaDispatchProperties {
   CudaLaunchMechanism launch_mechanism = l;
 };
 }  // namespace Experimental
+
+enum class ManageStream : bool { no, yes };
+
 }  // namespace Impl
 /// \class Cuda
 /// \brief Kokkos Execution Space that uses CUDA to run on GPUs.
@@ -181,7 +184,10 @@ class Cuda {
 
   Cuda();
 
-  Cuda(cudaStream_t stream, bool manage_stream = false);
+  Cuda(cudaStream_t stream,
+       Impl::ManageStream manage_stream = Impl::ManageStream::no);
+
+  KOKKOS_DEPRECATED Cuda(cudaStream_t stream, bool manage_stream);
 
   //--------------------------------------------------------------------------
   //! Free any resources being consumed by the device.
@@ -241,28 +247,6 @@ struct DeviceTypeTraits<Cuda> {
 };
 }  // namespace Experimental
 }  // namespace Tools
-
-namespace Impl {
-
-template <class DT, class... DP>
-struct ZeroMemset<Kokkos::Cuda, DT, DP...> {
-  ZeroMemset(const Kokkos::Cuda& exec_space_instance,
-             const View<DT, DP...>& dst,
-             typename View<DT, DP...>::const_value_type&) {
-    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemsetAsync(
-        dst.data(), 0,
-        dst.size() * sizeof(typename View<DT, DP...>::value_type),
-        exec_space_instance.cuda_stream()));
-  }
-
-  ZeroMemset(const View<DT, DP...>& dst,
-             typename View<DT, DP...>::const_value_type&) {
-    KOKKOS_IMPL_CUDA_SAFE_CALL(
-        cudaMemset(dst.data(), 0,
-                   dst.size() * sizeof(typename View<DT, DP...>::value_type)));
-  }
-};
-}  // namespace Impl
 }  // namespace Kokkos
 
 /*--------------------------------------------------------------------------*/
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp b/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
index 87b4c8c00c999efc64e0edf9e0ffd1037c20d95d..c6512f44dadc974b381975ec67feda62789e4e4a 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
@@ -22,8 +22,8 @@
 #ifdef KOKKOS_ENABLE_CUDA
 
 #include <Kokkos_Core.hpp>
-#include <Kokkos_Cuda.hpp>
-#include <Kokkos_CudaSpace.hpp>
+#include <Cuda/Kokkos_Cuda.hpp>
+#include <Cuda/Kokkos_CudaSpace.hpp>
 
 #include <cstdlib>
 #include <iostream>
@@ -43,7 +43,8 @@
 cudaStream_t Kokkos::Impl::cuda_get_deep_copy_stream() {
   static cudaStream_t s = nullptr;
   if (s == nullptr) {
-    cudaStreamCreate(&s);
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        (CudaInternal::singleton().cuda_stream_create_wrapper(&s)));
   }
   return s;
 }
@@ -66,19 +67,22 @@ static std::atomic<int> num_uvm_allocations(0);
 }  // namespace
 
 void DeepCopyCuda(void *dst, const void *src, size_t n) {
-  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemcpy(dst, src, n, cudaMemcpyDefault));
+  KOKKOS_IMPL_CUDA_SAFE_CALL((CudaInternal::singleton().cuda_memcpy_wrapper(
+      dst, src, n, cudaMemcpyDefault)));
 }
 
 void DeepCopyAsyncCuda(const Cuda &instance, void *dst, const void *src,
                        size_t n) {
   KOKKOS_IMPL_CUDA_SAFE_CALL(
-      cudaMemcpyAsync(dst, src, n, cudaMemcpyDefault, instance.cuda_stream()));
+      (instance.impl_internal_space_instance()->cuda_memcpy_async_wrapper(
+          dst, src, n, cudaMemcpyDefault)));
 }
 
 void DeepCopyAsyncCuda(void *dst, const void *src, size_t n) {
   cudaStream_t s = cuda_get_deep_copy_stream();
   KOKKOS_IMPL_CUDA_SAFE_CALL(
-      cudaMemcpyAsync(dst, src, n, cudaMemcpyDefault, s));
+      (CudaInternal::singleton().cuda_memcpy_async_wrapper(
+          dst, src, n, cudaMemcpyDefault, s)));
   Impl::cuda_stream_synchronize(
       s,
       Kokkos::Tools::Experimental::SpecialSynchronizationCases::
@@ -137,7 +141,7 @@ CudaUVMSpace::CudaUVMSpace() : m_device(Kokkos::Cuda().cuda_device()) {}
 
 CudaHostPinnedSpace::CudaHostPinnedSpace() {}
 
-int memory_threshold_g = 40000;  // 40 kB
+size_t memory_threshold_g = 40000;  // 40 kB
 
 //==============================================================================
 // <editor-fold desc="allocate()"> {{{1
@@ -170,25 +174,39 @@ void *impl_allocate_common(const Cuda &exec_space, const char *arg_label,
   cudaError_t error_code;
   if (arg_alloc_size >= memory_threshold_g) {
     if (exec_space_provided) {
-      cudaStream_t stream = exec_space.cuda_stream();
-      error_code          = cudaMallocAsync(&ptr, arg_alloc_size, stream);
-      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamSynchronize(stream));
+      error_code =
+          exec_space.impl_internal_space_instance()->cuda_malloc_async_wrapper(
+              &ptr, arg_alloc_size);
+      exec_space.fence("Kokkos::Cuda: backend fence after async malloc");
     } else {
-      error_code = cudaMallocAsync(&ptr, arg_alloc_size, 0);
-      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
+      error_code = Impl::CudaInternal::singleton().cuda_malloc_async_wrapper(
+          &ptr, arg_alloc_size);
+      Impl::cuda_device_synchronize(
+          "Kokkos::Cuda: backend fence after async malloc");
     }
   } else {
-    error_code = cudaMalloc(&ptr, arg_alloc_size);
+    error_code =
+        (exec_space_provided
+             ? exec_space.impl_internal_space_instance()->cuda_malloc_wrapper(
+                   &ptr, arg_alloc_size)
+             : Impl::CudaInternal::singleton().cuda_malloc_wrapper(
+                   &ptr, arg_alloc_size));
   }
 #else
-  (void)exec_space;
-  (void)exec_space_provided;
-  auto error_code = cudaMalloc(&ptr, arg_alloc_size);
+  cudaError_t error_code;
+  if (exec_space_provided) {
+    error_code = exec_space.impl_internal_space_instance()->cuda_malloc_wrapper(
+        &ptr, arg_alloc_size);
+  } else {
+    error_code = Impl::CudaInternal::singleton().cuda_malloc_wrapper(
+        &ptr, arg_alloc_size);
+  }
 #endif
   if (error_code != cudaSuccess) {  // TODO tag as unlikely branch
-    cudaGetLastError();  // This is the only way to clear the last error, which
-                         // we should do here since we're turning it into an
-                         // exception here
+    // This is the only way to clear the last error, which
+    // we should do here since we're turning it into an
+    // exception here
+    exec_space.impl_internal_space_instance()->cuda_get_last_error_wrapper();
     throw Experimental::CudaRawMemoryAllocationFailure(
         arg_alloc_size, error_code,
         Experimental::RawMemoryAllocationFailure::AllocationMechanism::
@@ -239,18 +257,22 @@ void *CudaUVMSpace::impl_allocate(
     Kokkos::Impl::num_uvm_allocations++;
 
     auto error_code =
-        cudaMallocManaged(&ptr, arg_alloc_size, cudaMemAttachGlobal);
+        Impl::CudaInternal::singleton().cuda_malloc_managed_wrapper(
+            &ptr, arg_alloc_size, cudaMemAttachGlobal);
 
 #ifdef KOKKOS_IMPL_DEBUG_CUDA_PIN_UVM_TO_HOST
     if (Kokkos::CudaUVMSpace::cuda_pin_uvm_to_host())
-      cudaMemAdvise(ptr, arg_alloc_size, cudaMemAdviseSetPreferredLocation,
-                    cudaCpuDeviceId);
+      KOKKOS_IMPL_CUDA_SAFE_CALL(
+          (Impl::CudaInternal::singleton().cuda_mem_advise_wrapper(
+              ptr, arg_alloc_size, cudaMemAdviseSetPreferredLocation,
+              cudaCpuDeviceId)));
 #endif
 
     if (error_code != cudaSuccess) {  // TODO tag as unlikely branch
-      cudaGetLastError();  // This is the only way to clear the last error,
-                           // which we should do here since we're turning it
-                           // into an exception here
+      // This is the only way to clear the last error, which
+      // we should do here since we're turning it into an
+      // exception here
+      Impl::CudaInternal::singleton().cuda_get_last_error_wrapper();
       throw Experimental::CudaRawMemoryAllocationFailure(
           arg_alloc_size, error_code,
           Experimental::RawMemoryAllocationFailure::AllocationMechanism::
@@ -280,11 +302,13 @@ void *CudaHostPinnedSpace::impl_allocate(
     const Kokkos::Tools::SpaceHandle arg_handle) const {
   void *ptr = nullptr;
 
-  auto error_code = cudaHostAlloc(&ptr, arg_alloc_size, cudaHostAllocDefault);
+  auto error_code = Impl::CudaInternal::singleton().cuda_host_alloc_wrapper(
+      &ptr, arg_alloc_size, cudaHostAllocDefault);
   if (error_code != cudaSuccess) {  // TODO tag as unlikely branch
-    cudaGetLastError();  // This is the only way to clear the last error, which
-                         // we should do here since we're turning it into an
-                         // exception here
+    // This is the only way to clear the last error, which
+    // we should do here since we're turning it into an
+    // exception here
+    Impl::CudaInternal::singleton().cuda_get_last_error_wrapper();
     throw Experimental::CudaRawMemoryAllocationFailure(
         arg_alloc_size, error_code,
         Experimental::RawMemoryAllocationFailure::AllocationMechanism::
@@ -324,14 +348,20 @@ void CudaSpace::impl_deallocate(
 #error CUDART_VERSION undefined!
 #elif (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020)
     if (arg_alloc_size >= memory_threshold_g) {
-      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
-      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFreeAsync(arg_alloc_ptr, 0));
-      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
+      Impl::cuda_device_synchronize(
+          "Kokkos::Cuda: backend fence before async free");
+      KOKKOS_IMPL_CUDA_SAFE_CALL(
+          (Impl::CudaInternal::singleton().cuda_free_async_wrapper(
+              arg_alloc_ptr)));
+      Impl::cuda_device_synchronize(
+          "Kokkos::Cuda: backend fence after async free");
     } else {
-      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(
+          (Impl::CudaInternal::singleton().cuda_free_wrapper(arg_alloc_ptr)));
     }
 #else
-    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        (Impl::CudaInternal::singleton().cuda_free_wrapper(arg_alloc_ptr)));
 #endif
   } catch (...) {
   }
@@ -350,10 +380,7 @@ void CudaUVMSpace::deallocate(const char *arg_label, void *const arg_alloc_ptr,
 }
 void CudaUVMSpace::impl_deallocate(
     const char *arg_label, void *const arg_alloc_ptr,
-    const size_t arg_alloc_size
-
-    ,
-    const size_t arg_logical_size,
+    const size_t arg_alloc_size, const size_t arg_logical_size,
     const Kokkos::Tools::SpaceHandle arg_handle) const {
   Cuda::impl_static_fence(
       "Kokkos::CudaUVMSpace::impl_deallocate: Pre UVM Deallocation");
@@ -366,7 +393,8 @@ void CudaUVMSpace::impl_deallocate(
   try {
     if (arg_alloc_ptr != nullptr) {
       Kokkos::Impl::num_uvm_allocations--;
-      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(
+          (Impl::CudaInternal::singleton().cuda_free_wrapper(arg_alloc_ptr)));
     }
   } catch (...) {
   }
@@ -396,7 +424,8 @@ void CudaHostPinnedSpace::impl_deallocate(
                                       reported_size);
   }
   try {
-    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFreeHost(arg_alloc_ptr));
+    KOKKOS_IMPL_CUDA_SAFE_CALL((
+        Impl::CudaInternal::singleton().cuda_free_host_wrapper(arg_alloc_ptr)));
   } catch (...) {
   }
 }
@@ -420,49 +449,6 @@ SharedAllocationRecord<void, void>
     SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, void>::s_root_record;
 #endif
 
-::cudaTextureObject_t
-SharedAllocationRecord<Kokkos::CudaSpace, void>::attach_texture_object(
-    const unsigned sizeof_alias, void *const alloc_ptr,
-    size_t const alloc_size) {
-  enum { TEXTURE_BOUND_1D = 1u << 27 };
-
-  if ((alloc_ptr == nullptr) ||
-      (sizeof_alias * TEXTURE_BOUND_1D <= alloc_size)) {
-    std::ostringstream msg;
-    msg << "Kokkos::CudaSpace ERROR: Cannot attach texture object to"
-        << " alloc_ptr(" << alloc_ptr << ")"
-        << " alloc_size(" << alloc_size << ")"
-        << " max_size(" << (sizeof_alias * TEXTURE_BOUND_1D) << ")";
-    std::cerr << msg.str() << std::endl;
-    std::cerr.flush();
-    Kokkos::Impl::throw_runtime_exception(msg.str());
-  }
-
-  ::cudaTextureObject_t tex_obj;
-
-  struct cudaResourceDesc resDesc;
-  struct cudaTextureDesc texDesc;
-
-  memset(&resDesc, 0, sizeof(resDesc));
-  memset(&texDesc, 0, sizeof(texDesc));
-
-  resDesc.resType = cudaResourceTypeLinear;
-  resDesc.res.linear.desc =
-      (sizeof_alias == 4
-           ? cudaCreateChannelDesc<int>()
-           : (sizeof_alias == 8
-                  ? cudaCreateChannelDesc< ::int2>()
-                  :
-                  /* sizeof_alias == 16 */ cudaCreateChannelDesc< ::int4>()));
-  resDesc.res.linear.sizeInBytes = alloc_size;
-  resDesc.res.linear.devPtr      = alloc_ptr;
-
-  KOKKOS_IMPL_CUDA_SAFE_CALL(
-      cudaCreateTextureObject(&tex_obj, &resDesc, &texDesc, nullptr));
-
-  return tex_obj;
-}
-
 //==============================================================================
 // <editor-fold desc="SharedAllocationRecord destructors"> {{{1
 
@@ -521,7 +507,6 @@ SharedAllocationRecord<Kokkos::CudaSpace, void>::SharedAllocationRecord(
                                                arg_alloc_size),
           sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc,
           arg_label),
-      m_tex_obj(0),
       m_space(arg_space) {
 
   SharedAllocationHeader header;
@@ -552,7 +537,6 @@ SharedAllocationRecord<Kokkos::CudaSpace, void>::SharedAllocationRecord(
                                                arg_label, arg_alloc_size),
           sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc,
           arg_label),
-      m_tex_obj(0),
       m_space(arg_space) {
 
   SharedAllocationHeader header;
@@ -579,7 +563,6 @@ SharedAllocationRecord<Kokkos::CudaUVMSpace, void>::SharedAllocationRecord(
                                                arg_alloc_size),
           sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc,
           arg_label),
-      m_tex_obj(0),
       m_space(arg_space) {
   this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr,
                                                   arg_label);
@@ -613,7 +596,9 @@ void cuda_prefetch_pointer(const Cuda &space, const void *ptr, size_t bytes,
                            bool to_device) {
   if ((ptr == nullptr) || (bytes == 0)) return;
   cudaPointerAttributes attr;
-  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaPointerGetAttributes(&attr, ptr));
+  KOKKOS_IMPL_CUDA_SAFE_CALL((
+      space.impl_internal_space_instance()->cuda_pointer_get_attributes_wrapper(
+          &attr, ptr)));
   // I measured this and it turns out prefetching towards the host slows
   // DualView syncs down. Probably because the latency is not too bad in the
   // first place for the pull down. If we want to change that provde
@@ -621,8 +606,9 @@ void cuda_prefetch_pointer(const Cuda &space, const void *ptr, size_t bytes,
   bool is_managed = attr.type == cudaMemoryTypeManaged;
   if (to_device && is_managed &&
       space.cuda_device_prop().concurrentManagedAccess) {
-    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemPrefetchAsync(
-        ptr, bytes, space.cuda_device(), space.cuda_stream()));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        (space.impl_internal_space_instance()->cuda_mem_prefetch_async_wrapper(
+            ptr, bytes, space.cuda_device())));
   }
 }
 
diff --git a/packages/kokkos/core/src/Kokkos_CudaSpace.hpp b/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp
similarity index 92%
rename from packages/kokkos/core/src/Kokkos_CudaSpace.hpp
rename to packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp
index eec9999f616c9c32171c60f169c5e2042547b9a0..b8fa335cd3b229a3a7c5883fe6e74f5c3d84d701 100644
--- a/packages/kokkos/core/src/Kokkos_CudaSpace.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp
@@ -534,15 +534,10 @@ class SharedAllocationRecord<Kokkos::CudaSpace, void>
   SharedAllocationRecord(const SharedAllocationRecord&) = delete;
   SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete;
 
-  static ::cudaTextureObject_t attach_texture_object(
-      const unsigned sizeof_alias, void* const alloc_ptr,
-      const size_t alloc_size);
-
 #ifdef KOKKOS_ENABLE_DEBUG
   static RecordBase s_root_record;
 #endif
 
-  ::cudaTextureObject_t m_tex_obj = 0;
   const Kokkos::CudaSpace m_space;
 
  protected:
@@ -566,7 +561,6 @@ class SharedAllocationRecord<Kokkos::CudaSpace, void>
                                                  arg_alloc_size),
             sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc,
             arg_label),
-        m_tex_obj(0),
         m_space(arg_space) {
 
     SharedAllocationHeader header;
@@ -592,30 +586,6 @@ class SharedAllocationRecord<Kokkos::CudaSpace, void>
   // helper function to work around MSVC+NVCC issue
   // https://github.com/kokkos/kokkos/issues/5258
   static void deep_copy_header_no_exec(void*, const void*);
-
- public:
-  template <typename AliasType>
-  inline ::cudaTextureObject_t attach_texture_object() {
-    static_assert((std::is_same<AliasType, int>::value ||
-                   std::is_same<AliasType, ::int2>::value ||
-                   std::is_same<AliasType, ::int4>::value),
-                  "Cuda texture fetch only supported for alias types of int, "
-                  "::int2, or ::int4");
-
-    if (m_tex_obj == 0) {
-      m_tex_obj = attach_texture_object(sizeof(AliasType),
-                                        (void*)RecordBase::m_alloc_ptr,
-                                        RecordBase::m_alloc_size);
-    }
-
-    return m_tex_obj;
-  }
-
-  template <typename AliasType>
-  inline int attach_texture_object_offset(const AliasType* const ptr) {
-    // Texture object is attached to the entire allocation range
-    return ptr - reinterpret_cast<AliasType*>(RecordBase::m_alloc_ptr);
-  }
 };
 
 template <>
@@ -632,7 +602,6 @@ class SharedAllocationRecord<Kokkos::CudaUVMSpace, void>
 
   static RecordBase s_root_record;
 
-  ::cudaTextureObject_t m_tex_obj = 0;
   const Kokkos::CudaUVMSpace m_space;
 
  protected:
@@ -657,7 +626,6 @@ class SharedAllocationRecord<Kokkos::CudaUVMSpace, void>
                                                  arg_alloc_size),
             sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc,
             arg_label),
-        m_tex_obj(0),
         m_space(arg_space) {
     this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr,
                                                     arg_label);
@@ -667,31 +635,6 @@ class SharedAllocationRecord<Kokkos::CudaUVMSpace, void>
       const Kokkos::CudaUVMSpace& arg_space, const std::string& arg_label,
       const size_t arg_alloc_size,
       const RecordBase::function_type arg_dealloc = &base_t::deallocate);
-
- public:
-  template <typename AliasType>
-  inline ::cudaTextureObject_t attach_texture_object() {
-    static_assert((std::is_same<AliasType, int>::value ||
-                   std::is_same<AliasType, ::int2>::value ||
-                   std::is_same<AliasType, ::int4>::value),
-                  "Cuda texture fetch only supported for alias types of int, "
-                  "::int2, or ::int4");
-
-    if (m_tex_obj == 0) {
-      m_tex_obj = SharedAllocationRecord<Kokkos::CudaSpace, void>::
-          attach_texture_object(sizeof(AliasType),
-                                (void*)RecordBase::m_alloc_ptr,
-                                RecordBase::m_alloc_size);
-    }
-
-    return m_tex_obj;
-  }
-
-  template <typename AliasType>
-  inline int attach_texture_object_offset(const AliasType* const ptr) {
-    // Texture object is attached to the entire allocation range
-    return ptr - reinterpret_cast<AliasType*>(RecordBase::m_alloc_ptr);
-  }
 };
 
 template <>
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Atomic_Intrinsics.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Atomic_Intrinsics.hpp
deleted file mode 100644
index cb196f6e8ffa062716b583faf257b085d101a1b7..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Atomic_Intrinsics.hpp
+++ /dev/null
@@ -1,979 +0,0 @@
-/*
-@HEADER
-================================================================================
-
-ORIGINAL LICENSE
-----------------
-
-Copyright (c) 2018, NVIDIA Corporation
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-
-================================================================================
-
-LICENSE ASSOCIATED WITH SUBSEQUENT MODIFICATIONS
-------------------------------------------------
-
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-//
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-// ************************************************************************
-@HEADER
-*/
-
-#include <Kokkos_Macros.hpp>
-#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA_ASM_ATOMICS)
-
-#include <cassert>
-
-#ifndef _SIMT_DETAILS_CONFIG
-#define _SIMT_DETAILS_CONFIG
-
-namespace Kokkos {
-namespace Impl {
-
-#ifndef __simt_scope
-// Modification: Kokkos GPU atomics should default to `gpu` scope
-#define __simt_scope "gpu"
-#endif
-
-#define __simt_fence_signal_() asm volatile("" ::: "memory")
-#define __simt_fence_sc_() \
-  asm volatile("fence.sc." __simt_scope ";" ::: "memory")
-#define __simt_fence_() asm volatile("fence." __simt_scope ";" ::: "memory")
-
-#define __simt_load_acquire_8_as_32(ptr, ret)             \
-  asm volatile("ld.acquire." __simt_scope ".b8 %0, [%1];" \
-               : "=r"(ret)                                \
-               : "l"(ptr)                                 \
-               : "memory")
-#define __simt_load_relaxed_8_as_32(ptr, ret)             \
-  asm volatile("ld.relaxed." __simt_scope ".b8 %0, [%1];" \
-               : "=r"(ret)                                \
-               : "l"(ptr)                                 \
-               : "memory")
-#define __simt_store_release_8_as_32(ptr, desired)                    \
-  asm volatile("st.release." __simt_scope ".b8 [%0], %1;" ::"l"(ptr), \
-               "r"(desired)                                           \
-               : "memory")
-#define __simt_store_relaxed_8_as_32(ptr, desired)                    \
-  asm volatile("st.relaxed." __simt_scope ".b8 [%0], %1;" ::"l"(ptr), \
-               "r"(desired)                                           \
-               : "memory")
-
-#define __simt_load_acquire_16(ptr, ret)                   \
-  asm volatile("ld.acquire." __simt_scope ".b16 %0, [%1];" \
-               : "=h"(ret)                                 \
-               : "l"(ptr)                                  \
-               : "memory")
-#define __simt_load_relaxed_16(ptr, ret)                   \
-  asm volatile("ld.relaxed." __simt_scope ".b16 %0, [%1];" \
-               : "=h"(ret)                                 \
-               : "l"(ptr)                                  \
-               : "memory")
-#define __simt_store_release_16(ptr, desired)                          \
-  asm volatile("st.release." __simt_scope ".b16 [%0], %1;" ::"l"(ptr), \
-               "h"(desired)                                            \
-               : "memory")
-#define __simt_store_relaxed_16(ptr, desired)                          \
-  asm volatile("st.relaxed." __simt_scope ".b16 [%0], %1;" ::"l"(ptr), \
-               "h"(desired)                                            \
-               : "memory")
-
-#define __simt_load_acquire_32(ptr, ret)                   \
-  asm volatile("ld.acquire." __simt_scope ".b32 %0, [%1];" \
-               : "=r"(ret)                                 \
-               : "l"(ptr)                                  \
-               : "memory")
-#define __simt_load_relaxed_32(ptr, ret)                   \
-  asm volatile("ld.relaxed." __simt_scope ".b32 %0, [%1];" \
-               : "=r"(ret)                                 \
-               : "l"(ptr)                                  \
-               : "memory")
-#define __simt_store_release_32(ptr, desired)                          \
-  asm volatile("st.release." __simt_scope ".b32 [%0], %1;" ::"l"(ptr), \
-               "r"(desired)                                            \
-               : "memory")
-#define __simt_store_relaxed_32(ptr, desired)                          \
-  asm volatile("st.relaxed." __simt_scope ".b32 [%0], %1;" ::"l"(ptr), \
-               "r"(desired)                                            \
-               : "memory")
-#define __simt_exch_release_32(ptr, old, desired)                     \
-  asm volatile("atom.exch.release." __simt_scope ".b32 %0, [%1], %2;" \
-               : "=r"(old)                                            \
-               : "l"(ptr), "r"(desired)                               \
-               : "memory")
-#define __simt_exch_acquire_32(ptr, old, desired)                     \
-  asm volatile("atom.exch.acquire." __simt_scope ".b32 %0, [%1], %2;" \
-               : "=r"(old)                                            \
-               : "l"(ptr), "r"(desired)                               \
-               : "memory")
-#define __simt_exch_acq_rel_32(ptr, old, desired)                     \
-  asm volatile("atom.exch.acq_rel." __simt_scope ".b32 %0, [%1], %2;" \
-               : "=r"(old)                                            \
-               : "l"(ptr), "r"(desired)                               \
-               : "memory")
-#define __simt_exch_relaxed_32(ptr, old, desired)                     \
-  asm volatile("atom.exch.relaxed." __simt_scope ".b32 %0, [%1], %2;" \
-               : "=r"(old)                                            \
-               : "l"(ptr), "r"(desired)                               \
-               : "memory")
-#define __simt_cas_release_32(ptr, old, expected, desired)               \
-  asm volatile("atom.cas.release." __simt_scope ".b32 %0, [%1], %2, %3;" \
-               : "=r"(old)                                               \
-               : "l"(ptr), "r"(expected), "r"(desired)                   \
-               : "memory")
-#define __simt_cas_acquire_32(ptr, old, expected, desired)               \
-  asm volatile("atom.cas.acquire." __simt_scope ".b32 %0, [%1], %2, %3;" \
-               : "=r"(old)                                               \
-               : "l"(ptr), "r"(expected), "r"(desired)                   \
-               : "memory")
-#define __simt_cas_acq_rel_32(ptr, old, expected, desired)               \
-  asm volatile("atom.cas.acq_rel." __simt_scope ".b32 %0, [%1], %2, %3;" \
-               : "=r"(old)                                               \
-               : "l"(ptr), "r"(expected), "r"(desired)                   \
-               : "memory")
-#define __simt_cas_relaxed_32(ptr, old, expected, desired)               \
-  asm volatile("atom.cas.relaxed." __simt_scope ".b32 %0, [%1], %2, %3;" \
-               : "=r"(old)                                               \
-               : "l"(ptr), "r"(expected), "r"(desired)                   \
-               : "memory")
-#define __simt_add_release_32(ptr, old, addend)                      \
-  asm volatile("atom.add.release." __simt_scope ".u32 %0, [%1], %2;" \
-               : "=r"(old)                                           \
-               : "l"(ptr), "r"(addend)                               \
-               : "memory")
-#define __simt_add_acquire_32(ptr, old, addend)                      \
-  asm volatile("atom.add.acquire." __simt_scope ".u32 %0, [%1], %2;" \
-               : "=r"(old)                                           \
-               : "l"(ptr), "r"(addend)                               \
-               : "memory")
-#define __simt_add_acq_rel_32(ptr, old, addend)                      \
-  asm volatile("atom.add.acq_rel." __simt_scope ".u32 %0, [%1], %2;" \
-               : "=r"(old)                                           \
-               : "l"(ptr), "r"(addend)                               \
-               : "memory")
-#define __simt_add_relaxed_32(ptr, old, addend)                      \
-  asm volatile("atom.add.relaxed." __simt_scope ".u32 %0, [%1], %2;" \
-               : "=r"(old)                                           \
-               : "l"(ptr), "r"(addend)                               \
-               : "memory")
-#define __simt_and_release_32(ptr, old, andend)                      \
-  asm volatile("atom.and.release." __simt_scope ".b32 %0, [%1], %2;" \
-               : "=r"(old)                                           \
-               : "l"(ptr), "r"(andend)                               \
-               : "memory")
-#define __simt_and_acquire_32(ptr, old, andend)                      \
-  asm volatile("atom.and.acquire." __simt_scope ".b32 %0, [%1], %2;" \
-               : "=r"(old)                                           \
-               : "l"(ptr), "r"(andend)                               \
-               : "memory")
-#define __simt_and_acq_rel_32(ptr, old, andend)                      \
-  asm volatile("atom.and.acq_rel." __simt_scope ".b32 %0, [%1], %2;" \
-               : "=r"(old)                                           \
-               : "l"(ptr), "r"(andend)                               \
-               : "memory")
-#define __simt_and_relaxed_32(ptr, old, andend)                      \
-  asm volatile("atom.and.relaxed." __simt_scope ".b32 %0, [%1], %2;" \
-               : "=r"(old)                                           \
-               : "l"(ptr), "r"(andend)                               \
-               : "memory")
-#define __simt_or_release_32(ptr, old, orend)                       \
-  asm volatile("atom.or.release." __simt_scope ".b32 %0, [%1], %2;" \
-               : "=r"(old)                                          \
-               : "l"(ptr), "r"(orend)                               \
-               : "memory")
-#define __simt_or_acquire_32(ptr, old, orend)                       \
-  asm volatile("atom.or.acquire." __simt_scope ".b32 %0, [%1], %2;" \
-               : "=r"(old)                                          \
-               : "l"(ptr), "r"(orend)                               \
-               : "memory")
-#define __simt_or_acq_rel_32(ptr, old, orend)                       \
-  asm volatile("atom.or.acq_rel." __simt_scope ".b32 %0, [%1], %2;" \
-               : "=r"(old)                                          \
-               : "l"(ptr), "r"(orend)                               \
-               : "memory")
-#define __simt_or_relaxed_32(ptr, old, orend)                       \
-  asm volatile("atom.or.relaxed." __simt_scope ".b32 %0, [%1], %2;" \
-               : "=r"(old)                                          \
-               : "l"(ptr), "r"(orend)                               \
-               : "memory")
-#define __simt_xor_release_32(ptr, old, xorend)                      \
-  asm volatile("atom.xor.release." __simt_scope ".b32 %0, [%1], %2;" \
-               : "=r"(old)                                           \
-               : "l"(ptr), "r"(xorend)                               \
-               : "memory")
-#define __simt_xor_acquire_32(ptr, old, xorend)                      \
-  asm volatile("atom.xor.acquire." __simt_scope ".b32 %0, [%1], %2;" \
-               : "=r"(old)                                           \
-               : "l"(ptr), "r"(xorend)                               \
-               : "memory")
-#define __simt_xor_acq_rel_32(ptr, old, xorend)                      \
-  asm volatile("atom.xor.acq_rel." __simt_scope ".b32 %0, [%1], %2;" \
-               : "=r"(old)                                           \
-               : "l"(ptr), "r"(xorend)                               \
-               : "memory")
-#define __simt_xor_relaxed_32(ptr, old, xorend)                      \
-  asm volatile("atom.xor.relaxed." __simt_scope ".b32 %0, [%1], %2;" \
-               : "=r"(old)                                           \
-               : "l"(ptr), "r"(xorend)                               \
-               : "memory")
-
-#define __simt_load_acquire_64(ptr, ret)                   \
-  asm volatile("ld.acquire." __simt_scope ".b64 %0, [%1];" \
-               : "=l"(ret)                                 \
-               : "l"(ptr)                                  \
-               : "memory")
-#define __simt_load_relaxed_64(ptr, ret)                   \
-  asm volatile("ld.relaxed." __simt_scope ".b64 %0, [%1];" \
-               : "=l"(ret)                                 \
-               : "l"(ptr)                                  \
-               : "memory")
-#define __simt_store_release_64(ptr, desired)                          \
-  asm volatile("st.release." __simt_scope ".b64 [%0], %1;" ::"l"(ptr), \
-               "l"(desired)                                            \
-               : "memory")
-#define __simt_store_relaxed_64(ptr, desired)                          \
-  asm volatile("st.relaxed." __simt_scope ".b64 [%0], %1;" ::"l"(ptr), \
-               "l"(desired)                                            \
-               : "memory")
-#define __simt_exch_release_64(ptr, old, desired)                     \
-  asm volatile("atom.exch.release." __simt_scope ".b64 %0, [%1], %2;" \
-               : "=l"(old)                                            \
-               : "l"(ptr), "l"(desired)                               \
-               : "memory")
-#define __simt_exch_acquire_64(ptr, old, desired)                     \
-  asm volatile("atom.exch.acquire." __simt_scope ".b64 %0, [%1], %2;" \
-               : "=l"(old)                                            \
-               : "l"(ptr), "l"(desired)                               \
-               : "memory")
-#define __simt_exch_acq_rel_64(ptr, old, desired)                     \
-  asm volatile("atom.exch.acq_rel." __simt_scope ".b64 %0, [%1], %2;" \
-               : "=l"(old)                                            \
-               : "l"(ptr), "l"(desired)                               \
-               : "memory")
-#define __simt_exch_relaxed_64(ptr, old, desired)                     \
-  asm volatile("atom.exch.relaxed." __simt_scope ".b64 %0, [%1], %2;" \
-               : "=l"(old)                                            \
-               : "l"(ptr), "l"(desired)                               \
-               : "memory")
-#define __simt_cas_release_64(ptr, old, expected, desired)               \
-  asm volatile("atom.cas.release." __simt_scope ".b64 %0, [%1], %2, %3;" \
-               : "=l"(old)                                               \
-               : "l"(ptr), "l"(expected), "l"(desired)                   \
-               : "memory")
-#define __simt_cas_acquire_64(ptr, old, expected, desired)               \
-  asm volatile("atom.cas.acquire." __simt_scope ".b64 %0, [%1], %2, %3;" \
-               : "=l"(old)                                               \
-               : "l"(ptr), "l"(expected), "l"(desired)                   \
-               : "memory")
-#define __simt_cas_acq_rel_64(ptr, old, expected, desired)               \
-  asm volatile("atom.cas.acq_rel." __simt_scope ".b64 %0, [%1], %2, %3;" \
-               : "=l"(old)                                               \
-               : "l"(ptr), "l"(expected), "l"(desired)                   \
-               : "memory")
-#define __simt_cas_relaxed_64(ptr, old, expected, desired)               \
-  asm volatile("atom.cas.relaxed." __simt_scope ".b64 %0, [%1], %2, %3;" \
-               : "=l"(old)                                               \
-               : "l"(ptr), "l"(expected), "l"(desired)                   \
-               : "memory")
-#define __simt_add_release_64(ptr, old, addend)                      \
-  asm volatile("atom.add.release." __simt_scope ".u64 %0, [%1], %2;" \
-               : "=l"(old)                                           \
-               : "l"(ptr), "l"(addend)                               \
-               : "memory")
-#define __simt_add_acquire_64(ptr, old, addend)                      \
-  asm volatile("atom.add.acquire." __simt_scope ".u64 %0, [%1], %2;" \
-               : "=l"(old)                                           \
-               : "l"(ptr), "l"(addend)                               \
-               : "memory")
-#define __simt_add_acq_rel_64(ptr, old, addend)                      \
-  asm volatile("atom.add.acq_rel." __simt_scope ".u64 %0, [%1], %2;" \
-               : "=l"(old)                                           \
-               : "l"(ptr), "l"(addend)                               \
-               : "memory")
-#define __simt_add_relaxed_64(ptr, old, addend)                      \
-  asm volatile("atom.add.relaxed." __simt_scope ".u64 %0, [%1], %2;" \
-               : "=l"(old)                                           \
-               : "l"(ptr), "l"(addend)                               \
-               : "memory")
-#define __simt_and_release_64(ptr, old, andend)                      \
-  asm volatile("atom.and.release." __simt_scope ".b64 %0, [%1], %2;" \
-               : "=l"(old)                                           \
-               : "l"(ptr), "l"(andend)                               \
-               : "memory")
-#define __simt_and_acquire_64(ptr, old, andend)                      \
-  asm volatile("atom.and.acquire." __simt_scope ".b64 %0, [%1], %2;" \
-               : "=l"(old)                                           \
-               : "l"(ptr), "l"(andend)                               \
-               : "memory")
-#define __simt_and_acq_rel_64(ptr, old, andend)                      \
-  asm volatile("atom.and.acq_rel." __simt_scope ".b64 %0, [%1], %2;" \
-               : "=l"(old)                                           \
-               : "l"(ptr), "l"(andend)                               \
-               : "memory")
-#define __simt_and_relaxed_64(ptr, old, andend)                      \
-  asm volatile("atom.and.relaxed." __simt_scope ".b64 %0, [%1], %2;" \
-               : "=l"(old)                                           \
-               : "l"(ptr), "l"(andend)                               \
-               : "memory")
-#define __simt_or_release_64(ptr, old, orend)                       \
-  asm volatile("atom.or.release." __simt_scope ".b64 %0, [%1], %2;" \
-               : "=l"(old)                                          \
-               : "l"(ptr), "l"(orend)                               \
-               : "memory")
-#define __simt_or_acquire_64(ptr, old, orend)                       \
-  asm volatile("atom.or.acquire." __simt_scope ".b64 %0, [%1], %2;" \
-               : "=l"(old)                                          \
-               : "l"(ptr), "l"(orend)                               \
-               : "memory")
-#define __simt_or_acq_rel_64(ptr, old, orend)                       \
-  asm volatile("atom.or.acq_rel." __simt_scope ".b64 %0, [%1], %2;" \
-               : "=l"(old)                                          \
-               : "l"(ptr), "l"(orend)                               \
-               : "memory")
-#define __simt_or_relaxed_64(ptr, old, orend)                       \
-  asm volatile("atom.or.relaxed." __simt_scope ".b64 %0, [%1], %2;" \
-               : "=l"(old)                                          \
-               : "l"(ptr), "l"(orend)                               \
-               : "memory")
-#define __simt_xor_release_64(ptr, old, xorend)                      \
-  asm volatile("atom.xor.release." __simt_scope ".b64 %0, [%1], %2;" \
-               : "=l"(old)                                           \
-               : "l"(ptr), "l"(xorend)                               \
-               : "memory")
-#define __simt_xor_acquire_64(ptr, old, xorend)                      \
-  asm volatile("atom.xor.acquire." __simt_scope ".b64 %0, [%1], %2;" \
-               : "=l"(old)                                           \
-               : "l"(ptr), "l"(xorend)                               \
-               : "memory")
-#define __simt_xor_acq_rel_64(ptr, old, xorend)                      \
-  asm volatile("atom.xor.acq_rel." __simt_scope ".b64 %0, [%1], %2;" \
-               : "=l"(old)                                           \
-               : "l"(ptr), "l"(xorend)                               \
-               : "memory")
-#define __simt_xor_relaxed_64(ptr, old, xorend)                      \
-  asm volatile("atom.xor.relaxed." __simt_scope ".b64 %0, [%1], %2;" \
-               : "=l"(old)                                           \
-               : "l"(ptr), "l"(xorend)                               \
-               : "memory")
-
-#define __simt_nanosleep(timeout) \
-  asm volatile("nanosleep.u32 %0;" ::"r"(unsigned(timeout)) :)
-
-/*
-    definitions
-*/
-
-#ifndef __GCC_ATOMIC_BOOL_LOCK_FREE
-#define __GCC_ATOMIC_BOOL_LOCK_FREE 2
-#define __GCC_ATOMIC_CHAR_LOCK_FREE 2
-#define __GCC_ATOMIC_CHAR16_T_LOCK_FREE 2
-#define __GCC_ATOMIC_CHAR32_T_LOCK_FREE 2
-#define __GCC_ATOMIC_WCHAR_T_LOCK_FREE 2
-#define __GCC_ATOMIC_SHORT_LOCK_FREE 2
-#define __GCC_ATOMIC_INT_LOCK_FREE 2
-#define __GCC_ATOMIC_LONG_LOCK_FREE 2
-#define __GCC_ATOMIC_LLONG_LOCK_FREE 2
-#define __GCC_ATOMIC_POINTER_LOCK_FREE 2
-#endif
-
-#ifndef __ATOMIC_RELAXED
-#define __ATOMIC_RELAXED 0
-#define __ATOMIC_CONSUME 1
-#define __ATOMIC_ACQUIRE 2
-#define __ATOMIC_RELEASE 3
-#define __ATOMIC_ACQ_REL 4
-#define __ATOMIC_SEQ_CST 5
-#endif
-
-inline __device__ int __stronger_order_simt_(int a, int b) {
-  if (b == __ATOMIC_SEQ_CST) return __ATOMIC_SEQ_CST;
-  if (b == __ATOMIC_RELAXED) return a;
-  switch (a) {
-    case __ATOMIC_SEQ_CST:
-    case __ATOMIC_ACQ_REL: return a;
-    case __ATOMIC_CONSUME:
-    case __ATOMIC_ACQUIRE:
-      if (b != __ATOMIC_ACQUIRE)
-        return __ATOMIC_ACQ_REL;
-      else
-        return __ATOMIC_ACQUIRE;
-    case __ATOMIC_RELEASE:
-      if (b != __ATOMIC_RELEASE)
-        return __ATOMIC_ACQ_REL;
-      else
-        return __ATOMIC_RELEASE;
-    case __ATOMIC_RELAXED: return b;
-    default: assert(0);
-  }
-  return __ATOMIC_SEQ_CST;
-}
-
-/*
-    base
-*/
-
-#define DO__atomic_load_simt_(bytes, bits)                                \
-  template <class type, std::enable_if_t<sizeof(type) == bytes, int> = 0> \
-  void __device__ __atomic_load_simt_(const type *ptr, type *ret,         \
-                                      int memorder) {                     \
-    int##bits##_t tmp = 0;                                                \
-    switch (memorder) {                                                   \
-      case __ATOMIC_SEQ_CST: __simt_fence_sc_();                          \
-      case __ATOMIC_CONSUME:                                              \
-      case __ATOMIC_ACQUIRE: __simt_load_acquire_##bits(ptr, tmp); break; \
-      case __ATOMIC_RELAXED: __simt_load_relaxed_##bits(ptr, tmp); break; \
-      default: assert(0);                                                 \
-    }                                                                     \
-    memcpy(ret, &tmp, bytes);                                             \
-  }
-DO__atomic_load_simt_(1, 32) DO__atomic_load_simt_(2, 16)
-    DO__atomic_load_simt_(4, 32) DO__atomic_load_simt_(8, 64)
-
-        template <class type>
-        type __device__ __atomic_load_n_simt_(const type *ptr, int memorder) {
-  type ret;
-  __atomic_load_simt_(ptr, &ret, memorder);
-  return ret;
-}
-
-#define DO__atomic_store_simt_(bytes, bits)                                  \
-  template <class type, std::enable_if_t<sizeof(type) == bytes, int> = 0>    \
-  void __device__ __atomic_store_simt_(type *ptr, type *val, int memorder) { \
-    int##bits##_t tmp = 0;                                                   \
-    memcpy(&tmp, val, bytes);                                                \
-    switch (memorder) {                                                      \
-      case __ATOMIC_RELEASE: __simt_store_release_##bits(ptr, tmp); break;   \
-      case __ATOMIC_SEQ_CST: __simt_fence_sc_();                             \
-      case __ATOMIC_RELAXED: __simt_store_relaxed_##bits(ptr, tmp); break;   \
-      default: assert(0);                                                    \
-    }                                                                        \
-  }
-DO__atomic_store_simt_(1, 32) DO__atomic_store_simt_(2, 16)
-    DO__atomic_store_simt_(4, 32) DO__atomic_store_simt_(8, 64)
-
-        template <class type>
-        void __device__
-    __atomic_store_n_simt_(type *ptr, type val, int memorder) {
-  __atomic_store_simt_(ptr, &val, memorder);
-}
-
-#define DO__atomic_compare_exchange_simt_(bytes, bits)                    \
-  template <class type, std::enable_if_t<sizeof(type) == bytes, int> = 0> \
-  bool __device__ __atomic_compare_exchange_simt_(                        \
-      type *ptr, type *expected, const type *desired, bool,               \
-      int success_memorder, int failure_memorder) {                       \
-    int##bits##_t tmp = 0, old = 0, old_tmp;                              \
-    memcpy(&tmp, desired, bytes);                                         \
-    memcpy(&old, expected, bytes);                                        \
-    old_tmp = old;                                                        \
-    switch (__stronger_order_simt_(success_memorder, failure_memorder)) { \
-      case __ATOMIC_SEQ_CST: __simt_fence_sc_();                          \
-      case __ATOMIC_CONSUME:                                              \
-      case __ATOMIC_ACQUIRE:                                              \
-        __simt_cas_acquire_##bits(ptr, old, old_tmp, tmp);                \
-        break;                                                            \
-      case __ATOMIC_ACQ_REL:                                              \
-        __simt_cas_acq_rel_##bits(ptr, old, old_tmp, tmp);                \
-        break;                                                            \
-      case __ATOMIC_RELEASE:                                              \
-        __simt_cas_release_##bits(ptr, old, old_tmp, tmp);                \
-        break;                                                            \
-      case __ATOMIC_RELAXED:                                              \
-        __simt_cas_relaxed_##bits(ptr, old, old_tmp, tmp);                \
-        break;                                                            \
-      default: assert(0);                                                 \
-    }                                                                     \
-    bool const ret = old == old_tmp;                                      \
-    memcpy(expected, &old, bytes);                                        \
-    return ret;                                                           \
-  }
-DO__atomic_compare_exchange_simt_(4, 32)
-    DO__atomic_compare_exchange_simt_(8, 64)
-
-        template <class type, std::enable_if_t<sizeof(type) <= 2, int> = 0>
-        bool __device__
-    __atomic_compare_exchange_simt_(type *ptr, type *expected,
-                                    const type *desired, bool,
-                                    int success_memorder,
-                                    int failure_memorder) {
-  using R = std::conditional_t<std::is_volatile<type>::value, volatile uint32_t,
-                               uint32_t>;
-  auto const aligned = (R *)((intptr_t)ptr & ~(sizeof(uint32_t) - 1));
-  auto const offset  = uint32_t((intptr_t)ptr & (sizeof(uint32_t) - 1)) * 8;
-  auto const mask    = ((1 << sizeof(type) * 8) - 1) << offset;
-
-  uint32_t old = *expected << offset, old_value;
-  while (1) {
-    old_value = (old & mask) >> offset;
-    if (old_value != *expected) break;
-    uint32_t const attempt = (old & ~mask) | (*desired << offset);
-    if (__atomic_compare_exchange_simt_(aligned, &old, &attempt, true,
-                                        success_memorder, failure_memorder))
-      return true;
-  }
-  *expected = old_value;
-  return false;
-}
-
-template <class type>
-bool __device__ __atomic_compare_exchange_n_simt_(type *ptr, type *expected,
-                                                  type desired, bool weak,
-                                                  int success_memorder,
-                                                  int failure_memorder) {
-  return __atomic_compare_exchange_simt_(ptr, expected, &desired, weak,
-                                         success_memorder, failure_memorder);
-}
-
-#define DO__atomic_exchange_simt_(bytes, bits)                                 \
-  template <class type, std::enable_if_t<sizeof(type) == bytes, int> = 0>      \
-  void __device__ __atomic_exchange_simt_(type *ptr, type *val, type *ret,     \
-                                          int memorder) {                      \
-    int##bits##_t tmp = 0;                                                     \
-    memcpy(&tmp, val, bytes);                                                  \
-    switch (memorder) {                                                        \
-      case __ATOMIC_SEQ_CST: __simt_fence_sc_();                               \
-      case __ATOMIC_CONSUME:                                                   \
-      case __ATOMIC_ACQUIRE: __simt_exch_acquire_##bits(ptr, tmp, tmp); break; \
-      case __ATOMIC_ACQ_REL: __simt_exch_acq_rel_##bits(ptr, tmp, tmp); break; \
-      case __ATOMIC_RELEASE: __simt_exch_release_##bits(ptr, tmp, tmp); break; \
-      case __ATOMIC_RELAXED: __simt_exch_relaxed_##bits(ptr, tmp, tmp); break; \
-      default: assert(0);                                                      \
-    }                                                                          \
-    memcpy(ret, &tmp, bytes);                                                  \
-  }
-DO__atomic_exchange_simt_(4, 32) DO__atomic_exchange_simt_(8, 64)
-
-    template <class type, std::enable_if_t<sizeof(type) <= 2, int> = 0>
-    void __device__
-    __atomic_exchange_simt_(type *ptr, type *val, type *ret, int memorder) {
-  type expected = __atomic_load_n_simt_(ptr, __ATOMIC_RELAXED);
-  while (!__atomic_compare_exchange_simt_(ptr, &expected, val, true, memorder,
-                                          memorder))
-    ;
-  *ret = expected;
-}
-
-template <class type>
-type __device__ __atomic_exchange_n_simt_(type *ptr, type val, int memorder) {
-  type ret;
-  __atomic_exchange_simt_(ptr, &val, &ret, memorder);
-  return ret;
-}
-
-#define DO__atomic_fetch_add_simt_(bytes, bits)                               \
-  template <class type, class delta,                                          \
-            std::enable_if_t<sizeof(type) == bytes, int> = 0>                 \
-  type __device__ __atomic_fetch_add_simt_(type *ptr, delta val,              \
-                                           int memorder) {                    \
-    type ret;                                                                 \
-    switch (memorder) {                                                       \
-      case __ATOMIC_SEQ_CST: __simt_fence_sc_();                              \
-      case __ATOMIC_CONSUME:                                                  \
-      case __ATOMIC_ACQUIRE: __simt_add_acquire_##bits(ptr, ret, val); break; \
-      case __ATOMIC_ACQ_REL: __simt_add_acq_rel_##bits(ptr, ret, val); break; \
-      case __ATOMIC_RELEASE: __simt_add_release_##bits(ptr, ret, val); break; \
-      case __ATOMIC_RELAXED: __simt_add_relaxed_##bits(ptr, ret, val); break; \
-      default: assert(0);                                                     \
-    }                                                                         \
-    return ret;                                                               \
-  }
-DO__atomic_fetch_add_simt_(4, 32) DO__atomic_fetch_add_simt_(8, 64)
-
-    template <class type, class delta,
-              std::enable_if_t<sizeof(type) <= 2, int> = 0>
-    type __device__
-    __atomic_fetch_add_simt_(type *ptr, delta val, int memorder) {
-  type expected      = __atomic_load_n_simt_(ptr, __ATOMIC_RELAXED);
-  type const desired = expected + val;
-  while (!__atomic_compare_exchange_simt_(ptr, &expected, &desired, true,
-                                          memorder, memorder))
-    ;
-  return expected;
-}
-
-#define DO__atomic_fetch_sub_simt_(bytes, bits)                                \
-  template <class type, class delta,                                           \
-            std::enable_if_t<sizeof(type) == bytes, int> = 0>                  \
-  type __device__ __atomic_fetch_sub_simt_(type *ptr, delta val,               \
-                                           int memorder) {                     \
-    type ret;                                                                  \
-    switch (memorder) {                                                        \
-      case __ATOMIC_SEQ_CST: __simt_fence_sc_();                               \
-      case __ATOMIC_CONSUME:                                                   \
-      case __ATOMIC_ACQUIRE: __simt_add_acquire_##bits(ptr, ret, -val); break; \
-      case __ATOMIC_ACQ_REL: __simt_add_acq_rel_##bits(ptr, ret, -val); break; \
-      case __ATOMIC_RELEASE: __simt_add_release_##bits(ptr, ret, -val); break; \
-      case __ATOMIC_RELAXED: __simt_add_relaxed_##bits(ptr, ret, -val); break; \
-      default: assert(0);                                                      \
-    }                                                                          \
-    return ret;                                                                \
-  }
-DO__atomic_fetch_sub_simt_(4, 32) DO__atomic_fetch_sub_simt_(8, 64)
-
-    template <class type, class delta,
-              std::enable_if_t<sizeof(type) <= 2, int> = 0>
-    type __device__
-    __atomic_fetch_sub_simt_(type *ptr, delta val, int memorder) {
-  type expected      = __atomic_load_n_simt_(ptr, __ATOMIC_RELAXED);
-  type const desired = expected - val;
-  while (!__atomic_compare_exchange_simt_(ptr, &expected, &desired, true,
-                                          memorder, memorder))
-    ;
-  return expected;
-}
-
-#define DO__atomic_fetch_and_simt_(bytes, bits)                               \
-  template <class type, std::enable_if_t<sizeof(type) == bytes, int> = 0>     \
-  type __device__ __atomic_fetch_and_simt_(type *ptr, type val,               \
-                                           int memorder) {                    \
-    type ret;                                                                 \
-    switch (memorder) {                                                       \
-      case __ATOMIC_SEQ_CST: __simt_fence_sc_();                              \
-      case __ATOMIC_CONSUME:                                                  \
-      case __ATOMIC_ACQUIRE: __simt_and_acquire_##bits(ptr, ret, val); break; \
-      case __ATOMIC_ACQ_REL: __simt_and_acq_rel_##bits(ptr, ret, val); break; \
-      case __ATOMIC_RELEASE: __simt_and_release_##bits(ptr, ret, val); break; \
-      case __ATOMIC_RELAXED: __simt_and_relaxed_##bits(ptr, ret, val); break; \
-      default: assert(0);                                                     \
-    }                                                                         \
-    return ret;                                                               \
-  }
-DO__atomic_fetch_and_simt_(4, 32) DO__atomic_fetch_and_simt_(8, 64)
-
-    template <class type, class delta,
-              std::enable_if_t<sizeof(type) <= 2, int> = 0>
-    type __device__
-    __atomic_fetch_and_simt_(type *ptr, delta val, int memorder) {
-  type expected      = __atomic_load_n_simt_(ptr, __ATOMIC_RELAXED);
-  type const desired = expected & val;
-  while (!__atomic_compare_exchange_simt_(ptr, &expected, &desired, true,
-                                          memorder, memorder))
-    ;
-  return expected;
-}
-
-#define DO__atomic_fetch_xor_simt_(bytes, bits)                               \
-  template <class type, std::enable_if_t<sizeof(type) == bytes, int> = 0>     \
-  type __device__ __atomic_fetch_xor_simt_(type *ptr, type val,               \
-                                           int memorder) {                    \
-    type ret;                                                                 \
-    switch (memorder) {                                                       \
-      case __ATOMIC_SEQ_CST: __simt_fence_sc_();                              \
-      case __ATOMIC_CONSUME:                                                  \
-      case __ATOMIC_ACQUIRE: __simt_xor_acquire_##bits(ptr, ret, val); break; \
-      case __ATOMIC_ACQ_REL: __simt_xor_acq_rel_##bits(ptr, ret, val); break; \
-      case __ATOMIC_RELEASE: __simt_xor_release_##bits(ptr, ret, val); break; \
-      case __ATOMIC_RELAXED: __simt_xor_relaxed_##bits(ptr, ret, val); break; \
-      default: assert(0);                                                     \
-    }                                                                         \
-    return ret;                                                               \
-  }
-DO__atomic_fetch_xor_simt_(4, 32) DO__atomic_fetch_xor_simt_(8, 64)
-
-    template <class type, class delta,
-              std::enable_if_t<sizeof(type) <= 2, int> = 0>
-    type __device__
-    __atomic_fetch_xor_simt_(type *ptr, delta val, int memorder) {
-  type expected      = __atomic_load_n_simt_(ptr, __ATOMIC_RELAXED);
-  type const desired = expected ^ val;
-  while (!__atomic_compare_exchange_simt_(ptr, &expected, &desired, true,
-                                          memorder, memorder))
-    ;
-  return expected;
-}
-
-#define DO__atomic_fetch_or_simt_(bytes, bits)                                 \
-  template <class type, std::enable_if_t<sizeof(type) == bytes, int> = 0>      \
-  type __device__ __atomic_fetch_or_simt_(type *ptr, type val, int memorder) { \
-    type ret;                                                                  \
-    switch (memorder) {                                                        \
-      case __ATOMIC_SEQ_CST: __simt_fence_sc_();                               \
-      case __ATOMIC_CONSUME:                                                   \
-      case __ATOMIC_ACQUIRE: __simt_or_acquire_##bits(ptr, ret, val); break;   \
-      case __ATOMIC_ACQ_REL: __simt_or_acq_rel_##bits(ptr, ret, val); break;   \
-      case __ATOMIC_RELEASE: __simt_or_release_##bits(ptr, ret, val); break;   \
-      case __ATOMIC_RELAXED: __simt_or_relaxed_##bits(ptr, ret, val); break;   \
-      default: assert(0);                                                      \
-    }                                                                          \
-    return ret;                                                                \
-  }
-DO__atomic_fetch_or_simt_(4, 32) DO__atomic_fetch_or_simt_(8, 64)
-
-    template <class type, class delta,
-              std::enable_if_t<sizeof(type) <= 2, int> = 0>
-    type __device__
-    __atomic_fetch_or_simt_(type *ptr, delta val, int memorder) {
-  type expected      = __atomic_load_n_simt_(ptr, __ATOMIC_RELAXED);
-  type const desired = expected | val;
-  while (!__atomic_compare_exchange_simt_(ptr, &expected, &desired, true,
-                                          memorder, memorder))
-    ;
-  return expected;
-}
-
-template <class type>
-inline bool __device__ __atomic_test_and_set_simt_(type *ptr, int memorder) {
-  return __atomic_exchange_n_simt_((char *)ptr, (char)1, memorder) == 1;
-}
-template <class type>
-inline void __device__ __atomic_clear_simt_(type *ptr, int memorder) {
-  return __atomic_store_n_simt_((char *)ptr, (char)0, memorder);
-}
-
-inline constexpr __device__ bool __atomic_always_lock_free_simt_(size_t size,
-                                                                 void *) {
-  return size <= 8;
-}
-inline __device__ bool __atomic_is_lock_free_simt_(size_t size, void *ptr) {
-  return __atomic_always_lock_free_simt_(size, ptr);
-}
-
-/*
-    fences
-*/
-
-inline void __device__ __atomic_thread_fence_simt(int memorder) {
-  switch (memorder) {
-    case __ATOMIC_SEQ_CST: __simt_fence_sc_(); break;
-    case __ATOMIC_CONSUME:
-    case __ATOMIC_ACQUIRE:
-    case __ATOMIC_ACQ_REL:
-    case __ATOMIC_RELEASE: __simt_fence_(); break;
-    case __ATOMIC_RELAXED: break;
-    default: assert(0);
-  }
-}
-inline void __device__ __atomic_signal_fence_simt(int memorder) {
-  __atomic_thread_fence_simt(memorder);
-}
-
-/*
-    non-volatile
-*/
-
-template <class type>
-type __device__ __atomic_load_n_simt(const type *ptr, int memorder) {
-  return __atomic_load_n_simt_(const_cast<const type *>(ptr), memorder);
-}
-template <class type>
-void __device__ __atomic_load_simt(const type *ptr, type *ret, int memorder) {
-  __atomic_load_simt_(const_cast<const type *>(ptr), ret, memorder);
-}
-template <class type>
-void __device__ __atomic_store_n_simt(type *ptr, type val, int memorder) {
-  __atomic_store_n_simt_(const_cast<type *>(ptr), val, memorder);
-}
-template <class type>
-void __device__ __atomic_store_simt(type *ptr, type *val, int memorder) {
-  __atomic_store_simt_(const_cast<type *>(ptr), val, memorder);
-}
-template <class type>
-type __device__ __atomic_exchange_n_simt(type *ptr, type val, int memorder) {
-  return __atomic_exchange_n_simt_(const_cast<type *>(ptr), val, memorder);
-}
-template <class type>
-void __device__ __atomic_exchange_simt(type *ptr, type *val, type *ret,
-                                       int memorder) {
-  __atomic_exchange_simt_(const_cast<type *>(ptr), val, ret, memorder);
-}
-template <class type>
-bool __device__ __atomic_compare_exchange_n_simt(type *ptr, type *expected,
-                                                 type desired, bool weak,
-                                                 int success_memorder,
-                                                 int failure_memorder) {
-  return __atomic_compare_exchange_n_simt_(const_cast<type *>(ptr), expected,
-                                           desired, weak, success_memorder,
-                                           failure_memorder);
-}
-template <class type>
-bool __device__ __atomic_compare_exchange_simt(type *ptr, type *expected,
-                                               type *desired, bool weak,
-                                               int success_memorder,
-                                               int failure_memorder) {
-  return __atomic_compare_exchange_simt_(const_cast<type *>(ptr), expected,
-                                         desired, weak, success_memorder,
-                                         failure_memorder);
-}
-template <class type, class delta>
-type __device__ __atomic_fetch_add_simt(type *ptr, delta val, int memorder) {
-  return __atomic_fetch_add_simt_(const_cast<type *>(ptr), val, memorder);
-}
-template <class type, class delta>
-type __device__ __atomic_fetch_sub_simt(type *ptr, delta val, int memorder) {
-  return __atomic_fetch_sub_simt_(const_cast<type *>(ptr), val, memorder);
-}
-template <class type>
-type __device__ __atomic_fetch_and_simt(type *ptr, type val, int memorder) {
-  return __atomic_fetch_and_simt_(const_cast<type *>(ptr), val, memorder);
-}
-template <class type>
-type __device__ __atomic_fetch_xor_simt(type *ptr, type val, int memorder) {
-  return __atomic_fetch_xor_simt_(const_cast<type *>(ptr), val, memorder);
-}
-template <class type>
-type __device__ __atomic_fetch_or_simt(type *ptr, type val, int memorder) {
-  return __atomic_fetch_or_simt_(const_cast<type *>(ptr), val, memorder);
-}
-template <class type>
-bool __device__ __atomic_test_and_set_simt(void *ptr, int memorder) {
-  return __atomic_test_and_set_simt_(const_cast<void *>(ptr), memorder);
-}
-template <class type>
-void __device__ __atomic_clear_simt(void *ptr, int memorder) {
-  return __atomic_clear_simt_(const_cast<void *>(ptr), memorder);
-}
-inline bool __device__ __atomic_always_lock_free_simt(size_t size, void *ptr) {
-  return __atomic_always_lock_free_simt_(size, const_cast<void *>(ptr));
-}
-inline bool __device__ __atomic_is_lock_free_simt(size_t size, void *ptr) {
-  return __atomic_is_lock_free_simt_(size, const_cast<void *>(ptr));
-}
-
-/*
-    volatile
-*/
-
-template <class type>
-type __device__ __atomic_load_n_simt(const volatile type *ptr, int memorder) {
-  return __atomic_load_n_simt_(const_cast<const type *>(ptr), memorder);
-}
-template <class type>
-void __device__ __atomic_load_simt(const volatile type *ptr, type *ret,
-                                   int memorder) {
-  __atomic_load_simt_(const_cast<const type *>(ptr), ret, memorder);
-}
-template <class type>
-void __device__ __atomic_store_n_simt(volatile type *ptr, type val,
-                                      int memorder) {
-  __atomic_store_n_simt_(const_cast<type *>(ptr), val, memorder);
-}
-template <class type>
-void __device__ __atomic_store_simt(volatile type *ptr, type *val,
-                                    int memorder) {
-  __atomic_store_simt_(const_cast<type *>(ptr), val, memorder);
-}
-template <class type>
-type __device__ __atomic_exchange_n_simt(volatile type *ptr, type val,
-                                         int memorder) {
-  return __atomic_exchange_n_simt_(const_cast<type *>(ptr), val, memorder);
-}
-template <class type>
-void __device__ __atomic_exchange_simt(volatile type *ptr, type *val, type *ret,
-                                       int memorder) {
-  __atomic_exchange_simt_(const_cast<type *>(ptr), val, ret, memorder);
-}
-template <class type>
-bool __device__ __atomic_compare_exchange_n_simt(volatile type *ptr,
-                                                 type *expected, type desired,
-                                                 bool weak,
-                                                 int success_memorder,
-                                                 int failure_memorder) {
-  return __atomic_compare_exchange_n_simt_(const_cast<type *>(ptr), expected,
-                                           desired, weak, success_memorder,
-                                           failure_memorder);
-}
-template <class type>
-bool __device__ __atomic_compare_exchange_simt(volatile type *ptr,
-                                               type *expected, type *desired,
-                                               bool weak, int success_memorder,
-                                               int failure_memorder) {
-  return __atomic_compare_exchange_simt_(const_cast<type *>(ptr), expected,
-                                         desired, weak, success_memorder,
-                                         failure_memorder);
-}
-template <class type, class delta>
-type __device__ __atomic_fetch_add_simt(volatile type *ptr, delta val,
-                                        int memorder) {
-  return __atomic_fetch_add_simt_(const_cast<type *>(ptr), val, memorder);
-}
-template <class type, class delta>
-type __device__ __atomic_fetch_sub_simt(volatile type *ptr, delta val,
-                                        int memorder) {
-  return __atomic_fetch_sub_simt_(const_cast<type *>(ptr), val, memorder);
-}
-template <class type>
-type __device__ __atomic_fetch_and_simt(volatile type *ptr, type val,
-                                        int memorder) {
-  return __atomic_fetch_and_simt_(const_cast<type *>(ptr), val, memorder);
-}
-template <class type>
-type __device__ __atomic_fetch_xor_simt(volatile type *ptr, type val,
-                                        int memorder) {
-  return __atomic_fetch_xor_simt_(const_cast<type *>(ptr), val, memorder);
-}
-template <class type>
-type __device__ __atomic_fetch_or_simt(volatile type *ptr, type val,
-                                       int memorder) {
-  return __atomic_fetch_or_simt_(const_cast<type *>(ptr), val, memorder);
-}
-template <class type>
-bool __device__ __atomic_test_and_set_simt(volatile void *ptr, int memorder) {
-  return __atomic_test_and_set_simt_(const_cast<void *>(ptr), memorder);
-}
-template <class type>
-void __device__ __atomic_clear_simt(volatile void *ptr, int memorder) {
-  return __atomic_clear_simt_(const_cast<void *>(ptr), memorder);
-}
-
-}  // end namespace Impl
-}  // end namespace Kokkos
-
-#endif  //_SIMT_DETAILS_CONFIG
-
-#ifndef KOKKOS_SIMT_ATOMIC_BUILTIN_REPLACEMENTS_DEFINED
-/*
-    builtins
-*/
-
-#define __atomic_load_n __atomic_load_n_simt
-#define __atomic_load __atomic_load_simt
-#define __atomic_store_n __atomic_store_n_simt
-#define __atomic_store __atomic_store_simt
-#define __atomic_exchange_n __atomic_exchange_n_simt
-#define __atomic_exchange __atomic_exchange_simt
-#define __atomic_compare_exchange_n __atomic_compare_exchange_n_simt
-#define __atomic_compare_exchange __atomic_compare_exchange_simt
-#define __atomic_fetch_add __atomic_fetch_add_simt
-#define __atomic_fetch_sub __atomic_fetch_sub_simt
-#define __atomic_fetch_and __atomic_fetch_and_simt
-#define __atomic_fetch_xor __atomic_fetch_xor_simt
-#define __atomic_fetch_or __atomic_fetch_or_simt
-#define __atomic_test_and_set __atomic_test_and_set_simt
-#define __atomic_clear __atomic_clear_simt
-#define __atomic_always_lock_free __atomic_always_lock_free_simt
-#define __atomic_is_lock_free __atomic_is_lock_free_simt
-#define __atomic_thread_fence __atomic_thread_fence_simt
-#define __atomic_signal_fence __atomic_signal_fence_simt
-
-#define KOKKOS_SIMT_ATOMIC_BUILTIN_REPLACEMENTS_DEFINED
-
-#endif  //__CUDA_ARCH__ && KOKKOS_ENABLE_CUDA_ASM_ATOMICS
-#endif  // KOKKOS_SIMT_ATOMIC_BUILTIN_REPLACEMENTS_DEFINED
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Atomic_Intrinsics_Restore_Builtins.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Atomic_Intrinsics_Restore_Builtins.hpp
deleted file mode 100644
index a7dfc15d7a74781cf1c36b6d5eb362714000d81e..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Atomic_Intrinsics_Restore_Builtins.hpp
+++ /dev/null
@@ -1,41 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#ifdef KOKKOS_SIMT_ATOMIC_BUILTIN_REPLACEMENTS_DEFINED
-
-#undef __atomic_load_n
-#undef __atomic_load
-#undef __atomic_store_n
-#undef __atomic_store
-#undef __atomic_exchange_n
-#undef __atomic_exchange
-#undef __atomic_compare_exchange_n
-#undef __atomic_compare_exchange
-#undef __atomic_fetch_add
-#undef __atomic_fetch_sub
-#undef __atomic_fetch_and
-#undef __atomic_fetch_xor
-#undef __atomic_fetch_or
-#undef __atomic_test_and_set
-#undef __atomic_clear
-#undef __atomic_always_lock_free
-#undef __atomic_is_lock_free
-#undef __atomic_thread_fence
-#undef __atomic_signal_fence
-
-#undef KOKKOS_SIMT_ATOMIC_BUILTIN_REPLACEMENTS_DEFINED
-
-#endif  // KOKKOS_SIMT_ATOMIC_BUILTIN_REPLACEMENTS_DEFINED
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
index 75c1686dc2ccff2bf67e591f9a6ec07824f28eaf..3de8b2820ed13eee43d2bd2b9be682afb4af0674 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
@@ -183,9 +183,7 @@ int cuda_get_max_block_size(const CudaInternal* cuda_instance,
                             const FunctorType& f, const size_t vector_length,
                             const size_t shmem_block,
                             const size_t shmem_thread) {
-  (void)cuda_instance;
-
-  auto const& prop = Kokkos::Cuda().cuda_device_prop();
+  auto const& prop = cuda_instance->m_deviceProp;
 
   auto const block_size_to_dynamic_shmem = [&f, vector_length, shmem_block,
                                             shmem_thread](int block_size) {
@@ -209,9 +207,7 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance,
                             const FunctorType& f, const size_t vector_length,
                             const size_t shmem_block,
                             const size_t shmem_thread) {
-  (void)cuda_instance;
-
-  auto const& prop = Kokkos::Cuda().cuda_device_prop();
+  auto const& prop = cuda_instance->m_deviceProp;
 
   auto const block_size_to_dynamic_shmem = [&f, vector_length, shmem_block,
                                             shmem_thread](int block_size) {
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp
index 8e5d4a07068bfca839ac19de371796fb9c01ce5e..a4d064e544a79bec682aff97305f7a0a1e640e73 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp
@@ -30,8 +30,7 @@
 #include <Kokkos_Parallel_Reduce.hpp>
 #include <Kokkos_PointerOwnership.hpp>
 
-#include <Kokkos_Cuda.hpp>
-#include <cuda_runtime_api.h>
+#include <Cuda/Kokkos_Cuda.hpp>
 
 namespace Kokkos {
 namespace Impl {
@@ -133,8 +132,9 @@ template <class KernelType>
 struct get_graph_node_kernel_type<KernelType, Kokkos::ParallelReduceTag>
     : type_identity<GraphNodeKernelImpl<
           Kokkos::Cuda, typename KernelType::Policy,
-          typename KernelType::functor_type, Kokkos::ParallelReduceTag,
-          typename KernelType::reducer_type>> {};
+          CombinedFunctorReducer<typename KernelType::functor_type,
+                                 typename KernelType::reducer_type>,
+          Kokkos::ParallelReduceTag>> {};
 
 //==============================================================================
 // <editor-fold desc="get_cuda_graph_*() helper functions"> {{{1
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNode_Impl.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNode_Impl.hpp
index a586d30147a87f71fcd780f1e413ebd9b8bf9714..ff0aa0da0de2951e62854d8b46c2b441acbda2b9 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNode_Impl.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNode_Impl.hpp
@@ -25,8 +25,7 @@
 
 #include <impl/Kokkos_GraphImpl.hpp>  // GraphAccess needs to be complete
 
-#include <Kokkos_Cuda.hpp>
-#include <cuda_runtime_api.h>
+#include <Cuda/Kokkos_Cuda.hpp>
 
 namespace Kokkos {
 namespace Impl {
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp
index 26face64c91701638978ff5e39dcce6021f6be58..fcc3ff04ff58c24f3c6939c915b3ceeaeedf5776 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp
@@ -30,9 +30,9 @@
 #include <impl/Kokkos_GraphNodeImpl.hpp>
 #include <Cuda/Kokkos_Cuda_GraphNode_Impl.hpp>
 
-#include <Kokkos_Cuda.hpp>
-#include <cuda_runtime_api.h>
+#include <Cuda/Kokkos_Cuda.hpp>
 #include <Cuda/Kokkos_Cuda_Error.hpp>
+#include <Cuda/Kokkos_Cuda_Instance.hpp>
 
 namespace Kokkos {
 namespace Impl {
@@ -55,8 +55,11 @@ struct GraphImpl<Kokkos::Cuda> {
     constexpr size_t error_log_size = 256;
     cudaGraphNode_t error_node      = nullptr;
     char error_log[error_log_size];
-    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGraphInstantiate(
-        &m_graph_exec, m_graph, &error_node, error_log, error_log_size));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        (m_execution_space.impl_internal_space_instance()
+             ->cuda_graph_instantiate_wrapper(&m_graph_exec, m_graph,
+                                              &error_node, error_log,
+                                              error_log_size)));
     // TODO @graphs print out errors
   }
 
@@ -83,24 +86,31 @@ struct GraphImpl<Kokkos::Cuda> {
     m_execution_space.fence("Kokkos::GraphImpl::~GraphImpl: Graph Destruction");
     KOKKOS_EXPECTS(bool(m_graph))
     if (bool(m_graph_exec)) {
-      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGraphExecDestroy(m_graph_exec));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(
+          (m_execution_space.impl_internal_space_instance()
+               ->cuda_graph_exec_destroy_wrapper(m_graph_exec)));
     }
-    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGraphDestroy(m_graph));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        (m_execution_space.impl_internal_space_instance()
+             ->cuda_graph_destroy_wrapper(m_graph)));
   };
 
   explicit GraphImpl(Kokkos::Cuda arg_instance)
       : m_execution_space(std::move(arg_instance)) {
     KOKKOS_IMPL_CUDA_SAFE_CALL(
-        cudaGraphCreate(&m_graph, cuda_graph_flags_t{0}));
+        (m_execution_space.impl_internal_space_instance()
+             ->cuda_graph_create_wrapper(&m_graph, cuda_graph_flags_t{0})));
   }
 
   void add_node(std::shared_ptr<aggregate_node_impl_t> const& arg_node_ptr) {
     // All of the predecessors are just added as normal, so all we need to
     // do here is add an empty node
     KOKKOS_IMPL_CUDA_SAFE_CALL(
-        cudaGraphAddEmptyNode(&(arg_node_ptr->node_details_t::node), m_graph,
-                              /* dependencies = */ nullptr,
-                              /* numDependencies = */ 0));
+        (m_execution_space.impl_internal_space_instance()
+             ->cuda_graph_add_empty_node_wrapper(
+                 &(arg_node_ptr->node_details_t::node), m_graph,
+                 /* dependencies = */ nullptr,
+                 /* numDependencies = */ 0)));
   }
 
   template <class NodeImpl>
@@ -146,7 +156,9 @@ struct GraphImpl<Kokkos::Cuda> {
     KOKKOS_EXPECTS(bool(cuda_node))
 
     KOKKOS_IMPL_CUDA_SAFE_CALL(
-        cudaGraphAddDependencies(m_graph, &pred_cuda_node, &cuda_node, 1));
+        (m_execution_space.impl_internal_space_instance()
+             ->cuda_graph_add_dependencies_wrapper(m_graph, &pred_cuda_node,
+                                                   &cuda_node, 1)));
   }
 
   void submit() {
@@ -154,7 +166,8 @@ struct GraphImpl<Kokkos::Cuda> {
       _instantiate_graph();
     }
     KOKKOS_IMPL_CUDA_SAFE_CALL(
-        cudaGraphLaunch(m_graph_exec, m_execution_space.cuda_stream()));
+        (m_execution_space.impl_internal_space_instance()
+             ->cuda_graph_launch_wrapper(m_graph_exec)));
   }
 
   execution_space const& get_execution_space() const noexcept {
@@ -167,9 +180,11 @@ struct GraphImpl<Kokkos::Cuda> {
     auto rv = std::make_shared<root_node_impl_t>(
         get_execution_space(), _graph_node_is_root_ctor_tag{});
     KOKKOS_IMPL_CUDA_SAFE_CALL(
-        cudaGraphAddEmptyNode(&(rv->node_details_t::node), m_graph,
-                              /* dependencies = */ nullptr,
-                              /* numDependencies = */ 0));
+        (m_execution_space.impl_internal_space_instance()
+             ->cuda_graph_add_empty_node_wrapper(&(rv->node_details_t::node),
+                                                 m_graph,
+                                                 /* dependencies = */ nullptr,
+                                                 /* numDependencies = */ 0)));
     KOKKOS_ENSURES(bool(rv->node_details_t::node))
     return rv;
   }
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp
index eaa6263c406fc2e0d76de023a193e8a3f5dcec93..59acaa6d7611d9e6ce98cf6b52e84d413b897fbd 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp
@@ -260,10 +260,16 @@ KOKKOS_INLINE_FUNCTION
 
 /************************** bhalf conversions *********************************/
 // Go in this branch if CUDA version is >= 11.0.0 and less than 11.1.0 or if the
-// architecture is not Ampere
+// architecture is older than Ampere
+#if !defined(KOKKOS_ARCH_KEPLER) && !defined(KOKKOS_ARCH_MAXWELL) && \
+    !defined(KOKKOS_ARCH_PASCAL) && !defined(KOKKOS_ARCH_VOLTA) &&   \
+    !defined(KOKKOS_ARCH_TURING75)
+#define KOKKOS_IMPL_NVIDIA_GPU_ARCH_SUPPORT_BHALF
+#endif
+
 #if CUDA_VERSION >= 11000 && \
     (CUDA_VERSION < 11010 || \
-     !(defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_HOPPER)))
+     !defined(KOKKOS_IMPL_NVIDIA_GPU_ARCH_SUPPORT_BHALF))
 KOKKOS_INLINE_FUNCTION
 bhalf_t cast_to_bhalf(bhalf_t val) { return val; }
 
@@ -390,8 +396,7 @@ KOKKOS_INLINE_FUNCTION
 }
 #endif  // CUDA_VERSION >= 11000 && CUDA_VERSION < 11010
 
-#if CUDA_VERSION >= 11010 && \
-    ((defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_HOPPER)))
+#if CUDA_VERSION >= 11010 && defined(KOKKOS_IMPL_NVIDIA_GPU_ARCH_SUPPORT_BHALF)
 KOKKOS_INLINE_FUNCTION
 bhalf_t cast_to_bhalf(bhalf_t val) { return val; }
 KOKKOS_INLINE_FUNCTION
@@ -473,6 +478,8 @@ KOKKOS_INLINE_FUNCTION
   return static_cast<T>(cast_from_bhalf<unsigned long long>(val));
 }
 #endif  // CUDA_VERSION >= 11010
+
+#undef KOKKOS_IMPL_NVIDIA_GPU_ARCH_SUPPORT_BHALF
 }  // namespace Experimental
 
 #if (CUDA_VERSION >= 11000)
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
index 3dbe179d66ed24a7f88d6af074db8f1e45344b77..d7f853d9910264126160790b3c443fe667841d68 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
@@ -29,10 +29,10 @@
 #include <Cuda/Kokkos_Cuda_Error.hpp>
 #include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp>
 #include <Cuda/Kokkos_Cuda_Instance.hpp>
-#include <Cuda/Kokkos_Cuda_Locks.hpp>
 #include <Cuda/Kokkos_Cuda_UniqueToken.hpp>
 #include <impl/Kokkos_Error.hpp>
 #include <impl/Kokkos_Tools.hpp>
+#include <impl/Kokkos_CheckedIntegerOps.hpp>
 #include <impl/Kokkos_DeviceManagement.hpp>
 #include <impl/Kokkos_ExecSpaceManager.hpp>
 
@@ -101,16 +101,27 @@ int cuda_kernel_arch() {
   int arch    = 0;
   int *d_arch = nullptr;
 
-  cudaMalloc(reinterpret_cast<void **>(&d_arch), sizeof(int));
-  cudaMemcpy(d_arch, &arch, sizeof(int), cudaMemcpyDefault);
+  KOKKOS_IMPL_CUDA_SAFE_CALL((CudaInternal::singleton().cuda_malloc_wrapper(
+      reinterpret_cast<void **>(&d_arch), sizeof(int))));
+  KOKKOS_IMPL_CUDA_SAFE_CALL((CudaInternal::singleton().cuda_memcpy_wrapper(
+      d_arch, &arch, sizeof(int), cudaMemcpyDefault)));
 
   query_cuda_kernel_arch<<<1, 1>>>(d_arch);
 
-  cudaMemcpy(&arch, d_arch, sizeof(int), cudaMemcpyDefault);
-  cudaFree(d_arch);
+  KOKKOS_IMPL_CUDA_SAFE_CALL((CudaInternal::singleton().cuda_memcpy_wrapper(
+      &arch, d_arch, sizeof(int), cudaMemcpyDefault)));
+  KOKKOS_IMPL_CUDA_SAFE_CALL(
+      (CudaInternal::singleton().cuda_free_wrapper(d_arch)));
   return arch;
 }
 
+constexpr auto sizeScratchGrain =
+    sizeof(Cuda::size_type[Impl::CudaTraits::WarpSize]);
+
+std::size_t scratch_count(const std::size_t size) {
+  return (size + sizeScratchGrain - 1) / sizeScratchGrain;
+}
+
 }  // namespace
 
 Kokkos::View<uint32_t *, Kokkos::CudaSpace> cuda_global_unique_token_locks(
@@ -124,14 +135,25 @@ Kokkos::View<uint32_t *, Kokkos::CudaSpace> cuda_global_unique_token_locks(
   return locks;
 }
 
+// FIXME_CUDA_MULTIPLE_DEVICES
 void cuda_device_synchronize(const std::string &name) {
   Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::Cuda>(
       name,
       Kokkos::Tools::Experimental::SpecialSynchronizationCases::
           GlobalDeviceSynchronization,
-      []() {  // TODO: correct device ID
-        KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
+#if defined(KOKKOS_COMPILER_CLANG)
+      // annotate with __host__ silence a clang warning about using
+      // cudaDeviceSynchronize in device code
+      [] __host__() {
+        KOKKOS_IMPL_CUDA_SAFE_CALL(
+            (CudaInternal::singleton().cuda_device_synchronize_wrapper()));
       });
+#else
+      []() {
+        KOKKOS_IMPL_CUDA_SAFE_CALL(
+            (CudaInternal::singleton().cuda_device_synchronize_wrapper()));
+      });
+#endif
 }
 
 void cuda_stream_synchronize(const cudaStream_t stream, const CudaInternal *ptr,
@@ -140,8 +162,9 @@ void cuda_stream_synchronize(const cudaStream_t stream, const CudaInternal *ptr,
       name,
       Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{
           ptr->impl_get_instance_id()},
-      [&]() {  // TODO: correct device ID
-        KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamSynchronize(stream));
+      [&]() {
+        KOKKOS_IMPL_CUDA_SAFE_CALL(
+            (ptr->cuda_stream_synchronize_wrapper(stream)));
       });
 }
 
@@ -150,16 +173,20 @@ void cuda_stream_synchronize(
     Kokkos::Tools::Experimental::SpecialSynchronizationCases reason,
     const std::string &name) {
   Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::Cuda>(
-      name, reason, [&]() {  // TODO: correct device ID
-        KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamSynchronize(stream));
+      name, reason, [&]() {
+        KOKKOS_IMPL_CUDA_SAFE_CALL(
+            (Impl::CudaInternal::singleton().cuda_stream_synchronize_wrapper(
+                stream)));
       });
 }
 
 void cuda_internal_error_throw(cudaError e, const char *name, const char *file,
                                const int line) {
   std::ostringstream out;
-  out << name << " error( " << cudaGetErrorName(e)
-      << "): " << cudaGetErrorString(e);
+  out << name << " error( "
+      << CudaInternal::singleton().cuda_get_error_name_wrapper<false>(e)
+      << "): "
+      << CudaInternal::singleton().cuda_get_error_string_wrapper<false>(e);
   if (file) {
     out << " " << file << ":" << line;
   }
@@ -169,8 +196,10 @@ void cuda_internal_error_throw(cudaError e, const char *name, const char *file,
 void cuda_internal_error_abort(cudaError e, const char *name, const char *file,
                                const int line) {
   std::ostringstream out;
-  out << name << " error( " << cudaGetErrorName(e)
-      << "): " << cudaGetErrorString(e);
+  out << name << " error( "
+      << CudaInternal::singleton().cuda_get_error_name_wrapper<false>(e)
+      << "): "
+      << CudaInternal::singleton().cuda_get_error_string_wrapper<false>(e);
   if (file) {
     out << " " << file << ":" << line;
   }
@@ -246,7 +275,9 @@ CudaInternalDevices::CudaInternalDevices() {
   // See 'cudaSetDeviceFlags' for host-device thread interaction
   // Section 4.4.2.6 of the CUDA Toolkit Reference Manual
 
-  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&m_cudaDevCount));
+  KOKKOS_IMPL_CUDA_SAFE_CALL(
+      (CudaInternal::singleton().cuda_get_device_count_wrapper<false>(
+          &m_cudaDevCount)));
 
   if (m_cudaDevCount > MAXIMUM_DEVICE_COUNT) {
     Kokkos::abort(
@@ -254,7 +285,9 @@ CudaInternalDevices::CudaInternalDevices() {
         "have. Please report this to github.com/kokkos/kokkos.");
   }
   for (int i = 0; i < m_cudaDevCount; ++i) {
-    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceProperties(m_cudaProp + i, i));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        (CudaInternal::singleton().cuda_get_device_properties_wrapper<false>(
+            m_cudaProp + i, i)));
   }
 }
 
@@ -331,86 +364,47 @@ CudaInternal &CudaInternal::singleton() {
   return self;
 }
 void CudaInternal::fence(const std::string &name) const {
-  Impl::cuda_stream_synchronize(m_stream, this, name);
+  Impl::cuda_stream_synchronize(get_stream(), this, name);
 }
 void CudaInternal::fence() const {
   fence("Kokkos::CudaInternal::fence(): Unnamed Instance Fence");
 }
 
 void CudaInternal::initialize(cudaStream_t stream, bool manage_stream) {
+  KOKKOS_EXPECTS(!is_initialized());
+
   if (was_finalized)
     Kokkos::abort("Calling Cuda::initialize after Cuda::finalize is illegal\n");
   was_initialized = true;
-  if (is_initialized()) return;
-
-#ifndef KOKKOS_IMPL_TURN_OFF_CUDA_HOST_INIT_CHECK
-  if (!HostSpace::execution_space::impl_is_initialized()) {
-    const std::string msg(
-        "Cuda::initialize ERROR : HostSpace::execution_space is not "
-        "initialized");
-    throw_runtime_exception(msg);
-  }
-#endif
-
-  const bool ok_init = nullptr == m_scratchSpace || nullptr == m_scratchFlags;
-
-  if (ok_init) {
-    //----------------------------------
-    // Multiblock reduction uses scratch flags for counters
-    // and scratch space for partial reduction values.
-    // Allocate some initial space.  This will grow as needed.
-
-    {
-      const unsigned reduce_block_count =
-          m_maxWarpCount * Impl::CudaTraits::WarpSize;
-
-      (void)scratch_unified(16 * sizeof(size_type));
-      (void)scratch_flags(reduce_block_count * 2 * sizeof(size_type));
-      (void)scratch_space(reduce_block_count * 16 * sizeof(size_type));
-    }
-  } else {
-    std::ostringstream msg;
-    msg << "Kokkos::Cuda::initialize(" << m_cudaDev
-        << ") FAILED : Already initialized";
-    Kokkos::Impl::throw_runtime_exception(msg.str());
-  }
 
-#ifdef KOKKOS_ENABLE_CUDA_UVM
-  const char *env_force_device_alloc =
-      getenv("CUDA_MANAGED_FORCE_DEVICE_ALLOC");
-  bool force_device_alloc;
-  if (env_force_device_alloc == nullptr)
-    force_device_alloc = false;
-  else
-    force_device_alloc = std::stoi(env_force_device_alloc) != 0;
+  //----------------------------------
+  // Multiblock reduction uses scratch flags for counters
+  // and scratch space for partial reduction values.
+  // Allocate some initial space.  This will grow as needed.
 
-  const char *env_visible_devices = getenv("CUDA_VISIBLE_DEVICES");
-  bool visible_devices_one        = true;
-  if (env_visible_devices == nullptr) visible_devices_one = false;
+  {
+    const unsigned reduce_block_count =
+        m_maxWarpCount * Impl::CudaTraits::WarpSize;
 
-  if (Kokkos::show_warnings() &&
-      (!visible_devices_one && !force_device_alloc)) {
-    std::cerr << R"warning(
-Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default
-                                  without setting CUDA_MANAGED_FORCE_DEVICE_ALLOC=1 or
-                                  setting CUDA_VISIBLE_DEVICES.
-                                  This could on multi GPU systems lead to severe performance"
-                                  penalties.)warning"
-              << std::endl;
+    (void)scratch_unified(16 * sizeof(size_type));
+    (void)scratch_flags(reduce_block_count * 2 * sizeof(size_type));
+    (void)scratch_space(reduce_block_count * 16 * sizeof(size_type));
   }
-#endif
 
   // Init the array for used for arbitrarily sized atomics
-  if (this == &singleton()) Impl::initialize_host_cuda_lock_arrays();
+  if (this == &singleton()) {
+    desul::Impl::init_lock_arrays();  // FIXME
+  }
 
   // Allocate a staging buffer for constant mem in pinned host memory
   // and an event to avoid overwriting driver for previous kernel launches
   if (this == &singleton()) {
-    KOKKOS_IMPL_CUDA_SAFE_CALL(
-        cudaMallocHost(reinterpret_cast<void **>(&constantMemHostStaging),
-                       CudaTraits::ConstantMemoryUsage));
+    KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_malloc_host_wrapper(
+        reinterpret_cast<void **>(&constantMemHostStaging),
+        CudaTraits::ConstantMemoryUsage)));
 
-    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaEventCreate(&constantMemReusable));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        (cuda_event_create_wrapper(&constantMemReusable)));
   }
 
   m_stream        = stream;
@@ -420,37 +414,37 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default
     m_team_scratch_ptr[i]          = nullptr;
   }
 
+  m_num_scratch_locks = concurrency();
   KOKKOS_IMPL_CUDA_SAFE_CALL(
-      cudaMalloc(&m_scratch_locks, sizeof(int32_t) * concurrency()));
-  KOKKOS_IMPL_CUDA_SAFE_CALL(
-      cudaMemset(m_scratch_locks, 0, sizeof(int32_t) * concurrency()));
+      (cuda_malloc_wrapper(reinterpret_cast<void **>(&m_scratch_locks),
+                           sizeof(int32_t) * m_num_scratch_locks)));
+  KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_memset_wrapper(
+      m_scratch_locks, 0, sizeof(int32_t) * m_num_scratch_locks)));
 }
 
 //----------------------------------------------------------------------------
 
-using ScratchGrain = Cuda::size_type[Impl::CudaTraits::WarpSize];
-enum { sizeScratchGrain = sizeof(ScratchGrain) };
-
 Cuda::size_type *CudaInternal::scratch_flags(const std::size_t size) const {
   if (verify_is_initialized("scratch_flags") &&
-      m_scratchFlagsCount * sizeScratchGrain < size) {
-    m_scratchFlagsCount = (size + sizeScratchGrain - 1) / sizeScratchGrain;
+      m_scratchFlagsCount < scratch_count(size)) {
+    m_scratchFlagsCount = scratch_count(size);
 
     using Record =
         Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaSpace, void>;
 
     if (m_scratchFlags) Record::decrement(Record::get_record(m_scratchFlags));
 
-    Record *const r =
-        Record::allocate(Kokkos::CudaSpace(), "Kokkos::InternalScratchFlags",
-                         (sizeof(ScratchGrain) * m_scratchFlagsCount));
+    std::size_t alloc_size =
+        multiply_overflow_abort(m_scratchFlagsCount, sizeScratchGrain);
+    Record *const r = Record::allocate(
+        Kokkos::CudaSpace(), "Kokkos::InternalScratchFlags", alloc_size);
 
     Record::increment(r);
 
     m_scratchFlags = reinterpret_cast<size_type *>(r->data());
 
     KOKKOS_IMPL_CUDA_SAFE_CALL(
-        cudaMemset(m_scratchFlags, 0, m_scratchFlagsCount * sizeScratchGrain));
+        (cuda_memset_wrapper(m_scratchFlags, 0, alloc_size)));
   }
 
   return m_scratchFlags;
@@ -458,17 +452,18 @@ Cuda::size_type *CudaInternal::scratch_flags(const std::size_t size) const {
 
 Cuda::size_type *CudaInternal::scratch_space(const std::size_t size) const {
   if (verify_is_initialized("scratch_space") &&
-      m_scratchSpaceCount * sizeScratchGrain < size) {
-    m_scratchSpaceCount = (size + sizeScratchGrain - 1) / sizeScratchGrain;
+      m_scratchSpaceCount < scratch_count(size)) {
+    m_scratchSpaceCount = scratch_count(size);
 
     using Record =
         Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaSpace, void>;
 
     if (m_scratchSpace) Record::decrement(Record::get_record(m_scratchSpace));
 
-    Record *const r =
-        Record::allocate(Kokkos::CudaSpace(), "Kokkos::InternalScratchSpace",
-                         (sizeof(ScratchGrain) * m_scratchSpaceCount));
+    std::size_t alloc_size =
+        multiply_overflow_abort(m_scratchSpaceCount, sizeScratchGrain);
+    Record *const r = Record::allocate(
+        Kokkos::CudaSpace(), "Kokkos::InternalScratchSpace", alloc_size);
 
     Record::increment(r);
 
@@ -479,9 +474,9 @@ Cuda::size_type *CudaInternal::scratch_space(const std::size_t size) const {
 }
 
 Cuda::size_type *CudaInternal::scratch_unified(const std::size_t size) const {
-  if (verify_is_initialized("scratch_unified") && m_scratchUnifiedSupported &&
-      m_scratchUnifiedCount * sizeScratchGrain < size) {
-    m_scratchUnifiedCount = (size + sizeScratchGrain - 1) / sizeScratchGrain;
+  if (verify_is_initialized("scratch_unified") &&
+      m_scratchUnifiedCount < scratch_count(size)) {
+    m_scratchUnifiedCount = scratch_count(size);
 
     using Record =
         Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, void>;
@@ -489,9 +484,11 @@ Cuda::size_type *CudaInternal::scratch_unified(const std::size_t size) const {
     if (m_scratchUnified)
       Record::decrement(Record::get_record(m_scratchUnified));
 
-    Record *const r = Record::allocate(
-        Kokkos::CudaHostPinnedSpace(), "Kokkos::InternalScratchUnified",
-        (sizeof(ScratchGrain) * m_scratchUnifiedCount));
+    std::size_t alloc_size =
+        multiply_overflow_abort(m_scratchUnifiedCount, sizeScratchGrain);
+    Record *const r =
+        Record::allocate(Kokkos::CudaHostPinnedSpace(),
+                         "Kokkos::InternalScratchUnified", alloc_size);
 
     Record::increment(r);
 
@@ -574,15 +571,18 @@ void CudaInternal::finalize() {
   // Only finalize this if we're the singleton
   if (this == &singleton()) {
     (void)Impl::cuda_global_unique_token_locks(true);
-    Impl::finalize_host_cuda_lock_arrays();
+    desul::Impl::finalize_lock_arrays();  // FIXME
 
-    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFreeHost(constantMemHostStaging));
-    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaEventDestroy(constantMemReusable));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        (cuda_free_host_wrapper(constantMemHostStaging)));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        (cuda_event_destroy_wrapper(constantMemReusable)));
     auto &deep_copy_space =
         Kokkos::Impl::cuda_get_deep_copy_space(/*initialize*/ false);
     if (deep_copy_space)
       deep_copy_space->impl_internal_space_instance()->finalize();
-    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamDestroy(cuda_get_deep_copy_stream()));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        (cuda_stream_destroy_wrapper(cuda_get_deep_copy_stream())));
   }
 
   if (nullptr != m_scratchSpace || nullptr != m_scratchFlags) {
@@ -602,8 +602,8 @@ void CudaInternal::finalize() {
       Kokkos::kokkos_free<Kokkos::CudaSpace>(m_team_scratch_ptr[i]);
   }
 
-  if (m_manage_stream && m_stream != nullptr)
-    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamDestroy(m_stream));
+  if (m_manage_stream && get_stream() != nullptr)
+    KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_stream_destroy_wrapper(m_stream)));
 
   m_scratchSpaceCount   = 0;
   m_scratchFlagsCount   = 0;
@@ -617,8 +617,9 @@ void CudaInternal::finalize() {
     m_team_scratch_ptr[i]          = nullptr;
   }
 
-  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(m_scratch_locks));
-  m_scratch_locks = nullptr;
+  KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_free_wrapper(m_scratch_locks)));
+  m_scratch_locks     = nullptr;
+  m_num_scratch_locks = 0;
 }
 
 //----------------------------------------------------------------------------
@@ -647,10 +648,6 @@ std::array<Cuda::size_type, 3> cuda_internal_maximum_grid_count() {
   return CudaInternal::singleton().m_maxBlock;
 }
 
-Cuda::size_type cuda_internal_maximum_shared_words() {
-  return CudaInternal::singleton().m_maxSharedWords;
-}
-
 Cuda::size_type *cuda_internal_scratch_space(const Cuda &instance,
                                              const std::size_t size) {
   return instance.impl_internal_space_instance()->scratch_space(size);
@@ -693,114 +690,115 @@ void Cuda::impl_initialize(InitializationSettings const &settings) {
   const int cuda_device_id = Impl::get_gpu(settings);
   const auto &dev_info     = Impl::CudaInternalDevices::singleton();
 
-  // Need device capability 3.0 or better
-  const bool ok_dev = 3 <= dev_info.m_cudaProp[cuda_device_id].major &&
-                      0 <= dev_info.m_cudaProp[cuda_device_id].minor;
-  if (ok_dev) {
-    const struct cudaDeviceProp &cudaProp = dev_info.m_cudaProp[cuda_device_id];
-
-    Impl::CudaInternal::m_cudaDev    = cuda_device_id;
-    Impl::CudaInternal::m_deviceProp = cudaProp;
-
-    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device_id));
-    Kokkos::Impl::cuda_device_synchronize(
-        "Kokkos::CudaInternal::initialize: Fence on space initialization");
-
-    // Query what compute capability architecture a kernel executes:
-    Impl::CudaInternal::m_cudaArch = Impl::cuda_kernel_arch();
-
-    if (Impl::CudaInternal::m_cudaArch == 0) {
-      std::stringstream ss;
-      ss << "Kokkos::Cuda::initialize ERROR: likely mismatch of architecture\n";
-      std::string msg = ss.str();
-      Kokkos::abort(msg.c_str());
-    }
-
-    int compiled_major = Impl::CudaInternal::m_cudaArch / 100;
-    int compiled_minor = (Impl::CudaInternal::m_cudaArch % 100) / 10;
-
-    if ((compiled_major > cudaProp.major) ||
-        ((compiled_major == cudaProp.major) &&
-         (compiled_minor > cudaProp.minor))) {
-      std::stringstream ss;
-      ss << "Kokkos::Cuda::initialize ERROR: running kernels compiled for "
-            "compute capability "
-         << compiled_major << "." << compiled_minor
-         << " on device with compute capability " << cudaProp.major << "."
-         << cudaProp.minor << " is not supported by CUDA!\n";
-      std::string msg = ss.str();
-      Kokkos::abort(msg.c_str());
-    }
-    if (Kokkos::show_warnings() && (compiled_major != cudaProp.major ||
-                                    compiled_minor != cudaProp.minor)) {
-      std::cerr << "Kokkos::Cuda::initialize WARNING: running kernels compiled "
-                   "for compute capability "
-                << compiled_major << "." << compiled_minor
-                << " on device with compute capability " << cudaProp.major
-                << "." << cudaProp.minor
-                << " , this will likely reduce potential performance."
-                << std::endl;
-    }
-
-    // number of multiprocessors
-    Impl::CudaInternal::m_multiProcCount = cudaProp.multiProcessorCount;
-
-    //----------------------------------
-    // Maximum number of warps,
-    // at most one warp per thread in a warp for reduction.
-    Impl::CudaInternal::m_maxWarpCount =
-        cudaProp.maxThreadsPerBlock / Impl::CudaTraits::WarpSize;
-
-    if (Impl::CudaTraits::WarpSize < Impl::CudaInternal::m_maxWarpCount) {
-      Impl::CudaInternal::m_maxWarpCount = Impl::CudaTraits::WarpSize;
-    }
-
-    constexpr auto WordSize = sizeof(size_type);
-    Impl::CudaInternal::m_maxSharedWords =
-        cudaProp.sharedMemPerBlock / WordSize;
-
-    //----------------------------------
-    // Maximum number of blocks:
-
-    Impl::CudaInternal::m_maxBlock[0] = cudaProp.maxGridSize[0];
-    Impl::CudaInternal::m_maxBlock[1] = cudaProp.maxGridSize[1];
-    Impl::CudaInternal::m_maxBlock[2] = cudaProp.maxGridSize[2];
-
-    Impl::CudaInternal::m_shmemPerSM = cudaProp.sharedMemPerMultiprocessor;
-    Impl::CudaInternal::m_maxShmemPerBlock = cudaProp.sharedMemPerBlock;
-    Impl::CudaInternal::m_maxBlocksPerSM =
-        Impl::CudaInternal::m_cudaArch < 500
-            ? 16
-            : (Impl::CudaInternal::m_cudaArch < 750
-                   ? 32
-                   : (Impl::CudaInternal::m_cudaArch == 750 ? 16 : 32));
-    Impl::CudaInternal::m_maxThreadsPerSM =
-        cudaProp.maxThreadsPerMultiProcessor;
-    Impl::CudaInternal::m_maxThreadsPerBlock = cudaProp.maxThreadsPerBlock;
-
-    //----------------------------------
-
-    Impl::CudaInternal::m_scratchUnifiedSupported = cudaProp.unifiedAddressing;
-
-    if (Kokkos::show_warnings() &&
-        !Impl::CudaInternal::m_scratchUnifiedSupported) {
-      std::cerr << "Kokkos::Cuda device " << cudaProp.name << " capability "
-                << cudaProp.major << "." << cudaProp.minor
-                << " does not support unified virtual address space"
-                << std::endl;
-    }
-  } else {
-    std::ostringstream msg;
-    msg << "Kokkos::Cuda::initialize(" << cuda_device_id << ") FAILED: Device ";
-    msg << dev_info.m_cudaProp[cuda_device_id].major;
-    msg << ".";
-    msg << dev_info.m_cudaProp[cuda_device_id].minor;
-    msg << " has insufficient capability, required 3.0 or better";
-    Kokkos::Impl::throw_runtime_exception(msg.str());
+  const struct cudaDeviceProp &cudaProp = dev_info.m_cudaProp[cuda_device_id];
+
+  Impl::CudaInternal::m_cudaDev    = cuda_device_id;
+  Impl::CudaInternal::m_deviceProp = cudaProp;
+
+  Kokkos::Impl::cuda_device_synchronize(
+      "Kokkos::CudaInternal::initialize: Fence on space initialization");
+
+  // Query what compute capability architecture a kernel executes:
+  Impl::CudaInternal::m_cudaArch = Impl::cuda_kernel_arch();
+
+  if (Impl::CudaInternal::m_cudaArch == 0) {
+    std::stringstream ss;
+    ss << "Kokkos::Cuda::initialize ERROR: likely mismatch of architecture\n";
+    std::string msg = ss.str();
+    Kokkos::abort(msg.c_str());
+  }
+
+  int compiled_major = Impl::CudaInternal::m_cudaArch / 100;
+  int compiled_minor = (Impl::CudaInternal::m_cudaArch % 100) / 10;
+
+  if ((compiled_major > cudaProp.major) ||
+      ((compiled_major == cudaProp.major) &&
+       (compiled_minor > cudaProp.minor))) {
+    std::stringstream ss;
+    ss << "Kokkos::Cuda::initialize ERROR: running kernels compiled for "
+          "compute capability "
+       << compiled_major << "." << compiled_minor
+       << " on device with compute capability " << cudaProp.major << "."
+       << cudaProp.minor << " is not supported by CUDA!\n";
+    std::string msg = ss.str();
+    Kokkos::abort(msg.c_str());
   }
+  if (Kokkos::show_warnings() &&
+      (compiled_major != cudaProp.major || compiled_minor != cudaProp.minor)) {
+    std::cerr << "Kokkos::Cuda::initialize WARNING: running kernels compiled "
+                 "for compute capability "
+              << compiled_major << "." << compiled_minor
+              << " on device with compute capability " << cudaProp.major << "."
+              << cudaProp.minor
+              << " , this will likely reduce potential performance."
+              << std::endl;
+  }
+
+  //----------------------------------
+
+#ifdef KOKKOS_ENABLE_CUDA_UVM
+  const char *env_force_device_alloc =
+      getenv("CUDA_MANAGED_FORCE_DEVICE_ALLOC");
+  bool force_device_alloc;
+  if (env_force_device_alloc == nullptr)
+    force_device_alloc = false;
+  else
+    force_device_alloc = std::stoi(env_force_device_alloc) != 0;
+
+  const char *env_visible_devices = getenv("CUDA_VISIBLE_DEVICES");
+  bool visible_devices_one        = true;
+  if (env_visible_devices == nullptr) visible_devices_one = false;
+
+  if (Kokkos::show_warnings() &&
+      (!visible_devices_one && !force_device_alloc)) {
+    std::cerr << R"warning(
+Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default
+                                  without setting CUDA_MANAGED_FORCE_DEVICE_ALLOC=1 or
+                                  setting CUDA_VISIBLE_DEVICES.
+                                  This could on multi GPU systems lead to severe performance"
+                                  penalties.)warning"
+              << std::endl;
+  }
+#endif
+
+  //----------------------------------
+  // number of multiprocessors
+  Impl::CudaInternal::m_multiProcCount = cudaProp.multiProcessorCount;
+
+  //----------------------------------
+  // Maximum number of warps,
+  // at most one warp per thread in a warp for reduction.
+  Impl::CudaInternal::m_maxWarpCount =
+      cudaProp.maxThreadsPerBlock / Impl::CudaTraits::WarpSize;
+
+  if (Impl::CudaTraits::WarpSize < Impl::CudaInternal::m_maxWarpCount) {
+    Impl::CudaInternal::m_maxWarpCount = Impl::CudaTraits::WarpSize;
+  }
+
+  //----------------------------------
+  // Maximum number of blocks:
+
+  Impl::CudaInternal::m_maxBlock[0] = cudaProp.maxGridSize[0];
+  Impl::CudaInternal::m_maxBlock[1] = cudaProp.maxGridSize[1];
+  Impl::CudaInternal::m_maxBlock[2] = cudaProp.maxGridSize[2];
+
+  Impl::CudaInternal::m_shmemPerSM       = cudaProp.sharedMemPerMultiprocessor;
+  Impl::CudaInternal::m_maxShmemPerBlock = cudaProp.sharedMemPerBlock;
+  Impl::CudaInternal::m_maxBlocksPerSM =
+      Impl::CudaInternal::m_cudaArch < 500
+          ? 16
+          : (Impl::CudaInternal::m_cudaArch < 750
+                 ? 32
+                 : (Impl::CudaInternal::m_cudaArch == 750 ? 16 : 32));
+  Impl::CudaInternal::m_maxThreadsPerSM = cudaProp.maxThreadsPerMultiProcessor;
+  Impl::CudaInternal::m_maxThreadsPerBlock = cudaProp.maxThreadsPerBlock;
+
+  //----------------------------------
 
   cudaStream_t singleton_stream;
-  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamCreate(&singleton_stream));
+  KOKKOS_IMPL_CUDA_SAFE_CALL(
+      (Impl::CudaInternal::singleton().cuda_stream_create_wrapper(
+          &singleton_stream)));
 
   auto &cuda_singleton = Impl::CudaInternal::singleton();
   cuda_singleton.initialize(singleton_stream, /*manage*/ true);
@@ -842,28 +840,24 @@ Cuda::Cuda()
       "Cuda instance constructor");
 }
 
-Cuda::Cuda(cudaStream_t stream, bool manage_stream)
+KOKKOS_DEPRECATED Cuda::Cuda(cudaStream_t stream, bool manage_stream)
+    : Cuda(stream,
+           manage_stream ? Impl::ManageStream::yes : Impl::ManageStream::no) {}
+
+Cuda::Cuda(cudaStream_t stream, Impl::ManageStream manage_stream)
     : m_space_instance(new Impl::CudaInternal, [](Impl::CudaInternal *ptr) {
         ptr->finalize();
         delete ptr;
       }) {
   Impl::CudaInternal::singleton().verify_is_initialized(
       "Cuda instance constructor");
-  m_space_instance->initialize(stream, manage_stream);
+  m_space_instance->initialize(stream, static_cast<bool>(manage_stream));
 }
 
 void Cuda::print_configuration(std::ostream &os, bool /*verbose*/) const {
   os << "Device Execution Space:\n";
   os << "  KOKKOS_ENABLE_CUDA: yes\n";
 
-  os << "Cuda Atomics:\n";
-  os << "  KOKKOS_ENABLE_CUDA_ATOMICS: ";
-#ifdef KOKKOS_ENABLE_CUDA_ATOMICS
-  os << "yes\n";
-#else
-  os << "no\n";
-#endif
-
   os << "Cuda Options:\n";
   os << "  KOKKOS_ENABLE_CUDA_LAMBDA: ";
 #ifdef KOKKOS_ENABLE_CUDA_LAMBDA
@@ -893,6 +887,12 @@ void Cuda::print_configuration(std::ostream &os, bool /*verbose*/) const {
 #else
   os << "no\n";
 #endif
+  os << "  KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC: ";
+#ifdef KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC
+  os << "yes\n";
+#else
+  os << "no\n";
+#endif
 
   os << "\nCuda Runtime Configuration:\n";
 
@@ -912,7 +912,9 @@ uint32_t Cuda::impl_instance_id() const noexcept {
   return m_space_instance->impl_get_instance_id();
 }
 
-cudaStream_t Cuda::cuda_stream() const { return m_space_instance->m_stream; }
+cudaStream_t Cuda::cuda_stream() const {
+  return m_space_instance->get_stream();
+}
 int Cuda::cuda_device() const { return m_space_instance->m_cudaDev; }
 const cudaDeviceProp &Cuda::cuda_device_prop() const {
   return m_space_instance->m_deviceProp;
@@ -927,6 +929,16 @@ int g_cuda_space_factory_initialized =
 
 }  // namespace Kokkos
 
+void Kokkos::Impl::create_Cuda_instances(std::vector<Cuda> &instances) {
+  for (int s = 0; s < int(instances.size()); s++) {
+    cudaStream_t stream;
+    KOKKOS_IMPL_CUDA_SAFE_CALL((
+        instances[s].impl_internal_space_instance()->cuda_stream_create_wrapper(
+            &stream)));
+    instances[s] = Cuda(stream, ManageStream::yes);
+  }
+}
+
 #else
 
 void KOKKOS_CORE_SRC_CUDA_IMPL_PREVENT_LINK_ERROR() {}
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp
index af34a5b9db345e7ef3225d7a019e5f92b3c816c6..a324adecfeb03e4a0b56111c7978717b9ed77ea9 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp
@@ -21,6 +21,7 @@
 #include <impl/Kokkos_Tools.hpp>
 #include <atomic>
 #include <Cuda/Kokkos_Cuda_Error.hpp>
+#include <cuda_runtime_api.h>
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@@ -72,7 +73,6 @@ struct CudaTraits {
 CudaSpace::size_type cuda_internal_multiprocessor_count();
 CudaSpace::size_type cuda_internal_maximum_warp_count();
 std::array<CudaSpace::size_type, 3> cuda_internal_maximum_grid_count();
-CudaSpace::size_type cuda_internal_maximum_shared_words();
 
 CudaSpace::size_type cuda_internal_maximum_concurrent_block_count();
 
@@ -108,7 +108,6 @@ class CudaInternal {
   inline static unsigned m_multiProcCount           = 0;
   inline static unsigned m_maxWarpCount             = 0;
   inline static std::array<size_type, 3> m_maxBlock = {0, 0, 0};
-  inline static unsigned m_maxSharedWords           = 0;
   inline static int m_shmemPerSM                    = 0;
   inline static int m_maxShmemPerBlock              = 0;
   inline static int m_maxBlocksPerSM                = 0;
@@ -124,7 +123,6 @@ class CudaInternal {
   mutable std::size_t m_scratchUnifiedCount;
   mutable std::size_t m_scratchFunctorSize;
 
-  inline static size_type m_scratchUnifiedSupported = 0;
   mutable size_type* m_scratchSpace;
   mutable size_type* m_scratchFlags;
   mutable size_type* m_scratchUnified;
@@ -138,7 +136,8 @@ class CudaInternal {
   mutable int64_t m_team_scratch_current_size[10];
   mutable void* m_team_scratch_ptr[10];
   mutable std::atomic_int m_team_scratch_pool[10];
-  std::int32_t* m_scratch_locks;
+  int32_t* m_scratch_locks;
+  size_t m_num_scratch_locks;
 
   bool was_initialized = false;
   bool was_finalized   = false;
@@ -192,6 +191,338 @@ class CudaInternal {
     }
   }
 
+  // Using cudaAPI function/objects will be w.r.t. device 0 unless
+  // cudaSetDevice(device_id) is called with the correct device_id.
+  // The correct device_id is stored in the variable
+  // CudaInternal::m_cudaDev set in Cuda::impl_initialize(). It is not
+  // sufficient to call cudaSetDevice(m_cudaDev) during cuda initialization
+  // only, however, since if a user creates a new thread, that thread will be
+  // given the default cuda env with device_id=0, causing errors when
+  // device_id!=0 is requested by the user. To ensure against this, almost all
+  // cudaAPI calls, as well as using cudaStream_t variables, must be proceeded
+  // by cudaSetDevice(device_id).
+
+  // This function sets device in cudaAPI to device requested at runtime (set in
+  // m_cudaDev).
+  void set_cuda_device() const {
+    verify_is_initialized("set_cuda_device");
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_cudaDev));
+  }
+
+  // Return the class stream, optionally setting the device id.
+  template <bool setCudaDevice = true>
+  cudaStream_t get_stream() const {
+    if constexpr (setCudaDevice) set_cuda_device();
+    return m_stream;
+  }
+
+  // The following are wrappers for cudaAPI functions (C and C++ routines) which
+  // set the correct device id directly before the cudaAPI call (unless
+  // explicitly disabled by providing setCudaDevice=false template).
+  // setCudaDevice=true should be used for all API calls which take a stream
+  // unless it is guarenteed to be from a cuda instance with the correct device
+  // set already (e.g., back-to-back cudaAPI calls in a single function). For
+  // cudaAPI functions that take a stream, an optional input stream is
+  // available. If no stream is given, the stream for the CudaInternal instance
+  // is used. All cudaAPI calls should be wrapped in these interface functions
+  // to ensure safety when using threads.
+
+  // Helper function for selecting the correct input stream
+  cudaStream_t get_input_stream(cudaStream_t s) const {
+    return s == nullptr ? get_stream<false>() : s;
+  }
+
+  // C API routines
+  template <bool setCudaDevice = true>
+  cudaError_t cuda_device_get_limit_wrapper(size_t* pValue,
+                                            cudaLimit limit) const {
+    if constexpr (setCudaDevice) set_cuda_device();
+    return cudaDeviceGetLimit(pValue, limit);
+  }
+
+  template <bool setCudaDevice = true>
+  cudaError_t cuda_device_set_limit_wrapper(cudaLimit limit,
+                                            size_t value) const {
+    if constexpr (setCudaDevice) set_cuda_device();
+    return cudaDeviceSetLimit(limit, value);
+  }
+
+  template <bool setCudaDevice = true>
+  cudaError_t cuda_device_synchronize_wrapper() const {
+    if constexpr (setCudaDevice) set_cuda_device();
+    return cudaDeviceSynchronize();
+  }
+
+  template <bool setCudaDevice = true>
+  cudaError_t cuda_event_create_wrapper(cudaEvent_t* event) const {
+    if constexpr (setCudaDevice) set_cuda_device();
+    return cudaEventCreate(event);
+  }
+
+  template <bool setCudaDevice = true>
+  cudaError_t cuda_event_destroy_wrapper(cudaEvent_t event) const {
+    if constexpr (setCudaDevice) set_cuda_device();
+    return cudaEventDestroy(event);
+  }
+
+  template <bool setCudaDevice = true>
+  cudaError_t cuda_event_record_wrapper(cudaEvent_t event,
+                                        cudaStream_t stream = nullptr) const {
+    if constexpr (setCudaDevice) set_cuda_device();
+    return cudaEventRecord(event, get_input_stream(stream));
+  }
+
+  template <bool setCudaDevice = true>
+  cudaError_t cuda_event_synchronize_wrapper(cudaEvent_t event) const {
+    if constexpr (setCudaDevice) set_cuda_device();
+    return cudaEventSynchronize(event);
+  }
+
+  template <bool setCudaDevice = true>
+  cudaError_t cuda_free_wrapper(void* devPtr) const {
+    if constexpr (setCudaDevice) set_cuda_device();
+    return cudaFree(devPtr);
+  }
+
+  template <bool setCudaDevice = true>
+  cudaError_t cuda_free_host_wrapper(void* ptr) const {
+    if constexpr (setCudaDevice) set_cuda_device();
+    return cudaFreeHost(ptr);
+  }
+
+  template <bool setCudaDevice = true>
+  cudaError_t cuda_get_device_count_wrapper(int* count) const {
+    if constexpr (setCudaDevice) set_cuda_device();
+    return cudaGetDeviceCount(count);
+  }
+
+  template <bool setCudaDevice = true>
+  cudaError_t cuda_get_device_properties_wrapper(cudaDeviceProp* prop,
+                                                 int device) const {
+    if constexpr (setCudaDevice) set_cuda_device();
+    return cudaGetDeviceProperties(prop, device);
+  }
+
+  template <bool setCudaDevice = true>
+  const char* cuda_get_error_name_wrapper(cudaError_t error) const {
+    if constexpr (setCudaDevice) set_cuda_device();
+    return cudaGetErrorName(error);
+  }
+
+  template <bool setCudaDevice = true>
+  const char* cuda_get_error_string_wrapper(cudaError_t error) const {
+    if constexpr (setCudaDevice) set_cuda_device();
+    return cudaGetErrorString(error);
+  }
+
+  template <bool setCudaDevice = true>
+  cudaError_t cuda_get_last_error_wrapper() const {
+    if constexpr (setCudaDevice) set_cuda_device();
+    return cudaGetLastError();
+  }
+
+  template <bool setCudaDevice = true>
+  cudaError_t cuda_graph_add_dependencies_wrapper(
+      cudaGraph_t graph, const cudaGraphNode_t* from, const cudaGraphNode_t* to,
+      size_t numDependencies) const {
+    if constexpr (setCudaDevice) set_cuda_device();
+    return cudaGraphAddDependencies(graph, from, to, numDependencies);
+  }
+
+  template <bool setCudaDevice = true>
+  cudaError_t cuda_graph_add_empty_node_wrapper(
+      cudaGraphNode_t* pGraphNode, cudaGraph_t graph,
+      const cudaGraphNode_t* pDependencies, size_t numDependencies) const {
+    if constexpr (setCudaDevice) set_cuda_device();
+    return cudaGraphAddEmptyNode(pGraphNode, graph, pDependencies,
+                                 numDependencies);
+  }
+
+  template <bool setCudaDevice = true>
+  cudaError_t cuda_graph_add_kernel_node_wrapper(
+      cudaGraphNode_t* pGraphNode, cudaGraph_t graph,
+      const cudaGraphNode_t* pDependencies, size_t numDependencies,
+      const cudaKernelNodeParams* pNodeParams) const {
+    if constexpr (setCudaDevice) set_cuda_device();
+    return cudaGraphAddKernelNode(pGraphNode, graph, pDependencies,
+                                  numDependencies, pNodeParams);
+  }
+
+  template <bool setCudaDevice = true>
+  cudaError_t cuda_graph_create_wrapper(cudaGraph_t* pGraph,
+                                        unsigned int flags) const {
+    if constexpr (setCudaDevice) set_cuda_device();
+    return cudaGraphCreate(pGraph, flags);
+  }
+
+  template <bool setCudaDevice = true>
+  cudaError_t cuda_graph_destroy_wrapper(cudaGraph_t graph) const {
+    if constexpr (setCudaDevice) set_cuda_device();
+    return cudaGraphDestroy(graph);
+  }
+
+  template <bool setCudaDevice = true>
+  cudaError_t cuda_graph_exec_destroy_wrapper(cudaGraphExec_t graphExec) const {
+    if constexpr (setCudaDevice) set_cuda_device();
+    return cudaGraphExecDestroy(graphExec);
+  }
+
+  template <bool setCudaDevice = true>
+  cudaError_t cuda_graph_launch_wrapper(cudaGraphExec_t graphExec,
+                                        cudaStream_t stream = nullptr) const {
+    if constexpr (setCudaDevice) set_cuda_device();
+    return cudaGraphLaunch(graphExec, get_input_stream(stream));
+  }
+
+  template <bool setCudaDevice = true>
+  cudaError_t cuda_host_alloc_wrapper(void** pHost, size_t size,
+                                      unsigned int flags) const {
+    if constexpr (setCudaDevice) set_cuda_device();
+    return cudaHostAlloc(pHost, size, flags);
+  }
+
+  template <bool setCudaDevice = true>
+  cudaError_t cuda_malloc_wrapper(void** devPtr, size_t size) const {
+    if constexpr (setCudaDevice) set_cuda_device();
+    return cudaMalloc(devPtr, size);
+  }
+
+  template <bool setCudaDevice = true>
+  cudaError_t cuda_malloc_host_wrapper(void** ptr, size_t size) const {
+    if constexpr (setCudaDevice) set_cuda_device();
+    return cudaMallocHost(ptr, size);
+  }
+
+  template <bool setCudaDevice = true>
+  cudaError_t cuda_malloc_managed_wrapper(
+      void** devPtr, size_t size,
+      unsigned int flags = cudaMemAttachGlobal) const {
+    if constexpr (setCudaDevice) set_cuda_device();
+    return cudaMallocManaged(devPtr, size, flags);
+  }
+
+  template <bool setCudaDevice = true>
+  cudaError_t cuda_mem_advise_wrapper(const void* devPtr, size_t count,
+                                      cudaMemoryAdvise advice,
+                                      int device) const {
+    if constexpr (setCudaDevice) set_cuda_device();
+    return cudaMemAdvise(devPtr, count, advice, device);
+  }
+
+  template <bool setCudaDevice = true>
+  cudaError_t cuda_mem_prefetch_async_wrapper(
+      const void* devPtr, size_t count, int dstDevice,
+      cudaStream_t stream = nullptr) const {
+    if constexpr (setCudaDevice) set_cuda_device();
+    return cudaMemPrefetchAsync(devPtr, count, dstDevice,
+                                get_input_stream(stream));
+  }
+
+  template <bool setCudaDevice = true>
+  cudaError_t cuda_memcpy_wrapper(void* dst, const void* src, size_t count,
+                                  cudaMemcpyKind kind) const {
+    if constexpr (setCudaDevice) set_cuda_device();
+    return cudaMemcpy(dst, src, count, kind);
+  }
+
+  template <bool setCudaDevice = true>
+  cudaError_t cuda_memcpy_async_wrapper(void* dst, const void* src,
+                                        size_t count, cudaMemcpyKind kind,
+                                        cudaStream_t stream = nullptr) const {
+    if constexpr (setCudaDevice) set_cuda_device();
+    return cudaMemcpyAsync(dst, src, count, kind, get_input_stream(stream));
+  }
+
+  template <bool setCudaDevice = true>
+  cudaError_t cuda_memcpy_to_symbol_async_wrapper(
+      const void* symbol, const void* src, size_t count, size_t offset,
+      cudaMemcpyKind kind, cudaStream_t stream = nullptr) const {
+    if constexpr (setCudaDevice) set_cuda_device();
+    return cudaMemcpyToSymbolAsync(symbol, src, count, offset, kind,
+                                   get_input_stream(stream));
+  }
+
+  template <bool setCudaDevice = true>
+  cudaError_t cuda_memset_wrapper(void* devPtr, int value, size_t count) const {
+    if constexpr (setCudaDevice) set_cuda_device();
+    return cudaMemset(devPtr, value, count);
+  }
+
+  template <bool setCudaDevice = true>
+  cudaError_t cuda_memset_async_wrapper(void* devPtr, int value, size_t count,
+                                        cudaStream_t stream = nullptr) const {
+    if constexpr (setCudaDevice) set_cuda_device();
+    return cudaMemsetAsync(devPtr, value, count, get_input_stream(stream));
+  }
+
+  template <bool setCudaDevice = true>
+  cudaError_t cuda_pointer_get_attributes_wrapper(
+      cudaPointerAttributes* attributes, const void* ptr) const {
+    if constexpr (setCudaDevice) set_cuda_device();
+    return cudaPointerGetAttributes(attributes, ptr);
+  }
+
+  template <bool setCudaDevice = true>
+  cudaError_t cuda_stream_create_wrapper(cudaStream_t* pStream) const {
+    if constexpr (setCudaDevice) set_cuda_device();
+    return cudaStreamCreate(pStream);
+  }
+
+  template <bool setCudaDevice = true>
+  cudaError_t cuda_stream_destroy_wrapper(cudaStream_t stream) const {
+    if constexpr (setCudaDevice) set_cuda_device();
+    return cudaStreamDestroy(stream);
+  }
+
+  template <bool setCudaDevice = true>
+  cudaError_t cuda_stream_synchronize_wrapper(cudaStream_t stream) const {
+    if constexpr (setCudaDevice) set_cuda_device();
+    return cudaStreamSynchronize(stream);
+  }
+
+  // The following are only available for cuda 11.2 and greater
+#if (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020)
+  template <bool setCudaDevice = true>
+  cudaError_t cuda_malloc_async_wrapper(void** devPtr, size_t size,
+                                        cudaStream_t hStream = nullptr) const {
+    if constexpr (setCudaDevice) set_cuda_device();
+    return cudaMallocAsync(devPtr, size, get_input_stream(hStream));
+  }
+
+  template <bool setCudaDevice = true>
+  cudaError_t cuda_free_async_wrapper(void* devPtr,
+                                      cudaStream_t hStream = nullptr) const {
+    if constexpr (setCudaDevice) set_cuda_device();
+    return cudaFreeAsync(devPtr, get_input_stream(hStream));
+  }
+#endif
+
+  // C++ API routines
+  template <typename T, bool setCudaDevice = true>
+  cudaError_t cuda_func_get_attributes_wrapper(cudaFuncAttributes* attr,
+                                               T* entry) const {
+    if constexpr (setCudaDevice) set_cuda_device();
+    return cudaFuncGetAttributes(attr, entry);
+  }
+
+  template <typename T, bool setCudaDevice = true>
+  cudaError_t cuda_func_set_attributes_wrapper(T* entry, cudaFuncAttribute attr,
+                                               int value) const {
+    if constexpr (setCudaDevice) set_cuda_device();
+    return cudaFuncSetAttributes(entry, attr, value);
+  }
+
+  template <bool setCudaDevice = true>
+  cudaError_t cuda_graph_instantiate_wrapper(cudaGraphExec_t* pGraphExec,
+                                             cudaGraph_t graph,
+                                             cudaGraphNode_t* pErrorNode,
+                                             char* pLogBuffer,
+                                             size_t bufferSize) const {
+    if constexpr (setCudaDevice) set_cuda_device();
+    return cudaGraphInstantiate(pGraphExec, graph, pErrorNode, pLogBuffer,
+                                bufferSize);
+  }
+
   // Resizing of reduction related scratch spaces
   size_type* scratch_space(const std::size_t size) const;
   size_type* scratch_flags(const std::size_t size) const;
@@ -205,6 +536,7 @@ class CudaInternal {
   void release_team_scratch_space(int scratch_pool_id);
 };
 
+void create_Cuda_instances(std::vector<Cuda>& instances);
 }  // Namespace Impl
 
 namespace Experimental {
@@ -213,34 +545,26 @@ namespace Experimental {
 //   Customization point for backends
 //   Default behavior is to return the passed in instance
 
-namespace Impl {
-inline void create_Cuda_instances(std::vector<Cuda>& instances) {
-  for (int s = 0; s < int(instances.size()); s++) {
-    cudaStream_t stream;
-    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamCreate(&stream));
-    instances[s] = Cuda(stream, true);
-  }
-}
-}  // namespace Impl
-
 template <class... Args>
 std::vector<Cuda> partition_space(const Cuda&, Args...) {
   static_assert(
       (... && std::is_arithmetic_v<Args>),
       "Kokkos Error: partitioning arguments must be integers or floats");
   std::vector<Cuda> instances(sizeof...(Args));
-  Impl::create_Cuda_instances(instances);
+  Kokkos::Impl::create_Cuda_instances(instances);
   return instances;
 }
 
 template <class T>
-std::vector<Cuda> partition_space(const Cuda&, std::vector<T>& weights) {
+std::vector<Cuda> partition_space(const Cuda&, std::vector<T> const& weights) {
   static_assert(
       std::is_arithmetic<T>::value,
       "Kokkos Error: partitioning arguments must be integers or floats");
 
+  // We only care about the number of instances to create and ignore weights
+  // otherwise.
   std::vector<Cuda> instances(weights.size());
-  Impl::create_Cuda_instances(instances);
+  Kokkos::Impl::create_Cuda_instances(instances);
   return instances;
 }
 }  // namespace Experimental
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
index 5afad7a6a3d8c113cb3bdf6e14afadbd702bb857..82a72b690218246f231ff86946effcad26ffc9ae 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
@@ -28,7 +28,6 @@
 #include <impl/Kokkos_Error.hpp>
 #include <Cuda/Kokkos_Cuda_abort.hpp>
 #include <Cuda/Kokkos_Cuda_Error.hpp>
-#include <Cuda/Kokkos_Cuda_Locks.hpp>
 #include <Cuda/Kokkos_Cuda_Instance.hpp>
 #include <impl/Kokkos_GraphImpl_fwd.hpp>
 #include <Cuda/Kokkos_Cuda_GraphNodeKernel.hpp>
@@ -129,7 +128,7 @@ inline void check_shmem_request(CudaInternal const* cuda_instance, int shmem) {
 // These functions need to be templated on DriverType and LaunchBounds
 // so that the static bool is unique for each type combo
 // KernelFuncPtr does not necessarily contain that type information.
-
+// FIXME_CUDA_MULTIPLE_DEVICES
 template <class DriverType, class LaunchBounds, class KernelFuncPtr>
 const cudaFuncAttributes& get_cuda_kernel_func_attributes(
     const KernelFuncPtr& func) {
@@ -137,7 +136,9 @@ const cudaFuncAttributes& get_cuda_kernel_func_attributes(
   // by leveraging static variable initialization rules
   auto wrap_get_attributes = [&]() -> cudaFuncAttributes {
     cudaFuncAttributes attr;
-    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncGetAttributes(&attr, func));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        (CudaInternal::singleton().cuda_func_get_attributes_wrapper(&attr,
+                                                                    func)));
     return attr;
   };
   static cudaFuncAttributes func_attr = wrap_get_attributes();
@@ -218,9 +219,11 @@ inline void configure_shmem_preference(const KernelFuncPtr& func,
   if (carveout > 100) carveout = 100;
 
   // Set the carveout, but only call it once per kernel or when it changes
+  // FIXME_CUDA_MULTIPLE_DEVICES
   auto set_cache_config = [&] {
-    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncSetAttribute(
-        func, cudaFuncAttributePreferredSharedMemoryCarveout, carveout));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        (CudaInternal::singleton().cuda_func_set_attributes_wrapper(
+            func, cudaFuncAttributePreferredSharedMemoryCarveout, carveout)));
     return carveout;
   };
   // Store the value in a static variable so we only reset if needed
@@ -362,9 +365,8 @@ struct CudaParallelLaunchKernelInvoker<
   static void invoke_kernel(DriverType const& driver, dim3 const& grid,
                             dim3 const& block, int shmem,
                             CudaInternal const* cuda_instance) {
-    (base_t::
-         get_kernel_func())<<<grid, block, shmem, cuda_instance->m_stream>>>(
-        driver);
+    (base_t::get_kernel_func())<<<grid, block, shmem,
+                                  cuda_instance->get_stream()>>>(driver);
   }
 
   inline static void create_parallel_launch_graph_node(
@@ -400,15 +402,17 @@ struct CudaParallelLaunchKernelInvoker<
       params.kernelParams   = (void**)args;
       params.extra          = nullptr;
 
-      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGraphAddKernelNode(
-          &graph_node, graph, /* dependencies = */ nullptr,
-          /* numDependencies = */ 0, &params));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(
+          (cuda_instance->cuda_graph_add_kernel_node_wrapper(
+              &graph_node, graph, /* dependencies = */ nullptr,
+              /* numDependencies = */ 0, &params)));
     } else {
       // We still need an empty node for the dependency structure
       KOKKOS_IMPL_CUDA_SAFE_CALL(
-          cudaGraphAddEmptyNode(&graph_node, graph,
-                                /* dependencies = */ nullptr,
-                                /* numDependencies = */ 0));
+          (cuda_instance->cuda_graph_add_empty_node_wrapper(
+              &graph_node, graph,
+              /* dependencies = */ nullptr,
+              /* numDependencies = */ 0)));
     }
     KOKKOS_ENSURES(bool(graph_node))
   }
@@ -459,11 +463,10 @@ struct CudaParallelLaunchKernelInvoker<
     DriverType* driver_ptr = reinterpret_cast<DriverType*>(
         cuda_instance->scratch_functor(sizeof(DriverType)));
 
-    cudaMemcpyAsync(driver_ptr, &driver, sizeof(DriverType), cudaMemcpyDefault,
-                    cuda_instance->m_stream);
-    (base_t::
-         get_kernel_func())<<<grid, block, shmem, cuda_instance->m_stream>>>(
-        driver_ptr);
+    KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_instance->cuda_memcpy_async_wrapper(
+        driver_ptr, &driver, sizeof(DriverType), cudaMemcpyDefault)));
+    (base_t::get_kernel_func())<<<grid, block, shmem,
+                                  cuda_instance->get_stream()>>>(driver_ptr);
   }
 
   inline static void create_parallel_launch_graph_node(
@@ -495,8 +498,8 @@ struct CudaParallelLaunchKernelInvoker<
       // which is guaranteed to be alive until the graph instance itself is
       // destroyed, where there should be a fence ensuring that the allocation
       // associated with this kernel on the device side isn't deleted.
-      cudaMemcpyAsync(driver_ptr, &driver, sizeof(DriverType),
-                      cudaMemcpyDefault, cuda_instance->m_stream);
+      KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_instance->cuda_memcpy_async_wrapper(
+          driver_ptr, &driver, sizeof(DriverType), cudaMemcpyDefault)));
 
       void const* args[] = {&driver_ptr};
 
@@ -509,15 +512,17 @@ struct CudaParallelLaunchKernelInvoker<
       params.kernelParams   = (void**)args;
       params.extra          = nullptr;
 
-      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGraphAddKernelNode(
-          &graph_node, graph, /* dependencies = */ nullptr,
-          /* numDependencies = */ 0, &params));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(
+          (cuda_instance->cuda_graph_add_kernel_node_wrapper(
+              &graph_node, graph, /* dependencies = */ nullptr,
+              /* numDependencies = */ 0, &params)));
     } else {
       // We still need an empty node for the dependency structure
       KOKKOS_IMPL_CUDA_SAFE_CALL(
-          cudaGraphAddEmptyNode(&graph_node, graph,
-                                /* dependencies = */ nullptr,
-                                /* numDependencies = */ 0));
+          (cuda_instance->cuda_graph_add_empty_node_wrapper(
+              &graph_node, graph,
+              /* dependencies = */ nullptr,
+              /* numDependencies = */ 0)));
     }
     KOKKOS_ENSURES(bool(graph_node))
   }
@@ -573,26 +578,26 @@ struct CudaParallelLaunchKernelInvoker<
                             CudaInternal const* cuda_instance) {
     // Wait until the previous kernel that uses the constant buffer is done
     std::lock_guard<std::mutex> lock(CudaInternal::constantMemMutex);
-    KOKKOS_IMPL_CUDA_SAFE_CALL(
-        cudaEventSynchronize(CudaInternal::constantMemReusable));
+    KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_instance->cuda_event_synchronize_wrapper(
+        CudaInternal::constantMemReusable)));
 
     // Copy functor (synchronously) to staging buffer in pinned host memory
     unsigned long* staging = cuda_instance->constantMemHostStaging;
     memcpy(staging, &driver, sizeof(DriverType));
 
     // Copy functor asynchronously from there to constant memory on the device
-    cudaMemcpyToSymbolAsync(kokkos_impl_cuda_constant_memory_buffer, staging,
-                            sizeof(DriverType), 0, cudaMemcpyHostToDevice,
-                            cudaStream_t(cuda_instance->m_stream));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        (cuda_instance->cuda_memcpy_to_symbol_async_wrapper(
+            kokkos_impl_cuda_constant_memory_buffer, staging,
+            sizeof(DriverType), 0, cudaMemcpyHostToDevice)));
 
     // Invoke the driver function on the device
-    (base_t::
-         get_kernel_func())<<<grid, block, shmem, cuda_instance->m_stream>>>();
+    (base_t::get_kernel_func())<<<grid, block, shmem,
+                                  cuda_instance->get_stream()>>>();
 
     // Record an event that says when the constant buffer can be reused
-    KOKKOS_IMPL_CUDA_SAFE_CALL(
-        cudaEventRecord(CudaInternal::constantMemReusable,
-                        cudaStream_t(cuda_instance->m_stream)));
+    KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_instance->cuda_event_record_wrapper(
+        CudaInternal::constantMemReusable)));
   }
 
   inline static void create_parallel_launch_graph_node(
@@ -664,13 +669,14 @@ struct CudaParallelLaunchImpl<
             shmem, desired_occupancy);
       }
 
-      ensure_cuda_lock_arrays_on_device();
+      desul::ensure_cuda_lock_arrays_on_device();
 
       // Invoke the driver function on the device
       base_t::invoke_kernel(driver, grid, block, shmem, cuda_instance);
 
 #if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK)
-      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError());
+      KOKKOS_IMPL_CUDA_SAFE_CALL(
+          (cuda_instance->cuda_get_last_error_wrapper()));
       cuda_instance->fence(
           "Kokkos::Impl::launch_kernel: Debug Only Check for Execution Error");
 #endif
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp
deleted file mode 100644
index b18fda80f06193152b23004468b00a3b806acecb..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp
+++ /dev/null
@@ -1,92 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
-#define KOKKOS_IMPL_PUBLIC_INCLUDE
-#endif
-
-#include <Kokkos_Core.hpp>
-#ifdef KOKKOS_ENABLE_CUDA
-#include <Cuda/Kokkos_Cuda_Locks.hpp>
-#include <Cuda/Kokkos_Cuda_Error.hpp>
-
-#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
-namespace Kokkos {
-namespace Impl {
-__device__ __constant__ CudaLockArrays g_device_cuda_lock_arrays = {nullptr, 0};
-}
-}  // namespace Kokkos
-#endif
-
-namespace Kokkos {
-
-namespace {
-
-__global__ void init_lock_array_kernel_atomic() {
-  unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i < CUDA_SPACE_ATOMIC_MASK + 1) {
-    Kokkos::Impl::g_device_cuda_lock_arrays.atomic[i] = 0;
-  }
-}
-
-}  // namespace
-
-namespace Impl {
-
-CudaLockArrays g_host_cuda_lock_arrays = {nullptr, 0};
-
-void initialize_host_cuda_lock_arrays() {
-#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
-  desul::Impl::init_lock_arrays();
-  desul::ensure_cuda_lock_arrays_on_device();
-#endif
-  if (g_host_cuda_lock_arrays.atomic != nullptr) return;
-  KOKKOS_IMPL_CUDA_SAFE_CALL(
-      cudaMalloc(&g_host_cuda_lock_arrays.atomic,
-                 sizeof(int) * (CUDA_SPACE_ATOMIC_MASK + 1)));
-  Impl::cuda_device_synchronize(
-      "Kokkos::Impl::initialize_host_cuda_lock_arrays: Pre Init Lock Arrays");
-  g_host_cuda_lock_arrays.n = CudaInternal::concurrency();
-  copy_cuda_lock_arrays_to_device();
-  init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK + 1 + 255) / 256,
-                                  256>>>();
-  Impl::cuda_device_synchronize(
-      "Kokkos::Impl::initialize_host_cuda_lock_arrays: Post Init Lock Arrays");
-}
-
-void finalize_host_cuda_lock_arrays() {
-#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
-  desul::Impl::finalize_lock_arrays();
-#endif
-
-  if (g_host_cuda_lock_arrays.atomic == nullptr) return;
-  cudaFree(g_host_cuda_lock_arrays.atomic);
-  g_host_cuda_lock_arrays.atomic = nullptr;
-  g_host_cuda_lock_arrays.n      = 0;
-#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
-  copy_cuda_lock_arrays_to_device();
-#endif
-}
-
-}  // namespace Impl
-
-}  // namespace Kokkos
-
-#else
-
-void KOKKOS_CORE_SRC_CUDA_CUDA_LOCKS_PREVENT_LINK_ERROR() {}
-
-#endif
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp
deleted file mode 100644
index 3916ae2c539008bcadb5a3ffacdb16043a89b4e3..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp
+++ /dev/null
@@ -1,169 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#ifndef KOKKOS_CUDA_LOCKS_HPP
-#define KOKKOS_CUDA_LOCKS_HPP
-
-#include <Kokkos_Macros.hpp>
-
-#ifdef KOKKOS_ENABLE_CUDA
-
-#include <cstdint>
-
-#include <Cuda/Kokkos_Cuda_Error.hpp>
-
-#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
-#include <desul/atomics/Lock_Array_CUDA.hpp>
-#endif
-
-namespace Kokkos {
-namespace Impl {
-
-struct CudaLockArrays {
-  std::int32_t* atomic;
-  std::int32_t n;
-};
-
-/// \brief This global variable in Host space is the central definition
-///        of these arrays.
-extern CudaLockArrays g_host_cuda_lock_arrays;
-
-/// \brief After this call, the g_host_cuda_lock_arrays variable has
-///        valid, initialized arrays.
-///
-/// This call is idempotent.
-void initialize_host_cuda_lock_arrays();
-
-/// \brief After this call, the g_host_cuda_lock_arrays variable has
-///        all null pointers, and all array memory has been freed.
-///
-/// This call is idempotent.
-void finalize_host_cuda_lock_arrays();
-
-}  // namespace Impl
-}  // namespace Kokkos
-
-namespace Kokkos {
-namespace Impl {
-
-/// \brief This global variable in CUDA space is what kernels use
-///        to get access to the lock arrays.
-///
-/// When relocatable device code is enabled, there can be one single
-/// instance of this global variable for the entire executable,
-/// whose definition will be in Kokkos_Cuda_Locks.cpp (and whose declaration
-/// here must then be extern.
-/// This one instance will be initialized by initialize_host_cuda_lock_arrays
-/// and need not be modified afterwards.
-///
-/// When relocatable device code is disabled, an instance of this variable
-/// will be created in every translation unit that sees this header file
-/// (we make this clear by marking it static, meaning no other translation
-///  unit can link to it).
-/// Since the Kokkos_Cuda_Locks.cpp translation unit cannot initialize the
-/// instances in other translation units, we must update this CUDA global
-/// variable based on the Host global variable prior to running any kernels
-/// that will use it.
-/// That is the purpose of the ensure_cuda_lock_arrays_on_device function.
-__device__
-#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
-    __constant__ extern
-#endif
-    CudaLockArrays g_device_cuda_lock_arrays;
-
-#define CUDA_SPACE_ATOMIC_MASK 0x1FFFF
-
-/// \brief Acquire a lock for the address
-///
-/// This function tries to acquire the lock for the hash value derived
-/// from the provided ptr. If the lock is successfully acquired the
-/// function returns true. Otherwise it returns false.
-__device__ inline bool lock_address_cuda_space(void* ptr) {
-  size_t offset = size_t(ptr);
-  offset        = offset >> 2;
-  offset        = offset & CUDA_SPACE_ATOMIC_MASK;
-  return (0 == atomicCAS(&g_device_cuda_lock_arrays.atomic[offset], 0, 1));
-}
-
-/// \brief Release lock for the address
-///
-/// This function releases the lock for the hash value derived
-/// from the provided ptr. This function should only be called
-/// after previously successfully acquiring a lock with
-/// lock_address.
-__device__ inline void unlock_address_cuda_space(void* ptr) {
-  size_t offset = size_t(ptr);
-  offset        = offset >> 2;
-  offset        = offset & CUDA_SPACE_ATOMIC_MASK;
-  atomicExch(&g_device_cuda_lock_arrays.atomic[offset], 0);
-}
-
-}  // namespace Impl
-}  // namespace Kokkos
-
-// Make lock_array_copied an explicit translation unit scope thingy
-namespace Kokkos {
-namespace Impl {
-namespace {
-static int lock_array_copied = 0;
-inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
-}  // namespace
-
-#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
-inline
-#else
-inline static
-#endif
-    void
-    copy_cuda_lock_arrays_to_device() {
-  if (lock_array_copied == 0) {
-    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemcpyToSymbol(g_device_cuda_lock_arrays,
-                                                  &g_host_cuda_lock_arrays,
-                                                  sizeof(CudaLockArrays)));
-  }
-  lock_array_copied = 1;
-}
-
-#ifndef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
-
-#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
-inline void ensure_cuda_lock_arrays_on_device() {}
-#else
-inline static void ensure_cuda_lock_arrays_on_device() {
-  copy_cuda_lock_arrays_to_device();
-}
-#endif
-
-#else
-
-#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
-inline void ensure_cuda_lock_arrays_on_device() {}
-#else
-// Still Need COPY_CUDA_LOCK_ARRAYS for team scratch etc.
-inline static void ensure_cuda_lock_arrays_on_device() {
-  copy_cuda_lock_arrays_to_device();
-  desul::ensure_cuda_lock_arrays_on_device();
-}
-#endif
-
-#endif /* defined( KOKKOS_ENABLE_IMPL_DESUL_ATOMICS ) */
-
-}  // namespace Impl
-}  // namespace Kokkos
-
-#endif /* defined( KOKKOS_ENABLE_CUDA ) */
-
-#endif /* #ifndef KOKKOS_CUDA_LOCKS_HPP */
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
index 0015d1ea14d7755cef5f7a93c3c762cfd1f50d90..8aae27d091f268eb464eed1f9408624d6261cc7d 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
@@ -188,11 +188,13 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
       : m_functor(arg_functor), m_rp(arg_policy) {}
 };
 
-template <class FunctorType, class ReducerType, class... Traits>
-class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
-                     Kokkos::Cuda> {
+template <class CombinedFunctorReducerType, class... Traits>
+class ParallelReduce<CombinedFunctorReducerType,
+                     Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
  public:
-  using Policy = Kokkos::MDRangePolicy<Traits...>;
+  using Policy      = Kokkos::MDRangePolicy<Traits...>;
+  using FunctorType = typename CombinedFunctorReducerType::functor_type;
+  using ReducerType = typename CombinedFunctorReducerType::reducer_type;
 
  private:
   using array_index_type = typename Policy::array_index_type;
@@ -202,37 +204,41 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
   using Member       = typename Policy::member_type;
   using LaunchBounds = typename Policy::launch_bounds;
 
-  using ReducerConditional =
-      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                         FunctorType, ReducerType>;
-  using ReducerTypeFwd = typename ReducerConditional::type;
-  using WorkTagFwd =
-      typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                                  WorkTag, void>::type;
-
-  using Analysis =
-      Kokkos::Impl::FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy,
-                                    ReducerTypeFwd>;
-
  public:
-  using pointer_type   = typename Analysis::pointer_type;
-  using value_type     = typename Analysis::value_type;
-  using reference_type = typename Analysis::reference_type;
+  using pointer_type   = typename ReducerType::pointer_type;
+  using value_type     = typename ReducerType::value_type;
+  using reference_type = typename ReducerType::reference_type;
   using functor_type   = FunctorType;
   using size_type      = Cuda::size_type;
   using reducer_type   = ReducerType;
 
+  // Conditionally set word_size_type to int16_t or int8_t if value_type is
+  // smaller than int32_t (Kokkos::Cuda::size_type)
+  // word_size_type is used to determine the word count, shared memory buffer
+  // size, and global memory buffer size before the reduction is performed.
+  // Within the reduction, the word count is recomputed based on word_size_type
+  // and when calculating indexes into the shared/global memory buffers for
+  // performing the reduction, word_size_type is used again.
+  // For scalars > 4 bytes in size, indexing into shared/global memory relies
+  // on the block and grid dimensions to ensure that we index at the correct
+  // offset rather than at every 4 byte word; such that, when the join is
+  // performed, we have the correct data that was copied over in chunks of 4
+  // bytes.
+  static_assert(sizeof(size_type) == 4);
+  using word_size_type = std::conditional_t<
+      sizeof(value_type) < 4,
+      std::conditional_t<sizeof(value_type) == 2, int16_t, int8_t>, size_type>;
+
   // Algorithmic constraints: blockSize is a power of two AND blockDim.y ==
   // blockDim.z == 1
 
-  const FunctorType m_functor;
+  const CombinedFunctorReducerType m_functor_reducer;
   const Policy m_policy;  // used for workrange and nwork
-  const ReducerType m_reducer;
   const pointer_type m_result_ptr;
   const bool m_result_ptr_device_accessible;
-  size_type* m_scratch_space;
+  word_size_type* m_scratch_space;
   size_type* m_scratch_flags;
-  size_type* m_unified_space;
+  word_size_type* m_unified_space;
 
   using DeviceIteratePattern = typename Kokkos::Impl::Reduce::DeviceIterateTile<
       Policy::rank, Policy, FunctorType, typename Policy::work_tag,
@@ -241,7 +247,7 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
   // Shall we use the shfl based reduction or not (only use it for static sized
   // types of more than 128bit
   static constexpr bool UseShflReduction = false;
-  //((sizeof(value_type)>2*sizeof(double)) && Analysis::StaticValueSize)
+  //((sizeof(value_type)>2*sizeof(double)) && ReducerType::static_value_size())
   // Some crutch to do function overloading
 
  public:
@@ -253,29 +259,28 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
   inline __device__ void exec_range(reference_type update) const {
     Kokkos::Impl::Reduce::DeviceIterateTile<Policy::rank, Policy, FunctorType,
                                             typename Policy::work_tag,
-                                            reference_type>(m_policy, m_functor,
-                                                            update)
+                                            reference_type>(
+        m_policy, m_functor_reducer.get_functor(), update)
         .exec_range();
   }
 
   inline __device__ void operator()() const {
-    typename Analysis::Reducer final_reducer(
-        &ReducerConditional::select(m_functor, m_reducer));
-    const integral_nonzero_constant<size_type, Analysis::StaticValueSize /
-                                                   sizeof(size_type)>
-        word_count(Analysis::value_size(
-                       ReducerConditional::select(m_functor, m_reducer)) /
-                   sizeof(size_type));
+    const integral_nonzero_constant<word_size_type,
+                                    ReducerType::static_value_size() /
+                                        sizeof(word_size_type)>
+        word_count(m_functor_reducer.get_reducer().value_size() /
+                   sizeof(word_size_type));
 
     {
-      reference_type value = final_reducer.init(reinterpret_cast<pointer_type>(
-          kokkos_impl_cuda_shared_memory<size_type>() +
-          threadIdx.y * word_count.value));
+      reference_type value =
+          m_functor_reducer.get_reducer().init(reinterpret_cast<pointer_type>(
+              kokkos_impl_cuda_shared_memory<word_size_type>() +
+              threadIdx.y * word_count.value));
 
       // Number of blocks is bounded so that the reduction can be limited to two
       // passes. Each thread block is given an approximately equal amount of
       // work to perform. Accumulate the values for this block. The accumulation
-      // ordering does not match the final pass, but is arithmatically
+      // ordering does not match the final pass, but is arithmetically
       // equivalent.
 
       this->exec_range(value);
@@ -284,20 +289,22 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
     // Reduce with final value at blockDim.y - 1 location.
     // Problem: non power-of-two blockDim
     if (cuda_single_inter_block_reduce_scan<false>(
-            final_reducer, blockIdx.x, gridDim.x,
-            kokkos_impl_cuda_shared_memory<size_type>(), m_scratch_space,
+            m_functor_reducer.get_reducer(), blockIdx.x, gridDim.x,
+            kokkos_impl_cuda_shared_memory<word_size_type>(), m_scratch_space,
             m_scratch_flags)) {
       // This is the final block with the final result at the final threads'
       // location
-      size_type* const shared = kokkos_impl_cuda_shared_memory<size_type>() +
-                                (blockDim.y - 1) * word_count.value;
-      size_type* const global =
+      word_size_type* const shared =
+          kokkos_impl_cuda_shared_memory<word_size_type>() +
+          (blockDim.y - 1) * word_count.value;
+      word_size_type* const global =
           m_result_ptr_device_accessible
-              ? reinterpret_cast<size_type*>(m_result_ptr)
+              ? reinterpret_cast<word_size_type*>(m_result_ptr)
               : (m_unified_space ? m_unified_space : m_scratch_space);
 
       if (threadIdx.y == 0) {
-        final_reducer.final(reinterpret_cast<value_type*>(shared));
+        m_functor_reducer.get_reducer().final(
+            reinterpret_cast<value_type*>(shared));
       }
 
       if (CudaTraits::WarpSize < word_count.value) {
@@ -314,9 +321,11 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
   inline unsigned local_block_size(const FunctorType& f) {
     unsigned n = CudaTraits::WarpSize * 8;
     int shmem_size =
-        cuda_single_inter_block_reduce_scan_shmem<false, FunctorType, WorkTag>(
+        cuda_single_inter_block_reduce_scan_shmem<false, WorkTag, value_type>(
             f, n);
-    using closure_type = Impl::ParallelReduce<FunctorType, Policy, ReducerType>;
+    using closure_type =
+        Impl::ParallelReduce<CombinedFunctorReducer<FunctorType, ReducerType>,
+                             Policy, Kokkos::Cuda>;
     cudaFuncAttributes attr =
         CudaParallelLaunch<closure_type,
                            LaunchBounds>::get_cuda_func_attributes();
@@ -330,39 +339,39 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
                  m_policy.space().impl_internal_space_instance(), attr, f, 1,
                  shmem_size, 0)))) {
       n >>= 1;
-      shmem_size = cuda_single_inter_block_reduce_scan_shmem<false, FunctorType,
-                                                             WorkTag>(f, n);
+      shmem_size =
+          cuda_single_inter_block_reduce_scan_shmem<false, WorkTag, value_type>(
+              f, n);
     }
     return n;
   }
 
   inline void execute() {
-    typename Analysis::Reducer final_reducer(
-        &ReducerConditional::select(m_functor, m_reducer));
-
     const auto nwork = m_policy.m_num_tiles;
     if (nwork) {
       int block_size = m_policy.m_prod_tile_dims;
       // CONSTRAINT: Algorithm requires block_size >= product of tile dimensions
       // Nearest power of two
-      int exponent_pow_two    = std::ceil(std::log2(block_size));
-      block_size              = std::pow(2, exponent_pow_two);
-      int suggested_blocksize = local_block_size(m_functor);
+      int exponent_pow_two = std::ceil(std::log2(block_size));
+      block_size           = std::pow(2, exponent_pow_two);
+      int suggested_blocksize =
+          local_block_size(m_functor_reducer.get_functor());
 
       block_size = (block_size > suggested_blocksize)
                        ? block_size
                        : suggested_blocksize;  // Note: block_size must be less
                                                // than or equal to 512
 
-      m_scratch_space = cuda_internal_scratch_space(
-          m_policy.space(), Analysis::value_size(ReducerConditional::select(
-                                m_functor, m_reducer)) *
-                                block_size /* block_size == max block_count */);
+      m_scratch_space =
+          reinterpret_cast<word_size_type*>(cuda_internal_scratch_space(
+              m_policy.space(),
+              m_functor_reducer.get_reducer().value_size() *
+                  block_size /* block_size == max block_count */));
       m_scratch_flags =
           cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type));
-      m_unified_space = cuda_internal_scratch_unified(
-          m_policy.space(), Analysis::value_size(ReducerConditional::select(
-                                m_functor, m_reducer)));
+      m_unified_space =
+          reinterpret_cast<word_size_type*>(cuda_internal_scratch_unified(
+              m_policy.space(), m_functor_reducer.get_reducer().value_size()));
 
       // REQUIRED ( 1 , N , 1 )
       const dim3 block(1, block_size, 1);
@@ -373,9 +382,9 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
       const int shmem =
           UseShflReduction
               ? 0
-              : cuda_single_inter_block_reduce_scan_shmem<false, FunctorType,
-                                                          WorkTag>(m_functor,
-                                                                   block.y);
+              : cuda_single_inter_block_reduce_scan_shmem<false, WorkTag,
+                                                          value_type>(
+                    m_functor_reducer.get_functor(), block.y);
 
       CudaParallelLaunch<ParallelReduce, LaunchBounds>(
           *this, grid, block, shmem,
@@ -389,14 +398,12 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
                 "Kokkos::Impl::ParallelReduce<Cuda, MDRangePolicy>::execute: "
                 "Result Not Device Accessible");
 
-            const int count = Analysis::value_count(
-                ReducerConditional::select(m_functor, m_reducer));
+            const int count = m_functor_reducer.get_reducer().value_count();
             for (int i = 0; i < count; ++i) {
               m_result_ptr[i] = pointer_type(m_unified_space)[i];
             }
           } else {
-            const int size = Analysis::value_size(
-                ReducerConditional::select(m_functor, m_reducer));
+            const int size = m_functor_reducer.get_reducer().value_size();
             DeepCopy<HostSpace, CudaSpace, Cuda>(m_policy.space(), m_result_ptr,
                                                  m_scratch_space, size);
           }
@@ -405,19 +412,16 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
     } else {
       if (m_result_ptr) {
         // TODO @graph We need to effectively insert this in to the graph
-        final_reducer.init(m_result_ptr);
+        m_functor_reducer.get_reducer().init(m_result_ptr);
       }
     }
   }
 
   template <class ViewType>
-  ParallelReduce(
-      const FunctorType& arg_functor, const Policy& arg_policy,
-      const ViewType& arg_result,
-      std::enable_if_t<Kokkos::is_view<ViewType>::value, void*> = nullptr)
-      : m_functor(arg_functor),
+  ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer,
+                 const Policy& arg_policy, const ViewType& arg_result)
+      : m_functor_reducer(arg_functor_reducer),
         m_policy(arg_policy),
-        m_reducer(InvalidType()),
         m_result_ptr(arg_result.data()),
         m_result_ptr_device_accessible(
             MemorySpaceAccess<Kokkos::CudaSpace,
@@ -425,23 +429,8 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
         m_scratch_space(nullptr),
         m_scratch_flags(nullptr),
         m_unified_space(nullptr) {
-    check_reduced_view_shmem_size<WorkTag>(m_policy, m_functor);
-  }
-
-  ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy,
-                 const ReducerType& reducer)
-      : m_functor(arg_functor),
-        m_policy(arg_policy),
-        m_reducer(reducer),
-        m_result_ptr(reducer.view().data()),
-        m_result_ptr_device_accessible(
-            MemorySpaceAccess<Kokkos::CudaSpace,
-                              typename ReducerType::result_view_type::
-                                  memory_space>::accessible),
-        m_scratch_space(nullptr),
-        m_scratch_flags(nullptr),
-        m_unified_space(nullptr) {
-    check_reduced_view_shmem_size<WorkTag>(m_policy, m_functor);
+    check_reduced_view_shmem_size<WorkTag, value_type>(
+        m_policy, m_functor_reducer.get_functor());
   }
 };
 }  // namespace Impl
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
index c5e89fc3da1df33cdd001db579d5eb1d3f4bedfd..5226c48bd9acc842e0095e9da334e230f1466fbd 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
@@ -114,11 +114,13 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
       : m_functor(arg_functor), m_policy(arg_policy) {}
 };
 
-template <class FunctorType, class ReducerType, class... Traits>
-class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
+template <class CombinedFunctorReducerType, class... Traits>
+class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>,
                      Kokkos::Cuda> {
  public:
-  using Policy = Kokkos::RangePolicy<Traits...>;
+  using Policy      = Kokkos::RangePolicy<Traits...>;
+  using FunctorType = typename CombinedFunctorReducerType::functor_type;
+  using ReducerType = typename CombinedFunctorReducerType::reducer_type;
 
  private:
   using WorkRange    = typename Policy::WorkRange;
@@ -126,22 +128,10 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
   using Member       = typename Policy::member_type;
   using LaunchBounds = typename Policy::launch_bounds;
 
-  using ReducerConditional =
-      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                         FunctorType, ReducerType>;
-  using ReducerTypeFwd = typename ReducerConditional::type;
-  using WorkTagFwd =
-      typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                                  WorkTag, void>::type;
-
-  using Analysis =
-      Kokkos::Impl::FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy,
-                                    ReducerTypeFwd>;
-
  public:
-  using pointer_type   = typename Analysis::pointer_type;
-  using value_type     = typename Analysis::value_type;
-  using reference_type = typename Analysis::reference_type;
+  using pointer_type   = typename ReducerType::pointer_type;
+  using value_type     = typename ReducerType::value_type;
+  using reference_type = typename ReducerType::reference_type;
   using functor_type   = FunctorType;
   // Conditionally set word_size_type to int16_t or int8_t if value_type is
   // smaller than int32_t (Kokkos::Cuda::size_type)
@@ -165,9 +155,8 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
   // Algorithmic constraints: blockSize is a power of two AND blockDim.y ==
   // blockDim.z == 1
 
-  const FunctorType m_functor;
+  const CombinedFunctorReducerType m_functor_reducer;
   const Policy m_policy;
-  const ReducerType m_reducer;
   const pointer_type m_result_ptr;
   const bool m_result_ptr_device_accessible;
   const bool m_result_ptr_host_accessible;
@@ -179,7 +168,7 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
 
   // FIXME_CUDA Shall we use the shfl based reduction or not (only use it for
   // static sized types of more than 128bit:
-  // sizeof(value_type)>2*sizeof(double)) && Analysis::StaticValueSize)
+  // sizeof(value_type)>2*sizeof(double)) && ReducerType::static_value_size())
   static constexpr bool UseShflReduction = false;
 
  public:
@@ -189,34 +178,32 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
   template <class TagType>
   __device__ inline std::enable_if_t<std::is_void<TagType>::value> exec_range(
       const Member& i, reference_type update) const {
-    m_functor(i, update);
+    m_functor_reducer.get_functor()(i, update);
   }
 
   template <class TagType>
   __device__ inline std::enable_if_t<!std::is_void<TagType>::value> exec_range(
       const Member& i, reference_type update) const {
-    m_functor(TagType(), i, update);
+    m_functor_reducer.get_functor()(TagType(), i, update);
   }
 
   __device__ inline void operator()() const {
-    typename Analysis::Reducer final_reducer(
-        &ReducerConditional::select(m_functor, m_reducer));
-
-    const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize /
-                                                        sizeof(word_size_type)>
-        word_count(Analysis::value_size(
-                       ReducerConditional::select(m_functor, m_reducer)) /
+    const integral_nonzero_constant<word_size_type,
+                                    ReducerType::static_value_size() /
+                                        sizeof(word_size_type)>
+        word_count(m_functor_reducer.get_reducer().value_size() /
                    sizeof(word_size_type));
 
     {
-      reference_type value = final_reducer.init(reinterpret_cast<pointer_type>(
-          kokkos_impl_cuda_shared_memory<word_size_type>() +
-          threadIdx.y * word_count.value));
+      reference_type value =
+          m_functor_reducer.get_reducer().init(reinterpret_cast<pointer_type>(
+              kokkos_impl_cuda_shared_memory<word_size_type>() +
+              threadIdx.y * word_count.value));
 
       // Number of blocks is bounded so that the reduction can be limited to two
       // passes. Each thread block is given an approximately equal amount of
       // work to perform. Accumulate the values for this block. The accumulation
-      // ordering does not match the final pass, but is arithmatically
+      // ordering does not match the final pass, but is arithmetically
       // equivalent.
 
       const WorkRange range(m_policy, blockIdx.x, gridDim.x);
@@ -233,7 +220,7 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
     bool do_final_reduction = true;
     if (!zero_length)
       do_final_reduction = cuda_single_inter_block_reduce_scan<false>(
-          final_reducer, blockIdx.x, gridDim.x,
+          m_functor_reducer.get_reducer(), blockIdx.x, gridDim.x,
           kokkos_impl_cuda_shared_memory<word_size_type>(), m_scratch_space,
           m_scratch_flags);
 
@@ -250,7 +237,8 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
               : (m_unified_space ? m_unified_space : m_scratch_space);
 
       if (threadIdx.y == 0) {
-        final_reducer.final(reinterpret_cast<value_type*>(shared));
+        m_functor_reducer.get_reducer().final(
+            reinterpret_cast<value_type*>(shared));
       }
 
       if (CudaTraits::WarpSize < word_count.value) {
@@ -267,9 +255,11 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
   inline unsigned local_block_size(const FunctorType& f) {
     unsigned n = CudaTraits::WarpSize * 8;
     int shmem_size =
-        cuda_single_inter_block_reduce_scan_shmem<false, FunctorType, WorkTag>(
+        cuda_single_inter_block_reduce_scan_shmem<false, WorkTag, value_type>(
             f, n);
-    using closure_type = Impl::ParallelReduce<FunctorType, Policy, ReducerType>;
+    using closure_type =
+        Impl::ParallelReduce<CombinedFunctorReducer<FunctorType, ReducerType>,
+                             Policy, Kokkos::Cuda>;
     cudaFuncAttributes attr =
         CudaParallelLaunch<closure_type,
                            LaunchBounds>::get_cuda_func_attributes();
@@ -283,31 +273,28 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
                  m_policy.space().impl_internal_space_instance(), attr, f, 1,
                  shmem_size, 0)))) {
       n >>= 1;
-      shmem_size = cuda_single_inter_block_reduce_scan_shmem<false, FunctorType,
-                                                             WorkTag>(f, n);
+      shmem_size =
+          cuda_single_inter_block_reduce_scan_shmem<false, WorkTag, value_type>(
+              f, n);
     }
     return n;
   }
 
   inline void execute() {
-    typename Analysis::Reducer final_reducer(
-        &ReducerConditional::select(m_functor, m_reducer));
-
     const index_type nwork     = m_policy.end() - m_policy.begin();
-    const bool need_device_set = Analysis::has_init_member_function ||
-                                 Analysis::has_final_member_function ||
+    const bool need_device_set = ReducerType::has_init_member_function() ||
+                                 ReducerType::has_final_member_function() ||
                                  !m_result_ptr_host_accessible ||
                                  Policy::is_graph_kernel::value ||
                                  !std::is_same<ReducerType, InvalidType>::value;
     if ((nwork > 0) || need_device_set) {
-      const int block_size = local_block_size(m_functor);
+      const int block_size = local_block_size(m_functor_reducer.get_functor());
 
       KOKKOS_ASSERT(block_size > 0);
 
       // TODO: down casting these uses more space than required?
       m_scratch_space = (word_size_type*)cuda_internal_scratch_space(
-          m_policy.space(), Analysis::value_size(ReducerConditional::select(
-                                m_functor, m_reducer)) *
+          m_policy.space(), m_functor_reducer.get_reducer().value_size() *
                                 block_size /* block_size == max block_count */);
 
       // Intentionally do not downcast to word_size_type since we use Cuda
@@ -316,8 +303,7 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
                                                     sizeof(Cuda::size_type));
       m_unified_space =
           reinterpret_cast<word_size_type*>(cuda_internal_scratch_unified(
-              m_policy.space(), Analysis::value_size(ReducerConditional::select(
-                                    m_functor, m_reducer))));
+              m_policy.space(), m_functor_reducer.get_reducer().value_size()));
 
       // REQUIRED ( 1 , N , 1 )
       dim3 block(1, block_size, 1);
@@ -329,9 +315,9 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
       const int shmem =
           UseShflReduction
               ? 0
-              : cuda_single_inter_block_reduce_scan_shmem<false, FunctorType,
-                                                          WorkTag>(m_functor,
-                                                                   block.y);
+              : cuda_single_inter_block_reduce_scan_shmem<false, WorkTag,
+                                                          value_type>(
+                    m_functor_reducer.get_functor(), block.y);
 
       if ((nwork == 0)
 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
@@ -354,14 +340,12 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
                 "Kokkos::Impl::ParallelReduce<Cuda, RangePolicy>::execute: "
                 "Result "
                 "Not Device Accessible");
-            const int count = Analysis::value_count(
-                ReducerConditional::select(m_functor, m_reducer));
+            const int count = m_functor_reducer.get_reducer().value_count();
             for (int i = 0; i < count; ++i) {
               m_result_ptr[i] = pointer_type(m_unified_space)[i];
             }
           } else {
-            const int size = Analysis::value_size(
-                ReducerConditional::select(m_functor, m_reducer));
+            const int size = m_functor_reducer.get_reducer().value_size();
             DeepCopy<HostSpace, CudaSpace, Cuda>(m_policy.space(), m_result_ptr,
                                                  m_scratch_space, size);
           }
@@ -370,19 +354,16 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
     } else {
       if (m_result_ptr) {
         // TODO @graph We need to effectively insert this in to the graph
-        final_reducer.init(m_result_ptr);
+        m_functor_reducer.get_reducer().init(m_result_ptr);
       }
     }
   }
 
   template <class ViewType>
-  ParallelReduce(
-      const FunctorType& arg_functor, const Policy& arg_policy,
-      const ViewType& arg_result,
-      std::enable_if_t<Kokkos::is_view<ViewType>::value, void*> = nullptr)
-      : m_functor(arg_functor),
+  ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer,
+                 const Policy& arg_policy, const ViewType& arg_result)
+      : m_functor_reducer(arg_functor_reducer),
         m_policy(arg_policy),
-        m_reducer(InvalidType()),
         m_result_ptr(arg_result.data()),
         m_result_ptr_device_accessible(
             MemorySpaceAccess<Kokkos::CudaSpace,
@@ -393,27 +374,8 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
         m_scratch_space(nullptr),
         m_scratch_flags(nullptr),
         m_unified_space(nullptr) {
-    check_reduced_view_shmem_size<WorkTag>(m_policy, m_functor);
-  }
-
-  ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy,
-                 const ReducerType& reducer)
-      : m_functor(arg_functor),
-        m_policy(arg_policy),
-        m_reducer(reducer),
-        m_result_ptr(reducer.view().data()),
-        m_result_ptr_device_accessible(
-            MemorySpaceAccess<Kokkos::CudaSpace,
-                              typename ReducerType::result_view_type::
-                                  memory_space>::accessible),
-        m_result_ptr_host_accessible(
-            MemorySpaceAccess<Kokkos::HostSpace,
-                              typename ReducerType::result_view_type::
-                                  memory_space>::accessible),
-        m_scratch_space(nullptr),
-        m_scratch_flags(nullptr),
-        m_unified_space(nullptr) {
-    check_reduced_view_shmem_size<WorkTag>(m_policy, m_functor);
+    check_reduced_view_shmem_size<WorkTag, value_type>(
+        m_policy, m_functor_reducer.get_functor());
   }
 };
 
@@ -429,7 +391,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
   using LaunchBounds = typename Policy::launch_bounds;
 
   using Analysis = Kokkos::Impl::FunctorAnalysis<FunctorPatternInterface::SCAN,
-                                                 Policy, FunctorType>;
+                                                 Policy, FunctorType, void>;
 
  public:
   using pointer_type   = typename Analysis::pointer_type;
@@ -460,7 +422,8 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
   //  (c) gridDim.x  <= blockDim.y * blockDim.y
   //  (d) gridDim.y  == gridDim.z == 1
 
-  const FunctorType m_functor;
+  const CombinedFunctorReducer<FunctorType, typename Analysis::Reducer>
+      m_functor_reducer;
   const Policy m_policy;
   word_size_type* m_scratch_space;
   size_type* m_scratch_flags;
@@ -472,23 +435,25 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
   template <class TagType>
   __device__ inline std::enable_if_t<std::is_void<TagType>::value> exec_range(
       const Member& i, reference_type update, const bool final_result) const {
-    m_functor(i, update, final_result);
+    m_functor_reducer.get_functor()(i, update, final_result);
   }
 
   template <class TagType>
   __device__ inline std::enable_if_t<!std::is_void<TagType>::value> exec_range(
       const Member& i, reference_type update, const bool final_result) const {
-    m_functor(TagType(), i, update, final_result);
+    m_functor_reducer.get_functor()(TagType(), i, update, final_result);
   }
 
   //----------------------------------------
 
   __device__ inline void initial() const {
-    typename Analysis::Reducer final_reducer(&m_functor);
+    const typename Analysis::Reducer& final_reducer =
+        m_functor_reducer.get_reducer();
 
     const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize /
                                                         sizeof(word_size_type)>
-        word_count(Analysis::value_size(m_functor) / sizeof(word_size_type));
+        word_count(Analysis::value_size(m_functor_reducer.get_functor()) /
+                   sizeof(word_size_type));
 
     word_size_type* const shared_value =
         kokkos_impl_cuda_shared_memory<word_size_type>() +
@@ -499,7 +464,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
     // Number of blocks is bounded so that the reduction can be limited to two
     // passes. Each thread block is given an approximately equal amount of work
     // to perform. Accumulate the values for this block. The accumulation
-    // ordering does not match the final pass, but is arithmatically equivalent.
+    // ordering does not match the final pass, but is arithmetically equivalent.
 
     const WorkRange range(m_policy, blockIdx.x, gridDim.x);
 
@@ -524,11 +489,13 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
   //----------------------------------------
 
   __device__ inline void final() const {
-    typename Analysis::Reducer final_reducer(&m_functor);
+    const typename Analysis::Reducer& final_reducer =
+        m_functor_reducer.get_reducer();
 
     const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize /
                                                         sizeof(word_size_type)>
-        word_count(Analysis::value_size(m_functor) / sizeof(word_size_type));
+        word_count(Analysis::value_size(m_functor_reducer.get_functor()) /
+                   sizeof(word_size_type));
 
     // Use shared memory as an exclusive scan: { 0 , value[0] , value[1] ,
     // value[2] , ... }
@@ -643,11 +610,12 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
     // testing
 
     unsigned n = CudaTraits::WarpSize * 4;
-    while (n && unsigned(m_policy.space()
-                             .impl_internal_space_instance()
-                             ->m_maxShmemPerBlock) <
-                    cuda_single_inter_block_reduce_scan_shmem<true, FunctorType,
-                                                              WorkTag>(f, n)) {
+    while (n &&
+           unsigned(m_policy.space()
+                        .impl_internal_space_instance()
+                        ->m_maxShmemPerBlock) <
+               cuda_single_inter_block_reduce_scan_shmem<true, WorkTag,
+                                                         value_type>(f, n)) {
       n >>= 1;
     }
     return n;
@@ -658,7 +626,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
     if (nwork) {
       constexpr int GridMaxComputeCapability_2x = 0x0ffff;
 
-      const int block_size = local_block_size(m_functor);
+      const int block_size = local_block_size(m_functor_reducer.get_functor());
       KOKKOS_ASSERT(block_size > 0);
 
       const int grid_max =
@@ -678,13 +646,15 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
 
       m_scratch_space =
           reinterpret_cast<word_size_type*>(cuda_internal_scratch_space(
-              m_policy.space(), Analysis::value_size(m_functor) * grid_x));
+              m_policy.space(),
+              Analysis::value_size(m_functor_reducer.get_functor()) * grid_x));
       m_scratch_flags =
           cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type) * 1);
 
       dim3 grid(grid_x, 1, 1);
       dim3 block(1, block_size, 1);  // REQUIRED DIMENSIONS ( 1 , N , 1 )
-      const int shmem = Analysis::value_size(m_functor) * (block_size + 2);
+      const int shmem = Analysis::value_size(m_functor_reducer.get_functor()) *
+                        (block_size + 2);
 
 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
       if (m_run_serial) {
@@ -709,7 +679,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
   }
 
   ParallelScan(const FunctorType& arg_functor, const Policy& arg_policy)
-      : m_functor(arg_functor),
+      : m_functor_reducer(arg_functor, typename Analysis::Reducer{arg_functor}),
         m_policy(arg_policy),
         m_scratch_space(nullptr),
         m_scratch_flags(nullptr),
@@ -735,8 +705,9 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
   using WorkRange    = typename Policy::WorkRange;
   using LaunchBounds = typename Policy::launch_bounds;
 
-  using Analysis = Kokkos::Impl::FunctorAnalysis<FunctorPatternInterface::SCAN,
-                                                 Policy, FunctorType>;
+  using Analysis =
+      Kokkos::Impl::FunctorAnalysis<FunctorPatternInterface::SCAN, Policy,
+                                    FunctorType, ReturnType>;
 
  public:
   using value_type     = typename Analysis::value_type;
@@ -767,7 +738,8 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
   //  (c) gridDim.x  <= blockDim.y * blockDim.y
   //  (d) gridDim.y  == gridDim.z == 1
 
-  const FunctorType m_functor;
+  const CombinedFunctorReducer<FunctorType, typename Analysis::Reducer>
+      m_functor_reducer;
   const Policy m_policy;
   word_size_type* m_scratch_space;
   size_type* m_scratch_flags;
@@ -782,23 +754,25 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
   template <class TagType>
   __device__ inline std::enable_if_t<std::is_void<TagType>::value> exec_range(
       const Member& i, reference_type update, const bool final_result) const {
-    m_functor(i, update, final_result);
+    m_functor_reducer.get_functor()(i, update, final_result);
   }
 
   template <class TagType>
   __device__ inline std::enable_if_t<!std::is_void<TagType>::value> exec_range(
       const Member& i, reference_type update, const bool final_result) const {
-    m_functor(TagType(), i, update, final_result);
+    m_functor_reducer.get_functor()(TagType(), i, update, final_result);
   }
 
   //----------------------------------------
 
   __device__ inline void initial() const {
-    typename Analysis::Reducer final_reducer(&m_functor);
+    const typename Analysis::Reducer& final_reducer =
+        m_functor_reducer.get_reducer();
 
     const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize /
                                                         sizeof(word_size_type)>
-        word_count(Analysis::value_size(m_functor) / sizeof(word_size_type));
+        word_count(Analysis::value_size(m_functor_reducer.get_functor()) /
+                   sizeof(word_size_type));
 
     word_size_type* const shared_value =
         kokkos_impl_cuda_shared_memory<word_size_type>() +
@@ -809,7 +783,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
     // Number of blocks is bounded so that the reduction can be limited to two
     // passes. Each thread block is given an approximately equal amount of work
     // to perform. Accumulate the values for this block. The accumulation
-    // ordering does not match the final pass, but is arithmatically equivalent.
+    // ordering does not match the final pass, but is arithmetically equivalent.
 
     const WorkRange range(m_policy, blockIdx.x, gridDim.x);
 
@@ -834,11 +808,12 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
   //----------------------------------------
 
   __device__ inline void final() const {
-    typename Analysis::Reducer final_reducer(&m_functor);
+    const typename Analysis::Reducer& final_reducer =
+        m_functor_reducer.get_reducer();
 
     const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize /
                                                         sizeof(word_size_type)>
-        word_count(Analysis::value_size(m_functor) / sizeof(word_size_type));
+        word_count(final_reducer.value_size() / sizeof(word_size_type));
 
     // Use shared memory as an exclusive scan: { 0 , value[0] , value[1] ,
     // value[2] , ... }
@@ -959,11 +934,12 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
     // testing
 
     unsigned n = CudaTraits::WarpSize * 4;
-    while (n && unsigned(m_policy.space()
-                             .impl_internal_space_instance()
-                             ->m_maxShmemPerBlock) <
-                    cuda_single_inter_block_reduce_scan_shmem<true, FunctorType,
-                                                              WorkTag>(f, n)) {
+    while (n &&
+           unsigned(m_policy.space()
+                        .impl_internal_space_instance()
+                        ->m_maxShmemPerBlock) <
+               cuda_single_inter_block_reduce_scan_shmem<true, WorkTag,
+                                                         value_type>(f, n)) {
       n >>= 1;
     }
     return n;
@@ -974,7 +950,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
     if (nwork) {
       enum { GridMaxComputeCapability_2x = 0x0ffff };
 
-      const int block_size = local_block_size(m_functor);
+      const int block_size = local_block_size(m_functor_reducer.get_functor());
       KOKKOS_ASSERT(block_size > 0);
 
       const int grid_max =
@@ -992,15 +968,17 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
       // How many block are really needed for this much work:
       const int grid_x = (nwork + work_per_block - 1) / work_per_block;
 
+      const typename Analysis::Reducer& final_reducer =
+          m_functor_reducer.get_reducer();
       m_scratch_space =
           reinterpret_cast<word_size_type*>(cuda_internal_scratch_space(
-              m_policy.space(), Analysis::value_size(m_functor) * grid_x));
+              m_policy.space(), final_reducer.value_size() * grid_x));
       m_scratch_flags =
           cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type) * 1);
 
       dim3 grid(grid_x, 1, 1);
       dim3 block(1, block_size, 1);  // REQUIRED DIMENSIONS ( 1 , N , 1 )
-      const int shmem = Analysis::value_size(m_functor) * (block_size + 2);
+      const int shmem = final_reducer.value_size() * (block_size + 2);
 
 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
       if (m_run_serial) {
@@ -1021,7 +999,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
           m_policy.space()
               .impl_internal_space_instance());  // copy to device and execute
 
-      const int size = Analysis::value_size(m_functor);
+      const int size = final_reducer.value_size();
 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
       if (m_run_serial)
         DeepCopy<HostSpace, CudaSpace, Cuda>(m_policy.space(), &m_returnvalue,
@@ -1042,7 +1020,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
   ParallelScanWithTotal(const FunctorType& arg_functor,
                         const Policy& arg_policy,
                         const ViewType& arg_result_view)
-      : m_functor(arg_functor),
+      : m_functor_reducer(arg_functor, typename Analysis::Reducer{arg_functor}),
         m_policy(arg_policy),
         m_scratch_space(nullptr),
         m_scratch_flags(nullptr),
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
index cdff86ccfc48d54dfc6a8c8c35256787959fed64..498e57f94a7536863eaa73fb4bf89e43adeb40f2 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
@@ -31,7 +31,6 @@
 #include <Cuda/Kokkos_Cuda_KernelLaunch.hpp>
 #include <Cuda/Kokkos_Cuda_ReduceScan.hpp>
 #include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp>
-#include <Cuda/Kokkos_Cuda_Locks.hpp>
 #include <Cuda/Kokkos_Cuda_Team.hpp>
 #include <Kokkos_MinMaxClamp.hpp>
 #include <Kokkos_Vectorization.hpp>
@@ -115,13 +114,11 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...>
                            const ParallelReduceTag&) const {
     using functor_analysis_type =
         Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
-                              TeamPolicyInternal, FunctorType>;
-    using reducer_type = typename Impl::ParallelReduceReturnValue<
-        void, typename functor_analysis_type::value_type,
-        FunctorType>::reducer_type;
-    using closure_type =
-        Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>,
-                             reducer_type>;
+                              TeamPolicyInternal, FunctorType, void>;
+    using closure_type = Impl::ParallelReduce<
+        CombinedFunctorReducer<FunctorType,
+                               typename functor_analysis_type::Reducer>,
+        TeamPolicy<Properties...>, Kokkos::Cuda>;
     return internal_team_size_max<closure_type>(f);
   }
 
@@ -129,8 +126,8 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...>
   inline int team_size_max(const FunctorType& f, const ReducerType& /*r*/,
                            const ParallelReduceTag&) const {
     using closure_type =
-        Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>,
-                             ReducerType>;
+        Impl::ParallelReduce<CombinedFunctorReducer<FunctorType, ReducerType>,
+                             TeamPolicy<Properties...>, Kokkos::Cuda>;
     return internal_team_size_max<closure_type>(f);
   }
 
@@ -156,13 +153,11 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...>
                                    const ParallelReduceTag&) const {
     using functor_analysis_type =
         Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
-                              TeamPolicyInternal, FunctorType>;
-    using reducer_type = typename Impl::ParallelReduceReturnValue<
-        void, typename functor_analysis_type::value_type,
-        FunctorType>::reducer_type;
-    using closure_type =
-        Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>,
-                             reducer_type>;
+                              TeamPolicyInternal, FunctorType, void>;
+    using closure_type = Impl::ParallelReduce<
+        CombinedFunctorReducer<FunctorType,
+                               typename functor_analysis_type::Reducer>,
+        TeamPolicy<Properties...>, Kokkos::Cuda>;
     return internal_team_size_recommended<closure_type>(f);
   }
 
@@ -170,8 +165,8 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...>
   int team_size_recommended(const FunctorType& f, const ReducerType&,
                             const ParallelReduceTag&) const {
     using closure_type =
-        Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>,
-                             ReducerType>;
+        Impl::ParallelReduce<CombinedFunctorReducer<FunctorType, ReducerType>,
+                             TeamPolicy<Properties...>, Kokkos::Cuda>;
     return internal_team_size_recommended<closure_type>(f);
   }
 
@@ -370,7 +365,7 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...>
         typename Impl::DeduceFunctorPatternInterface<ClosureType>::type;
     using Analysis =
         Impl::FunctorAnalysis<Interface, typename ClosureType::Policy,
-                              FunctorType>;
+                              FunctorType, void>;
 
     cudaFuncAttributes attr =
         CudaParallelLaunch<closure_type, typename traits::launch_bounds>::
@@ -408,14 +403,15 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...>
 };
 
 __device__ inline int64_t cuda_get_scratch_index(Cuda::size_type league_size,
-                                                 int32_t* scratch_locks) {
+                                                 int32_t* scratch_locks,
+                                                 size_t num_scratch_locks) {
   int64_t threadid = 0;
   __shared__ int64_t base_thread_id;
   if (threadIdx.x == 0 && threadIdx.y == 0) {
     int64_t const wraparound_len = Kokkos::max(
-        int64_t(1), Kokkos::min(int64_t(league_size),
-                                (int64_t(g_device_cuda_lock_arrays.n)) /
-                                    (blockDim.x * blockDim.y)));
+        int64_t(1),
+        Kokkos::min(int64_t(league_size),
+                    int64_t(num_scratch_locks) / (blockDim.x * blockDim.y)));
     threadid = (blockIdx.x * blockDim.z + threadIdx.z) % wraparound_len;
     threadid *= blockDim.x * blockDim.y;
     int done = 0;
@@ -477,6 +473,7 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
   size_t m_scratch_size[2];
   int m_scratch_pool_id = -1;
   int32_t* m_scratch_locks;
+  size_t m_num_scratch_locks;
 
   template <class TagType>
   __device__ inline std::enable_if_t<std::is_void<TagType>::value> exec_team(
@@ -497,7 +494,8 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
     // Iterate this block through the league
     int64_t threadid = 0;
     if (m_scratch_size[1] > 0) {
-      threadid = cuda_get_scratch_index(m_league_size, m_scratch_locks);
+      threadid = cuda_get_scratch_index(m_league_size, m_scratch_locks,
+                                        m_num_scratch_locks);
     }
 
     const int int_league_size = (int)m_league_size;
@@ -556,9 +554,10 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
     m_shmem_size =
         (m_policy.scratch_size(0, m_team_size) +
          FunctorTeamShmemSize<FunctorType>::value(m_functor, m_team_size));
-    m_scratch_size[0] = m_policy.scratch_size(0, m_team_size);
-    m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);
-    m_scratch_locks   = internal_space_instance->m_scratch_locks;
+    m_scratch_size[0]   = m_policy.scratch_size(0, m_team_size);
+    m_scratch_size[1]   = m_policy.scratch_size(1, m_team_size);
+    m_scratch_locks     = internal_space_instance->m_scratch_locks;
+    m_num_scratch_locks = internal_space_instance->m_num_scratch_locks;
 
     // Functor's reduce memory, team scan memory, and team shared memory depend
     // upon team size.
@@ -605,32 +604,22 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
   }
 };
 
-template <class FunctorType, class ReducerType, class... Properties>
-class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
-                     ReducerType, Kokkos::Cuda> {
+template <class CombinedFunctorReducerType, class... Properties>
+class ParallelReduce<CombinedFunctorReducerType,
+                     Kokkos::TeamPolicy<Properties...>, Kokkos::Cuda> {
  public:
-  using Policy = TeamPolicy<Properties...>;
+  using Policy      = TeamPolicy<Properties...>;
+  using FunctorType = typename CombinedFunctorReducerType::functor_type;
+  using ReducerType = typename CombinedFunctorReducerType::reducer_type;
 
  private:
   using Member       = typename Policy::member_type;
   using WorkTag      = typename Policy::work_tag;
   using LaunchBounds = typename Policy::launch_bounds;
 
-  using ReducerConditional =
-      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                         FunctorType, ReducerType>;
-  using ReducerTypeFwd = typename ReducerConditional::type;
-  using WorkTagFwd =
-      typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                                  WorkTag, void>::type;
-
-  using Analysis =
-      Kokkos::Impl::FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy,
-                                    ReducerTypeFwd>;
-
-  using pointer_type   = typename Analysis::pointer_type;
-  using reference_type = typename Analysis::reference_type;
-  using value_type     = typename Analysis::value_type;
+  using pointer_type   = typename ReducerType::pointer_type;
+  using reference_type = typename ReducerType::reference_type;
+  using value_type     = typename ReducerType::value_type;
 
  public:
   using functor_type = FunctorType;
@@ -638,7 +627,7 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   using reducer_type = ReducerType;
 
   static constexpr bool UseShflReduction =
-      (true && (Analysis::StaticValueSize != 0));
+      (true && (ReducerType::static_value_size() != 0));
 
  private:
   struct ShflReductionTag {};
@@ -652,9 +641,8 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   //  [ team   shared space ]
   //
 
-  const FunctorType m_functor;
+  const CombinedFunctorReducerType m_functor_reducer;
   const Policy m_policy;
-  const ReducerType m_reducer;
   const pointer_type m_result_ptr;
   const bool m_result_ptr_device_accessible;
   const bool m_result_ptr_host_accessible;
@@ -668,6 +656,7 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   size_t m_scratch_size[2];
   int m_scratch_pool_id = -1;
   int32_t* m_scratch_locks;
+  size_t m_num_scratch_locks;
   const size_type m_league_size;
   int m_team_size;
   const size_type m_vector_size;
@@ -675,13 +664,13 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   template <class TagType>
   __device__ inline std::enable_if_t<std::is_void<TagType>::value> exec_team(
       const Member& member, reference_type update) const {
-    m_functor(member, update);
+    m_functor_reducer.get_functor()(member, update);
   }
 
   template <class TagType>
   __device__ inline std::enable_if_t<!std::is_void<TagType>::value> exec_team(
       const Member& member, reference_type update) const {
-    m_functor(TagType(), member, update);
+    m_functor_reducer.get_functor()(TagType(), member, update);
   }
 
  public:
@@ -690,7 +679,8 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   __device__ inline void operator()() const {
     int64_t threadid = 0;
     if (m_scratch_size[1] > 0) {
-      threadid = cuda_get_scratch_index(m_league_size, m_scratch_locks);
+      threadid = cuda_get_scratch_index(m_league_size, m_scratch_locks,
+                                        m_num_scratch_locks);
     }
 
     using ReductionTag = std::conditional_t<UseShflReduction, ShflReductionTag,
@@ -702,18 +692,14 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   }
 
   __device__ inline void run(SHMEMReductionTag&, const int& threadid) const {
-    typename Analysis::Reducer final_reducer(
-        &ReducerConditional::select(m_functor, m_reducer));
-
-    const integral_nonzero_constant<size_type, Analysis::StaticValueSize /
-                                                   sizeof(size_type)>
-        word_count(Analysis::value_size(
-                       ReducerConditional::select(m_functor, m_reducer)) /
+    const integral_nonzero_constant<
+        size_type, ReducerType::static_value_size() / sizeof(size_type)>
+        word_count(m_functor_reducer.get_reducer().value_size() /
                    sizeof(size_type));
 
-    reference_type value =
-        final_reducer.init(kokkos_impl_cuda_shared_memory<size_type>() +
-                           threadIdx.y * word_count.value);
+    reference_type value = m_functor_reducer.get_reducer().init(
+        kokkos_impl_cuda_shared_memory<size_type>() +
+        threadIdx.y * word_count.value);
 
     // Iterate this block through the league
     const int int_league_size = (int)m_league_size;
@@ -734,7 +720,7 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
     bool do_final_reduction = true;
     if (!zero_length)
       do_final_reduction = cuda_single_inter_block_reduce_scan<false>(
-          final_reducer, blockIdx.x, gridDim.x,
+          m_functor_reducer.get_reducer(), blockIdx.x, gridDim.x,
           kokkos_impl_cuda_shared_memory<size_type>(), m_scratch_space,
           m_scratch_flags);
 
@@ -750,7 +736,8 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
               : (m_unified_space ? m_unified_space : m_scratch_space);
 
       if (threadIdx.y == 0) {
-        final_reducer.final(reinterpret_cast<value_type*>(shared));
+        m_functor_reducer.get_reducer().final(
+            reinterpret_cast<value_type*>(shared));
       }
 
       if (CudaTraits::WarpSize < word_count.value) {
@@ -764,11 +751,8 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   }
 
   __device__ inline void run(ShflReductionTag, const int& threadid) const {
-    typename Analysis::Reducer final_reducer(
-        &ReducerConditional::select(m_functor, m_reducer));
-
     value_type value;
-    final_reducer.init(&value);
+    m_functor_reducer.get_reducer().init(&value);
 
     // Iterate this block through the league
     const int int_league_size = (int)m_league_size;
@@ -791,29 +775,26 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
                                              : m_scratch_space);
 
     value_type init;
-    final_reducer.init(&init);
+    m_functor_reducer.get_reducer().init(&init);
 
     if (int_league_size == 0) {
-      final_reducer.final(&value);
+      m_functor_reducer.get_reducer().final(&value);
       *result = value;
-    } else if (Impl::cuda_inter_block_reduction(value, init, final_reducer,
-                                                m_scratch_space, result,
-                                                m_scratch_flags, blockDim.y)) {
+    } else if (Impl::cuda_inter_block_reduction(
+                   value, init, m_functor_reducer.get_reducer(),
+                   m_scratch_space, result, m_scratch_flags, blockDim.y)) {
       const unsigned id = threadIdx.y * blockDim.x + threadIdx.x;
       if (id == 0) {
-        final_reducer.final(&value);
+        m_functor_reducer.get_reducer().final(&value);
         *result = value;
       }
     }
   }
 
   inline void execute() {
-    typename Analysis::Reducer final_reducer(
-        &ReducerConditional::select(m_functor, m_reducer));
-
     const bool is_empty_range  = m_league_size == 0 || m_team_size == 0;
-    const bool need_device_set = Analysis::has_init_member_function ||
-                                 Analysis::has_final_member_function ||
+    const bool need_device_set = ReducerType::has_init_member_function() ||
+                                 ReducerType::has_final_member_function() ||
                                  !m_result_ptr_host_accessible ||
                                  Policy::is_graph_kernel::value ||
                                  !std::is_same<ReducerType, InvalidType>::value;
@@ -823,14 +804,12 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
                                : std::min(int(m_league_size), m_team_size));
 
       m_scratch_space = cuda_internal_scratch_space(
-          m_policy.space(), Analysis::value_size(ReducerConditional::select(
-                                m_functor, m_reducer)) *
-                                block_count);
+          m_policy.space(),
+          m_functor_reducer.get_reducer().value_size() * block_count);
       m_scratch_flags =
           cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type));
       m_unified_space = cuda_internal_scratch_unified(
-          m_policy.space(), Analysis::value_size(ReducerConditional::select(
-                                m_functor, m_reducer)));
+          m_policy.space(), m_functor_reducer.get_reducer().value_size());
 
       dim3 block(m_vector_size, m_team_size, 1);
       dim3 grid(block_count, 1, 1);
@@ -857,14 +836,12 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
 
         if (m_result_ptr) {
           if (m_unified_space) {
-            const int count = Analysis::value_count(
-                ReducerConditional::select(m_functor, m_reducer));
+            const int count = m_functor_reducer.get_reducer().value_count();
             for (int i = 0; i < count; ++i) {
               m_result_ptr[i] = pointer_type(m_unified_space)[i];
             }
           } else {
-            const int size = Analysis::value_size(
-                ReducerConditional::select(m_functor, m_reducer));
+            const int size = m_functor_reducer.get_reducer().value_size();
             DeepCopy<HostSpace, CudaSpace>(m_result_ptr, m_scratch_space, size);
           }
         }
@@ -872,19 +849,16 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
     } else {
       if (m_result_ptr) {
         // TODO @graph We need to effectively insert this in to the graph
-        final_reducer.init(m_result_ptr);
+        m_functor_reducer.get_reducer().init(m_result_ptr);
       }
     }
   }
 
   template <class ViewType>
-  ParallelReduce(
-      const FunctorType& arg_functor, const Policy& arg_policy,
-      const ViewType& arg_result,
-      std::enable_if_t<Kokkos::is_view<ViewType>::value, void*> = nullptr)
-      : m_functor(arg_functor),
+  ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer,
+                 const Policy& arg_policy, const ViewType& arg_result)
+      : m_functor_reducer(arg_functor_reducer),
         m_policy(arg_policy),
-        m_reducer(InvalidType()),
         m_result_ptr(arg_result.data()),
         m_result_ptr_device_accessible(
             MemorySpaceAccess<Kokkos::CudaSpace,
@@ -911,7 +885,8 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
         m_team_size >= 0
             ? m_team_size
             : Kokkos::Impl::cuda_get_opt_block_size<FunctorType, LaunchBounds>(
-                  internal_space_instance, attr, m_functor, m_vector_size,
+                  internal_space_instance, attr,
+                  m_functor_reducer.get_functor(), m_vector_size,
                   m_policy.team_scratch_size(0),
                   m_policy.thread_scratch_size(0)) /
                   m_vector_size;
@@ -919,16 +894,17 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
     m_team_begin =
         UseShflReduction
             ? 0
-            : cuda_single_inter_block_reduce_scan_shmem<false, FunctorType,
-                                                        WorkTag>(arg_functor,
-                                                                 m_team_size);
+            : cuda_single_inter_block_reduce_scan_shmem<false, WorkTag,
+                                                        value_type>(
+                  arg_functor_reducer.get_functor(), m_team_size);
     m_shmem_begin = sizeof(double) * (m_team_size + 2);
-    m_shmem_size =
-        m_policy.scratch_size(0, m_team_size) +
-        FunctorTeamShmemSize<FunctorType>::value(arg_functor, m_team_size);
-    m_scratch_size[0] = m_shmem_size;
-    m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);
-    m_scratch_locks   = internal_space_instance->m_scratch_locks;
+    m_shmem_size  = m_policy.scratch_size(0, m_team_size) +
+                   FunctorTeamShmemSize<FunctorType>::value(
+                       arg_functor_reducer.get_functor(), m_team_size);
+    m_scratch_size[0]   = m_shmem_size;
+    m_scratch_size[1]   = m_policy.scratch_size(1, m_team_size);
+    m_scratch_locks     = internal_space_instance->m_scratch_locks;
+    m_num_scratch_locks = internal_space_instance->m_num_scratch_locks;
     if (m_team_size <= 0) {
       m_scratch_ptr[1] = nullptr;
     } else {
@@ -974,112 +950,9 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
     }
 
     if (int(m_team_size) >
-        arg_policy.team_size_max(m_functor, m_reducer, ParallelReduceTag())) {
-      Kokkos::Impl::throw_runtime_exception(
-          std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too "
-                      "large team size."));
-    }
-  }
-
-  ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy,
-                 const ReducerType& reducer)
-      : m_functor(arg_functor),
-        m_policy(arg_policy),
-        m_reducer(reducer),
-        m_result_ptr(reducer.view().data()),
-        m_result_ptr_device_accessible(
-            MemorySpaceAccess<Kokkos::CudaSpace,
-                              typename ReducerType::result_view_type::
-                                  memory_space>::accessible),
-        m_result_ptr_host_accessible(
-            MemorySpaceAccess<Kokkos::HostSpace,
-                              typename ReducerType::result_view_type::
-                                  memory_space>::accessible),
-        m_scratch_space(nullptr),
-        m_scratch_flags(nullptr),
-        m_unified_space(nullptr),
-        m_team_begin(0),
-        m_shmem_begin(0),
-        m_shmem_size(0),
-        m_scratch_ptr{nullptr, nullptr},
-        m_league_size(arg_policy.league_size()),
-        m_team_size(arg_policy.team_size()),
-        m_vector_size(arg_policy.impl_vector_length()) {
-    auto internal_space_instance =
-        m_policy.space().impl_internal_space_instance();
-    cudaFuncAttributes attr =
-        CudaParallelLaunch<ParallelReduce,
-                           LaunchBounds>::get_cuda_func_attributes();
-
-    // Valid team size not provided, deduce team size
-    m_team_size =
-        m_team_size >= 0
-            ? m_team_size
-            : Kokkos::Impl::cuda_get_opt_block_size<FunctorType, LaunchBounds>(
-                  internal_space_instance, attr, m_functor, m_vector_size,
-                  m_policy.team_scratch_size(0),
-                  m_policy.thread_scratch_size(0)) /
-                  m_vector_size;
-
-    m_team_begin =
-        UseShflReduction
-            ? 0
-            : cuda_single_inter_block_reduce_scan_shmem<false, FunctorType,
-                                                        WorkTag>(arg_functor,
-                                                                 m_team_size);
-    m_shmem_begin = sizeof(double) * (m_team_size + 2);
-    m_shmem_size =
-        m_policy.scratch_size(0, m_team_size) +
-        FunctorTeamShmemSize<FunctorType>::value(arg_functor, m_team_size);
-    m_scratch_size[0] = m_shmem_size;
-    m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);
-    m_scratch_locks   = internal_space_instance->m_scratch_locks;
-    if (m_team_size <= 0) {
-      m_scratch_ptr[1] = nullptr;
-    } else {
-      m_scratch_pool_id = internal_space_instance->acquire_team_scratch_space();
-      m_scratch_ptr[1]  = internal_space_instance->resize_team_scratch_space(
-          m_scratch_pool_id,
-          static_cast<std::int64_t>(m_scratch_size[1]) *
-              (std::min(
-                  static_cast<std::int64_t>(Cuda().concurrency() /
-                                            (m_team_size * m_vector_size)),
-                  static_cast<std::int64_t>(m_league_size))));
-    }
-
-    // The global parallel_reduce does not support vector_length other than 1 at
-    // the moment
-    if ((arg_policy.impl_vector_length() > 1) && !UseShflReduction)
-      Impl::throw_runtime_exception(
-          "Kokkos::parallel_reduce with a TeamPolicy using a vector length of "
-          "greater than 1 is not currently supported for CUDA for dynamic "
-          "sized reduction types.");
-
-    if ((m_team_size < 32) && !UseShflReduction)
-      Impl::throw_runtime_exception(
-          "Kokkos::parallel_reduce with a TeamPolicy using a team_size smaller "
-          "than 32 is not currently supported with CUDA for dynamic sized "
-          "reduction types.");
-
-    // Functor's reduce memory, team scan memory, and team shared memory depend
-    // upon team size.
-
-    const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size;
-
-    if ((!Kokkos::Impl::is_integral_power_of_two(m_team_size) &&
-         !UseShflReduction) ||
-        internal_space_instance->m_maxShmemPerBlock < shmem_size_total) {
-      Kokkos::Impl::throw_runtime_exception(
-          std::string("Kokkos::Impl::ParallelReduce< Cuda > bad team size"));
-    }
-
-    size_type team_size_max =
-        Kokkos::Impl::cuda_get_max_block_size<FunctorType, LaunchBounds>(
-            internal_space_instance, attr, m_functor, m_vector_size,
-            m_policy.team_scratch_size(0), m_policy.thread_scratch_size(0)) /
-        m_vector_size;
-
-    if ((int)m_team_size > (int)team_size_max) {
+        arg_policy.team_size_max(m_functor_reducer.get_functor(),
+                                 m_functor_reducer.get_reducer(),
+                                 ParallelReduceTag())) {
       Kokkos::Impl::throw_runtime_exception(
           std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too "
                       "large team size."));
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
index 59fdd1351386c705861da0030eedd60a3c31fb0c..7ccedbfe28daf16ecd985601b78c52df4bf83686 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
@@ -315,7 +315,7 @@ struct CudaReductionsFunctor<FunctorType, false, false> {
     __syncwarp(mask);
 
     for (int delta = skip_vector ? blockDim.x : 1; delta < width; delta *= 2) {
-      if (lane_id + delta < 32) {
+      if ((lane_id + delta < 32) && (lane_id % (delta * 2) == 0)) {
         functor.join(value, value + delta);
       }
       __syncwarp(mask);
@@ -672,34 +672,35 @@ __device__ bool cuda_single_inter_block_reduce_scan(
 }
 
 // Size in bytes required for inter block reduce or scan
-template <bool DoScan, class FunctorType, class ArgTag>
+template <bool DoScan, class ArgTag, class ValueType, class FunctorType>
 inline std::enable_if_t<DoScan, unsigned>
 cuda_single_inter_block_reduce_scan_shmem(const FunctorType& functor,
                                           const unsigned BlockSize) {
   using Analysis =
       Impl::FunctorAnalysis<Impl::FunctorPatternInterface::SCAN,
-                            RangePolicy<Cuda, ArgTag>, FunctorType>;
+                            RangePolicy<Cuda, ArgTag>, FunctorType, ValueType>;
 
   return (BlockSize + 2) * Analysis::value_size(functor);
 }
 
-template <bool DoScan, class FunctorType, class ArgTag>
+template <bool DoScan, class ArgTag, class ValueType, class FunctorType>
 inline std::enable_if_t<!DoScan, unsigned>
 cuda_single_inter_block_reduce_scan_shmem(const FunctorType& functor,
                                           const unsigned BlockSize) {
   using Analysis =
       Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
-                            RangePolicy<Cuda, ArgTag>, FunctorType>;
+                            RangePolicy<Cuda, ArgTag>, FunctorType, ValueType>;
 
   return (BlockSize + 2) * Analysis::value_size(functor);
 }
 
-template <typename WorkTag, typename Policy, typename FunctorType>
+template <typename WorkTag, typename ValueType, typename Policy,
+          typename FunctorType>
 inline void check_reduced_view_shmem_size(const Policy& policy,
                                           const FunctorType& functor) {
   size_t minBlockSize = CudaTraits::WarpSize * 1;
   unsigned reqShmemSize =
-      cuda_single_inter_block_reduce_scan_shmem<false, FunctorType, WorkTag>(
+      cuda_single_inter_block_reduce_scan_shmem<false, WorkTag, ValueType>(
           functor, minBlockSize);
   size_t maxShmemPerBlock =
       policy.space().impl_internal_space_instance()->m_maxShmemPerBlock;
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp
index 76e4122af2e0636ede640a96e20f01231dd39bad..baff7ef3f553ade083ce82721a5714f68964209d 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp
@@ -222,6 +222,7 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> {
     }
   }
 
+  // FIXME_CUDA_MULTIPLE_DEVICES
   static void execute(scheduler_type const& scheduler) {
     const int shared_per_warp = 2048;
     const dim3 grid(Kokkos::Impl::cuda_internal_multiprocessor_count(), 1, 1);
@@ -245,7 +246,8 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> {
 
     size_t previous_stack_size = 0;
     KOKKOS_IMPL_CUDA_SAFE_CALL(
-        cudaDeviceGetLimit(&previous_stack_size, cudaLimitStackSize));
+        (CudaInternal::singleton().cuda_device_get_limit_wrapper(
+            &previous_stack_size, cudaLimitStackSize)));
 
     // If not large enough then set the stack size, in bytes:
 
@@ -253,13 +255,15 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> {
 
     if (previous_stack_size < larger_stack_size) {
       KOKKOS_IMPL_CUDA_SAFE_CALL(
-          cudaDeviceSetLimit(cudaLimitStackSize, larger_stack_size));
+          (CudaInternal::singleton().cuda_device_set_limit_wrapper(
+              cudaLimitStackSize, larger_stack_size)));
     }
 
     cuda_task_queue_execute<<<grid, block, shared_total, stream>>>(
         scheduler, shared_per_warp);
 
-    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError());
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        (CudaInternal::singleton().cuda_get_last_error_wrapper()));
 
     Impl::cuda_device_synchronize(
         "Kokkos::Impl::TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::"
@@ -267,7 +271,8 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> {
 
     if (previous_stack_size < larger_stack_size) {
       KOKKOS_IMPL_CUDA_SAFE_CALL(
-          cudaDeviceSetLimit(cudaLimitStackSize, previous_stack_size));
+          (CudaInternal::singleton().cuda_device_set_limit_wrapper(
+              cudaLimitStackSize, previous_stack_size)));
     }
   }
 
@@ -295,7 +300,8 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> {
     set_cuda_task_base_apply_function_pointer<TaskType>
         <<<1, 1>>>(ptr_ptr, dtor_ptr);
 
-    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError());
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        (CudaInternal::singleton().cuda_get_last_error_wrapper()));
     Impl::cuda_device_synchronize(
         "Kokkos::Impl::TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::"
         "Cuda>::execute: Post Get Function Pointer for Tasks");
@@ -456,6 +462,7 @@ class TaskQueueSpecializationConstrained<
     } while (1);
   }
 
+  // FIXME_CUDA_MULTIPLE_DEVICES
   static void execute(scheduler_type const& scheduler) {
     const int shared_per_warp = 2048;
     const int warps_per_block = 4;
@@ -476,7 +483,8 @@ class TaskQueueSpecializationConstrained<
 
     size_t previous_stack_size = 0;
     KOKKOS_IMPL_CUDA_SAFE_CALL(
-        cudaDeviceGetLimit(&previous_stack_size, cudaLimitStackSize));
+        (CudaInternal::singleton().cuda_device_get_limit_wrapper(
+            &previous_stack_size, cudaLimitStackSize)));
 
     // If not large enough then set the stack size, in bytes:
 
@@ -484,13 +492,15 @@ class TaskQueueSpecializationConstrained<
 
     if (previous_stack_size < larger_stack_size) {
       KOKKOS_IMPL_CUDA_SAFE_CALL(
-          cudaDeviceSetLimit(cudaLimitStackSize, larger_stack_size));
+          (CudaInternal::singleton().cuda_device_set_limit_wrapper(
+              cudaLimitStackSize, larger_stack_size)));
     }
 
     cuda_task_queue_execute<<<grid, block, shared_total, stream>>>(
         scheduler, shared_per_warp);
 
-    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError());
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        (CudaInternal::singleton().cuda_get_last_error_wrapper()));
 
     Impl::cuda_device_synchronize(
         "Kokkos::Impl::TaskQueueSpecializationConstrained<SimpleTaskScheduler<"
@@ -498,7 +508,8 @@ class TaskQueueSpecializationConstrained<
 
     if (previous_stack_size < larger_stack_size) {
       KOKKOS_IMPL_CUDA_SAFE_CALL(
-          cudaDeviceSetLimit(cudaLimitStackSize, previous_stack_size));
+          (CudaInternal::singleton().cuda_device_set_limit_wrapper(
+              cudaLimitStackSize, previous_stack_size)));
     }
   }
 
@@ -521,7 +532,8 @@ class TaskQueueSpecializationConstrained<
     set_cuda_task_base_apply_function_pointer<TaskType>
         <<<1, 1>>>(ptr_ptr, dtor_ptr);
 
-    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError());
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        (CudaInternal::singleton().cuda_get_last_error_wrapper()));
     Impl::cuda_device_synchronize(
         "Kokkos::Impl::TaskQueueSpecializationConstrained<SimpleTaskScheduler<"
         "Kokkos::Cuda>::get_function_pointer: Post Get Function Pointer");
@@ -1042,7 +1054,8 @@ KOKKOS_INLINE_FUNCTION void parallel_scan(
   // Extract value_type from closure
 
   using value_type = typename Kokkos::Impl::FunctorAnalysis<
-      Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type;
+      Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure,
+      void>::value_type;
 
   if (1 < loop_boundaries.thread.team_size()) {
     // make sure all threads perform all loop iterations
@@ -1107,7 +1120,8 @@ KOKKOS_INLINE_FUNCTION void parallel_scan(
   // Extract value_type from closure
 
   using value_type = typename Kokkos::Impl::FunctorAnalysis<
-      Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type;
+      Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure,
+      void>::value_type;
 
   if (1 < loop_boundaries.thread.team_size()) {
     // make sure all threads perform all loop iterations
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp
index fc3f46bce6a62e32955b662850fc0b96481d93b2..c2b5f1fa78945beba56fcfdd78052a8b3e8de7f4 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp
@@ -196,9 +196,10 @@ class CudaTeamMember {
     (void)reducer;
     (void)value;
     KOKKOS_IF_ON_DEVICE(
-        (typename Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
-                                        TeamPolicy<Cuda>, ReducerType>::Reducer
-             wrapped_reducer(&reducer);
+        (typename Impl::FunctorAnalysis<
+             Impl::FunctorPatternInterface::REDUCE, TeamPolicy<Cuda>,
+             ReducerType, typename ReducerType::value_type>::Reducer
+             wrapped_reducer(reducer);
          cuda_intra_block_reduction(value, wrapped_reducer, blockDim.y);
          reducer.reference() = value;))
   }
@@ -228,7 +229,8 @@ class CudaTeamMember {
         Impl::CudaJoinFunctor<Type> cuda_join_functor;
         typename Impl::FunctorAnalysis<
             Impl::FunctorPatternInterface::SCAN, TeamPolicy<Cuda>,
-            Impl::CudaJoinFunctor<Type>>::Reducer reducer(&cuda_join_functor);
+            Impl::CudaJoinFunctor<Type>, Type>::Reducer
+            reducer(cuda_join_functor);
         Impl::cuda_intra_block_reduce_scan<true>(reducer, base_data + 1);
 
         if (global_accum) {
@@ -376,18 +378,10 @@ struct ThreadVectorRangeBoundariesStruct<iType, CudaTeamMember> {
   ThreadVectorRangeBoundariesStruct(const CudaTeamMember, index_type count)
       : start(static_cast<index_type>(0)), end(count) {}
 
-  KOKKOS_INLINE_FUNCTION
-  ThreadVectorRangeBoundariesStruct(index_type count)
-      : start(static_cast<index_type>(0)), end(count) {}
-
   KOKKOS_INLINE_FUNCTION
   ThreadVectorRangeBoundariesStruct(const CudaTeamMember, index_type arg_begin,
                                     index_type arg_end)
       : start(arg_begin), end(arg_end) {}
-
-  KOKKOS_INLINE_FUNCTION
-  ThreadVectorRangeBoundariesStruct(index_type arg_begin, index_type arg_end)
-      : start(arg_begin), end(arg_end) {}
 };
 
 }  // namespace Impl
@@ -492,9 +486,6 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t<Kokkos::is_reducer<ReducerType>::value>
 parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<
                     iType, Impl::CudaTeamMember>& loop_boundaries,
                 const Closure& closure, const ReducerType& reducer) {
-  (void)loop_boundaries;
-  (void)closure;
-  (void)reducer;
   KOKKOS_IF_ON_DEVICE(
       (typename ReducerType::value_type value;
 
@@ -504,6 +495,11 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<
             i < loop_boundaries.end; i += blockDim.y) { closure(i, value); }
 
        loop_boundaries.member.team_reduce(reducer, value);))
+  // Avoid bogus warning about reducer value being uninitialized with combined
+  // reducers
+  KOKKOS_IF_ON_HOST(((void)loop_boundaries; (void)closure;
+                     reducer.init(reducer.reference());
+                     Kokkos::abort("Should only run on the device!");));
 }
 
 /** \brief  Inter-thread parallel_reduce assuming summation.
@@ -552,9 +548,6 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t<Kokkos::is_reducer<ReducerType>::value>
 parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct<
                     iType, Impl::CudaTeamMember>& loop_boundaries,
                 const Closure& closure, const ReducerType& reducer) {
-  (void)loop_boundaries;
-  (void)closure;
-  (void)reducer;
   KOKKOS_IF_ON_DEVICE((typename ReducerType::value_type value;
                        reducer.init(value);
 
@@ -565,6 +558,11 @@ parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct<
 
                        loop_boundaries.member.vector_reduce(reducer, value);
                        loop_boundaries.member.team_reduce(reducer, value);))
+  // Avoid bogus warning about reducer value being uninitialized with combined
+  // reducers
+  KOKKOS_IF_ON_HOST(((void)loop_boundaries; (void)closure;
+                     reducer.init(reducer.reference());
+                     Kokkos::abort("Should only run on the device!");));
 }
 
 template <typename iType, class Closure, typename ValueType>
@@ -632,9 +630,6 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t<is_reducer<ReducerType>::value>
 parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct<
                     iType, Impl::CudaTeamMember> const& loop_boundaries,
                 Closure const& closure, ReducerType const& reducer) {
-  (void)loop_boundaries;
-  (void)closure;
-  (void)reducer;
   KOKKOS_IF_ON_DEVICE((
 
       reducer.init(reducer.reference());
@@ -646,6 +641,11 @@ parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct<
       Impl::CudaTeamMember::vector_reduce(reducer);
 
       ))
+  // Avoid bogus warning about reducer value being uninitialized with combined
+  // reducers
+  KOKKOS_IF_ON_HOST(((void)loop_boundaries; (void)closure;
+                     reducer.init(reducer.reference());
+                     Kokkos::abort("Should only run on the device!");));
 }
 
 /** \brief  Intra-thread vector parallel_reduce.
@@ -689,15 +689,17 @@ parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct<
  *  final == true.
  */
 // This is the same code as in HIP and largely the same as in OpenMPTarget
-template <typename iType, typename FunctorType>
+template <typename iType, typename FunctorType, typename ValueType>
 KOKKOS_INLINE_FUNCTION void parallel_scan(
     const Impl::TeamThreadRangeBoundariesStruct<iType, Impl::CudaTeamMember>&
         loop_bounds,
-    const FunctorType& lambda) {
-  // Extract value_type from lambda
-  using value_type = typename Kokkos::Impl::FunctorAnalysis<
-      Kokkos::Impl::FunctorPatternInterface::SCAN, void,
-      FunctorType>::value_type;
+    const FunctorType& lambda, ValueType& return_val) {
+  // Extract ValueType from the Functor
+  using functor_value_type = typename Kokkos::Impl::FunctorAnalysis<
+      Kokkos::Impl::FunctorPatternInterface::SCAN, void, FunctorType,
+      void>::value_type;
+  static_assert(std::is_same<functor_value_type, ValueType>::value,
+                "Non-matching value types of functor and return type");
 
   const auto start     = loop_bounds.start;
   const auto end       = loop_bounds.end;
@@ -705,12 +707,12 @@ KOKKOS_INLINE_FUNCTION void parallel_scan(
   const auto team_size = member.team_size();
   const auto team_rank = member.team_rank();
   const auto nchunk    = (end - start + team_size - 1) / team_size;
-  value_type accum     = 0;
+  ValueType accum      = 0;
   // each team has to process one or more chunks of the prefix scan
   for (iType i = 0; i < nchunk; ++i) {
     auto ii = start + i * team_size + team_rank;
     // local accumulation for this chunk
-    value_type local_accum = 0;
+    ValueType local_accum = 0;
     // user updates value with prefix value
     if (ii < loop_bounds.end) lambda(ii, local_accum, false);
     // perform team scan
@@ -724,6 +726,29 @@ KOKKOS_INLINE_FUNCTION void parallel_scan(
     // broadcast last value to rest of the team
     member.team_broadcast(accum, team_size - 1);
   }
+  return_val = accum;
+}
+
+/** \brief  Inter-thread parallel exclusive prefix sum.
+ *
+ *  Executes closure(iType i, ValueType & val, bool final) for each i=[0..N)
+ *
+ *  The range [0..N) is mapped to each rank in the team (whose global rank is
+ *  less than N) and a scan operation is performed. The last call to closure has
+ *  final == true.
+ */
+template <typename iType, typename FunctorType>
+KOKKOS_INLINE_FUNCTION void parallel_scan(
+    const Impl::TeamThreadRangeBoundariesStruct<iType, Impl::CudaTeamMember>&
+        loop_bounds,
+    const FunctorType& lambda) {
+  // Extract value_type from functor
+  using value_type = typename Kokkos::Impl::FunctorAnalysis<
+      Kokkos::Impl::FunctorPatternInterface::SCAN, void, FunctorType,
+      void>::value_type;
+
+  value_type dummy;
+  parallel_scan(loop_bounds, lambda, dummy);
 }
 
 //----------------------------------------------------------------------------
@@ -782,7 +807,7 @@ parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<
         // exclusive scan -- the final accumulation
         // of i's val will be included in the second
         // closure call later.
-        if (i < loop_boundaries.end && threadIdx.x > 0) {
+        if (i - 1 < loop_boundaries.end && threadIdx.x > 0) {
           closure(i - 1, val, false);
         }
 
@@ -814,6 +839,8 @@ parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<
         Impl::in_place_shfl(accum, val, mask, blockDim.x, active_mask);
       }
 
+      reducer.reference() = accum;
+
       ))
 }
 
@@ -833,11 +860,38 @@ KOKKOS_INLINE_FUNCTION void parallel_scan(
         loop_boundaries,
     const Closure& closure) {
   using value_type = typename Kokkos::Impl::FunctorAnalysis<
-      Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type;
+      Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure,
+      void>::value_type;
   value_type dummy;
   parallel_scan(loop_boundaries, closure, Kokkos::Sum<value_type>(dummy));
 }
 
+/** \brief  Intra-thread vector parallel exclusive prefix sum.
+ *
+ *  Executes closure(iType i, ValueType & val, bool final) for each i=[0..N)
+ *
+ *  The range [0..N) is mapped to all vector lanes in the
+ *  thread and a scan operation is performed.
+ *  The last call to closure has final == true.
+ */
+template <typename iType, class Closure, typename ValueType>
+KOKKOS_INLINE_FUNCTION void parallel_scan(
+    const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::CudaTeamMember>&
+        loop_boundaries,
+    const Closure& closure, ValueType& return_val) {
+  // Extract ValueType from the Closure
+  using closure_value_type = typename Kokkos::Impl::FunctorAnalysis<
+      Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure,
+      ValueType>::value_type;
+  static_assert(std::is_same<closure_value_type, ValueType>::value,
+                "Non-matching value types of closure and return type");
+
+  ValueType accum;
+  parallel_scan(loop_boundaries, closure, Kokkos::Sum<ValueType>(accum));
+
+  return_val = accum;
+}
+
 }  // namespace Kokkos
 
 namespace Kokkos {
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp
index 1ade4c34b6e1bc11c1d775c299257adb22b315dd..abb747e39a1066d8f826e4ed51de1faaaaa6930b 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp
@@ -20,7 +20,7 @@
 #include <Kokkos_Macros.hpp>
 #ifdef KOKKOS_ENABLE_CUDA
 
-#include <Kokkos_CudaSpace.hpp>
+#include <Cuda/Kokkos_CudaSpace.hpp>
 #include <Kokkos_UniqueToken.hpp>
 #include <impl/Kokkos_SharedAlloc.hpp>
 
@@ -104,13 +104,9 @@ class UniqueToken<Cuda, UniqueTokenScope::Global> {
       idx = idx % size();
     }
 #endif
-// Make sure that all writes in the previous lock owner are visible to me
-#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+    // Make sure that all writes in the previous lock owner are visible to me
     desul::atomic_thread_fence(desul::MemoryOrderAcquire(),
                                desul::MemoryScopeDevice());
-#else
-    Kokkos::memory_fence();
-#endif
     return idx;
   }
 
@@ -125,13 +121,9 @@ class UniqueToken<Cuda, UniqueTokenScope::Global> {
   /// \brief release an acquired value
   KOKKOS_INLINE_FUNCTION
   void release(size_type idx) const noexcept {
-// Make sure my writes are visible to the next lock owner
-#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+    // Make sure my writes are visible to the next lock owner
     desul::atomic_thread_fence(desul::MemoryOrderRelease(),
                                desul::MemoryScopeDevice());
-#else
-    Kokkos::memory_fence();
-#endif
     (void)Kokkos::atomic_exchange(&m_locks(idx), 0);
   }
 };
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
index d5d11f499a2bc857aa2e26e826eeb3f6b1e7ac01..a945a716bc336b3fe691a9db7909502491ff25ec 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
@@ -17,7 +17,7 @@
 #ifndef KOKKOS_CUDA_WORKGRAPHPOLICY_HPP
 #define KOKKOS_CUDA_WORKGRAPHPOLICY_HPP
 
-#include <Kokkos_Cuda.hpp>
+#include <Cuda/Kokkos_Cuda.hpp>
 #include <Cuda/Kokkos_Cuda_KernelLaunch.hpp>
 
 namespace Kokkos {
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c7f0d12d914bf37f82fe30219cf0d8b9e31a28c2
--- /dev/null
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp
@@ -0,0 +1,50 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+#ifndef KOKKOS_CUDA_ZEROMEMSET_HPP
+#define KOKKOS_CUDA_ZEROMEMSET_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <Cuda/Kokkos_Cuda.hpp>
+#include <impl/Kokkos_ZeroMemset_fwd.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+template <class T, class... P>
+struct ZeroMemset<Kokkos::Cuda, View<T, P...>> {
+  ZeroMemset(const Kokkos::Cuda& exec_space_instance, const View<T, P...>& dst,
+             typename View<T, P...>::const_value_type&) {
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        (exec_space_instance.impl_internal_space_instance()
+             ->cuda_memset_async_wrapper(
+                 dst.data(), 0,
+                 dst.size() * sizeof(typename View<T, P...>::value_type))));
+  }
+
+  ZeroMemset(const View<T, P...>& dst,
+             typename View<T, P...>::const_value_type&) {
+    // FIXME_CUDA_MULTIPLE_DEVICES
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        (Kokkos::Impl::CudaInternal::singleton().cuda_memset_wrapper(
+            dst.data(), 0,
+            dst.size() * sizeof(typename View<T, P...>::value_type))));
+  }
+};
+
+}  // namespace Impl
+}  // namespace Kokkos
+
+#endif  // !defined(KOKKOS_CUDA_ZEROMEMSET_HPP)
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP.cpp b/packages/kokkos/core/src/HIP/Kokkos_HIP.cpp
index 766f815c72c97da7ea5e12e069c3940b65669a8e..f78bfd28b2f2d6a6cf61eedbd916bab386b6ffc4 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP.cpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP.cpp
@@ -20,7 +20,6 @@
 
 #include <HIP/Kokkos_HIP.hpp>
 #include <HIP/Kokkos_HIP_Instance.hpp>
-#include <HIP/Kokkos_HIP_Locks.hpp>
 
 #include <impl/Kokkos_DeviceManagement.hpp>
 #include <impl/Kokkos_ExecSpaceManager.hpp>
@@ -61,8 +60,6 @@ void HIP::impl_initialize(InitializationSettings const& settings) {
   if (Impl::HIPTraits::WarpSize < Impl::HIPInternal::m_maxWarpCount) {
     Impl::HIPInternal::m_maxWarpCount = Impl::HIPTraits::WarpSize;
   }
-  int constexpr WordSize              = sizeof(size_type);
-  Impl::HIPInternal::m_maxSharedWords = hipProp.sharedMemPerBlock / WordSize;
 
   //----------------------------------
   // Maximum number of blocks
@@ -79,7 +76,7 @@ void HIP::impl_initialize(InitializationSettings const& settings) {
       Impl::HIPInternal::m_maxWavesPerCU * Impl::HIPTraits::WarpSize;
 
   // Init the array for used for arbitrarily sized atomics
-  Impl::initialize_host_hip_lock_arrays();
+  desul::Impl::init_lock_arrays();  // FIXME
 
   // Allocate a staging buffer for constant mem in pinned host memory
   // and an event to avoid overwriting driver for previous kernel launches
@@ -104,16 +101,20 @@ HIP::HIP()
       "HIP instance constructor");
 }
 
-HIP::HIP(hipStream_t const stream, bool manage_stream)
+HIP::HIP(hipStream_t const stream, Impl::ManageStream manage_stream)
     : m_space_instance(new Impl::HIPInternal, [](Impl::HIPInternal* ptr) {
         ptr->finalize();
         delete ptr;
       }) {
   Impl::HIPInternal::singleton().verify_is_initialized(
       "HIP instance constructor");
-  m_space_instance->initialize(stream, manage_stream);
+  m_space_instance->initialize(stream, static_cast<bool>(manage_stream));
 }
 
+KOKKOS_DEPRECATED HIP::HIP(hipStream_t const stream, bool manage_stream)
+    : HIP(stream,
+          manage_stream ? Impl::ManageStream::yes : Impl::ManageStream::no) {}
+
 void HIP::print_configuration(std::ostream& os, bool /*verbose*/) const {
   os << "Device Execution Space:\n";
   os << "  KOKKOS_ENABLE_HIP: yes\n";
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP.hpp
index d48be3228088f63f6c341a99f724d0af2fbcdfca..61ed346b21825cd92d37b35c11b61a59a0c9fef7 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP.hpp
@@ -27,7 +27,8 @@
 namespace Kokkos {
 namespace Impl {
 class HIPInternal;
-}
+enum class ManageStream : bool { no, yes };
+}  // namespace Impl
 /// \class HIP
 /// \brief Kokkos device for multicore processors in the host memory space.
 class HIP {
@@ -47,7 +48,9 @@ class HIP {
   using scratch_memory_space = ScratchMemorySpace<HIP>;
 
   HIP();
-  HIP(hipStream_t stream, bool manage_stream = false);
+  HIP(hipStream_t stream,
+      Impl::ManageStream manage_stream = Impl::ManageStream::no);
+  KOKKOS_DEPRECATED HIP(hipStream_t stream, bool manage_stream);
 
   //@}
   //------------------------------------
@@ -137,26 +140,6 @@ struct DeviceTypeTraits<HIP> {
 };
 }  // namespace Experimental
 }  // namespace Tools
-
-namespace Impl {
-template <class DT, class... DP>
-struct ZeroMemset<HIP, DT, DP...> {
-  ZeroMemset(const HIP& exec_space, const View<DT, DP...>& dst,
-             typename View<DT, DP...>::const_value_type&) {
-    KOKKOS_IMPL_HIP_SAFE_CALL(hipMemsetAsync(
-        dst.data(), 0,
-        dst.size() * sizeof(typename View<DT, DP...>::value_type),
-        exec_space.hip_stream()));
-  }
-
-  ZeroMemset(const View<DT, DP...>& dst,
-             typename View<DT, DP...>::const_value_type&) {
-    KOKKOS_IMPL_HIP_SAFE_CALL(
-        hipMemset(dst.data(), 0,
-                  dst.size() * sizeof(typename View<DT, DP...>::value_type)));
-  }
-};
-}  // namespace Impl
 }  // namespace Kokkos
 
 #endif
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Atomic.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Atomic.hpp
deleted file mode 100644
index 49f89ed332e30a587fbd681a84d1f36354d2d0ec..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Atomic.hpp
+++ /dev/null
@@ -1,590 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#ifndef KOKKOS_HIP_ATOMIC_HPP
-#define KOKKOS_HIP_ATOMIC_HPP
-
-#include <impl/Kokkos_Atomic_Memory_Order.hpp>
-#include <impl/Kokkos_Memory_Fence.hpp>
-#include <HIP/Kokkos_HIP_Locks.hpp>
-
-#if defined(KOKKOS_ENABLE_HIP_ATOMICS)
-namespace Kokkos {
-// HIP can do:
-// Types int/unsigned int
-// variants:
-// atomic_exchange/compare_exchange/fetch_add/fetch_sub/fetch_max/fetch_min/fetch_and/fetch_or/fetch_xor/fetch_inc/fetch_dec
-
-// atomic_exchange -------------------------------------------------------------
-
-__inline__ __device__ int atomic_exchange(volatile int *const dest,
-                                          const int val) {
-  return atomicExch(const_cast<int *>(dest), val);
-}
-
-__inline__ __device__ unsigned int atomic_exchange(
-    volatile unsigned int *const dest, const unsigned int val) {
-  return atomicExch(const_cast<unsigned int *>(dest), val);
-}
-
-__inline__ __device__ unsigned long long int atomic_exchange(
-    volatile unsigned long long int *const dest,
-    const unsigned long long int val) {
-  return atomicExch(const_cast<unsigned long long *>(dest), val);
-}
-
-__inline__ __device__ float atomic_exchange(volatile float *const dest,
-                                            const float val) {
-  return atomicExch(const_cast<float *>(dest), val);
-}
-
-template <typename T>
-__inline__ __device__ T
-atomic_exchange(volatile T *const dest,
-                std::enable_if_t<sizeof(T) == sizeof(int), const T &> val) {
-  int tmp = atomicExch(reinterpret_cast<int *>(const_cast<T *>(dest)),
-                       *reinterpret_cast<int *>(const_cast<T *>(&val)));
-  return reinterpret_cast<T &>(tmp);
-}
-
-template <typename T>
-__inline__ __device__ T atomic_exchange(
-    volatile T *const dest,
-    std::enable_if_t<sizeof(T) != sizeof(int) &&
-                         sizeof(T) == sizeof(unsigned long long int),
-                     const T &>
-        val) {
-  using type = unsigned long long int;
-
-  type tmp = atomicExch(reinterpret_cast<type *>(const_cast<T *>(dest)),
-                        *reinterpret_cast<type *>(const_cast<T *>(&val)));
-  return reinterpret_cast<T &>(tmp);
-}
-
-template <typename T>
-__inline__ __device__ T atomic_exchange(
-    volatile T *const dest,
-    std::enable_if_t<sizeof(T) != sizeof(int) && sizeof(T) != sizeof(long long),
-                     const T> &val) {
-  T return_val;
-  int done                 = 0;
-  unsigned int active      = __ballot(1);
-  unsigned int done_active = 0;
-  while (active != done_active) {
-    if (!done) {
-      if (Impl::lock_address_hip_space((void *)dest)) {
-        return_val = *dest;
-        *dest      = val;
-        Impl::unlock_address_hip_space((void *)dest);
-        done = 1;
-      }
-    }
-    done_active = __ballot(done);
-  }
-  return return_val;
-}
-
-// atomic_assign ---------------------------------------------------------------
-
-template <typename T>
-__inline__ __device__ void atomic_assign(
-    volatile T *const dest,
-    std::enable_if_t<sizeof(T) == sizeof(int), const T &> val) {
-  atomicExch(reinterpret_cast<int *>(const_cast<T *>(dest)),
-             *reinterpret_cast<int *>(const_cast<T *>(&val)));
-}
-
-template <typename T>
-__inline__ __device__ void atomic_assign(
-    volatile T *const dest,
-    std::enable_if_t<sizeof(T) != sizeof(int) &&
-                         sizeof(T) == sizeof(unsigned long long int),
-                     const T &>
-        val) {
-  using type = unsigned long long int;
-  atomicExch(reinterpret_cast<type *>(const_cast<T *>(dest)),
-             *reinterpret_cast<type *>(const_cast<T *>(&val)));
-}
-
-template <typename T>
-__inline__ __device__ void atomic_assign(
-    volatile T *const dest,
-    std::enable_if_t<sizeof(T) != sizeof(int) &&
-                         sizeof(T) != sizeof(unsigned long long int),
-                     const T &>
-        val) {
-  atomic_exchange(dest, val);
-}
-
-// atomic_compare_exchange -----------------------------------------------------
-
-inline __device__ int atomic_compare_exchange(volatile int *dest, int compare,
-                                              const int &val) {
-  return atomicCAS(const_cast<int *>(dest), compare, val);
-}
-
-inline __device__ unsigned int atomic_compare_exchange(
-    volatile unsigned int *dest, unsigned int compare,
-    const unsigned int &val) {
-  return atomicCAS(const_cast<unsigned int *>(dest), compare, val);
-}
-
-inline __device__ unsigned long long int atomic_compare_exchange(
-    volatile unsigned long long int *dest, unsigned long long int compare,
-    const unsigned long long int &val) {
-  return atomicCAS(const_cast<unsigned long long int *>(dest), compare, val);
-}
-
-template <class T>
-__inline__ __device__ T atomic_compare_exchange(
-    volatile T *dest, T compare,
-    std::enable_if_t<sizeof(T) == sizeof(int), const T &> val) {
-  // FIXME_HIP UB
-  union U {
-    int i;
-    T f;
-    __inline__ __device__ U() {}
-  } idest, icompare, ival;
-  icompare.f = compare;
-  ival.f     = val;
-  idest.i    = atomicCAS(reinterpret_cast<int *>(const_cast<T *>(dest)),
-                      icompare.i, ival.i);
-  return idest.f;
-}
-
-template <class T>
-__inline__ __device__ T atomic_compare_exchange(
-    volatile T *dest, T compare,
-    std::enable_if_t<sizeof(T) == sizeof(unsigned long long int), const T &>
-        val) {
-  // FIXME_HIP UB
-  union U {
-    unsigned long long int i;
-    T f;
-    __inline__ __device__ U() {}
-  } idest, icompare, ival;
-  icompare.f = compare;
-  ival.f     = val;
-  idest.i    = atomicCAS(
-      reinterpret_cast<unsigned long long int *>(const_cast<T *>(dest)),
-      icompare.i, ival.i);
-  return idest.f;
-}
-
-template <typename T>
-__inline__ __device__ T atomic_compare_exchange(
-    volatile T *const dest, const T &compare,
-    std::enable_if_t<sizeof(T) != sizeof(int) && sizeof(T) != sizeof(long long),
-                     const T> &val) {
-  T return_val;
-  int done                 = 0;
-  unsigned int active      = __ballot(1);
-  unsigned int done_active = 0;
-  while (active != done_active) {
-    if (!done) {
-      if (Impl::lock_address_hip_space((void *)dest)) {
-        return_val = *dest;
-        if (return_val == compare) *dest = val;
-        Impl::unlock_address_hip_space((void *)dest);
-        done = 1;
-      }
-    }
-    done_active = __ballot(done);
-  }
-  return return_val;
-}
-
-// atomic_fetch_add ------------------------------------------------------------
-
-inline __device__ int atomic_fetch_add(volatile int *dest, const int &val) {
-  return atomicAdd(const_cast<int *>(dest), val);
-}
-
-inline __device__ unsigned int atomic_fetch_add(volatile unsigned int *dest,
-                                                const unsigned int &val) {
-  return atomicAdd(const_cast<unsigned int *>(dest), val);
-}
-
-inline __device__ unsigned long long atomic_fetch_add(
-    volatile unsigned long long *dest, const unsigned long long &val) {
-  return atomicAdd(const_cast<unsigned long long *>(dest), val);
-}
-
-inline __device__ float atomic_fetch_add(volatile float *dest,
-                                         const float &val) {
-  return atomicAdd(const_cast<float *>(dest), val);
-}
-
-template <typename T>
-inline __device__ T
-atomic_fetch_add(volatile T *const dest,
-                 std::enable_if_t<sizeof(T) == sizeof(int), const T> val) {
-  // FIXME_HIP UB
-  union U {
-    int i;
-    T t;
-    __inline__ __device__ U() {}
-  } assume, oldval, newval;
-
-  oldval.t = *dest;
-
-  do {
-    assume.i = oldval.i;
-    newval.t = assume.t + val;
-    oldval.i = atomicCAS(reinterpret_cast<int *>(const_cast<T *>(dest)),
-                         assume.i, newval.i);
-  } while (assume.i != oldval.i);
-
-  return oldval.t;
-}
-
-template <typename T>
-inline __device__ T atomic_fetch_add(
-    volatile T *const dest,
-    std::enable_if_t<sizeof(T) == sizeof(long long), const T> val) {
-  // FIXME_HIP UB
-  union U {
-    unsigned long long i;
-    T t;
-    __inline__ __device__ U() {}
-  } assume, oldval, newval;
-
-  oldval.t = *dest;
-
-  do {
-    assume.i = oldval.i;
-    newval.t = assume.t + val;
-    oldval.i = atomic_compare_exchange(
-        reinterpret_cast<volatile unsigned long long *>(dest), assume.i,
-        newval.i);
-  } while (assume.i != oldval.i);
-
-  return oldval.t;
-}
-
-__inline__ __device__ char atomic_fetch_add(volatile char *dest,
-                                            const char &val) {
-  unsigned int oldval, newval, assume;
-  oldval = *reinterpret_cast<volatile unsigned int *>(&dest);
-
-  do {
-    assume = oldval;
-    newval = assume & 0x7fffff00 + ((assume & 0xff) + val) & 0xff;
-    oldval =
-        atomicCAS(reinterpret_cast<unsigned int *>(const_cast<char *>(dest)),
-                  assume, newval);
-  } while (assume != oldval);
-
-  return oldval;
-}
-
-__inline__ __device__ short atomic_fetch_add(volatile short *dest,
-                                             const short &val) {
-  unsigned int oldval, newval, assume;
-  oldval = *reinterpret_cast<volatile unsigned int *>(&dest);
-
-  do {
-    assume = oldval;
-    newval = assume & 0x7fff0000 + ((assume & 0xffff) + val) & 0xffff;
-    oldval =
-        atomicCAS(reinterpret_cast<unsigned int *>(const_cast<short *>(dest)),
-                  assume, newval);
-  } while (assume != oldval);
-
-  return oldval;
-}
-
-__inline__ __device__ long long atomic_fetch_add(volatile long long *dest,
-                                                 const long long &val) {
-  return atomicAdd(
-      reinterpret_cast<unsigned long long *>(const_cast<long long *>(dest)),
-      val);
-}
-
-template <class T>
-__inline__ __device__ T atomic_fetch_add(
-    volatile T *dest,
-    std::enable_if_t<sizeof(T) != sizeof(int) && sizeof(T) != sizeof(long long),
-                     const T &>
-        val) {
-  T return_val;
-  int done                 = 0;
-  unsigned int active      = __ballot(1);
-  unsigned int done_active = 0;
-  while (active != done_active) {
-    if (!done) {
-      if (Kokkos::Impl::lock_address_hip_space((void *)dest)) {
-        return_val = *dest;
-        *dest      = return_val + val;
-        Kokkos::Impl::unlock_address_hip_space((void *)dest);
-        done = 1;
-      }
-    }
-    done_active = __ballot(done);
-  }
-  return return_val;
-}
-
-// atmic_fetch_sub -------------------------------------------------------------
-
-__inline__ __device__ int atomic_fetch_sub(volatile int *dest, int const &val) {
-  return atomicSub(const_cast<int *>(dest), val);
-}
-
-__inline__ __device__ unsigned int atomic_fetch_sub(volatile unsigned int *dest,
-                                                    unsigned int const &val) {
-  return atomicSub(const_cast<unsigned int *>(dest), val);
-}
-
-__inline__ __device__ unsigned long long atomic_fetch_sub(
-    unsigned long long *dest, int64_t const &val) {
-  return atomicAdd(reinterpret_cast<unsigned long long *>(dest),
-                   -reinterpret_cast<unsigned long long const &>(val));
-}
-
-__inline__ __device__ char atomic_fetch_sub(volatile char *dest,
-                                            const char &val) {
-  unsigned int oldval, newval, assume;
-  oldval = *reinterpret_cast<volatile unsigned int *>(dest);
-
-  do {
-    assume = oldval;
-    newval = assume & 0x7fffff00 + ((assume & 0xff) - val) & 0xff;
-    oldval =
-        atomicCAS(reinterpret_cast<unsigned int *>(const_cast<char *>(dest)),
-                  assume, newval);
-  } while (assume != oldval);
-
-  return oldval;
-}
-
-__inline__ __device__ short atomic_fetch_sub(volatile short *dest,
-                                             const short &val) {
-  unsigned int oldval, newval, assume;
-  oldval = *reinterpret_cast<volatile unsigned int *>(dest);
-
-  do {
-    assume = oldval;
-    newval = assume & 0x7fff0000 + ((assume & 0xffff) - val) & 0xffff;
-    oldval =
-        atomicCAS(reinterpret_cast<unsigned int *>(const_cast<short *>(dest)),
-                  assume, newval);
-  } while (assume != oldval);
-
-  return oldval;
-}
-
-__inline__ __device__ long long atomic_fetch_sub(volatile long long *dest,
-                                                 const long long &val) {
-  return static_cast<long long>(atomicAdd(
-      reinterpret_cast<unsigned long long int *>(const_cast<long long *>(dest)),
-      -reinterpret_cast<unsigned long long int const &>(val)));
-}
-
-template <class T>
-__inline__ __device__ T atomic_fetch_sub(
-    volatile T *dest, std::enable_if_t<sizeof(T) == sizeof(int), T> val) {
-  // FIXME_HIP UB
-  union U {
-    int i;
-    T t;
-    __inline__ __device__ U() {}
-  } assume, oldval, newval;
-
-  oldval.t = *dest;
-
-  do {
-    assume.i = oldval.i;
-    newval.t = assume.t - val;
-    oldval.i = atomic_compare_exchange(reinterpret_cast<volatile int *>(dest),
-                                       assume.i, newval.i);
-  } while (assume.i != oldval.i);
-
-  return oldval.t;
-}
-
-template <typename T>
-inline __device__ T atomic_fetch_sub(
-    volatile T *const dest,
-    std::enable_if_t<sizeof(T) == sizeof(long long), const T> val) {
-  // FIXME_HIP UB
-  union U {
-    unsigned long long i;
-    T t;
-    __inline__ __device__ U() {}
-  } assume, oldval, newval;
-
-  oldval.t = *dest;
-
-  do {
-    assume.i = oldval.i;
-    newval.t = assume.t - val;
-    oldval.i = atomic_compare_exchange(
-        reinterpret_cast<volatile unsigned long long *>(dest), assume.i,
-        newval.i);
-  } while (assume.i != oldval.i);
-
-  return oldval.t;
-}
-
-template <class T>
-__inline__ __device__ T atomic_fetch_sub(
-    volatile T *dest, std::enable_if_t<sizeof(T) == sizeof(char), T> val) {
-  unsigned int oldval, newval, assume;
-  oldval = *reinterpret_cast<volatile unsigned int *>(dest);
-
-  do {
-    assume = oldval;
-    newval = assume & 0x7fffff00 + ((assume & 0xff) - val) & 0xff;
-    oldval = atomicCAS(reinterpret_cast<unsigned int *>(dest), assume, newval);
-  } while (assume != oldval);
-
-  return reinterpret_cast<T>(oldval) & 0xff;
-}
-
-template <class T>
-__inline__ __device__ T atomic_fetch_sub(
-    volatile T *dest, std::enable_if_t<sizeof(T) == sizeof(short), T> val) {
-  unsigned int oldval, newval, assume;
-  oldval = *reinterpret_cast<int *>(dest);
-
-  do {
-    assume = oldval;
-    newval = assume & 0x7fff0000 + ((assume & 0xffff) - val) & 0xffff;
-    oldval = atomicCAS(reinterpret_cast<unsigned int *>(dest), assume, newval);
-  } while (assume != oldval);
-
-  return reinterpret_cast<T>(oldval) & 0xffff;
-}
-
-template <typename T>
-__inline__ __device__ T atomic_fetch_sub(
-    volatile T *const dest,
-    std::enable_if_t<sizeof(T) != sizeof(int) && sizeof(T) != sizeof(long long),
-                     const T> &val) {
-  T return_val;
-  int done                 = 0;
-  unsigned int active      = __ballot(1);
-  unsigned int done_active = 0;
-  while (active != done_active) {
-    if (!done) {
-      if (Impl::lock_address_hip_space((void *)dest)) {
-        return_val = *dest;
-        *dest      = return_val - val;
-        Impl::unlock_address_hip_space((void *)dest);
-        done = 1;
-      }
-    }
-    done_active = __ballot(done);
-  }
-  return return_val;
-}
-
-// atomic_fetch_or -------------------------------------------------------------
-
-__inline__ __device__ int atomic_fetch_or(volatile int *const dest,
-                                          int const val) {
-  return atomicOr(const_cast<int *>(dest), val);
-}
-
-__inline__ __device__ unsigned int atomic_fetch_or(
-    volatile unsigned int *const dest, unsigned int const val) {
-  return atomicOr(const_cast<unsigned int *>(dest), val);
-}
-
-__inline__ __device__ unsigned long long int atomic_fetch_or(
-    volatile unsigned long long int *const dest,
-    unsigned long long int const val) {
-  return atomicOr(const_cast<unsigned long long int *>(dest), val);
-}
-
-// atomic_fetch_and ------------------------------------------------------------
-
-__inline__ __device__ int atomic_fetch_and(volatile int *const dest,
-                                           int const val) {
-  return atomicAnd(const_cast<int *>(dest), val);
-}
-
-__inline__ __device__ unsigned int atomic_fetch_and(
-    volatile unsigned int *const dest, unsigned int const val) {
-  return atomicAnd(const_cast<unsigned int *>(dest), val);
-}
-
-__inline__ __device__ unsigned long long int atomic_fetch_and(
-    volatile unsigned long long int *const dest,
-    unsigned long long int const val) {
-  return atomicAnd(const_cast<unsigned long long int *>(dest), val);
-}
-
-namespace Impl {
-
-template <typename T>
-__inline__ __device__ void _atomic_store(T *ptr, T val,
-                                         memory_order_relaxed_t) {
-  (void)atomic_exchange(ptr, val);
-}
-
-template <typename T>
-__inline__ __device__ void _atomic_store(T *ptr, T val,
-                                         memory_order_seq_cst_t) {
-  memory_fence();
-  atomic_store(ptr, val, memory_order_relaxed);
-  memory_fence();
-}
-
-template <typename T>
-__inline__ __device__ void _atomic_store(T *ptr, T val,
-                                         memory_order_release_t) {
-  memory_fence();
-  atomic_store(ptr, val, memory_order_relaxed);
-}
-
-template <typename T>
-__inline__ __device__ void _atomic_store(T *ptr, T val) {
-  atomic_store(ptr, val, memory_order_relaxed);
-}
-
-template <typename T>
-__inline__ __device__ T _atomic_load(T *ptr, memory_order_relaxed_t) {
-  T dummy{};
-  return atomic_compare_exchange(ptr, dummy, dummy);
-}
-
-template <typename T>
-__inline__ __device__ T _atomic_load(T *ptr, memory_order_seq_cst_t) {
-  memory_fence();
-  T rv = atomic_load(ptr, memory_order_relaxed);
-  memory_fence();
-  return rv;
-}
-
-template <typename T>
-__inline__ __device__ T _atomic_load(T *ptr, memory_order_acquire_t) {
-  T rv = atomic_load(ptr, memory_order_relaxed);
-  memory_fence();
-  return rv;
-}
-
-template <typename T>
-__inline__ __device__ T _atomic_load(T *ptr) {
-  return atomic_load(ptr, memory_order_relaxed);
-}
-
-}  // namespace Impl
-}  // namespace Kokkos
-#endif
-
-#endif
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Error.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Error.hpp
index e09382c7056fb5ffdce8f40e553fd66910484005..43d63c090b3712afc0908fb4da5f7ef8fac2ed31 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Error.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Error.hpp
@@ -70,7 +70,7 @@ class HIPRawMemoryAllocationFailure : public RawMemoryAllocationFailure {
 
   void append_additional_error_information(std::ostream& o) const override {
     if (m_error_code != hipSuccess) {
-      o << "  The HIP allocation returned the error code \"\""
+      o << "  The HIP allocation returned the error code \""
         << hipGetErrorName(m_error_code) << "\".";
     }
   }
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..576c53426bca1feb38af7afb7ffdae7103d9257c
--- /dev/null
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp
@@ -0,0 +1,161 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_HIP_GRAPHNODEKERNEL_HPP
+#define KOKKOS_HIP_GRAPHNODEKERNEL_HPP
+
+#include <Kokkos_Graph_fwd.hpp>
+
+#include <impl/Kokkos_GraphImpl.hpp>
+#include <impl/Kokkos_SharedAlloc.hpp>
+
+#include <Kokkos_Parallel.hpp>
+#include <Kokkos_Parallel_Reduce.hpp>
+#include <Kokkos_PointerOwnership.hpp>
+
+#include <HIP/Kokkos_HIP_SharedAllocationRecord.hpp>
+#include <HIP/Kokkos_HIP_GraphNode_Impl.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+template <typename PolicyType, typename Functor, typename PatternTag,
+          typename... Args>
+class GraphNodeKernelImpl<Kokkos::HIP, PolicyType, Functor, PatternTag, Args...>
+    : public PatternImplSpecializationFromTag<PatternTag, Functor, PolicyType,
+                                              Args..., Kokkos::HIP>::type {
+ public:
+  using Policy       = PolicyType;
+  using graph_kernel = GraphNodeKernelImpl;
+  using base_t =
+      typename PatternImplSpecializationFromTag<PatternTag, Functor, Policy,
+                                                Args..., Kokkos::HIP>::type;
+  using Record = Kokkos::Impl::SharedAllocationRecord<Kokkos::HIPSpace, void>;
+
+  // TODO use the name and executionspace
+  template <typename PolicyDeduced, typename... ArgsDeduced>
+  GraphNodeKernelImpl(std::string, Kokkos::HIP const&, Functor arg_functor,
+                      PolicyDeduced&& arg_policy, ArgsDeduced&&... args)
+      : base_t(std::move(arg_functor), (PolicyDeduced &&) arg_policy,
+               (ArgsDeduced &&) args...) {}
+
+  template <typename PolicyDeduced>
+  GraphNodeKernelImpl(Kokkos::HIP const& exec_space, Functor arg_functor,
+                      PolicyDeduced&& arg_policy)
+      : GraphNodeKernelImpl("", exec_space, std::move(arg_functor),
+                            (PolicyDeduced &&) arg_policy) {}
+
+  ~GraphNodeKernelImpl() {
+    if (m_driver_storage) {
+      Record::decrement(Record::get_record(m_driver_storage));
+    }
+  }
+
+  void set_hip_graph_ptr(hipGraph_t* arg_graph_ptr) {
+    m_graph_ptr = arg_graph_ptr;
+  }
+
+  void set_hip_graph_node_ptr(hipGraphNode_t* arg_node_ptr) {
+    m_graph_node_ptr = arg_node_ptr;
+  }
+
+  hipGraphNode_t* get_hip_graph_node_ptr() const { return m_graph_node_ptr; }
+
+  hipGraph_t const* get_hip_graph_ptr() const { return m_graph_ptr; }
+
+  Kokkos::ObservingRawPtr<base_t> allocate_driver_memory_buffer() const {
+    KOKKOS_EXPECTS(m_driver_storage == nullptr);
+
+    auto* record = Record::allocate(
+        Kokkos::HIPSpace{}, "GraphNodeKernel global memory functor storage",
+        sizeof(base_t));
+
+    Record::increment(record);
+    m_driver_storage = reinterpret_cast<base_t*>(record->data());
+    KOKKOS_ENSURES(m_driver_storage != nullptr);
+
+    return m_driver_storage;
+  }
+
+ private:
+  Kokkos::ObservingRawPtr<const hipGraph_t> m_graph_ptr    = nullptr;
+  Kokkos::ObservingRawPtr<hipGraphNode_t> m_graph_node_ptr = nullptr;
+  Kokkos::OwningRawPtr<base_t> m_driver_storage            = nullptr;
+};
+
+struct HIPGraphNodeAggregateKernel {
+  using graph_kernel = HIPGraphNodeAggregateKernel;
+
+  // Aggregates don't need a policy, but for the purposes of checking the static
+  // assertions about graph kernels,
+  struct Policy {
+    using is_graph_kernel = std::true_type;
+  };
+};
+
+template <typename KernelType,
+          typename Tag =
+              typename PatternTagFromImplSpecialization<KernelType>::type>
+struct get_graph_node_kernel_type
+    : type_identity<
+          GraphNodeKernelImpl<Kokkos::HIP, typename KernelType::Policy,
+                              typename KernelType::functor_type, Tag>> {};
+
+template <typename KernelType>
+struct get_graph_node_kernel_type<KernelType, Kokkos::ParallelReduceTag>
+    : type_identity<GraphNodeKernelImpl<
+          Kokkos::HIP, typename KernelType::Policy,
+          CombinedFunctorReducer<typename KernelType::functor_type,
+                                 typename KernelType::reducer_type>,
+          Kokkos::ParallelReduceTag>> {};
+
+template <typename KernelType>
+auto* allocate_driver_storage_for_kernel(KernelType const& kernel) {
+  using graph_node_kernel_t =
+      typename get_graph_node_kernel_type<KernelType>::type;
+  auto const& kernel_as_graph_kernel =
+      static_cast<graph_node_kernel_t const&>(kernel);
+
+  return kernel_as_graph_kernel.allocate_driver_memory_buffer();
+}
+
+template <typename KernelType>
+auto const& get_hip_graph_from_kernel(KernelType const& kernel) {
+  using graph_node_kernel_t =
+      typename get_graph_node_kernel_type<KernelType>::type;
+  auto const& kernel_as_graph_kernel =
+      static_cast<graph_node_kernel_t const&>(kernel);
+  hipGraph_t const* graph_ptr = kernel_as_graph_kernel.get_hip_graph_ptr();
+  KOKKOS_EXPECTS(graph_ptr != nullptr);
+
+  return *graph_ptr;
+}
+
+template <typename KernelType>
+auto& get_hip_graph_node_from_kernel(KernelType const& kernel) {
+  using graph_node_kernel_t =
+      typename get_graph_node_kernel_type<KernelType>::type;
+  auto const& kernel_as_graph_kernel =
+      static_cast<graph_node_kernel_t const&>(kernel);
+  auto* graph_node_ptr = kernel_as_graph_kernel.get_hip_graph_node_ptr();
+  KOKKOS_EXPECTS(graph_node_ptr != nullptr);
+
+  return *graph_node_ptr;
+}
+}  // namespace Impl
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_GraphNode_Impl.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_GraphNode_Impl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..819ee12f396f62b2f8c68af6624d91c725a028c5
--- /dev/null
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_GraphNode_Impl.hpp
@@ -0,0 +1,54 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_HIP_GRAPHNODE_IMPL_HPP
+#define KOKKOS_HIP_GRAPHNODE_IMPL_HPP
+
+#include <Kokkos_Graph_fwd.hpp>
+
+#include <impl/Kokkos_GraphImpl.hpp>
+
+#include <HIP/Kokkos_HIP.hpp>
+
+namespace Kokkos {
+namespace Impl {
+template <>
+struct GraphNodeBackendSpecificDetails<Kokkos::HIP> {
+  hipGraphNode_t node = nullptr;
+
+  explicit GraphNodeBackendSpecificDetails() = default;
+
+  explicit GraphNodeBackendSpecificDetails(
+      _graph_node_is_root_ctor_tag) noexcept {}
+};
+
+template <typename Kernel, typename PredecessorRef>
+struct GraphNodeBackendDetailsBeforeTypeErasure<Kokkos::HIP, Kernel,
+                                                PredecessorRef> {
+ protected:
+  GraphNodeBackendDetailsBeforeTypeErasure(
+      Kokkos::HIP const &, Kernel &, PredecessorRef const &,
+      GraphNodeBackendSpecificDetails<Kokkos::HIP> &) noexcept {}
+
+  GraphNodeBackendDetailsBeforeTypeErasure(
+      Kokkos::HIP const &, _graph_node_is_root_ctor_tag,
+      GraphNodeBackendSpecificDetails<Kokkos::HIP> &) noexcept {}
+};
+
+}  // namespace Impl
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..3bde15444c7a847df6a92396bd379a30e08cf724
--- /dev/null
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp
@@ -0,0 +1,187 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_HIP_GRAPH_IMPL_HPP
+#define KOKKOS_HIP_GRAPH_IMPL_HPP
+
+#include <Kokkos_Macros.hpp>
+
+#include <Kokkos_Graph_fwd.hpp>
+
+#include <impl/Kokkos_GraphImpl.hpp>
+#include <impl/Kokkos_GraphNodeImpl.hpp>
+
+#include <HIP/Kokkos_HIP_GraphNodeKernel.hpp>
+
+namespace Kokkos {
+namespace Impl {
+template <>
+class GraphImpl<Kokkos::HIP> {
+ public:
+  using node_details_t = GraphNodeBackendSpecificDetails<Kokkos::HIP>;
+  using root_node_impl_t =
+      GraphNodeImpl<Kokkos::HIP, Kokkos::Experimental::TypeErasedTag,
+                    Kokkos::Experimental::TypeErasedTag>;
+  using aggregate_kernel_impl_t = HIPGraphNodeAggregateKernel;
+  using aggregate_node_impl_t =
+      GraphNodeImpl<Kokkos::HIP, aggregate_kernel_impl_t,
+                    Kokkos::Experimental::TypeErasedTag>;
+
+  // Not moveable or copyable; it spends its whole life as a shared_ptr in the
+  // Graph object.
+  GraphImpl()                 = delete;
+  GraphImpl(GraphImpl const&) = delete;
+  GraphImpl(GraphImpl&&)      = delete;
+  GraphImpl& operator=(GraphImpl const&) = delete;
+  GraphImpl& operator=(GraphImpl&&) = delete;
+
+  ~GraphImpl();
+
+  explicit GraphImpl(Kokkos::HIP instance);
+
+  void add_node(std::shared_ptr<aggregate_node_impl_t> const& arg_node_ptr);
+
+  template <class NodeImpl>
+  void add_node(std::shared_ptr<NodeImpl> const& arg_node_ptr);
+
+  template <class NodeImplPtr, class PredecessorRef>
+  void add_predecessor(NodeImplPtr arg_node_ptr, PredecessorRef arg_pred_ref);
+
+  void submit();
+
+  Kokkos::HIP const& get_execution_space() const noexcept;
+
+  auto create_root_node_ptr();
+
+  template <class... PredecessorRefs>
+  auto create_aggregate_ptr(PredecessorRefs&&...);
+
+ private:
+  void instantiate_graph() {
+    constexpr size_t error_log_size = 256;
+    hipGraphNode_t error_node       = nullptr;
+    char error_log[error_log_size];
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipGraphInstantiate(
+        &m_graph_exec, m_graph, &error_node, error_log, error_log_size));
+  }
+
+  Kokkos::HIP m_execution_space;
+  hipGraph_t m_graph          = nullptr;
+  hipGraphExec_t m_graph_exec = nullptr;
+};
+
+GraphImpl<Kokkos::HIP>::~GraphImpl() {
+  m_execution_space.fence("Kokkos::GraphImpl::~GraphImpl: Graph Destruction");
+  KOKKOS_EXPECTS(m_graph);
+  if (m_graph_exec) {
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipGraphExecDestroy(m_graph_exec));
+  }
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipGraphDestroy(m_graph));
+}
+
+GraphImpl<Kokkos::HIP>::GraphImpl(Kokkos::HIP instance)
+    : m_execution_space(std::move(instance)) {
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipGraphCreate(&m_graph, 0));
+}
+
+void GraphImpl<Kokkos::HIP>::add_node(
+    std::shared_ptr<aggregate_node_impl_t> const& arg_node_ptr) {
+  // All of the predecessors are just added as normal, so all we need to
+  // do here is add an empty node
+  KOKKOS_IMPL_HIP_SAFE_CALL(
+      hipGraphAddEmptyNode(&(arg_node_ptr->node_details_t::node), m_graph,
+                           /* dependencies = */ nullptr,
+                           /* numDependencies = */ 0));
+}
+
+// Requires NodeImplPtr is a shared_ptr to specialization of GraphNodeImpl
+// Also requires that the kernel has the graph node tag in it's policy
+template <class NodeImpl>
+void GraphImpl<Kokkos::HIP>::add_node(
+    std::shared_ptr<NodeImpl> const& arg_node_ptr) {
+  static_assert(NodeImpl::kernel_type::Policy::is_graph_kernel::value);
+  KOKKOS_EXPECTS(arg_node_ptr);
+  // The Kernel launch from the execute() method has been shimmed to insert
+  // the node into the graph
+  auto& kernel = arg_node_ptr->get_kernel();
+  auto& node   = static_cast<node_details_t*>(arg_node_ptr.get())->node;
+  KOKKOS_EXPECTS(!node);
+  kernel.set_hip_graph_ptr(&m_graph);
+  kernel.set_hip_graph_node_ptr(&node);
+  kernel.execute();
+  KOKKOS_ENSURES(node);
+}
+
+// Requires PredecessorRef is a specialization of GraphNodeRef that has
+// already been added to this graph and NodeImpl is a specialization of
+// GraphNodeImpl that has already been added to this graph.
+template <class NodeImplPtr, class PredecessorRef>
+void GraphImpl<Kokkos::HIP>::add_predecessor(NodeImplPtr arg_node_ptr,
+                                             PredecessorRef arg_pred_ref) {
+  KOKKOS_EXPECTS(arg_node_ptr);
+  auto pred_ptr = GraphAccess::get_node_ptr(arg_pred_ref);
+  KOKKOS_EXPECTS(pred_ptr);
+
+  auto const& pred_node = pred_ptr->node_details_t::node;
+  KOKKOS_EXPECTS(pred_node);
+
+  auto const& node = arg_node_ptr->node_details_t::node;
+  KOKKOS_EXPECTS(node);
+
+  KOKKOS_IMPL_HIP_SAFE_CALL(
+      hipGraphAddDependencies(m_graph, &pred_node, &node, 1));
+}
+
+void GraphImpl<Kokkos::HIP>::submit() {
+  if (!m_graph_exec) {
+    instantiate_graph();
+  }
+  KOKKOS_IMPL_HIP_SAFE_CALL(
+      hipGraphLaunch(m_graph_exec, m_execution_space.hip_stream()));
+}
+
+Kokkos::HIP const& GraphImpl<Kokkos::HIP>::get_execution_space() const
+    noexcept {
+  return m_execution_space;
+}
+
+auto GraphImpl<Kokkos::HIP>::create_root_node_ptr() {
+  KOKKOS_EXPECTS(m_graph);
+  KOKKOS_EXPECTS(!m_graph_exec);
+  auto rv = std::make_shared<root_node_impl_t>(get_execution_space(),
+                                               _graph_node_is_root_ctor_tag{});
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipGraphAddEmptyNode(&(rv->node_details_t::node),
+                                                 m_graph,
+                                                 /* dependencies = */ nullptr,
+                                                 /* numDependencies = */ 0));
+  KOKKOS_ENSURES(rv->node_details_t::node);
+  return rv;
+}
+
+template <class... PredecessorRefs>
+auto GraphImpl<Kokkos::HIP>::create_aggregate_ptr(PredecessorRefs&&...) {
+  // The attachment to predecessors, which is all we really need, happens
+  // in the generic layer, which calls through to add_predecessor for
+  // each predecessor ref, so all we need to do here is create the (trivial)
+  // aggregate node.
+  return std::make_shared<aggregate_node_impl_t>(m_execution_space,
+                                                 _graph_node_kernel_ctor_tag{},
+                                                 aggregate_kernel_impl_t{});
+}
+}  // namespace Impl
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp
index 28c9c1cb6ad19dad002ac1ddae807dba36e6b459..7f04eb721cb4e707fea6d5481935ff39124266b4 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp
@@ -26,6 +26,7 @@
 #include <HIP/Kokkos_HIP_Instance.hpp>
 #include <HIP/Kokkos_HIP.hpp>
 #include <HIP/Kokkos_HIP_Space.hpp>
+#include <impl/Kokkos_CheckedIntegerOps.hpp>
 #include <impl/Kokkos_Error.hpp>
 
 /*--------------------------------------------------------------------------*/
@@ -59,14 +60,24 @@ Kokkos::View<uint32_t *, HIPSpace> hip_global_unique_token_locks(
 }  // namespace Kokkos
 
 namespace Kokkos {
-
 namespace Impl {
 
+namespace {
+
+using ScratchGrain = Kokkos::HIP::size_type[Impl::HIPTraits::WarpSize];
+constexpr auto sizeScratchGrain = sizeof(ScratchGrain);
+
+std::size_t scratch_count(const std::size_t size) {
+  return (size + sizeScratchGrain - 1) / sizeScratchGrain;
+}
+
+}  // namespace
+
 //----------------------------------------------------------------------------
 
 int HIPInternal::concurrency() {
-  static int const concurrency = m_deviceProp.maxThreadsPerMultiProcessor *
-                                 m_deviceProp.multiProcessorCount;
+  static int const concurrency = m_maxThreadsPerSM * m_multiProcCount;
+
   return concurrency;
 }
 
@@ -84,12 +95,17 @@ void HIPInternal::print_configuration(std::ostream &s) const {
   for (int i = 0; i < hipDevCount; ++i) {
     hipDeviceProp_t hipProp;
     KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceProperties(&hipProp, i));
+    std::string gpu_type = hipProp.integrated == 1 ? "APU" : "dGPU";
 
     s << "Kokkos::HIP[ " << i << " ] "
-      << "gcnArch " << hipProp.gcnArch << ", Total Global Memory: "
+      << "gcnArch " << hipProp.gcnArchName << ", Total Global Memory: "
       << ::Kokkos::Impl::human_memory_size(hipProp.totalGlobalMem)
       << ", Shared Memory per Block: "
-      << ::Kokkos::Impl::human_memory_size(hipProp.sharedMemPerBlock);
+      << ::Kokkos::Impl::human_memory_size(hipProp.sharedMemPerBlock)
+      << ", APU or dGPU: " << gpu_type
+      << ", Is Large Bar: " << hipProp.isLargeBar
+      << ", Supports Managed Memory: " << hipProp.managedMemory
+      << ", Wavefront Size: " << hipProp.warpSize;
     if (m_hipDev == i) s << " : Selected";
     s << '\n';
   }
@@ -144,65 +160,48 @@ void HIPInternal::fence(const std::string &name) const {
 }
 
 void HIPInternal::initialize(hipStream_t stream, bool manage_stream) {
+  KOKKOS_EXPECTS(!is_initialized());
+
   if (was_finalized)
     Kokkos::abort("Calling HIP::initialize after HIP::finalize is illegal\n");
 
-  if (is_initialized()) return;
-
-  if (!HostSpace::execution_space::impl_is_initialized()) {
-    const std::string msg(
-        "HIP::initialize ERROR : HostSpace::execution_space "
-        "is not initialized");
-    Kokkos::Impl::throw_runtime_exception(msg);
-  }
+  m_stream        = stream;
+  m_manage_stream = manage_stream;
 
-  const bool ok_init = nullptr == m_scratchSpace || nullptr == m_scratchFlags;
+  //----------------------------------
+  // Multiblock reduction uses scratch flags for counters
+  // and scratch space for partial reduction values.
+  // Allocate some initial space.  This will grow as needed.
+  {
+    const unsigned reduce_block_count =
+        m_maxWarpCount * Impl::HIPTraits::WarpSize;
 
-  if (ok_init) {
-    m_stream        = stream;
-    m_manage_stream = manage_stream;
-
-    //----------------------------------
-    // Multiblock reduction uses scratch flags for counters
-    // and scratch space for partial reduction values.
-    // Allocate some initial space.  This will grow as needed.
-    {
-      const unsigned reduce_block_count =
-          m_maxWarpCount * Impl::HIPTraits::WarpSize;
-
-      (void)scratch_flags(reduce_block_count * 2 * sizeof(size_type));
-      (void)scratch_space(reduce_block_count * 16 * sizeof(size_type));
-    }
-  } else {
-    std::ostringstream msg;
-    msg << "Kokkos::HIP::initialize(" << m_hipDev
-        << ") FAILED : Already initialized";
-    Kokkos::Impl::throw_runtime_exception(msg.str());
+    (void)scratch_flags(reduce_block_count * 2 * sizeof(size_type));
+    (void)scratch_space(reduce_block_count * 16 * sizeof(size_type));
   }
 
+  m_num_scratch_locks = concurrency();
   KOKKOS_IMPL_HIP_SAFE_CALL(
-      hipMalloc(&m_scratch_locks, sizeof(int32_t) * concurrency()));
+      hipMalloc(&m_scratch_locks, sizeof(int32_t) * m_num_scratch_locks));
   KOKKOS_IMPL_HIP_SAFE_CALL(
-      hipMemset(m_scratch_locks, 0, sizeof(int32_t) * concurrency()));
+      hipMemset(m_scratch_locks, 0, sizeof(int32_t) * m_num_scratch_locks));
 }
 
 //----------------------------------------------------------------------------
 
-using ScratchGrain = Kokkos::HIP::size_type[Impl::HIPTraits::WarpSize];
-enum { sizeScratchGrain = sizeof(ScratchGrain) };
-
 Kokkos::HIP::size_type *HIPInternal::scratch_space(const std::size_t size) {
   if (verify_is_initialized("scratch_space") &&
-      m_scratchSpaceCount * sizeScratchGrain < size) {
-    m_scratchSpaceCount = (size + sizeScratchGrain - 1) / sizeScratchGrain;
+      m_scratchSpaceCount < scratch_count(size)) {
+    m_scratchSpaceCount = scratch_count(size);
 
     using Record = Kokkos::Impl::SharedAllocationRecord<Kokkos::HIPSpace, void>;
 
     if (m_scratchSpace) Record::decrement(Record::get_record(m_scratchSpace));
 
-    Record *const r =
-        Record::allocate(Kokkos::HIPSpace(), "Kokkos::InternalScratchSpace",
-                         (sizeScratchGrain * m_scratchSpaceCount));
+    std::size_t alloc_size =
+        multiply_overflow_abort(m_scratchSpaceCount, sizeScratchGrain);
+    Record *const r = Record::allocate(
+        Kokkos::HIPSpace(), "Kokkos::InternalScratchSpace", alloc_size);
 
     Record::increment(r);
 
@@ -214,23 +213,23 @@ Kokkos::HIP::size_type *HIPInternal::scratch_space(const std::size_t size) {
 
 Kokkos::HIP::size_type *HIPInternal::scratch_flags(const std::size_t size) {
   if (verify_is_initialized("scratch_flags") &&
-      m_scratchFlagsCount * sizeScratchGrain < size) {
-    m_scratchFlagsCount = (size + sizeScratchGrain - 1) / sizeScratchGrain;
+      m_scratchFlagsCount < scratch_count(size)) {
+    m_scratchFlagsCount = scratch_count(size);
 
     using Record = Kokkos::Impl::SharedAllocationRecord<Kokkos::HIPSpace, void>;
 
     if (m_scratchFlags) Record::decrement(Record::get_record(m_scratchFlags));
 
-    Record *const r =
-        Record::allocate(Kokkos::HIPSpace(), "Kokkos::InternalScratchFlags",
-                         (sizeScratchGrain * m_scratchFlagsCount));
+    std::size_t alloc_size =
+        multiply_overflow_abort(m_scratchFlagsCount, sizeScratchGrain);
+    Record *const r = Record::allocate(
+        Kokkos::HIPSpace(), "Kokkos::InternalScratchFlags", alloc_size);
 
     Record::increment(r);
 
     m_scratchFlags = reinterpret_cast<size_type *>(r->data());
 
-    KOKKOS_IMPL_HIP_SAFE_CALL(
-        hipMemset(m_scratchFlags, 0, m_scratchFlagsCount * sizeScratchGrain));
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipMemset(m_scratchFlags, 0, alloc_size));
   }
 
   return m_scratchFlags;
@@ -326,7 +325,7 @@ void HIPInternal::finalize() {
 
   if (this == &singleton()) {
     (void)Kokkos::Impl::hip_global_unique_token_locks(true);
-    Impl::finalize_host_hip_lock_arrays();
+    desul::Impl::finalize_lock_arrays();  // FIXME
 
     KOKKOS_IMPL_HIP_SAFE_CALL(hipHostFree(constantMemHostStaging));
     KOKKOS_IMPL_HIP_SAFE_CALL(hipEventDestroy(constantMemReusable));
@@ -363,7 +362,8 @@ void HIPInternal::finalize() {
   }
 
   KOKKOS_IMPL_HIP_SAFE_CALL(hipFree(m_scratch_locks));
-  m_scratch_locks = nullptr;
+  m_scratch_locks     = nullptr;
+  m_num_scratch_locks = 0;
 }
 
 //----------------------------------------------------------------------------
@@ -397,14 +397,6 @@ Kokkos::HIP::size_type *hip_internal_scratch_flags(const HIP &instance,
 
 namespace Kokkos {
 namespace Impl {
-void hip_device_synchronize(const std::string &name) {
-  Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::HIP>(
-      name,
-      Kokkos::Tools::Experimental::SpecialSynchronizationCases::
-          GlobalDeviceSynchronization,
-      [&]() { KOKKOS_IMPL_HIP_SAFE_CALL(hipDeviceSynchronize()); });
-}
-
 void hip_internal_error_throw(hipError_t e, const char *name, const char *file,
                               const int line) {
   std::ostringstream out;
@@ -420,6 +412,16 @@ void hip_internal_error_throw(hipError_t e, const char *name, const char *file,
 
 //----------------------------------------------------------------------------
 
+void Kokkos::Impl::create_HIP_instances(std::vector<HIP> &instances) {
+  for (int s = 0; s < int(instances.size()); s++) {
+    hipStream_t stream;
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamCreate(&stream));
+    instances[s] = HIP(stream, ManageStream::yes);
+  }
+}
+
+//----------------------------------------------------------------------------
+
 namespace Kokkos {
 HIP::size_type HIP::detect_device_count() {
   int hipDevCount;
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp
index 06fab84b56d9b3548945c78f7542f7fb6e31556f..ef140ec46c061d6d90afb9bf9afee312b9d0527d 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp
@@ -22,28 +22,30 @@
 #include <HIP/Kokkos_HIP_Space.hpp>
 #include <HIP/Kokkos_HIP_Error.hpp>
 
+#include <atomic>
 #include <mutex>
 
 namespace Kokkos {
 namespace Impl {
 
 struct HIPTraits {
-#if defined(KOKKOS_ARCH_VEGA)
-  static int constexpr WarpSize       = 64;
-  static int constexpr WarpIndexMask  = 0x003f; /* hexadecimal for 63 */
-  static int constexpr WarpIndexShift = 6;      /* WarpSize == 1 << WarpShift*/
-#elif defined(KOKKOS_ARCH_NAVI)
-  static int constexpr WarpSize       = 32;
-  static int constexpr WarpIndexMask  = 0x001f; /* hexadecimal for 31 */
-  static int constexpr WarpIndexShift = 5;      /* WarpSize == 1 << WarpShift*/
+#if defined(KOKKOS_ARCH_AMD_GFX906) || defined(KOKKOS_ARCH_AMD_GFX908) || \
+    defined(KOKKOS_ARCH_AMD_GFX90A) || defined(KOKKOS_ARCH_AMD_GFX942)
+  static constexpr int WarpSize       = 64;
+  static constexpr int WarpIndexMask  = 0x003f; /* hexadecimal for 63 */
+  static constexpr int WarpIndexShift = 6;      /* WarpSize == 1 << WarpShift*/
+#elif defined(KOKKOS_ARCH_AMD_GFX1030) || defined(KOKKOS_ARCH_AMD_GFX1100)
+  static constexpr int WarpSize       = 32;
+  static constexpr int WarpIndexMask  = 0x001f; /* hexadecimal for 31 */
+  static constexpr int WarpIndexShift = 5;      /* WarpSize == 1 << WarpShift*/
 #endif
-  static int constexpr ConservativeThreadsPerBlock =
+  static constexpr int ConservativeThreadsPerBlock =
       256;  // conservative fallback blocksize in case of spills
-  static int constexpr MaxThreadsPerBlock =
+  static constexpr int MaxThreadsPerBlock =
       1024;  // the maximum we can fit in a block
-  static int constexpr ConstantMemoryUsage        = 0x008000; /* 32k bytes */
-  static int constexpr KernelArgumentLimit        = 0x001000; /*  4k bytes */
-  static int constexpr ConstantMemoryUseThreshold = 0x000200; /* 512 bytes */
+  static constexpr int ConstantMemoryUsage        = 0x008000; /* 32k bytes */
+  static constexpr int KernelArgumentLimit        = 0x001000; /*  4k bytes */
+  static constexpr int ConstantMemoryUseThreshold = 0x000200; /* 512 bytes */
 };
 
 //----------------------------------------------------------------------------
@@ -68,12 +70,10 @@ class HIPInternal {
   using size_type = ::Kokkos::HIP::size_type;
 
   inline static int m_hipDev                        = -1;
-  inline static int m_hipArch                       = -1;
   inline static unsigned m_multiProcCount           = 0;
   inline static unsigned m_maxWarpCount             = 0;
   inline static std::array<size_type, 3> m_maxBlock = {0, 0, 0};
   inline static unsigned m_maxWavesPerCU            = 0;
-  inline static unsigned m_maxSharedWords           = 0;
   inline static int m_shmemPerSM                    = 0;
   inline static int m_maxShmemPerBlock              = 0;
   inline static int m_maxThreadsPerSM               = 0;
@@ -104,7 +104,8 @@ class HIPInternal {
   mutable int64_t m_team_scratch_current_size[10] = {};
   mutable void *m_team_scratch_ptr[10]            = {};
   mutable std::atomic_int m_team_scratch_pool[10] = {};
-  std::int32_t *m_scratch_locks;
+  int32_t *m_scratch_locks                        = nullptr;
+  size_t m_num_scratch_locks                      = 0;
 
   bool was_finalized = false;
 
@@ -147,6 +148,7 @@ class HIPInternal {
   void release_team_scratch_space(int scratch_pool_id);
 };
 
+void create_HIP_instances(std::vector<HIP> &instances);
 }  // namespace Impl
 
 namespace Experimental {
@@ -155,16 +157,6 @@ namespace Experimental {
 //   Customization point for backends
 //   Default behavior is to return the passed in instance
 
-namespace Impl {
-inline void create_HIP_instances(std::vector<HIP> &instances) {
-  for (int s = 0; s < int(instances.size()); s++) {
-    hipStream_t stream;
-    KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamCreate(&stream));
-    instances[s] = HIP(stream, true);
-  }
-}
-}  // namespace Impl
-
 template <class... Args>
 std::vector<HIP> partition_space(const HIP &, Args...) {
   static_assert(
@@ -172,18 +164,20 @@ std::vector<HIP> partition_space(const HIP &, Args...) {
       "Kokkos Error: partitioning arguments must be integers or floats");
 
   std::vector<HIP> instances(sizeof...(Args));
-  Impl::create_HIP_instances(instances);
+  Kokkos::Impl::create_HIP_instances(instances);
   return instances;
 }
 
 template <class T>
-std::vector<HIP> partition_space(const HIP &, std::vector<T> &weights) {
+std::vector<HIP> partition_space(const HIP &, std::vector<T> const &weights) {
   static_assert(
       std::is_arithmetic<T>::value,
       "Kokkos Error: partitioning arguments must be integers or floats");
 
+  // We only care about the number of instances to create and ignore weights
+  // otherwise.
   std::vector<HIP> instances(weights.size());
-  Impl::create_HIP_instances(instances);
+  Kokkos::Impl::create_HIP_instances(instances);
   return instances;
 }
 }  // namespace Experimental
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp
index 8e8895f65a960c000da6941f8d67ac57bd64817a..7cd0afcf47fc004776554f44e64b213b7c1a12e2 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp
@@ -24,7 +24,15 @@
 #include <HIP/Kokkos_HIP_Error.hpp>
 #include <HIP/Kokkos_HIP_Instance.hpp>
 #include <HIP/Kokkos_HIP_Space.hpp>
-#include <HIP/Kokkos_HIP_Locks.hpp>
+
+#if !((HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR == 2))
+#define KOKKOS_IMPL_HIP_GRAPH_ENABLED
+#endif
+
+#ifdef KOKKOS_IMPL_HIP_GRAPH_ENABLED
+#include <HIP/Kokkos_HIP_GraphNodeKernel.hpp>
+#include <impl/Kokkos_GraphImpl_fwd.hpp>
+#endif
 
 // Must use global variable on the device with HIP-Clang
 #ifdef __HIP__
@@ -194,6 +202,13 @@ struct HIPParallelLaunchKernelFuncData {
   }
 };
 
+//---------------------------------------------------------------//
+// Helper function                                               //
+//---------------------------------------------------------------//
+inline bool is_empty_launch(dim3 const &grid, dim3 const &block) {
+  return (grid.x == 0) || ((block.x * block.y * block.z) == 0);
+}
+
 //---------------------------------------------------------------//
 // HIPParallelLaunchKernelFunc structure and its specializations //
 //---------------------------------------------------------------//
@@ -368,6 +383,42 @@ struct HIPParallelLaunchKernelInvoker<DriverType, LaunchBounds,
     (base_t::get_kernel_func())<<<grid, block, shmem, hip_instance->m_stream>>>(
         driver);
   }
+
+#ifdef KOKKOS_IMPL_HIP_GRAPH_ENABLED
+  static void create_parallel_launch_graph_node(
+      DriverType const &driver, dim3 const &grid, dim3 const &block, int shmem,
+      HIPInternal const * /*hip_instance*/) {
+    auto const &graph = get_hip_graph_from_kernel(driver);
+    KOKKOS_EXPECTS(graph);
+    auto &graph_node = get_hip_graph_node_from_kernel(driver);
+    // Expect node not yet initialized
+    KOKKOS_EXPECTS(!graph_node);
+
+    if (!is_empty_launch(grid, block)) {
+      void const *args[] = {&driver};
+
+      hipKernelNodeParams params = {};
+
+      params.blockDim       = block;
+      params.gridDim        = grid;
+      params.sharedMemBytes = shmem;
+      params.func           = (void *)base_t::get_kernel_func();
+      params.kernelParams   = (void **)args;
+      params.extra          = nullptr;
+
+      KOKKOS_IMPL_HIP_SAFE_CALL(hipGraphAddKernelNode(
+          &graph_node, graph, /* dependencies = */ nullptr,
+          /* numDependencies = */ 0, &params));
+    } else {
+      // We still need an empty node for the dependency structure
+      KOKKOS_IMPL_HIP_SAFE_CALL(
+          hipGraphAddEmptyNode(&graph_node, graph,
+                               /* dependencies = */ nullptr,
+                               /* numDependencies = */ 0));
+    }
+    KOKKOS_ENSURES(graph_node);
+  }
+#endif
 };
 
 // HIPLaunchMechanism::GlobalMemory specialization
@@ -390,6 +441,52 @@ struct HIPParallelLaunchKernelInvoker<DriverType, LaunchBounds,
     (base_t::get_kernel_func())<<<grid, block, shmem, hip_instance->m_stream>>>(
         driver_ptr);
   }
+
+#ifdef KOKKOS_IMPL_HIP_GRAPH_ENABLED
+  static void create_parallel_launch_graph_node(
+      DriverType const &driver, dim3 const &grid, dim3 const &block, int shmem,
+      HIPInternal const *hip_instance) {
+    auto const &graph = get_hip_graph_from_kernel(driver);
+    KOKKOS_EXPECTS(graph);
+    auto &graph_node = get_hip_graph_node_from_kernel(driver);
+    // Expect node not yet initialized
+    KOKKOS_EXPECTS(!graph_node);
+
+    if (!Impl::is_empty_launch(grid, block)) {
+      auto *driver_ptr = Impl::allocate_driver_storage_for_kernel(driver);
+
+      // Unlike in the non-graph case, we can get away with doing an async copy
+      // here because the `DriverType` instance is held in the GraphNodeImpl
+      // which is guaranteed to be alive until the graph instance itself is
+      // destroyed, where there should be a fence ensuring that the allocation
+      // associated with this kernel on the device side isn't deleted.
+      hipMemcpyAsync(driver_ptr, &driver, sizeof(DriverType), hipMemcpyDefault,
+                     hip_instance->m_stream);
+
+      void const *args[] = {&driver_ptr};
+
+      hipKernelNodeParams params = {};
+
+      params.blockDim       = block;
+      params.gridDim        = grid;
+      params.sharedMemBytes = shmem;
+      params.func           = (void *)base_t::get_kernel_func();
+      params.kernelParams   = (void **)args;
+      params.extra          = nullptr;
+
+      KOKKOS_IMPL_HIP_SAFE_CALL(hipGraphAddKernelNode(
+          &graph_node, graph, /* dependencies = */ nullptr,
+          /* numDependencies = */ 0, &params));
+    } else {
+      // We still need an empty node for the dependency structure
+      KOKKOS_IMPL_HIP_SAFE_CALL(
+          hipGraphAddEmptyNode(&graph_node, graph,
+                               /* dependencies = */ nullptr,
+                               /* numDependencies = */ 0));
+    }
+    KOKKOS_ENSURES(bool(graph_node))
+  }
+#endif
 };
 
 // HIPLaunchMechanism::ConstantMemory specializations
@@ -463,7 +560,7 @@ struct HIPParallelLaunch<
             "HIPParallelLaunch FAILED: shared memory request is too large");
       }
 
-      KOKKOS_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE();
+      desul::ensure_hip_lock_arrays_on_device();
 
       // Invoke the driver function on the device
       base_t::invoke_kernel(driver, grid, block, shmem, hip_instance);
@@ -482,42 +579,57 @@ struct HIPParallelLaunch<
 // al.
 template <typename DriverType, typename LaunchBounds = Kokkos::LaunchBounds<>,
           HIPLaunchMechanism LaunchMechanism =
-              DeduceHIPLaunchMechanism<DriverType>::launch_mechanism>
+              DeduceHIPLaunchMechanism<DriverType>::launch_mechanism,
+          bool DoGraph = DriverType::Policy::is_graph_kernel::value>
 void hip_parallel_launch(const DriverType &driver, const dim3 &grid,
                          const dim3 &block, const int shmem,
                          const HIPInternal *hip_instance,
                          const bool prefer_shmem) {
+#ifdef KOKKOS_IMPL_HIP_GRAPH_ENABLED
+  if constexpr (DoGraph) {
+    // Graph launch
+    using base_t = HIPParallelLaunchKernelInvoker<DriverType, LaunchBounds,
+                                                  LaunchMechanism>;
+    base_t::create_parallel_launch_graph_node(driver, grid, block, shmem,
+                                              hip_instance);
+  } else
+#endif
+  {
+    // Regular kernel launch
 #ifndef KOKKOS_ENABLE_HIP_MULTIPLE_KERNEL_INSTANTIATIONS
-  HIPParallelLaunch<DriverType, LaunchBounds, LaunchMechanism>(
-      driver, grid, block, shmem, hip_instance, prefer_shmem);
-#else
-  if constexpr (!HIPParallelLaunch<DriverType, LaunchBounds,
-                                   LaunchMechanism>::default_launchbounds()) {
-    // for user defined, we *always* honor the request
     HIPParallelLaunch<DriverType, LaunchBounds, LaunchMechanism>(
         driver, grid, block, shmem, hip_instance, prefer_shmem);
-  } else {
-    // we can do what we like
-    const unsigned flat_block_size = block.x * block.y * block.z;
-    if (flat_block_size <= HIPTraits::ConservativeThreadsPerBlock) {
-      // we have to use the large blocksize
-      HIPParallelLaunch<
-          DriverType,
-          Kokkos::LaunchBounds<HIPTraits::ConservativeThreadsPerBlock, 1>,
-          LaunchMechanism>(driver, grid, block, shmem, hip_instance,
-                           prefer_shmem);
+#else
+    if constexpr (!HIPParallelLaunch<DriverType, LaunchBounds,
+                                     LaunchMechanism>::default_launchbounds()) {
+      // for user defined, we *always* honor the request
+      HIPParallelLaunch<DriverType, LaunchBounds, LaunchMechanism>(
+          driver, grid, block, shmem, hip_instance, prefer_shmem);
     } else {
-      HIPParallelLaunch<DriverType,
-                        Kokkos::LaunchBounds<HIPTraits::MaxThreadsPerBlock, 1>,
-                        LaunchMechanism>(driver, grid, block, shmem,
-                                         hip_instance, prefer_shmem);
+      // we can do what we like
+      const unsigned flat_block_size = block.x * block.y * block.z;
+      if (flat_block_size <= HIPTraits::ConservativeThreadsPerBlock) {
+        // we have to use the large blocksize
+        HIPParallelLaunch<
+            DriverType,
+            Kokkos::LaunchBounds<HIPTraits::ConservativeThreadsPerBlock, 1>,
+            LaunchMechanism>(driver, grid, block, shmem, hip_instance,
+                             prefer_shmem);
+      } else {
+        HIPParallelLaunch<
+            DriverType, Kokkos::LaunchBounds<HIPTraits::MaxThreadsPerBlock, 1>,
+            LaunchMechanism>(driver, grid, block, shmem, hip_instance,
+                             prefer_shmem);
+      }
     }
-  }
 #endif
+  }
 }
 }  // namespace Impl
 }  // namespace Kokkos
 
+#undef KOKKOS_IMPL_HIP_GRAPH_ENABLED
+
 #endif
 
 #endif
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Locks.cpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Locks.cpp
deleted file mode 100644
index 76d3f6f5c8ef4079f2854ee7525f513159fb01a5..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Locks.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
-#define KOKKOS_IMPL_PUBLIC_INCLUDE
-#endif
-
-#include <Kokkos_Macros.hpp>
-
-#include <HIP/Kokkos_HIP_Locks.hpp>
-#include <HIP/Kokkos_HIP_Error.hpp>
-#include <HIP/Kokkos_HIP.hpp>
-#include <HIP/Kokkos_HIP_Instance.hpp>
-
-#include <hip/hip_runtime.h>
-
-#include <iostream>
-
-namespace Kokkos {
-
-#ifdef KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE
-namespace Impl {
-__device__ __constant__ HIPLockArrays g_device_hip_lock_arrays = {nullptr, 0};
-}
-#endif
-
-namespace {
-
-__global__ void init_lock_array_kernel_atomic() {
-  unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i < KOKKOS_IMPL_HIP_SPACE_ATOMIC_MASK + 1) {
-    Kokkos::Impl::g_device_hip_lock_arrays.atomic[i] = 0;
-  }
-}
-
-}  // namespace
-
-namespace Impl {
-
-HIPLockArrays g_host_hip_lock_arrays = {nullptr, 0};
-
-void initialize_host_hip_lock_arrays() {
-#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
-  desul::Impl::init_lock_arrays();
-
-  DESUL_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE();
-#endif
-
-  if (g_host_hip_lock_arrays.atomic != nullptr) return;
-  KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(
-      &g_host_hip_lock_arrays.atomic,
-      sizeof(std::int32_t) * (KOKKOS_IMPL_HIP_SPACE_ATOMIC_MASK + 1)));
-
-  g_host_hip_lock_arrays.n = HIPInternal::concurrency();
-
-  KOKKOS_COPY_HIP_LOCK_ARRAYS_TO_DEVICE();
-  init_lock_array_kernel_atomic<<<
-      (KOKKOS_IMPL_HIP_SPACE_ATOMIC_MASK + 1 + 255) / 256, 256, 0, nullptr>>>();
-}
-
-void finalize_host_hip_lock_arrays() {
-#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
-  desul::Impl::finalize_lock_arrays();
-#endif
-
-  if (g_host_hip_lock_arrays.atomic == nullptr) return;
-  KOKKOS_IMPL_HIP_SAFE_CALL(hipFree(g_host_hip_lock_arrays.atomic));
-  g_host_hip_lock_arrays.atomic = nullptr;
-  g_host_hip_lock_arrays.n      = 0;
-#ifdef KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE
-  KOKKOS_COPY_HIP_LOCK_ARRAYS_TO_DEVICE();
-#endif
-}
-
-}  // namespace Impl
-
-}  // namespace Kokkos
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Locks.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Locks.hpp
deleted file mode 100644
index fbed4afd3f434c8f1fdefdb34fb7be4f20a96024..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Locks.hpp
+++ /dev/null
@@ -1,157 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#ifndef KOKKOS_HIP_LOCKS_HPP
-#define KOKKOS_HIP_LOCKS_HPP
-
-#include <Kokkos_Macros.hpp>
-
-#include <cstdint>
-
-#include <HIP/Kokkos_HIP_Error.hpp>
-
-#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
-#include <desul/atomics/Lock_Array_HIP.hpp>
-#endif
-
-namespace Kokkos {
-namespace Impl {
-
-struct HIPLockArrays {
-  std::int32_t* atomic;
-  std::int32_t n;
-};
-
-/// \brief This global variable in Host space is the central definition
-///        of these arrays.
-extern HIPLockArrays g_host_hip_lock_arrays;
-
-/// \brief After this call, the g_host_hip_lock_arrays variable has
-///        valid, initialized arrays.
-///
-/// This call is idempotent.
-void initialize_host_hip_lock_arrays();
-
-/// \brief After this call, the g_host_hip_lock_arrays variable has
-///        all null pointers, and all array memory has been freed.
-///
-/// This call is idempotent.
-void finalize_host_hip_lock_arrays();
-
-#if defined(__HIPCC__)
-
-/// \brief This global variable in HIP space is what kernels use
-///        to get access to the lock arrays.
-///
-/// When relocatable device code is enabled, there can be one single
-/// instance of this global variable for the entire executable,
-/// whose definition will be in Kokkos_HIP_Locks.cpp (and whose declaration
-/// here must then be extern).
-/// This one instance will be initialized by initialize_host_HIP_lock_arrays
-/// and need not be modified afterwards.
-///
-/// When relocatable device code is disabled, an instance of this variable
-/// will be created in every translation unit that sees this header file.
-/// Since the Kokkos_HIP_Locks.cpp translation unit cannot initialize the
-/// instances in other translation units, we must update this HIP global
-/// variable based on the Host global variable prior to running any kernels
-/// that will use it.
-/// That is the purpose of the KOKKOS_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE macro.
-__device__
-#ifdef KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE
-    __constant__ extern
-#endif
-    HIPLockArrays g_device_hip_lock_arrays;
-
-#define KOKKOS_IMPL_HIP_SPACE_ATOMIC_MASK 0x1FFFF
-
-/// \brief Acquire a lock for the address
-///
-/// This function tries to acquire the lock for the hash value derived
-/// from the provided ptr. If the lock is successfully acquired the
-/// function returns true. Otherwise it returns false.
-__device__ inline bool lock_address_hip_space(void* ptr) {
-  auto offset = reinterpret_cast<size_t>(ptr);
-  offset      = offset >> 2;
-  offset      = offset & KOKKOS_IMPL_HIP_SPACE_ATOMIC_MASK;
-  return (0 == atomicCAS(&g_device_hip_lock_arrays.atomic[offset], 0, 1));
-}
-
-/// \brief Release lock for the address
-///
-/// This function releases the lock for the hash value derived
-/// from the provided ptr. This function should only be called
-/// after previously successfully aquiring a lock with
-/// lock_address.
-__device__ inline void unlock_address_hip_space(void* ptr) {
-  auto offset = reinterpret_cast<size_t>(ptr);
-  offset      = offset >> 2;
-  offset      = offset & KOKKOS_IMPL_HIP_SPACE_ATOMIC_MASK;
-  atomicExch(&g_device_hip_lock_arrays.atomic[offset], 0);
-}
-
-}  // namespace Impl
-}  // namespace Kokkos
-
-// Make lock_array_copied an explicit translation unit scope thingy
-namespace Kokkos {
-namespace Impl {
-namespace {
-static int lock_array_copied = 0;
-inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
-}  // namespace
-}  // namespace Impl
-}  // namespace Kokkos
-
-/* Dan Ibanez: it is critical that this code be a macro, so that it will
-   capture the right address for g_device_hip_lock_arrays!
-   putting this in an inline function will NOT do the right thing! */
-#define KOKKOS_COPY_HIP_LOCK_ARRAYS_TO_DEVICE()                 \
-  {                                                             \
-    if (::Kokkos::Impl::lock_array_copied == 0) {               \
-      KOKKOS_IMPL_HIP_SAFE_CALL(hipMemcpyToSymbol(              \
-          HIP_SYMBOL(::Kokkos::Impl::g_device_hip_lock_arrays), \
-          &::Kokkos::Impl::g_host_hip_lock_arrays,              \
-          sizeof(::Kokkos::Impl::HIPLockArrays)));              \
-    }                                                           \
-    ::Kokkos::Impl::lock_array_copied = 1;                      \
-  }
-
-#ifndef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
-
-#ifdef KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE
-#define KOKKOS_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE()
-#else
-#define KOKKOS_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE() \
-  KOKKOS_COPY_HIP_LOCK_ARRAYS_TO_DEVICE()
-#endif
-
-#else
-
-#ifdef KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE
-#define KOKKOS_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE()
-#else
-// Still Need COPY_CUDA_LOCK_ARRAYS for team scratch etc.
-#define KOKKOS_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE() \
-  KOKKOS_COPY_HIP_LOCK_ARRAYS_TO_DEVICE()         \
-  DESUL_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE()
-#endif
-
-#endif /* defined( KOKKOS_ENABLE_IMPL_DESUL_ATOMICS ) */
-
-#endif /* defined( __HIPCC__ ) */
-
-#endif /* #ifndef KOKKOS_HIP_LOCKS_HPP */
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp
index 10ec301d15d1fcfd735b809a43aad858362f3fad..0fa325cb12c7250919fc0176f3533fca76f35bd5 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp
@@ -30,7 +30,8 @@ namespace Impl {
 template <class FunctorType, class... Traits>
 class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, HIP> {
  public:
-  using Policy = Kokkos::MDRangePolicy<Traits...>;
+  using Policy       = Kokkos::MDRangePolicy<Traits...>;
+  using functor_type = FunctorType;
 
  private:
   using array_index_type = typename Policy::array_index_type;
@@ -40,10 +41,11 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, HIP> {
   const FunctorType m_functor;
   const Policy m_policy;
 
-  ParallelFor()        = delete;
+ public:
+  ParallelFor()                   = delete;
+  ParallelFor(ParallelFor const&) = default;
   ParallelFor& operator=(ParallelFor const&) = delete;
 
- public:
   inline __device__ void operator()() const {
     Kokkos::Impl::DeviceIterateTile<Policy::rank, Policy, FunctorType,
                                     typename Policy::work_tag>(m_policy,
@@ -165,11 +167,13 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, HIP> {
 };
 
 // ParallelReduce
-template <class FunctorType, class ReducerType, class... Traits>
-class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
-                     HIP> {
+template <class CombinedFunctorReducerType, class... Traits>
+class ParallelReduce<CombinedFunctorReducerType,
+                     Kokkos::MDRangePolicy<Traits...>, HIP> {
  public:
-  using Policy = Kokkos::MDRangePolicy<Traits...>;
+  using Policy      = Kokkos::MDRangePolicy<Traits...>;
+  using FunctorType = typename CombinedFunctorReducerType::functor_type;
+  using ReducerType = typename CombinedFunctorReducerType::reducer_type;
 
  private:
   using array_index_type = typename Policy::array_index_type;
@@ -179,34 +183,38 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
   using Member       = typename Policy::member_type;
   using LaunchBounds = typename Policy::launch_bounds;
 
-  using ReducerConditional =
-      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                         FunctorType, ReducerType>;
-  using ReducerTypeFwd = typename ReducerConditional::type;
-  using WorkTagFwd =
-      typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                                  WorkTag, void>::type;
-
-  using Analysis =
-      Kokkos::Impl::FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy,
-                                    ReducerTypeFwd>;
-
  public:
-  using pointer_type   = typename Analysis::pointer_type;
-  using value_type     = typename Analysis::value_type;
-  using reference_type = typename Analysis::reference_type;
+  using pointer_type   = typename ReducerType::pointer_type;
+  using value_type     = typename ReducerType::value_type;
+  using reference_type = typename ReducerType::reference_type;
   using functor_type   = FunctorType;
   using size_type      = HIP::size_type;
 
+  // Conditionally set word_size_type to int16_t or int8_t if value_type is
+  // smaller than int32_t (Kokkos::HIP::size_type)
+  // word_size_type is used to determine the word count, shared memory buffer
+  // size, and global memory buffer size before the reduction is performed.
+  // Within the reduction, the word count is recomputed based on word_size_type
+  // and when calculating indexes into the shared/global memory buffers for
+  // performing the reduction, word_size_type is used again.
+  // For scalars > 4 bytes in size, indexing into shared/global memory relies
+  // on the block and grid dimensions to ensure that we index at the correct
+  // offset rather than at every 4 byte word; such that, when the join is
+  // performed, we have the correct data that was copied over in chunks of 4
+  // bytes.
+  static_assert(sizeof(size_type) == 4);
+  using word_size_type = std::conditional_t<
+      sizeof(value_type) < 4,
+      std::conditional_t<sizeof(value_type) == 2, int16_t, int8_t>, size_type>;
+
   // Algorithmic constraints: blockSize is a power of two AND blockDim.y ==
   // blockDim.z == 1
 
-  const FunctorType m_functor;
+  const CombinedFunctorReducerType m_functor_reducer;
   const Policy m_policy;  // used for workrange and nwork
-  const ReducerType m_reducer;
   const pointer_type m_result_ptr;
   const bool m_result_ptr_device_accessible;
-  size_type* m_scratch_space;
+  word_size_type* m_scratch_space;
   size_type* m_scratch_flags;
 
   using DeviceIteratePattern = typename Kokkos::Impl::Reduce::DeviceIterateTile<
@@ -214,28 +222,27 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
 
  public:
   inline __device__ void exec_range(reference_type update) const {
-    DeviceIteratePattern(m_policy, m_functor, update).exec_range();
+    DeviceIteratePattern(m_policy, m_functor_reducer.get_functor(), update)
+        .exec_range();
   }
 
   inline __device__ void operator()() const {
-    typename Analysis::Reducer final_reducer(
-        &ReducerConditional::select(m_functor, m_reducer));
+    const ReducerType& reducer = m_functor_reducer.get_reducer();
 
-    const integral_nonzero_constant<size_type, Analysis::StaticValueSize /
-                                                   sizeof(size_type)>
-        word_count(Analysis::value_size(
-                       ReducerConditional::select(m_functor, m_reducer)) /
-                   sizeof(size_type));
+    const integral_nonzero_constant<word_size_type,
+                                    ReducerType::static_value_size() /
+                                        sizeof(word_size_type)>
+        word_count(reducer.value_size() / sizeof(word_size_type));
 
     {
-      reference_type value = final_reducer.init(reinterpret_cast<pointer_type>(
-          kokkos_impl_hip_shared_memory<size_type>() +
+      reference_type value = reducer.init(reinterpret_cast<pointer_type>(
+          kokkos_impl_hip_shared_memory<word_size_type>() +
           threadIdx.y * word_count.value));
 
       // Number of blocks is bounded so that the reduction can be limited to two
       // passes. Each thread block is given an approximately equal amount of
       // work to perform. Accumulate the values for this block. The accumulation
-      // ordering does not match the final pass, but is arithmatically
+      // ordering does not match the final pass, but is arithmetically
       // equivalent.
 
       this->exec_range(value);
@@ -244,19 +251,21 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
     // Reduce with final value at blockDim.y - 1 location.
     // Problem: non power-of-two blockDim
     if (::Kokkos::Impl::hip_single_inter_block_reduce_scan<false>(
-            final_reducer, blockIdx.x, gridDim.x,
-            kokkos_impl_hip_shared_memory<size_type>(), m_scratch_space,
+            reducer, blockIdx.x, gridDim.x,
+            kokkos_impl_hip_shared_memory<word_size_type>(), m_scratch_space,
             m_scratch_flags)) {
       // This is the final block with the final result at the final threads'
       // location
-      size_type* const shared = kokkos_impl_hip_shared_memory<size_type>() +
-                                (blockDim.y - 1) * word_count.value;
-      size_type* const global = m_result_ptr_device_accessible
-                                    ? reinterpret_cast<size_type*>(m_result_ptr)
-                                    : m_scratch_space;
+      word_size_type* const shared =
+          kokkos_impl_hip_shared_memory<word_size_type>() +
+          (blockDim.y - 1) * word_count.value;
+      word_size_type* const global =
+          m_result_ptr_device_accessible
+              ? reinterpret_cast<word_size_type*>(m_result_ptr)
+              : m_scratch_space;
 
       if (threadIdx.y == 0) {
-        final_reducer.final(reinterpret_cast<value_type*>(shared));
+        reducer.final(reinterpret_cast<value_type*>(shared));
       }
 
       if (Impl::HIPTraits::WarpSize < word_count.value) {
@@ -274,13 +283,12 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
   inline unsigned local_block_size(const FunctorType& f) {
     const auto& instance = m_policy.space().impl_internal_space_instance();
     auto shmem_functor   = [&f](unsigned n) {
-      return hip_single_inter_block_reduce_scan_shmem<false, FunctorType,
-                                                      WorkTag>(f, n);
+      return hip_single_inter_block_reduce_scan_shmem<false, WorkTag,
+                                                      value_type>(f, n);
     };
-    using closure_type = ParallelReduce<FunctorType, Policy, ReducerType, HIP>;
 
     unsigned block_size =
-        Kokkos::Impl::hip_get_preferred_blocksize<closure_type, LaunchBounds>(
+        Kokkos::Impl::hip_get_preferred_blocksize<ParallelReduce, LaunchBounds>(
             instance, shmem_functor);
     if (block_size == 0) {
       Kokkos::Impl::throw_runtime_exception(
@@ -291,29 +299,28 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
   }
 
   inline void execute() {
-    typename Analysis::Reducer final_reducer(
-        &ReducerConditional::select(m_functor, m_reducer));
+    ReducerType reducer = m_functor_reducer.get_reducer();
 
-    using ClosureType =
-        ParallelReduce<FunctorType, Policy, ReducerType, Kokkos::HIP>;
     const auto nwork = m_policy.m_num_tiles;
     if (nwork) {
       int block_size = m_policy.m_prod_tile_dims;
       // CONSTRAINT: Algorithm requires block_size >= product of tile dimensions
       // Nearest power of two
-      int exponent_pow_two    = std::ceil(std::log2(block_size));
-      block_size              = std::pow(2, exponent_pow_two);
-      int suggested_blocksize = local_block_size(m_functor);
+      int exponent_pow_two = std::ceil(std::log2(block_size));
+      block_size           = std::pow(2, exponent_pow_two);
+      int suggested_blocksize =
+          local_block_size(m_functor_reducer.get_functor());
 
       block_size = (block_size > suggested_blocksize)
                        ? block_size
                        : suggested_blocksize;  // Note: block_size must be less
                                                // than or equal to 512
 
-      m_scratch_space = hip_internal_scratch_space(
-          m_policy.space(), Analysis::value_size(ReducerConditional::select(
-                                m_functor, m_reducer)) *
-                                block_size /* block_size == max block_count */);
+      m_scratch_space =
+          reinterpret_cast<word_size_type*>(hip_internal_scratch_space(
+              m_policy.space(),
+              reducer.value_size() *
+                  block_size /* block_size == max block_count */));
       m_scratch_flags =
           hip_internal_scratch_flags(m_policy.space(), sizeof(size_type));
 
@@ -326,34 +333,31 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
 
       const int shmem =
           ::Kokkos::Impl::hip_single_inter_block_reduce_scan_shmem<
-              false, FunctorType, WorkTag>(m_functor, block.y);
+              false, WorkTag, value_type>(m_functor_reducer.get_functor(),
+                                          block.y);
 
-      hip_parallel_launch<ClosureType, LaunchBounds>(
+      hip_parallel_launch<ParallelReduce, LaunchBounds>(
           *this, grid, block, shmem,
           m_policy.space().impl_internal_space_instance(),
           false);  // copy to device and execute
 
       if (!m_result_ptr_device_accessible && m_result_ptr) {
-        const int size = Analysis::value_size(
-            ReducerConditional::select(m_functor, m_reducer));
+        const int size = reducer.value_size();
         DeepCopy<HostSpace, HIPSpace, HIP>(m_policy.space(), m_result_ptr,
                                            m_scratch_space, size);
       }
     } else {
       if (m_result_ptr) {
-        final_reducer.init(m_result_ptr);
+        reducer.init(m_result_ptr);
       }
     }
   }
 
   template <class ViewType>
-  ParallelReduce(
-      const FunctorType& arg_functor, const Policy& arg_policy,
-      const ViewType& arg_result,
-      std::enable_if_t<Kokkos::is_view<ViewType>::value, void*> = nullptr)
-      : m_functor(arg_functor),
+  ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer,
+                 const Policy& arg_policy, const ViewType& arg_result)
+      : m_functor_reducer(arg_functor_reducer),
         m_policy(arg_policy),
-        m_reducer(InvalidType()),
         m_result_ptr(arg_result.data()),
         m_result_ptr_device_accessible(
             MemorySpaceAccess<HIPSpace,
@@ -361,23 +365,10 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
         m_scratch_space(nullptr),
         m_scratch_flags(nullptr) {}
 
-  ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy,
-                 const ReducerType& reducer)
-      : m_functor(arg_functor),
-        m_policy(arg_policy),
-        m_reducer(reducer),
-        m_result_ptr(reducer.view().data()),
-        m_result_ptr_device_accessible(
-            MemorySpaceAccess<HIPSpace, typename ReducerType::result_view_type::
-                                            memory_space>::accessible),
-        m_scratch_space(nullptr),
-        m_scratch_flags(nullptr) {}
-
   template <typename Policy, typename Functor>
   static int max_tile_size_product(const Policy&, const Functor&) {
-    using closure_type =
-        ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>,
-                       ReducerType, HIP>;
+    using closure_type  = ParallelReduce<CombinedFunctorReducerType,
+                                        Kokkos::MDRangePolicy<Traits...>, HIP>;
     unsigned block_size = hip_get_max_blocksize<closure_type, LaunchBounds>();
     if (block_size == 0) {
       Kokkos::Impl::throw_runtime_exception(
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp
index d8c52aa95f7e991e0c65c250c41d6cd743590604..26e8be4698a85bf7075f5e2e24f6426c64acb827 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp
@@ -43,9 +43,6 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::HIP> {
   const FunctorType m_functor;
   const Policy m_policy;
 
-  ParallelFor()        = delete;
-  ParallelFor& operator=(const ParallelFor&) = delete;
-
   template <class TagType>
   inline __device__ std::enable_if_t<std::is_void<TagType>::value> exec_range(
       const Member i) const {
@@ -61,6 +58,10 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::HIP> {
  public:
   using functor_type = FunctorType;
 
+  ParallelFor()                   = delete;
+  ParallelFor(ParallelFor const&) = default;
+  ParallelFor& operator=(ParallelFor const&) = delete;
+
   inline __device__ void operator()() const {
     const Member work_stride = blockDim.y * gridDim.x;
     const Member work_end    = m_policy.end();
@@ -101,11 +102,13 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::HIP> {
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
-template <class FunctorType, class ReducerType, class... Traits>
-class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
+template <class CombinedFunctorReducerType, class... Traits>
+class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>,
                      Kokkos::HIP> {
  public:
-  using Policy = Kokkos::RangePolicy<Traits...>;
+  using Policy      = Kokkos::RangePolicy<Traits...>;
+  using FunctorType = typename CombinedFunctorReducerType::functor_type;
+  using ReducerType = typename CombinedFunctorReducerType::reducer_type;
 
  private:
   using WorkRange    = typename Policy::WorkRange;
@@ -113,40 +116,42 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
   using Member       = typename Policy::member_type;
   using LaunchBounds = typename Policy::launch_bounds;
 
-  using ReducerConditional =
-      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                         FunctorType, ReducerType>;
-  using ReducerTypeFwd = typename ReducerConditional::type;
-  using WorkTagFwd =
-      typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                                  WorkTag, void>::type;
-
-  using Analysis =
-      Kokkos::Impl::FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy,
-                                    ReducerTypeFwd>;
-
  public:
-  using pointer_type   = typename Analysis::pointer_type;
-  using value_type     = typename Analysis::value_type;
-  using reference_type = typename Analysis::reference_type;
+  using pointer_type   = typename ReducerType::pointer_type;
+  using value_type     = typename ReducerType::value_type;
+  using reference_type = typename ReducerType::reference_type;
   using functor_type   = FunctorType;
+  using reducer_type   = ReducerType;
   using size_type      = Kokkos::HIP::size_type;
   using index_type     = typename Policy::index_type;
+  // Conditionally set word_size_type to int16_t or int8_t if value_type is
+  // smaller than int32_t (Kokkos::HIP::size_type)
+  // word_size_type is used to determine the word count, shared memory buffer
+  // size, and global memory buffer size before the scan is performed.
+  // Within the scan, the word count is recomputed based on word_size_type
+  // and when calculating indexes into the shared/global memory buffers for
+  // performing the scan, word_size_type is used again.
+  // For scalars > 4 bytes in size, indexing into shared/global memory relies
+  // on the block and grid dimensions to ensure that we index at the correct
+  // offset rather than at every 4 byte word; such that, when the join is
+  // performed, we have the correct data that was copied over in chunks of 4
+  // bytes.
+  using word_size_type = std::conditional_t<
+      sizeof(value_type) < sizeof(size_type),
+      std::conditional_t<sizeof(value_type) == 2, int16_t, int8_t>, size_type>;
 
   // Algorithmic constraints: blockSize is a power of two AND blockDim.y ==
   // blockDim.z == 1
 
-  const FunctorType m_functor;
+  const CombinedFunctorReducerType m_functor_reducer;
   const Policy m_policy;
-  const ReducerType m_reducer;
   const pointer_type m_result_ptr;
   const bool m_result_ptr_device_accessible;
   const bool m_result_ptr_host_accessible;
-  size_type* m_scratch_space = nullptr;
-  size_type* m_scratch_flags = nullptr;
+  word_size_type* m_scratch_space = nullptr;
+  size_type* m_scratch_flags      = nullptr;
 
-  static bool constexpr UseShflReduction =
-      static_cast<bool>(Analysis::StaticValueSize);
+  static constexpr bool UseShflReduction = false;
 
  private:
   struct ShflReductionTag {};
@@ -156,13 +161,13 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
   template <class TagType>
   __device__ inline std::enable_if_t<std::is_void<TagType>::value> exec_range(
       const Member& i, reference_type update) const {
-    m_functor(i, update);
+    m_functor_reducer.get_functor()(i, update);
   }
 
   template <class TagType>
   __device__ inline std::enable_if_t<!std::is_void<TagType>::value> exec_range(
       const Member& i, reference_type update) const {
-    m_functor(TagType(), i, update);
+    m_functor_reducer.get_functor()(TagType(), i, update);
   }
 
  public:
@@ -173,17 +178,15 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
   }
 
   __device__ inline void run(SHMEMReductionTag) const {
-    const integral_nonzero_constant<size_type, Analysis::StaticValueSize /
-                                                   sizeof(size_type)>
-        word_count(Analysis::value_size(
-                       ReducerConditional::select(m_functor, m_reducer)) /
-                   sizeof(size_type));
-
-    typename Analysis::Reducer final_reducer(
-        &ReducerConditional::select(m_functor, m_reducer));
+    const ReducerType& reducer = m_functor_reducer.get_reducer();
+    const integral_nonzero_constant<word_size_type,
+                                    ReducerType::static_value_size() /
+                                        sizeof(word_size_type)>
+        word_count(reducer.value_size() / sizeof(word_size_type));
+
     {
-      reference_type value = final_reducer.init(reinterpret_cast<pointer_type>(
-          ::Kokkos::kokkos_impl_hip_shared_memory<size_type>() +
+      reference_type value = reducer.init(reinterpret_cast<pointer_type>(
+          ::Kokkos::kokkos_impl_hip_shared_memory<word_size_type>() +
           threadIdx.y * word_count.value));
 
       // Number of blocks is bounded so that the reduction can be limited to two
@@ -205,22 +208,23 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
     bool do_final_reduction = m_policy.begin() == m_policy.end();
     if (!do_final_reduction)
       do_final_reduction = hip_single_inter_block_reduce_scan<false>(
-          final_reducer, blockIdx.x, gridDim.x,
-          ::Kokkos::kokkos_impl_hip_shared_memory<size_type>(), m_scratch_space,
-          m_scratch_flags);
+          reducer, blockIdx.x, gridDim.x,
+          ::Kokkos::kokkos_impl_hip_shared_memory<word_size_type>(),
+          m_scratch_space, m_scratch_flags);
     if (do_final_reduction) {
       // This is the final block with the final result at the final threads'
       // location
 
-      size_type* const shared =
-          ::Kokkos::kokkos_impl_hip_shared_memory<size_type>() +
+      word_size_type* const shared =
+          ::Kokkos::kokkos_impl_hip_shared_memory<word_size_type>() +
           (blockDim.y - 1) * word_count.value;
-      size_type* const global = m_result_ptr_device_accessible
-                                    ? reinterpret_cast<size_type*>(m_result_ptr)
-                                    : m_scratch_space;
+      word_size_type* const global =
+          m_result_ptr_device_accessible
+              ? reinterpret_cast<word_size_type*>(m_result_ptr)
+              : m_scratch_space;
 
       if (threadIdx.y == 0) {
-        final_reducer.final(reinterpret_cast<value_type*>(shared));
+        reducer.final(reinterpret_cast<value_type*>(shared));
       }
 
       if (::Kokkos::Impl::HIPTraits::WarpSize < word_count.value) {
@@ -234,11 +238,10 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
   }
 
   __device__ inline void run(ShflReductionTag) const {
-    typename Analysis::Reducer final_reducer(
-        &ReducerConditional::select(m_functor, m_reducer));
+    const ReducerType& reducer = m_functor_reducer.get_reducer();
 
     value_type value;
-    final_reducer.init(&value);
+    reducer.init(&value);
     // Number of blocks is bounded so that the reduction can be limited to two
     // passes. Each thread block is given an approximately equal amount of work
     // to perform. Accumulate the values for this block. The accumulation
@@ -262,18 +265,18 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
         (max_active_thread == 0) ? blockDim.y : max_active_thread;
 
     value_type init;
-    final_reducer.init(&init);
+    reducer.init(&init);
     if (m_policy.begin() == m_policy.end()) {
-      final_reducer.final(&value);
+      reducer.final(&value);
       pointer_type const final_result =
           m_result_ptr_device_accessible ? m_result_ptr : result;
       *final_result = value;
     } else if (Impl::hip_inter_block_shuffle_reduction<>(
-                   value, init, final_reducer, m_scratch_space, result,
+                   value, init, reducer, m_scratch_space, result,
                    m_scratch_flags, max_active_thread)) {
       unsigned int const id = threadIdx.y * blockDim.x + threadIdx.x;
       if (id == 0) {
-        final_reducer.final(&value);
+        reducer.final(&value);
         pointer_type const final_result =
             m_result_ptr_device_accessible ? m_result_ptr : result;
         *final_result = value;
@@ -285,45 +288,73 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
   inline unsigned local_block_size(const FunctorType& f) {
     const auto& instance = m_policy.space().impl_internal_space_instance();
     auto shmem_functor   = [&f](unsigned n) {
-      return hip_single_inter_block_reduce_scan_shmem<false, FunctorType,
-                                                      WorkTag>(f, n);
+      return hip_single_inter_block_reduce_scan_shmem<false, WorkTag,
+                                                      value_type>(f, n);
     };
-    using DriverType =
-        ParallelReduce<FunctorType, Policy, ReducerType, Kokkos::HIP>;
-    return Kokkos::Impl::hip_get_preferred_blocksize<DriverType, LaunchBounds>(
+    return Kokkos::Impl::hip_get_preferred_blocksize<ParallelReduce,
+                                                     LaunchBounds>(
         instance, shmem_functor);
   }
 
   inline void execute() {
-    typename Analysis::Reducer final_reducer(
-        &ReducerConditional::select(m_functor, m_reducer));
+    const ReducerType& reducer = m_functor_reducer.get_reducer();
 
     const index_type nwork     = m_policy.end() - m_policy.begin();
-    const bool need_device_set = Analysis::has_init_member_function ||
-                                 Analysis::has_final_member_function ||
+    const bool need_device_set = ReducerType::has_init_member_function() ||
+                                 ReducerType::has_final_member_function() ||
                                  !m_result_ptr_host_accessible ||
                                  !std::is_same<ReducerType, InvalidType>::value;
     if ((nwork > 0) || need_device_set) {
-      const int block_size = local_block_size(m_functor);
+      const int block_size = local_block_size(m_functor_reducer.get_functor());
       if (block_size == 0) {
         Kokkos::Impl::throw_runtime_exception(
             std::string("Kokkos::Impl::ParallelReduce< HIP > could not find a "
                         "valid execution configuration."));
       }
 
-      m_scratch_space = ::Kokkos::Impl::hip_internal_scratch_space(
-          m_policy.space(), Analysis::value_size(ReducerConditional::select(
-                                m_functor, m_reducer)) *
-                                block_size /* block_size == max block_count */);
-      m_scratch_flags = ::Kokkos::Impl::hip_internal_scratch_flags(
-          m_policy.space(), sizeof(size_type));
-
       // REQUIRED ( 1 , N , 1 )
       dim3 block(1, block_size, 1);
+      // use a slightly less constrained, but still well bounded limit for
+      // scratch
+      int nblocks = (nwork + block.y - 1) / block.y;
+      // Heuristic deciding the value of nblocks.
+      // The general idea here is we want to:
+      //    1. Not undersubscribe the device (i.e., we want at least
+      //    preferred_block_min blocks)
+      //    2. Have each thread reduce > 1 value to minimize overheads
+      //    3. Limit the total # of blocks, to avoid unbounded scratch space
+      constexpr int block_max           = 4096;
+      constexpr int preferred_block_min = 1024;
+
+      if (nblocks < preferred_block_min) {
+        // keep blocks as is, already have low parallelism
+      } else if (nblocks > block_max) {
+        // "large dispatch" -> already have lots of parallelism
+        nblocks = block_max;
+      } else {
+        // in the intermediate range, try to have each thread process multiple
+        // items to offset the cost of the reduction (with not enough
+        // parallelism to hide it)
+        int items_per_thread =
+            (nwork + nblocks * block_size - 1) / (nblocks * block_size);
+        if (items_per_thread < 4) {
+          int ratio = std::min(
+              (nblocks + preferred_block_min - 1) / preferred_block_min,
+              (4 + items_per_thread - 1) / items_per_thread);
+          nblocks /= ratio;
+        }
+      }
+
+      // TODO: down casting these uses more space than required?
+      m_scratch_space =
+          (word_size_type*)::Kokkos::Impl::hip_internal_scratch_space(
+              m_policy.space(), reducer.value_size() * nblocks);
+      // Intentionally do not downcast to word_size_type since we use HIP
+      // atomics in Kokkos_HIP_ReduceScan.hpp
+      m_scratch_flags = ::Kokkos::Impl::hip_internal_scratch_flags(
+          m_policy.space(), sizeof(size_type));
       // Required grid.x <= block.y
-      dim3 grid(std::min(block.y, static_cast<uint32_t>((nwork + block.y - 1) /
-                                                        block.y)),
-                1, 1);
+      dim3 grid(nblocks, 1, 1);
 
       if (nwork == 0) {
         block = dim3(1, 1, 1);
@@ -332,38 +363,32 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
       const int shmem =
           UseShflReduction
               ? 0
-              : hip_single_inter_block_reduce_scan_shmem<false, FunctorType,
-                                                         WorkTag>(m_functor,
-                                                                  block.y);
+              : hip_single_inter_block_reduce_scan_shmem<false, WorkTag,
+                                                         value_type>(
+                    m_functor_reducer.get_functor(), block.y);
 
-      using DriverType =
-          ParallelReduce<FunctorType, Policy, ReducerType, Kokkos::HIP>;
-      Kokkos::Impl::hip_parallel_launch<DriverType, LaunchBounds>(
+      Kokkos::Impl::hip_parallel_launch<ParallelReduce, LaunchBounds>(
           *this, grid, block, shmem,
           m_policy.space().impl_internal_space_instance(),
           false);  // copy to device and execute
 
       if (!m_result_ptr_device_accessible && m_result_ptr) {
-        const int size = Analysis::value_size(
-            ReducerConditional::select(m_functor, m_reducer));
+        const int size = reducer.value_size();
         DeepCopy<HostSpace, HIPSpace, HIP>(m_policy.space(), m_result_ptr,
                                            m_scratch_space, size);
       }
     } else {
       if (m_result_ptr) {
-        final_reducer.init(m_result_ptr);
+        reducer.init(m_result_ptr);
       }
     }
   }
 
   template <class ViewType>
-  ParallelReduce(
-      const FunctorType& arg_functor, const Policy& arg_policy,
-      const ViewType& arg_result,
-      std::enable_if_t<Kokkos::is_view<ViewType>::value, void*> = nullptr)
-      : m_functor(arg_functor),
+  ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer,
+                 const Policy& arg_policy, const ViewType& arg_result)
+      : m_functor_reducer(arg_functor_reducer),
         m_policy(arg_policy),
-        m_reducer(InvalidType()),
         m_result_ptr(arg_result.data()),
         m_result_ptr_device_accessible(
             MemorySpaceAccess<HIPSpace,
@@ -371,23 +396,9 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
         m_result_ptr_host_accessible(
             MemorySpaceAccess<Kokkos::HostSpace,
                               typename ViewType::memory_space>::accessible) {}
-
-  ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy,
-                 const ReducerType& reducer)
-      : m_functor(arg_functor),
-        m_policy(arg_policy),
-        m_reducer(reducer),
-        m_result_ptr(reducer.view().data()),
-        m_result_ptr_device_accessible(
-            MemorySpaceAccess<HIPSpace, typename ReducerType::result_view_type::
-                                            memory_space>::accessible),
-        m_result_ptr_host_accessible(
-            MemorySpaceAccess<Kokkos::HostSpace,
-                              typename ReducerType::result_view_type::
-                                  memory_space>::accessible) {}
 };
 
-template <class FunctorType, class... Traits>
+template <class FunctorType, class ValueType, class... Traits>
 class ParallelScanHIPBase {
  public:
   using Policy = Kokkos::RangePolicy<Traits...>;
@@ -398,8 +409,9 @@ class ParallelScanHIPBase {
   using WorkRange    = typename Policy::WorkRange;
   using LaunchBounds = typename Policy::launch_bounds;
 
-  using Analysis = Kokkos::Impl::FunctorAnalysis<FunctorPatternInterface::SCAN,
-                                                 Policy, FunctorType>;
+  using Analysis =
+      Kokkos::Impl::FunctorAnalysis<FunctorPatternInterface::SCAN, Policy,
+                                    FunctorType, ValueType>;
 
  public:
   using value_type     = typename Analysis::value_type;
@@ -431,7 +443,8 @@ class ParallelScanHIPBase {
   //  (c) gridDim.x  <= blockDim.y * blockDim.y
   //  (d) gridDim.y  == gridDim.z == 1
 
-  const FunctorType m_functor;
+  const CombinedFunctorReducer<FunctorType, typename Analysis::Reducer>
+      m_functor_reducer;
   const Policy m_policy;
   const pointer_type m_result_ptr;
   const bool m_result_ptr_device_accessible;
@@ -444,23 +457,24 @@ class ParallelScanHIPBase {
   template <class TagType>
   __device__ inline std::enable_if_t<std::is_void<TagType>::value> exec_range(
       const Member& i, reference_type update, const bool final_result) const {
-    m_functor(i, update, final_result);
+    m_functor_reducer.get_functor()(i, update, final_result);
   }
 
   template <class TagType>
   __device__ inline std::enable_if_t<!std::is_void<TagType>::value> exec_range(
       const Member& i, reference_type update, const bool final_result) const {
-    m_functor(TagType(), i, update, final_result);
+    m_functor_reducer.get_functor()(TagType(), i, update, final_result);
   }
 
   //----------------------------------------
 
   __device__ inline void initial() const {
-    typename Analysis::Reducer final_reducer(&m_functor);
+    const typename Analysis::Reducer& final_reducer =
+        m_functor_reducer.get_reducer();
 
     const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize /
                                                         sizeof(word_size_type)>
-        word_count(Analysis::value_size(m_functor) / sizeof(word_size_type));
+        word_count(final_reducer.value_size() / sizeof(word_size_type));
 
     pointer_type const shared_value = reinterpret_cast<pointer_type>(
         kokkos_impl_hip_shared_memory<word_size_type>() +
@@ -494,11 +508,12 @@ class ParallelScanHIPBase {
   //----------------------------------------
 
   __device__ inline void final() const {
-    typename Analysis::Reducer final_reducer(&m_functor);
+    const typename Analysis::Reducer& final_reducer =
+        m_functor_reducer.get_reducer();
 
     const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize /
                                                         sizeof(word_size_type)>
-        word_count(Analysis::value_size(m_functor) / sizeof(word_size_type));
+        word_count(final_reducer.value_size() / sizeof(word_size_type));
 
     // Use shared memory as an exclusive scan: { 0 , value[0] , value[1] ,
     // value[2] , ... }
@@ -609,27 +624,28 @@ class ParallelScanHIPBase {
       // How many block are really needed for this much work:
       m_grid_x = (nwork + work_per_block - 1) / work_per_block;
 
+      const typename Analysis::Reducer& final_reducer =
+          m_functor_reducer.get_reducer();
       m_scratch_space =
           reinterpret_cast<word_size_type*>(Impl::hip_internal_scratch_space(
-              m_policy.space(), Analysis::value_size(m_functor) * m_grid_x));
+              m_policy.space(), final_reducer.value_size() * m_grid_x));
       m_scratch_flags = Impl::hip_internal_scratch_flags(m_policy.space(),
                                                          sizeof(size_type) * 1);
 
       dim3 grid(m_grid_x, 1, 1);
       dim3 block(1, block_size, 1);  // REQUIRED DIMENSIONS ( 1 , N , 1 )
-      const int shmem = Analysis::value_size(m_functor) * (block_size + 2);
+      const int shmem = final_reducer.value_size() * (block_size + 2);
 
       m_final = false;
       // these ones are OK to be just the base because the specializations
       // do not modify the kernel at all
-      using DriverType = ParallelScanHIPBase<FunctorType, Traits...>;
-      Impl::hip_parallel_launch<DriverType, LaunchBounds>(
+      Impl::hip_parallel_launch<ParallelScanHIPBase, LaunchBounds>(
           *this, grid, block, shmem,
           m_policy.space().impl_internal_space_instance(),
           false);  // copy to device and execute
 
       m_final = true;
-      Impl::hip_parallel_launch<DriverType, LaunchBounds>(
+      Impl::hip_parallel_launch<ParallelScanHIPBase, LaunchBounds>(
           *this, grid, block, shmem,
           m_policy.space().impl_internal_space_instance(),
           false);  // copy to device and execute
@@ -639,7 +655,7 @@ class ParallelScanHIPBase {
   ParallelScanHIPBase(const FunctorType& arg_functor, const Policy& arg_policy,
                       pointer_type arg_result_ptr,
                       bool arg_result_ptr_device_accessible)
-      : m_functor(arg_functor),
+      : m_functor_reducer(arg_functor, typename Analysis::Reducer{arg_functor}),
         m_policy(arg_policy),
         m_result_ptr(arg_result_ptr),
         m_result_ptr_device_accessible(arg_result_ptr_device_accessible) {}
@@ -647,13 +663,14 @@ class ParallelScanHIPBase {
 
 template <class FunctorType, class... Traits>
 class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, HIP>
-    : public ParallelScanHIPBase<FunctorType, Traits...> {
+    : public ParallelScanHIPBase<FunctorType, void, Traits...> {
  public:
-  using Base = ParallelScanHIPBase<FunctorType, Traits...>;
+  using Base = ParallelScanHIPBase<FunctorType, void, Traits...>;
   using Base::operator();
 
   inline void execute() {
-    const int block_size = static_cast<int>(local_block_size(Base::m_functor));
+    const int block_size = static_cast<int>(
+        local_block_size(Base::m_functor_reducer.get_functor()));
     if (block_size == 0) {
       Kokkos::Impl::throw_runtime_exception(
           std::string("Kokkos::Impl::ParallelScan< HIP > could not find a "
@@ -674,9 +691,8 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, HIP>
     const auto& instance =
         Base::m_policy.space().impl_internal_space_instance();
     auto shmem_functor = [&f](unsigned n) {
-      return hip_single_inter_block_reduce_scan_shmem<true, FunctorType,
-                                                      typename Base::WorkTag>(
-          f, n);
+      return hip_single_inter_block_reduce_scan_shmem<
+          true, typename Base::WorkTag, void>(f, n);
     };
     using DriverType = ParallelScan<FunctorType, typename Base::Policy, HIP>;
     return Impl::hip_get_preferred_blocksize<DriverType,
@@ -690,13 +706,14 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, HIP>
 template <class FunctorType, class ReturnType, class... Traits>
 class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
                             ReturnType, HIP>
-    : public ParallelScanHIPBase<FunctorType, Traits...> {
+    : public ParallelScanHIPBase<FunctorType, ReturnType, Traits...> {
  public:
-  using Base = ParallelScanHIPBase<FunctorType, Traits...>;
+  using Base = ParallelScanHIPBase<FunctorType, ReturnType, Traits...>;
   using Base::operator();
 
   inline void execute() {
-    const int block_size = static_cast<int>(local_block_size(Base::m_functor));
+    const int block_size = static_cast<int>(
+        local_block_size(Base::m_functor_reducer.get_functor()));
     if (block_size == 0) {
       Kokkos::Impl::throw_runtime_exception(
           std::string("Kokkos::Impl::ParallelScan< HIP > could not find a "
@@ -707,7 +724,8 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
 
     const auto nwork = Base::m_policy.end() - Base::m_policy.begin();
     if (nwork && !Base::m_result_ptr_device_accessible) {
-      const int size = Base::Analysis::value_size(Base::m_functor);
+      const int size =
+          Base::Analysis::value_size(Base::m_functor_reducer.get_functor());
       DeepCopy<HostSpace, HIPSpace, HIP>(
           Base::m_policy.space(), Base::m_result_ptr,
           Base::m_scratch_space + (Base::m_grid_x - 1) * size /
@@ -731,9 +749,8 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
     const auto& instance =
         Base::m_policy.space().impl_internal_space_instance();
     auto shmem_functor = [&f](unsigned n) {
-      return hip_single_inter_block_reduce_scan_shmem<true, FunctorType,
-                                                      typename Base::WorkTag>(
-          f, n);
+      return hip_single_inter_block_reduce_scan_shmem<
+          true, typename Base::WorkTag, ReturnType>(f, n);
     };
     using DriverType = ParallelScanWithTotal<FunctorType, typename Base::Policy,
                                              ReturnType, HIP>;
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp
index 442ca8aef290a309a5caa27961b311fdebfb2dfd..3fe568ac361f547bfd3eeb19526c1ad289b70380 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp
@@ -22,7 +22,6 @@
 #if defined(__HIPCC__)
 
 #include <HIP/Kokkos_HIP_KernelLaunch.hpp>
-#include <HIP/Kokkos_HIP_Locks.hpp>
 #include <HIP/Kokkos_HIP_Team.hpp>
 #include <HIP/Kokkos_HIP_Instance.hpp>
 #include <Kokkos_MinMaxClamp.hpp>
@@ -75,7 +74,7 @@ class TeamPolicyInternal<HIP, Properties...>
     using closure_type =
         Impl::ParallelFor<FunctorType, TeamPolicy<Properties...>>;
 
-    return internal_team_size_common<BlockType::Max, closure_type>(f);
+    return internal_team_size_common<BlockType::Max, closure_type, void>(f);
   }
 
   template <class FunctorType>
@@ -83,23 +82,24 @@ class TeamPolicyInternal<HIP, Properties...>
                            const ParallelReduceTag&) const {
     using functor_analysis_type =
         Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
-                              TeamPolicyInternal, FunctorType>;
-    using reducer_type = typename Impl::ParallelReduceReturnValue<
-        void, typename functor_analysis_type::value_type,
-        FunctorType>::reducer_type;
-    using closure_type =
-        Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>,
-                             reducer_type>;
-    return internal_team_size_common<BlockType::Max, closure_type>(f);
+                              TeamPolicyInternal, FunctorType, void>;
+    using closure_type = Impl::ParallelReduce<
+        CombinedFunctorReducer<FunctorType,
+                               typename functor_analysis_type::Reducer>,
+        TeamPolicy<Properties...>, Kokkos::HIP>;
+    return internal_team_size_common<
+        BlockType::Max, closure_type,
+        typename functor_analysis_type::value_type>(f);
   }
 
   template <typename FunctorType, typename ReducerType>
   inline int team_size_max(const FunctorType& f, const ReducerType&,
                            const ParallelReduceTag&) const {
     using closure_type =
-        Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>,
-                             ReducerType>;
-    return internal_team_size_common<BlockType::Max, closure_type>(f);
+        Impl::ParallelReduce<CombinedFunctorReducer<FunctorType, ReducerType>,
+                             TeamPolicy<Properties...>, Kokkos::HIP>;
+    return internal_team_size_common<BlockType::Max, closure_type,
+                                     typename ReducerType::value_type>(f);
   }
 
   template <typename FunctorType>
@@ -107,7 +107,8 @@ class TeamPolicyInternal<HIP, Properties...>
     using closure_type =
         Impl::ParallelFor<FunctorType, TeamPolicy<Properties...>>;
 
-    return internal_team_size_common<BlockType::Preferred, closure_type>(f);
+    return internal_team_size_common<BlockType::Preferred, closure_type, void>(
+        f);
   }
 
   template <typename FunctorType>
@@ -115,23 +116,24 @@ class TeamPolicyInternal<HIP, Properties...>
                                    ParallelReduceTag const&) const {
     using functor_analysis_type =
         Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
-                              TeamPolicyInternal, FunctorType>;
-    using reducer_type = typename Impl::ParallelReduceReturnValue<
-        void, typename functor_analysis_type::value_type,
-        FunctorType>::reducer_type;
-    using closure_type =
-        Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>,
-                             reducer_type>;
-    return internal_team_size_common<BlockType::Preferred, closure_type>(f);
+                              TeamPolicyInternal, FunctorType, void>;
+    using closure_type = Impl::ParallelReduce<
+        CombinedFunctorReducer<FunctorType,
+                               typename functor_analysis_type::Reducer>,
+        TeamPolicy<Properties...>, Kokkos::HIP>;
+    return internal_team_size_common<
+        BlockType::Preferred, closure_type,
+        typename functor_analysis_type::value_type>(f);
   }
 
   template <typename FunctorType, typename ReducerType>
   int team_size_recommended(FunctorType const& f, ReducerType const&,
                             ParallelReduceTag const&) const {
     using closure_type =
-        Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>,
-                             ReducerType>;
-    return internal_team_size_common<BlockType::Preferred, closure_type>(f);
+        Impl::ParallelReduce<CombinedFunctorReducer<FunctorType, ReducerType>,
+                             TeamPolicy<Properties...>, Kokkos::HIP>;
+    return internal_team_size_common<BlockType::Preferred, closure_type,
+                                     typename ReducerType::value_type>(f);
   }
 
   inline bool impl_auto_vector_length() const { return m_tune_vector_length; }
@@ -145,7 +147,7 @@ class TeamPolicyInternal<HIP, Properties...>
     // Allow only power-of-two vector_length
     if (!(is_integral_power_of_two(test_vector_length))) {
       int test_pow2           = 1;
-      int constexpr warp_size = HIPTraits::WarpSize;
+      constexpr int warp_size = HIPTraits::WarpSize;
       while (test_pow2 < warp_size) {
         test_pow2 <<= 1;
         if (test_pow2 > test_vector_length) {
@@ -330,7 +332,8 @@ class TeamPolicyInternal<HIP, Properties...>
   using member_type = Kokkos::Impl::HIPTeamMember;
 
  protected:
-  template <BlockType BlockSize, class ClosureType, class FunctorType>
+  template <BlockType BlockSize, class ClosureType, class ValueType,
+            class FunctorType>
   int internal_team_size_common(FunctorType const& f) const {
     const unsigned shmem_block = team_scratch_size(0) + 2 * sizeof(double);
     unsigned shmem_thread      = thread_scratch_size(0) + sizeof(double);
@@ -340,7 +343,7 @@ class TeamPolicyInternal<HIP, Properties...>
           typename Impl::DeduceFunctorPatternInterface<ClosureType>::type;
       using Analysis =
           Impl::FunctorAnalysis<Interface, typename ClosureType::Policy,
-                                FunctorType>;
+                                FunctorType, ValueType>;
       shmem_thread +=
           ((Analysis::StaticValueSize != 0) ? 0 : Analysis::value_size(f));
     }
@@ -384,14 +387,14 @@ class TeamPolicyInternal<HIP, Properties...>
 };
 
 __device__ inline int64_t hip_get_scratch_index(HIP::size_type league_size,
-                                                int32_t* scratch_locks) {
+                                                int32_t* scratch_locks,
+                                                size_t num_scratch_locks) {
   int64_t threadid = 0;
   __shared__ int64_t base_thread_id;
   if (threadIdx.x == 0 && threadIdx.y == 0) {
     int64_t const wraparound_len =
         Kokkos::min(int64_t(league_size),
-                    (int64_t(Kokkos::Impl::g_device_hip_lock_arrays.n)) /
-                        (blockDim.x * blockDim.y));
+                    int64_t(num_scratch_locks) / (blockDim.x * blockDim.y));
     threadid = (blockIdx.x * blockDim.z + threadIdx.z) % wraparound_len;
     threadid *= blockDim.x * blockDim.y;
     int done = 0;
@@ -422,7 +425,7 @@ __device__ inline void hip_release_scratch_index(int32_t* scratch_locks,
 template <typename FunctorType, typename... Properties>
 class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, HIP> {
  public:
-  using Policy       = TeamPolicyInternal<HIP, Properties...>;
+  using Policy       = TeamPolicy<Properties...>;
   using functor_type = FunctorType;
   using size_type    = HIP::size_type;
 
@@ -448,6 +451,7 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, HIP> {
   size_t m_scratch_size[2];
   int m_scratch_pool_id = -1;
   int32_t* m_scratch_locks;
+  size_t m_num_scratch_locks;
 
   template <typename TagType>
   __device__ inline std::enable_if_t<std::is_void<TagType>::value> exec_team(
@@ -462,11 +466,16 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, HIP> {
   }
 
  public:
+  ParallelFor()                   = delete;
+  ParallelFor(ParallelFor const&) = default;
+  ParallelFor& operator=(ParallelFor const&) = delete;
+
   __device__ inline void operator()() const {
     // Iterate this block through the league
     int64_t threadid = 0;
     if (m_scratch_size[1] > 0) {
-      threadid = hip_get_scratch_index(m_league_size, m_scratch_locks);
+      threadid = hip_get_scratch_index(m_league_size, m_scratch_locks,
+                                       m_num_scratch_locks);
     }
 
     int const int_league_size = static_cast<int>(m_league_size);
@@ -514,9 +523,10 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, HIP> {
     m_shmem_size =
         (m_policy.scratch_size(0, m_team_size) +
          FunctorTeamShmemSize<FunctorType>::value(m_functor, m_team_size));
-    m_scratch_size[0] = m_policy.scratch_size(0, m_team_size);
-    m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);
-    m_scratch_locks   = internal_space_instance->m_scratch_locks;
+    m_scratch_size[0]   = m_policy.scratch_size(0, m_team_size);
+    m_scratch_size[1]   = m_policy.scratch_size(1, m_team_size);
+    m_scratch_locks     = internal_space_instance->m_scratch_locks;
+    m_num_scratch_locks = internal_space_instance->m_num_scratch_locks;
 
     // Functor's reduce memory, team scan memory, and team shared memory depend
     // upon team size.
@@ -559,37 +569,32 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, HIP> {
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
-template <class FunctorType, class ReducerType, class... Properties>
-class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
-                     ReducerType, HIP> {
+template <class CombinedFunctorReducerType, class... Properties>
+class ParallelReduce<CombinedFunctorReducerType,
+                     Kokkos::TeamPolicy<Properties...>, HIP> {
  public:
-  using Policy = TeamPolicyInternal<HIP, Properties...>;
+  using Policy      = TeamPolicyInternal<HIP, Properties...>;
+  using FunctorType = typename CombinedFunctorReducerType::functor_type;
+  using ReducerType = typename CombinedFunctorReducerType::reducer_type;
 
  private:
   using member_type   = typename Policy::member_type;
   using work_tag      = typename Policy::work_tag;
   using launch_bounds = typename Policy::launch_bounds;
 
-  using reducer_conditional =
-      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                         FunctorType, ReducerType>;
-  using reducer_type_fwd = typename reducer_conditional::type;
-  using work_tag_fwd =
-      typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                                  work_tag, void>::type;
-
-  using analysis = Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
-                                         Policy, reducer_type_fwd>;
-
-  using pointer_type   = typename analysis::pointer_type;
-  using reference_type = typename analysis::reference_type;
-  using value_type     = typename analysis::value_type;
+  using pointer_type   = typename ReducerType::pointer_type;
+  using reference_type = typename ReducerType::reference_type;
+  using value_type     = typename ReducerType::value_type;
 
  public:
   using functor_type = FunctorType;
   using size_type    = HIP::size_type;
 
-  static int constexpr UseShflReduction = (analysis::StaticValueSize != 0);
+  // static int constexpr UseShflReduction = false;
+  // FIXME_HIP This should be disabled unconditionally for best performance, but
+  // it currently causes tests to fail.
+  static constexpr int UseShflReduction =
+      (ReducerType::static_value_size() != 0);
 
  private:
   struct ShflReductionTag {};
@@ -603,9 +608,8 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   //  [ team   shared space ]
   //
 
-  const FunctorType m_functor;
+  const CombinedFunctorReducerType m_functor_reducer;
   const Policy m_policy;
-  const ReducerType m_reducer;
   const pointer_type m_result_ptr;
   const bool m_result_ptr_device_accessible;
   const bool m_result_ptr_host_accessible;
@@ -618,6 +622,7 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   size_t m_scratch_size[2];
   int m_scratch_pool_id = -1;
   int32_t* m_scratch_locks;
+  size_t m_num_scratch_locks;
   const size_type m_league_size;
   int m_team_size;
   const size_type m_vector_size;
@@ -625,13 +630,13 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   template <class TagType>
   __device__ inline std::enable_if_t<std::is_void<TagType>::value> exec_team(
       member_type const& member, reference_type update) const {
-    m_functor(member, update);
+    m_functor_reducer.get_functor()(member, update);
   }
 
   template <class TagType>
   __device__ inline std::enable_if_t<!std::is_void<TagType>::value> exec_team(
       member_type const& member, reference_type update) const {
-    m_functor(TagType(), member, update);
+    m_functor_reducer.get_functor()(TagType(), member, update);
   }
 
   __device__ inline void iterate_through_league(int const threadid,
@@ -652,11 +657,41 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
     }
   }
 
+  int compute_block_count() const {
+    constexpr auto light_weight =
+        Kokkos::Experimental::WorkItemProperty::HintLightWeight;
+    constexpr typename Policy::work_item_property property;
+    // Numbers were tuned on MI210 using dot product and yAx benchmarks
+    constexpr int block_max =
+        (property & light_weight) == light_weight ? 2097152 : 65536;
+    constexpr int preferred_block_min = 1024;
+    int block_count                   = m_league_size;
+    if (block_count < preferred_block_min) {
+      // keep blocks as is, already low parallelism
+    } else if (block_count >= block_max) {
+      block_count = block_max;
+
+    } else {
+      int nwork = m_league_size * m_team_size;
+      int items_per_thread =
+          (nwork + block_count * m_team_size - 1) / (block_count * m_team_size);
+      if (items_per_thread < 4) {
+        int ratio = std::min(
+            (block_count + preferred_block_min - 1) / preferred_block_min,
+            (4 + items_per_thread - 1) / items_per_thread);
+        block_count /= ratio;
+      }
+    }
+
+    return block_count;
+  }
+
  public:
   __device__ inline void operator()() const {
     int64_t threadid = 0;
     if (m_scratch_size[1] > 0) {
-      threadid = hip_get_scratch_index(m_league_size, m_scratch_locks);
+      threadid = hip_get_scratch_index(m_league_size, m_scratch_locks,
+                                       m_num_scratch_locks);
     }
 
     using ReductionTag = std::conditional_t<UseShflReduction, ShflReductionTag,
@@ -669,19 +704,15 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   }
 
   __device__ inline void run(SHMEMReductionTag, int const threadid) const {
-    typename analysis::Reducer final_reducer(
-        &reducer_conditional::select(m_functor, m_reducer));
+    const ReducerType& reducer = m_functor_reducer.get_reducer();
 
-    integral_nonzero_constant<size_type, analysis::StaticValueSize /
+    integral_nonzero_constant<size_type, ReducerType::static_value_size() /
                                              sizeof(size_type)> const
-        word_count(analysis::value_size(
-                       reducer_conditional::select(m_functor, m_reducer)) /
-                   sizeof(size_type));
+        word_count(reducer.value_size() / sizeof(size_type));
 
     reference_type value =
-        final_reducer.init(kokkos_impl_hip_shared_memory<size_type>() +
-                           threadIdx.y * word_count.value);
-
+        reducer.init(kokkos_impl_hip_shared_memory<size_type>() +
+                     threadIdx.y * word_count.value);
     // Iterate this block through the league
     iterate_through_league(threadid, value);
 
@@ -690,9 +721,9 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
     if (!do_final_reduce)
       do_final_reduce =
           hip_single_inter_block_reduce_scan<false, FunctorType, work_tag>(
-              reducer_conditional::select(m_functor, m_reducer), blockIdx.x,
-              gridDim.x, kokkos_impl_hip_shared_memory<size_type>(),
-              m_scratch_space, m_scratch_flags);
+              reducer, blockIdx.x, gridDim.x,
+              kokkos_impl_hip_shared_memory<size_type>(), m_scratch_space,
+              m_scratch_flags);
     if (do_final_reduce) {
       // This is the final block with the final result at the final threads'
       // location
@@ -704,7 +735,7 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
                                     : m_scratch_space;
 
       if (threadIdx.y == 0) {
-        final_reducer.final(reinterpret_cast<value_type*>(shared));
+        reducer.final(reinterpret_cast<value_type*>(shared));
       }
 
       if (HIPTraits::WarpSize < word_count.value) {
@@ -718,11 +749,10 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   }
 
   __device__ inline void run(ShflReductionTag, int const threadid) const {
-    typename analysis::Reducer final_reducer(
-        &reducer_conditional::select(m_functor, m_reducer));
+    const ReducerType& reducer = m_functor_reducer.get_reducer();
 
     value_type value;
-    final_reducer.init(&value);
+    reducer.init(&value);
 
     // Iterate this block through the league
     iterate_through_league(threadid, value);
@@ -733,40 +763,35 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
             : reinterpret_cast<pointer_type>(m_scratch_space);
 
     value_type init;
-    final_reducer.init(&init);
+    reducer.init(&init);
     if (m_league_size == 0) {
-      final_reducer.final(&value);
+      reducer.final(&value);
       *result = value;
     } else if (Impl::hip_inter_block_shuffle_reduction(
-                   value, init, final_reducer, m_scratch_space, result,
+                   value, init, reducer, m_scratch_space, result,
                    m_scratch_flags, blockDim.y)) {
       unsigned int const id = threadIdx.y * blockDim.x + threadIdx.x;
       if (id == 0) {
-        final_reducer.final(&value);
+        reducer.final(&value);
         *result = value;
       }
     }
   }
 
   inline void execute() {
-    typename analysis::Reducer final_reducer(
-        &reducer_conditional::select(m_functor, m_reducer));
+    const ReducerType& reducer = m_functor_reducer.get_reducer();
 
     const bool is_empty_range  = m_league_size == 0 || m_team_size == 0;
-    const bool need_device_set = analysis::has_init_member_function ||
-                                 analysis::has_final_member_function ||
+    const bool need_device_set = ReducerType::has_init_member_function() ||
+                                 ReducerType::has_final_member_function() ||
                                  !m_result_ptr_host_accessible ||
+                                 Policy::is_graph_kernel::value ||
                                  !std::is_same<ReducerType, InvalidType>::value;
     if (!is_empty_range || need_device_set) {
-      const int block_count =
-          UseShflReduction
-              ? std::min(m_league_size, size_type(1024 * HIPTraits::WarpSize))
-              : std::min(static_cast<int>(m_league_size), m_team_size);
+      int const block_count = compute_block_count();
 
       m_scratch_space = hip_internal_scratch_space(
-          m_policy.space(), analysis::value_size(reducer_conditional::select(
-                                m_functor, m_reducer)) *
-                                block_count);
+          m_policy.space(), reducer.value_size() * block_count);
       m_scratch_flags =
           hip_internal_scratch_flags(m_policy.space(), sizeof(size_type));
 
@@ -778,10 +803,7 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
       }
       const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size;
 
-      using closure_type =
-          ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
-                         ReducerType, HIP>;
-      Impl::hip_parallel_launch<closure_type, launch_bounds>(
+      Impl::hip_parallel_launch<ParallelReduce, launch_bounds>(
           *this, grid, block, shmem_size_total,
           m_policy.space().impl_internal_space_instance(),
           true);  // copy to device and execute
@@ -790,26 +812,22 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
         m_policy.space().impl_internal_space_instance()->fence();
 
         if (m_result_ptr) {
-          const int size = analysis::value_size(
-              reducer_conditional::select(m_functor, m_reducer));
+          const int size = reducer.value_size();
           DeepCopy<HostSpace, HIPSpace>(m_result_ptr, m_scratch_space, size);
         }
       }
     } else {
       if (m_result_ptr) {
-        final_reducer.init(m_result_ptr);
+        reducer.init(m_result_ptr);
       }
     }
   }
 
   template <class ViewType>
-  ParallelReduce(
-      FunctorType const& arg_functor, Policy const& arg_policy,
-      ViewType const& arg_result,
-      std::enable_if_t<Kokkos::is_view<ViewType>::value, void*> = nullptr)
-      : m_functor(arg_functor),
+  ParallelReduce(CombinedFunctorReducerType const& arg_functor_reducer,
+                 Policy const& arg_policy, ViewType const& arg_result)
+      : m_functor_reducer(arg_functor_reducer),
         m_policy(arg_policy),
-        m_reducer(InvalidType()),
         m_result_ptr(arg_result.data()),
         m_result_ptr_device_accessible(
             MemorySpaceAccess<HIPSpace,
@@ -830,21 +848,24 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
         m_policy.space().impl_internal_space_instance();
     m_team_size = m_team_size >= 0 ? m_team_size
                                    : arg_policy.team_size_recommended(
-                                         arg_functor, ParallelReduceTag());
+                                         arg_functor_reducer.get_functor(),
+                                         arg_functor_reducer.get_reducer(),
+                                         ParallelReduceTag());
 
     m_team_begin =
         UseShflReduction
             ? 0
-            : hip_single_inter_block_reduce_scan_shmem<false, FunctorType,
-                                                       work_tag>(arg_functor,
-                                                                 m_team_size);
+            : hip_single_inter_block_reduce_scan_shmem<false, work_tag,
+                                                       value_type>(
+                  arg_functor_reducer.get_functor(), m_team_size);
     m_shmem_begin = sizeof(double) * (m_team_size + 2);
-    m_shmem_size =
-        m_policy.scratch_size(0, m_team_size) +
-        FunctorTeamShmemSize<FunctorType>::value(arg_functor, m_team_size);
-    m_scratch_size[0] = m_shmem_size;
-    m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);
-    m_scratch_locks   = internal_space_instance->m_scratch_locks;
+    m_shmem_size  = m_policy.scratch_size(0, m_team_size) +
+                   FunctorTeamShmemSize<FunctorType>::value(
+                       arg_functor_reducer.get_functor(), m_team_size);
+    m_scratch_size[0]   = m_shmem_size;
+    m_scratch_size[1]   = m_policy.scratch_size(1, m_team_size);
+    m_scratch_locks     = internal_space_instance->m_scratch_locks;
+    m_num_scratch_locks = internal_space_instance->m_num_scratch_locks;
     if (m_team_size <= 0) {
       m_scratch_ptr[1] = nullptr;
     } else {
@@ -889,96 +910,9 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
                       "L0 scratch memory"));
     }
 
-    size_t max_size =
-        arg_policy.team_size_max(arg_functor, ParallelReduceTag());
-    if (static_cast<int>(m_team_size) > static_cast<int>(max_size)) {
-      Kokkos::Impl::throw_runtime_exception(
-          std::string("Kokkos::Impl::ParallelReduce< HIP > requested too "
-                      "large team size."));
-    }
-  }
-
-  ParallelReduce(FunctorType const& arg_functor, Policy const& arg_policy,
-                 ReducerType const& reducer)
-      : m_functor(arg_functor),
-        m_policy(arg_policy),
-        m_reducer(reducer),
-        m_result_ptr(reducer.view().data()),
-        m_result_ptr_device_accessible(
-            MemorySpaceAccess<HIPSpace, typename ReducerType::result_view_type::
-                                            memory_space>::accessible),
-        m_result_ptr_host_accessible(
-            MemorySpaceAccess<Kokkos::HostSpace,
-                              typename ReducerType::result_view_type::
-                                  memory_space>::accessible),
-        m_scratch_space(nullptr),
-        m_scratch_flags(nullptr),
-        m_team_begin(0),
-        m_shmem_begin(0),
-        m_shmem_size(0),
-        m_scratch_ptr{nullptr, nullptr},
-        m_league_size(arg_policy.league_size()),
-        m_team_size(arg_policy.team_size()),
-        m_vector_size(arg_policy.impl_vector_length()) {
-    auto internal_space_instance =
-        m_policy.space().impl_internal_space_instance();
-    m_team_size = m_team_size >= 0
-                      ? m_team_size
-                      : arg_policy.team_size_recommended(arg_functor, reducer,
-                                                         ParallelReduceTag());
-    m_team_begin =
-        UseShflReduction
-            ? 0
-            : hip_single_inter_block_reduce_scan_shmem<false, FunctorType,
-                                                       work_tag>(arg_functor,
-                                                                 m_team_size);
-    m_shmem_begin = sizeof(double) * (m_team_size + 2);
-    m_shmem_size =
-        m_policy.scratch_size(0, m_team_size) +
-        FunctorTeamShmemSize<FunctorType>::value(arg_functor, m_team_size);
-    m_scratch_size[0] = m_shmem_size;
-    m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);
-    m_scratch_locks   = internal_space_instance->m_scratch_locks;
-    if (m_team_size <= 0) {
-      m_scratch_ptr[1] = nullptr;
-    } else {
-      m_scratch_pool_id = internal_space_instance->acquire_team_scratch_space();
-      m_scratch_ptr[1]  = internal_space_instance->resize_team_scratch_space(
-          m_scratch_pool_id,
-          static_cast<std::int64_t>(m_scratch_size[1]) *
-              (std::min(
-                  static_cast<std::int64_t>(HIP().concurrency() /
-                                            (m_team_size * m_vector_size)),
-                  static_cast<std::int64_t>(m_league_size))));
-    }
-
-    // The global parallel_reduce does not support vector_length other than 1 at
-    // the moment
-    if ((arg_policy.impl_vector_length() > 1) && !UseShflReduction)
-      Impl::throw_runtime_exception(
-          "Kokkos::parallel_reduce with a TeamPolicy using a vector length of "
-          "greater than 1 is not currently supported for HIP for dynamic "
-          "sized reduction types.");
-
-    if ((m_team_size < HIPTraits::WarpSize) && !UseShflReduction)
-      Impl::throw_runtime_exception(
-          "Kokkos::parallel_reduce with a TeamPolicy using a team_size smaller "
-          "than 64 is not currently supported with HIP for dynamic sized "
-          "reduction types.");
-
-    // Functor's reduce memory, team scan memory, and team shared memory depend
-    // upon team size.
-
-    const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size;
-    if ((!Kokkos::Impl::is_integral_power_of_two(m_team_size) &&
-         !UseShflReduction) ||
-        internal_space_instance->m_maxShmemPerBlock < shmem_size_total) {
-      Kokkos::Impl::throw_runtime_exception(
-          std::string("Kokkos::Impl::ParallelReduce< HIP > bad team size"));
-    }
-
-    size_t max_size =
-        arg_policy.team_size_max(arg_functor, reducer, ParallelReduceTag());
+    size_t max_size = arg_policy.team_size_max(
+        arg_functor_reducer.get_functor(), arg_functor_reducer.get_reducer(),
+        ParallelReduceTag());
     if (static_cast<int>(m_team_size) > static_cast<int>(max_size)) {
       Kokkos::Impl::throw_runtime_exception(
           std::string("Kokkos::Impl::ParallelReduce< HIP > requested too "
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_ReduceScan.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_ReduceScan.hpp
index 9de26b63a7efd89437eadc87b5195e741fccfae3..b9226fc264f4de1bcab0c24ad3898c59e9b8ec5f 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_ReduceScan.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_ReduceScan.hpp
@@ -58,7 +58,7 @@ struct HIPReductionsFunctor<FunctorType, true> {
       FunctorType const& functor, Scalar value, bool const skip,
       Scalar* my_global_team_buffer_element, int const shared_elements,
       Scalar* shared_team_buffer_element) {
-    unsigned int constexpr warp_size = HIPTraits::WarpSize;
+    constexpr unsigned int warp_size = HIPTraits::WarpSize;
     int const warp_id                = (threadIdx.y * blockDim.x) / warp_size;
     Scalar* const my_shared_team_buffer_element =
         shared_team_buffer_element + warp_id % shared_elements;
@@ -90,6 +90,7 @@ struct HIPReductionsFunctor<FunctorType, true> {
       }
       scalar_intra_warp_reduction(functor, value, false, warp_size,
                                   *my_global_team_buffer_element);
+      __threadfence();
     }
   }
 
@@ -104,7 +105,7 @@ struct HIPReductionsFunctor<FunctorType, true> {
     Scalar* shared_team_buffer_elements =
         reinterpret_cast<Scalar*>(shared_data);
     Scalar value                     = shared_team_buffer_elements[threadIdx.y];
-    unsigned int constexpr warp_size = Impl::HIPTraits::WarpSize;
+    constexpr unsigned int warp_size = Impl::HIPTraits::WarpSize;
     int shared_elements              = blockDim.x * blockDim.y / warp_size;
     int global_elements              = block_count;
     __syncthreads();
@@ -116,16 +117,12 @@ struct HIPReductionsFunctor<FunctorType, true> {
 
     // Use the last block that is done to do the do the reduction across the
     // block
-    __shared__ unsigned int num_teams_done;
+    unsigned int num_teams_done = 0;
     if (threadIdx.x + threadIdx.y == 0) {
       num_teams_done = Kokkos::atomic_fetch_add(global_flags, 1) + 1;
     }
     bool is_last_block = false;
-    // FIXME_HIP HIP does not support syncthreads_or. That's why we need to make
-    // num_teams_done __shared__
-    // if (__syncthreads_or(num_teams_done == gridDim.x)) {*/
-    __syncthreads();
-    if (num_teams_done == gridDim.x) {
+    if (__syncthreads_or(num_teams_done == gridDim.x)) {
       is_last_block = true;
       *global_flags = 0;
       functor.init(&value);
@@ -157,7 +154,8 @@ struct HIPReductionsFunctor<FunctorType, false> {
     int const lane_id =
         (threadIdx.y * blockDim.x + threadIdx.x) % HIPTraits::WarpSize;
     for (int delta = skip_vector ? blockDim.x : 1; delta < width; delta *= 2) {
-      if (lane_id + delta < HIPTraits::WarpSize) {
+      if (lane_id + delta < HIPTraits::WarpSize &&
+          (lane_id % (delta * 2) == 0)) {
         functor.join(value, value + delta);
       }
     }
@@ -186,7 +184,10 @@ struct HIPReductionsFunctor<FunctorType, false> {
       scalar_intra_warp_reduction(
           functor, my_shared_team_buffer_element, false,
           blockDim.x * blockDim.y / HIPTraits::WarpSize);
-      if (threadIdx.x + threadIdx.y == 0) *result = *shared_team_buffer_element;
+      if (threadIdx.x + threadIdx.y == 0) {
+        *result = *shared_team_buffer_element;
+        if (skip) __threadfence();
+      }
     }
   }
 
@@ -214,16 +215,12 @@ struct HIPReductionsFunctor<FunctorType, false> {
 
     // Use the last block that is done to do the do the reduction across the
     // block
-    __shared__ unsigned int num_teams_done;
+    unsigned int num_teams_done = 0;
     if (threadIdx.x + threadIdx.y == 0) {
       num_teams_done = Kokkos::atomic_fetch_add(global_flags, 1) + 1;
     }
     bool is_last_block = false;
-    // FIXME_HIP HIP does not support syncthreads_or. That's why we need to make
-    // num_teams_done __shared__
-    // if (__syncthreads_or(num_teams_done == gridDim.x)) {*/
-    __syncthreads();
-    if (num_teams_done == gridDim.x) {
+    if (__syncthreads_or(num_teams_done == gridDim.x)) {
       is_last_block = true;
       *global_flags = 0;
       functor.init(&value);
@@ -390,25 +387,16 @@ __device__ bool hip_single_inter_block_reduce_scan_impl(
     for (size_t i = threadIdx.y; i < word_count.value; i += blockDim.y) {
       global[i] = shared[i];
     }
+    __threadfence();
   }
 
   // Contributing blocks note that their contribution has been completed via an
   // atomic-increment flag If this block is not the last block to contribute to
   // this group then the block is done.
-  // FIXME_HIP __syncthreads_or is not supported by HIP yet.
-  // const bool is_last_block = !__syncthreads_or(
-  //    threadIdx.y
-  //        ? 0
-  //        : (1 + atomicInc(global_flags, block_count - 1) < block_count));
-  __shared__ int n_done;
-  n_done = 0;
-  __syncthreads();
-  if (threadIdx.y == 0) {
-    n_done = 1 + atomicInc(global_flags, block_count - 1);
-  }
-  __syncthreads();
-  bool const is_last_block = (n_done == static_cast<int>(block_count));
-
+  const bool is_last_block = !__syncthreads_or(
+      threadIdx.y
+          ? 0
+          : (1 + atomicInc(global_flags, block_count - 1) < block_count));
   if (is_last_block) {
     size_type const b = (static_cast<long long int>(block_count) *
                          static_cast<long long int>(threadIdx.y)) >>
@@ -477,22 +465,24 @@ __device__ bool hip_single_inter_block_reduce_scan(
 }
 
 // Size in bytes required for inter block reduce or scan
-template <bool DoScan, class FunctorType, class ArgTag>
+template <bool DoScan, class ArgTag, class ValueType, class FunctorType>
 inline std::enable_if_t<DoScan, unsigned>
 hip_single_inter_block_reduce_scan_shmem(const FunctorType& functor,
                                          const unsigned BlockSize) {
-  using Analysis = Impl::FunctorAnalysis<Impl::FunctorPatternInterface::SCAN,
-                                         RangePolicy<HIP, ArgTag>, FunctorType>;
+  using Analysis =
+      Impl::FunctorAnalysis<Impl::FunctorPatternInterface::SCAN,
+                            RangePolicy<HIP, ArgTag>, FunctorType, ValueType>;
 
   return (BlockSize + 2) * Analysis::value_size(functor);
 }
 
-template <bool DoScan, class FunctorType, class ArgTag>
+template <bool DoScan, class ArgTag, class ValueType, class FunctorType>
 inline std::enable_if_t<!DoScan, unsigned>
 hip_single_inter_block_reduce_scan_shmem(const FunctorType& functor,
                                          const unsigned BlockSize) {
-  using Analysis = Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
-                                         RangePolicy<HIP, ArgTag>, FunctorType>;
+  using Analysis =
+      Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
+                            RangePolicy<HIP, ArgTag>, FunctorType, ValueType>;
 
   return (BlockSize + 2) * Analysis::value_size(functor);
 }
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp
index cca5e9ff26e4f6b05686d764affefdd0343dd932..4035bb012132798ed511eaccac823be0be218114 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp
@@ -40,7 +40,7 @@ __device__ inline void hip_intra_warp_shuffle_reduction(
   unsigned int shift = 1;
 
   // Reduce over values from threads with different threadIdx.y
-  unsigned int constexpr warp_size = HIPTraits::WarpSize;
+  constexpr unsigned int warp_size = HIPTraits::WarpSize;
   while (blockDim.x * shift < warp_size) {
     ValueType const tmp = shfl_down(result, blockDim.x * shift, warp_size);
     // Only join if upper thread is active (this allows non power of two for
@@ -59,8 +59,8 @@ template <typename ValueType, typename ReducerType>
 __device__ inline void hip_inter_warp_shuffle_reduction(
     ValueType& value, const ReducerType& reducer,
     const int max_active_thread = blockDim.y) {
-  unsigned int constexpr warp_size = HIPTraits::WarpSize;
-  int constexpr step_width         = 8;
+  constexpr unsigned int warp_size = HIPTraits::WarpSize;
+  constexpr int step_width         = 8;
   // Depending on the ValueType __shared__ memory must be aligned up to 8 byte
   // boundaries. The reason not to use ValueType directly is that for types with
   // constructors it could lead to race conditions.
@@ -118,13 +118,14 @@ __device__ inline bool hip_inter_block_shuffle_reduction(
     pointer_type global =
         reinterpret_cast<pointer_type>(m_scratch_space) + blockIdx.x;
     *global = value;
+    __threadfence();
   }
 
   // One warp of last block performs inter block reduction through loading the
   // block values from global scratch_memory
   bool last_block = false;
   __syncthreads();
-  int constexpr warp_size = HIPTraits::WarpSize;
+  constexpr int warp_size = HIPTraits::WarpSize;
   if (id < warp_size) {
     HIP::size_type count;
 
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp
index 07b9382ab7658d87ea1dcb226181164cf379c533..7f6aa0d8e82d7e392d97119b5ab1cac65db2916e 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp
@@ -45,14 +45,6 @@ namespace {
 
 static std::atomic<bool> is_first_hip_managed_allocation(true);
 
-bool hip_driver_check_page_migration(int deviceId) {
-  // check with driver if page migrating memory is available
-  // this driver query is copied from the hip documentation
-  int hasManagedMemory = 0;  // false by default
-  KOKKOS_IMPL_HIP_SAFE_CALL(hipDeviceGetAttribute(
-      &hasManagedMemory, hipDeviceAttributeManagedMemory, deviceId));
-  return static_cast<bool>(hasManagedMemory);
-}
 }  // namespace
 
 /*--------------------------------------------------------------------------*/
@@ -153,7 +145,7 @@ void* HIPManagedSpace::impl_allocate(
     if (is_first_hip_managed_allocation.exchange(false) &&
         Kokkos::show_warnings()) {
       do {  // hack to avoid spamming users with too many warnings
-        if (!hip_driver_check_page_migration(m_device)) {
+        if (!impl_hip_driver_check_page_migration()) {
           std::cerr << R"warning(
 Kokkos::HIP::allocation WARNING: The combination of device and system configuration
                                  does not support page migration between device and host.
@@ -205,6 +197,19 @@ Kokkos::HIP::runtime WARNING: Kokkos did not find an environment variable 'HSA_X
 
   return ptr;
 }
+bool HIPManagedSpace::impl_hip_driver_check_page_migration() const {
+  // check with driver if page migrating memory is available
+  // this driver query is copied from the hip documentation
+  int hasManagedMemory = 0;  // false by default
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipDeviceGetAttribute(
+      &hasManagedMemory, hipDeviceAttributeManagedMemory, m_device));
+  if (!static_cast<bool>(hasManagedMemory)) return false;
+  // next, check pageableMemoryAccess
+  int hasPageableMemory = 0;  // false by default
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipDeviceGetAttribute(
+      &hasPageableMemory, hipDeviceAttributePageableMemoryAccess, m_device));
+  return static_cast<bool>(hasPageableMemory);
+}
 
 void HIPSpace::deallocate(void* const arg_alloc_ptr,
                           const size_t arg_alloc_size) const {
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.hpp
index 902b6f05527b1a10ae30da1a82ac1c2de41eb421..f3e5adf87e5cbf1f0e1ac83709de0f0478ab5fb2 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.hpp
@@ -204,6 +204,9 @@ class HIPManagedSpace {
                   const size_t arg_alloc_size,
                   const size_t arg_logical_size = 0) const;
 
+  //  internal only method to determine whether page migration is supported
+  bool impl_hip_driver_check_page_migration() const;
+
  private:
   int m_device;  ///< Which HIP device
   template <class, class, class, class>
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp
index 5b9faba786f77e7fda2c92aa122d8bb3cc19884d..fb466d8a721f9a343fb9083a289a80e71f5ab2e8 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp
@@ -181,8 +181,8 @@ class HIPTeamMember {
               typename ReducerType::value_type& value) const noexcept {
 #ifdef __HIP_DEVICE_COMPILE__
     typename Kokkos::Impl::FunctorAnalysis<
-        FunctorPatternInterface::REDUCE, TeamPolicy<HIP>, ReducerType>::Reducer
-        wrapped_reducer(&reducer);
+        FunctorPatternInterface::REDUCE, TeamPolicy<HIP>, ReducerType,
+        typename ReducerType::value_type>::Reducer wrapped_reducer(reducer);
     hip_intra_block_shuffle_reduction(value, wrapped_reducer, blockDim.y);
     reducer.reference() = value;
 #else
@@ -219,7 +219,7 @@ class HIPTeamMember {
     Impl::HIPJoinFunctor<Type> hip_join_functor;
     typename Kokkos::Impl::FunctorAnalysis<
         FunctorPatternInterface::REDUCE, TeamPolicy<HIP>,
-        Impl::HIPJoinFunctor<Type>>::Reducer reducer(&hip_join_functor);
+        Impl::HIPJoinFunctor<Type>, Type>::Reducer reducer(hip_join_functor);
     Impl::hip_intra_block_reduce_scan<true>(reducer, base_data + 1);
 
     if (global_accum) {
@@ -367,18 +367,10 @@ struct ThreadVectorRangeBoundariesStruct<iType, HIPTeamMember> {
   ThreadVectorRangeBoundariesStruct(const HIPTeamMember, index_type count)
       : start(static_cast<index_type>(0)), end(count) {}
 
-  KOKKOS_INLINE_FUNCTION
-  ThreadVectorRangeBoundariesStruct(index_type count)
-      : start(static_cast<index_type>(0)), end(count) {}
-
   KOKKOS_INLINE_FUNCTION
   ThreadVectorRangeBoundariesStruct(const HIPTeamMember, index_type arg_begin,
                                     index_type arg_end)
       : start(arg_begin), end(arg_end) {}
-
-  KOKKOS_INLINE_FUNCTION
-  ThreadVectorRangeBoundariesStruct(index_type arg_begin, index_type arg_end)
-      : start(arg_begin), end(arg_end) {}
 };
 
 }  // namespace Impl
@@ -545,15 +537,17 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<
  *  final == true.
  */
 // This is the same code as in CUDA and largely the same as in OpenMPTarget
-template <typename iType, typename FunctorType>
+template <typename iType, typename FunctorType, typename ValueType>
 KOKKOS_INLINE_FUNCTION void parallel_scan(
     const Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HIPTeamMember>&
         loop_bounds,
-    const FunctorType& lambda) {
-  // Extract value_type from lambda
-  using value_type = typename Kokkos::Impl::FunctorAnalysis<
-      Kokkos::Impl::FunctorPatternInterface::SCAN, void,
-      FunctorType>::value_type;
+    const FunctorType& lambda, ValueType& return_val) {
+  // Extract ValueType from the Functor
+  using functor_value_type = typename Kokkos::Impl::FunctorAnalysis<
+      Kokkos::Impl::FunctorPatternInterface::SCAN, void, FunctorType,
+      ValueType>::value_type;
+  static_assert(std::is_same_v<functor_value_type, ValueType>,
+                "Non-matching value types of functor and return type");
 
   const auto start     = loop_bounds.start;
   const auto end       = loop_bounds.end;
@@ -561,12 +555,12 @@ KOKKOS_INLINE_FUNCTION void parallel_scan(
   const auto team_size = member.team_size();
   const auto team_rank = member.team_rank();
   const auto nchunk    = (end - start + team_size - 1) / team_size;
-  value_type accum     = 0;
+  ValueType accum      = {};
   // each team has to process one or more chunks of the prefix scan
   for (iType i = 0; i < nchunk; ++i) {
     auto ii = start + i * team_size + team_rank;
     // local accumulation for this chunk
-    value_type local_accum = 0;
+    ValueType local_accum = 0;
     // user updates value with prefix value
     if (ii < loop_bounds.end) lambda(ii, local_accum, false);
     // perform team scan
@@ -580,6 +574,29 @@ KOKKOS_INLINE_FUNCTION void parallel_scan(
     // broadcast last value to rest of the team
     member.team_broadcast(accum, team_size - 1);
   }
+  return_val = accum;
+}
+
+/** \brief  Inter-thread parallel exclusive prefix sum.
+ *
+ *  Executes closure(iType i, ValueType & val, bool final) for each i=[0..N)
+ *
+ *  The range [0..N) is mapped to each rank in the team (whose global rank is
+ *  less than N) and a scan operation is performed. The last call to closure has
+ *  final == true.
+ */
+template <typename iType, typename FunctorType>
+KOKKOS_INLINE_FUNCTION void parallel_scan(
+    const Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HIPTeamMember>&
+        loop_bounds,
+    const FunctorType& lambda) {
+  // Extract value_type from lambda
+  using value_type = typename Kokkos::Impl::FunctorAnalysis<
+      Kokkos::Impl::FunctorPatternInterface::SCAN, void, FunctorType,
+      void>::value_type;
+
+  value_type scan_val;
+  parallel_scan(loop_bounds, lambda, scan_val);
 }
 
 template <typename iType, class Closure>
@@ -780,7 +797,8 @@ parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<
     // exclusive scan -- the final accumulation
     // of i's val will be included in the second
     // closure call later.
-    if (i < loop_boundaries.end && threadIdx.x > 0) closure(i - 1, val, false);
+    if (i - 1 < loop_boundaries.end && threadIdx.x > 0)
+      closure(i - 1, val, false);
 
     // Bottom up exclusive scan in triangular pattern
     // where each HIP thread is the root of a reduction tree
@@ -809,6 +827,7 @@ parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<
     if (i < loop_boundaries.end) closure(i, val, true);
     Impl::in_place_shfl(accum, val, blockDim.x - 1, blockDim.x);
   }
+  reducer.reference() = accum;
 #else
   (void)loop_boundaries;
   (void)closure;
@@ -832,11 +851,38 @@ KOKKOS_INLINE_FUNCTION void parallel_scan(
         loop_boundaries,
     const Closure& closure) {
   using value_type = typename Kokkos::Impl::FunctorAnalysis<
-      Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type;
+      Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure,
+      void>::value_type;
   value_type dummy;
   parallel_scan(loop_boundaries, closure, Kokkos::Sum<value_type>(dummy));
 }
 
+/** \brief  Intra-thread vector parallel exclusive prefix sum.
+ *
+ *  Executes closure(iType i, ValueType & val, bool final) for each i=[0..N)
+ *
+ *  The range [0..N) is mapped to all vector lanes in the
+ *  thread and a scan operation is performed.
+ *  The last call to closure has final == true.
+ */
+template <typename iType, class Closure, typename ValueType>
+KOKKOS_INLINE_FUNCTION void parallel_scan(
+    const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HIPTeamMember>&
+        loop_boundaries,
+    const Closure& closure, ValueType& return_val) {
+  // Extract ValueType from the Closure
+  using closure_value_type = typename Kokkos::Impl::FunctorAnalysis<
+      Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure,
+      ValueType>::value_type;
+  static_assert(std::is_same_v<closure_value_type, ValueType>,
+                "Non-matching value types of closure and return type");
+
+  ValueType accum;
+  parallel_scan(loop_boundaries, closure, Kokkos::Sum<ValueType>(accum));
+
+  return_val = accum;
+}
+
 }  // namespace Kokkos
 
 namespace Kokkos {
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_UniqueToken.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_UniqueToken.hpp
index 13fc6216d6b06482026810fa2fd43507617bf602..313e5f5217296f152171e4bffe418b60c4a413c3 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_UniqueToken.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_UniqueToken.hpp
@@ -97,13 +97,9 @@ class UniqueToken<HIP, UniqueTokenScope::Global> {
       done_active = __ballot(done ? 1 : 0);
     }
 
-// Make sure that all writes in the previous lock owner are visible to me
-#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+    // Make sure that all writes in the previous lock owner are visible to me
     desul::atomic_thread_fence(desul::MemoryOrderAcquire(),
                                desul::MemoryScopeDevice());
-#else
-    Kokkos::memory_fence();
-#endif
     return idx;
   }
 
@@ -118,13 +114,9 @@ class UniqueToken<HIP, UniqueTokenScope::Global> {
   /// \brief release an acquired value
   KOKKOS_INLINE_FUNCTION
   void release(size_type idx) const noexcept {
-// Make sure my writes are visible to the next lock owner
-#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+    // Make sure my writes are visible to the next lock owner
     desul::atomic_thread_fence(desul::MemoryOrderRelease(),
                                desul::MemoryScopeDevice());
-#else
-    Kokkos::memory_fence();
-#endif
     (void)Kokkos::atomic_exchange(m_locks.data() + idx, 0);
   }
 };
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Vectorization.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Vectorization.hpp
index e14b722f37378668b92fd04939770def45f0a4d8..30774c898b679e463fe5aff997186f3ffc6f9bc8 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Vectorization.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Vectorization.hpp
@@ -85,7 +85,7 @@ struct in_place_shfl_op {
   operator()(Scalar& out, const Scalar& val, int lane_or_delta, int width) const
       noexcept {
     using shuffle_as_t = int;
-    int constexpr N    = sizeof(Scalar) / sizeof(shuffle_as_t);
+    constexpr int N    = sizeof(Scalar) / sizeof(shuffle_as_t);
 
     for (int i = 0; i < N; ++i) {
       reinterpret_cast<shuffle_as_t*>(&out)[i] = self().do_shfl_op(
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..5c40d0fbc8d079d7d64db4f0799624d8ebdc4b4d
--- /dev/null
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.hpp
@@ -0,0 +1,46 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+#ifndef KOKKOS_HIP_ZEROMEMSET_HPP
+#define KOKKOS_HIP_ZEROMEMSET_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <HIP/Kokkos_HIP.hpp>
+#include <impl/Kokkos_ZeroMemset_fwd.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+template <class T, class... P>
+struct ZeroMemset<HIP, View<T, P...>> {
+  ZeroMemset(const HIP& exec_space, const View<T, P...>& dst,
+             typename View<T, P...>::const_value_type&) {
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipMemsetAsync(
+        dst.data(), 0, dst.size() * sizeof(typename View<T, P...>::value_type),
+        exec_space.hip_stream()));
+  }
+
+  ZeroMemset(const View<T, P...>& dst,
+             typename View<T, P...>::const_value_type&) {
+    KOKKOS_IMPL_HIP_SAFE_CALL(
+        hipMemset(dst.data(), 0,
+                  dst.size() * sizeof(typename View<T, P...>::value_type)));
+  }
+};
+
+}  // namespace Impl
+}  // namespace Kokkos
+
+#endif  // !defined(KOKKOS_HIP_ZEROMEMSET_HPP)
diff --git a/packages/kokkos/core/src/HPX/Kokkos_HPX.cpp b/packages/kokkos/core/src/HPX/Kokkos_HPX.cpp
index 2074123a15fd4acbea86f4b8f753aa2628991d7d..4a40ffcaa4f680c86e7bfd6d832cf7ea5dae9b2a 100644
--- a/packages/kokkos/core/src/HPX/Kokkos_HPX.cpp
+++ b/packages/kokkos/core/src/HPX/Kokkos_HPX.cpp
@@ -21,38 +21,177 @@
 #include <Kokkos_Core.hpp>
 
 #ifdef KOKKOS_ENABLE_HPX
-#include <Kokkos_HPX.hpp>
+#include <HPX/Kokkos_HPX.hpp>
 
 #include <impl/Kokkos_ExecSpaceManager.hpp>
 
-#include <hpx/local/condition_variable.hpp>
-#include <hpx/local/init.hpp>
-#include <hpx/local/thread.hpp>
-#include <hpx/local/mutex.hpp>
+#include <hpx/condition_variable.hpp>
+#include <hpx/init.hpp>
+#include <hpx/mutex.hpp>
+#include <hpx/runtime.hpp>
+#include <hpx/thread.hpp>
+#include <hpx/version.hpp>
 
 #include <atomic>
 #include <chrono>
-#include <iostream>
 #include <memory>
+#include <ostream>
 #include <string>
 #include <type_traits>
 
 namespace Kokkos {
+namespace Impl {
+void hpx_thread_buffer::resize(const std::size_t num_threads,
+                               const std::size_t size_per_thread,
+                               const std::size_t extra_space) noexcept {
+  m_num_threads     = num_threads;
+  m_size_per_thread = size_per_thread;
+  m_extra_space     = extra_space;
+
+  pad_to_cache_line(m_size_per_thread);
+
+  std::size_t size_total_new =
+      m_num_threads * m_size_per_thread + m_extra_space;
+
+  if (m_size_total < size_total_new) {
+    // Don't use make_unique here as it value-initializes the elements of the
+    // array, which we have no use for, and can be very slow for large arrays.
+    m_data       = std::unique_ptr<char[]>(new char[size_total_new]);
+    m_size_total = size_total_new;
+  }
+}
+
+void *hpx_thread_buffer::get(std::size_t thread_num) const noexcept {
+  KOKKOS_EXPECTS(thread_num < m_num_threads);
+  if (!m_data) {
+    return nullptr;
+  }
+  return &m_data[thread_num * m_size_per_thread];
+}
+
+void *hpx_thread_buffer::get_extra_space() const noexcept {
+  KOKKOS_EXPECTS(m_extra_space > 0);
+  if (!m_data) {
+    return nullptr;
+  }
+  return &m_data[m_num_threads * m_size_per_thread];
+}
+}  // namespace Impl
+
 namespace Experimental {
 
 bool HPX::m_hpx_initialized = false;
-#if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH)
 std::atomic<uint32_t> HPX::m_next_instance_id{HPX::impl_default_instance_id() +
                                               1};
 uint32_t HPX::m_active_parallel_region_count{0};
 hpx::spinlock HPX::m_active_parallel_region_count_mutex;
 hpx::condition_variable_any HPX::m_active_parallel_region_count_cond;
 HPX::instance_data HPX::m_default_instance_data;
+
+void HPX::print_configuration(std::ostream &os, const bool) const {
+  os << "Host Parallel Execution Space\n";
+  os << "  KOKKOS_ENABLE_HPX: yes\n";
+  os << "HPX Options:\n";
+#if defined(KOKKOS_ENABLE_IMPL_HPX_ASYNC_DISPATCH)
+  os << "  KOKKOS_ENABLE_IMPL_HPX_ASYNC_DISPATCH: yes\n";
 #else
-Kokkos::Impl::thread_buffer HPX::m_default_buffer;
+  os << "  KOKKOS_ENABLE_IMPL_HPX_ASYNC_DISPATCH: no\n";
 #endif
+  os << "\nHPX Runtime Configuration:\n";
+  os << "Worker threads: " << hpx::get_num_worker_threads() << '\n';
+  os << hpx::complete_version() << '\n';
+  os << hpx::configuration_string() << '\n';
+}
+
+bool &HPX::impl_get_in_parallel() noexcept {
+  static thread_local bool in_parallel = false;
+  return in_parallel;
+}
+
+HPX::impl_in_parallel_scope::impl_in_parallel_scope() noexcept {
+  KOKKOS_EXPECTS(!impl_get_in_parallel());
+  impl_get_in_parallel() = true;
+}
 
+HPX::impl_in_parallel_scope::~impl_in_parallel_scope() noexcept {
+  KOKKOS_EXPECTS(impl_get_in_parallel());
+  impl_get_in_parallel() = false;
+}
+
+HPX::impl_not_in_parallel_scope::impl_not_in_parallel_scope() noexcept {
+  KOKKOS_EXPECTS(impl_get_in_parallel());
+  impl_get_in_parallel() = false;
+}
+
+HPX::impl_not_in_parallel_scope::~impl_not_in_parallel_scope() noexcept {
+  KOKKOS_EXPECTS(!impl_get_in_parallel());
+  impl_get_in_parallel() = true;
+}
+
+void HPX::impl_decrement_active_parallel_region_count() {
+  std::unique_lock<hpx::spinlock> l(m_active_parallel_region_count_mutex);
+  if (--m_active_parallel_region_count == 0) {
+    l.unlock();
+    m_active_parallel_region_count_cond.notify_all();
+  };
+}
+
+void HPX::impl_increment_active_parallel_region_count() {
+  std::unique_lock<hpx::spinlock> l(m_active_parallel_region_count_mutex);
+  ++m_active_parallel_region_count;
+}
+
+void HPX::impl_instance_fence_locked(const std::string &name) const {
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<
+      Kokkos::Experimental::HPX>(
+      name,
+      Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{
+          impl_instance_id()},
+      [&]() {
+        auto &s = impl_get_sender();
+
+        hpx::this_thread::experimental::sync_wait(std::move(s));
+        s = hpx::execution::experimental::unique_any_sender(
+            hpx::execution::experimental::just());
+      });
+}
+
+void HPX::impl_instance_fence(const std::string &name) const {
+  std::lock_guard<hpx::spinlock> l(impl_get_sender_mutex());
+  impl_instance_fence_locked(name);
+}
+
+void HPX::impl_static_fence(const std::string &name) {
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<
+      Kokkos::Experimental::HPX>(
+      name,
+      Kokkos::Tools::Experimental::SpecialSynchronizationCases::
+          GlobalDeviceSynchronization,
+      [&]() {
+        auto &s = HPX().impl_get_sender();
+
+        std::unique_lock<hpx::spinlock> l(HPX().impl_get_sender_mutex());
+
+        // This is a loose fence. Any work scheduled before this will be waited
+        // for, but work scheduled while waiting may also be waited for.
+        {
+          std::unique_lock<hpx::spinlock> l_count(
+              m_active_parallel_region_count_mutex);
+          m_active_parallel_region_count_cond.wait(
+              l_count, [&]() { return m_active_parallel_region_count == 0; });
+        }
+
+        hpx::this_thread::experimental::sync_wait(std::move(s));
+        s = hpx::execution::experimental::unique_any_sender(
+            hpx::execution::experimental::just());
+      });
+}
+
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
 int HPX::concurrency() {
+#else
+int HPX::concurrency() const {
+#endif
   hpx::runtime *rt = hpx::get_runtime_ptr();
   if (rt == nullptr) {
     return hpx::threads::hardware_concurrency();
@@ -68,12 +207,7 @@ int HPX::concurrency() {
 void HPX::impl_initialize(InitializationSettings const &settings) {
   hpx::runtime *rt = hpx::get_runtime_ptr();
   if (rt == nullptr) {
-    hpx::local::init_params i;
-    i.cfg = {
-#ifdef KOKKOS_ENABLE_DEBUG
-        "--hpx:attach-debugger=exception",
-#endif
-    };
+    hpx::init_params i;
     if (settings.has_num_threads()) {
       i.cfg.emplace_back("hpx.os_threads=" +
                          std::to_string(settings.get_num_threads()));
@@ -81,7 +215,7 @@ void HPX::impl_initialize(InitializationSettings const &settings) {
     int argc_hpx     = 1;
     char name[]      = "kokkos_hpx";
     char *argv_hpx[] = {name, nullptr};
-    hpx::local::start(nullptr, argc_hpx, argv_hpx, i);
+    hpx::start(nullptr, argc_hpx, argv_hpx, i);
 
     m_hpx_initialized = true;
   }
@@ -96,8 +230,12 @@ void HPX::impl_finalize() {
   if (m_hpx_initialized) {
     hpx::runtime *rt = hpx::get_runtime_ptr();
     if (rt != nullptr) {
-      hpx::apply([]() { hpx::local::finalize(); });
-      hpx::local::stop();
+#if HPX_VERSION_FULL >= 0x010900
+      hpx::post([]() { hpx::finalize(); });
+#else
+      hpx::apply([]() { hpx::finalize(); });
+#endif
+      hpx::stop();
     } else {
       Kokkos::abort(
           "Kokkos::Experimental::HPX::impl_finalize: Kokkos started "
@@ -106,10 +244,78 @@ void HPX::impl_finalize() {
   }
 }
 
+int HPX::impl_thread_pool_size() noexcept {
+  hpx::runtime *rt = hpx::get_runtime_ptr();
+  if (rt == nullptr) {
+    return 0;
+  } else {
+    if (hpx::threads::get_self_ptr() == nullptr) {
+      return hpx::resource::get_thread_pool(0).get_os_thread_count();
+    } else {
+      return hpx::this_thread::get_pool()->get_os_thread_count();
+    }
+  }
+}
+
+int HPX::impl_thread_pool_rank() noexcept {
+  hpx::runtime *rt = hpx::get_runtime_ptr();
+  if (rt == nullptr) {
+    return 0;
+  } else {
+    if (hpx::threads::get_self_ptr() == nullptr) {
+      return 0;
+    } else {
+      return hpx::this_thread::get_pool()->get_pool_index();
+    }
+  }
+}
+
+int HPX::impl_thread_pool_size(int depth) {
+  if (depth == 0) {
+    return impl_thread_pool_size();
+  } else {
+    return 1;
+  }
+}
+
+template void HPX::impl_bulk_plain_erased<int>(
+    bool, bool, std::function<void(int)> &&, int const,
+    hpx::threads::thread_stacksize stacksize) const;
+
+template void HPX::impl_bulk_plain_erased<unsigned int>(
+    bool, bool, std::function<void(unsigned int)> &&, unsigned int const,
+    hpx::threads::thread_stacksize stacksize) const;
+
+template void HPX::impl_bulk_plain_erased<long>(
+    bool, bool, std::function<void(long)> &&, long const,
+    hpx::threads::thread_stacksize stacksize) const;
+
+template void HPX::impl_bulk_plain_erased<std::size_t>(
+    bool, bool, std::function<void(std::size_t)> &&, std::size_t const,
+    hpx::threads::thread_stacksize stacksize) const;
+
+template void HPX::impl_bulk_setup_finalize_erased<int>(
+    bool, bool, std::function<void(int)> &&, std::function<void()> &&,
+    std::function<void()> &&, int const,
+    hpx::threads::thread_stacksize stacksize) const;
+
+template void HPX::impl_bulk_setup_finalize_erased<unsigned int>(
+    bool, bool, std::function<void(unsigned int)> &&, std::function<void()> &&,
+    std::function<void()> &&, unsigned int const,
+    hpx::threads::thread_stacksize stacksize) const;
+
+template void HPX::impl_bulk_setup_finalize_erased<long>(
+    bool, bool, std::function<void(long)> &&, std::function<void()> &&,
+    std::function<void()> &&, long const,
+    hpx::threads::thread_stacksize stacksize) const;
+
+template void HPX::impl_bulk_setup_finalize_erased<std::size_t>(
+    bool, bool, std::function<void(std::size_t)> &&, std::function<void()> &&,
+    std::function<void()> &&, std::size_t const,
+    hpx::threads::thread_stacksize stacksize) const;
 }  // namespace Experimental
 
 namespace Impl {
-
 int g_hpx_space_factory_initialized =
     initialize_space_factory<Kokkos::Experimental::HPX>("060_HPX");
 
@@ -119,4 +325,4 @@ int g_hpx_space_factory_initialized =
 
 #else
 void KOKKOS_CORE_SRC_IMPL_HPX_PREVENT_LINK_ERROR() {}
-#endif  //#ifdef KOKKOS_ENABLE_HPX
+#endif  // #ifdef KOKKOS_ENABLE_HPX
diff --git a/packages/kokkos/core/src/HPX/Kokkos_HPX.hpp b/packages/kokkos/core/src/HPX/Kokkos_HPX.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..1dfc5b406464192f90563996706b31522a569722
--- /dev/null
+++ b/packages/kokkos/core/src/HPX/Kokkos_HPX.hpp
@@ -0,0 +1,1980 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#include <Kokkos_Macros.hpp>
+static_assert(false,
+              "Including non-public Kokkos header files is not allowed.");
+#endif
+#ifndef KOKKOS_HPX_HPP
+#define KOKKOS_HPX_HPP
+
+#include <Kokkos_Macros.hpp>
+#if defined(KOKKOS_ENABLE_HPX)
+
+#include <Kokkos_Core_fwd.hpp>
+
+#include <Kokkos_HostSpace.hpp>
+#include <cstddef>
+#include <iosfwd>
+
+#ifdef KOKKOS_ENABLE_HBWSPACE
+#include <Kokkos_HBWSpace.hpp>
+#endif
+
+#include <Kokkos_HostSpace.hpp>
+#include <Kokkos_Layout.hpp>
+#include <Kokkos_MemoryTraits.hpp>
+#include <Kokkos_Parallel.hpp>
+#include <Kokkos_ScratchSpace.hpp>
+#include <Kokkos_TaskScheduler.hpp>
+#include <impl/Kokkos_ConcurrentBitset.hpp>
+#include <impl/Kokkos_FunctorAnalysis.hpp>
+#include <impl/Kokkos_HostSharedPtr.hpp>
+#include <impl/Kokkos_Tools.hpp>
+#include <impl/Kokkos_TaskQueue.hpp>
+#include <impl/Kokkos_InitializationSettings.hpp>
+
+#include <KokkosExp_MDRangePolicy.hpp>
+
+#include <hpx/barrier.hpp>
+#include <hpx/condition_variable.hpp>
+#include <hpx/execution.hpp>
+#include <hpx/future.hpp>
+#include <hpx/mutex.hpp>
+#include <hpx/thread.hpp>
+
+#include <Kokkos_UniqueToken.hpp>
+
+#include <iosfwd>
+#include <functional>
+#include <memory>
+#include <type_traits>
+#include <vector>
+
+namespace Kokkos {
+namespace Impl {
+class hpx_thread_buffer {
+  static constexpr std::size_t m_cache_line_size = 64;
+
+  std::size_t m_num_threads      = 0;
+  std::size_t m_size_per_thread  = 0;
+  std::size_t m_extra_space      = 0;
+  std::size_t m_size_total       = 0;
+  std::unique_ptr<char[]> m_data = nullptr;
+
+  static constexpr void pad_to_cache_line(std::size_t &size) {
+    size = ((size + m_cache_line_size - 1) / m_cache_line_size) *
+           m_cache_line_size;
+  }
+
+ public:
+  hpx_thread_buffer()                          = default;
+  ~hpx_thread_buffer()                         = default;
+  hpx_thread_buffer(const hpx_thread_buffer &) = delete;
+  hpx_thread_buffer(hpx_thread_buffer &&)      = delete;
+  hpx_thread_buffer &operator=(const hpx_thread_buffer &) = delete;
+  hpx_thread_buffer &operator=(hpx_thread_buffer) = delete;
+
+  void resize(const std::size_t num_threads, const std::size_t size_per_thread,
+              const std::size_t extra_space = 0) noexcept;
+  void *get(std::size_t thread_num) const noexcept;
+  void *get_extra_space() const noexcept;
+};
+
+template <typename T>
+struct hpx_range {
+  T begin;
+  T end;
+};
+
+template <typename T>
+constexpr T get_num_chunks(const T offset, const T chunk_size, const T max) {
+  return (max - offset + chunk_size - 1) / chunk_size;
+}
+
+template <typename T>
+constexpr hpx_range<T> get_chunk_range(const T i_chunk, const T offset,
+                                       const T chunk_size, const T max) {
+  const T begin = offset + i_chunk * chunk_size;
+  const T end   = (std::min)(begin + chunk_size, max);
+  return {begin, end};
+}
+
+template <typename Policy>
+constexpr bool is_light_weight_policy() {
+  constexpr Kokkos::Experimental::WorkItemProperty::HintLightWeight_t
+      light_weight = Kokkos::Experimental::WorkItemProperty::HintLightWeight;
+  return (typename Policy::work_item_property() & light_weight) == light_weight;
+}
+}  // namespace Impl
+
+namespace Experimental {
+class HPX {
+ public:
+  static constexpr uint32_t impl_default_instance_id() { return 1; }
+
+ private:
+  static bool m_hpx_initialized;
+  static std::atomic<uint32_t> m_next_instance_id;
+
+ public:
+  enum class instance_mode { default_, independent };
+
+ private:
+  static uint32_t m_active_parallel_region_count;
+  static hpx::spinlock m_active_parallel_region_count_mutex;
+  static hpx::condition_variable_any m_active_parallel_region_count_cond;
+
+  struct instance_data {
+    instance_data()  = default;
+    ~instance_data() = default;
+    instance_data(uint32_t instance_id) : m_instance_id(instance_id) {}
+    instance_data(uint32_t instance_id,
+                  hpx::execution::experimental::unique_any_sender<> &&sender)
+        : m_instance_id(instance_id), m_sender{std::move(sender)} {}
+
+    instance_data(const instance_data &) = delete;
+    instance_data(instance_data &&)      = delete;
+    instance_data &operator=(const instance_data &) = delete;
+    instance_data &operator=(instance_data) = delete;
+
+    uint32_t m_instance_id{HPX::impl_default_instance_id()};
+    hpx::execution::experimental::unique_any_sender<> m_sender{
+        hpx::execution::experimental::just()};
+    Kokkos::Impl::hpx_thread_buffer m_buffer;
+    hpx::spinlock m_sender_mutex;
+  };
+
+  static void default_instance_deleter(instance_data *) {}
+  static instance_data m_default_instance_data;
+  Kokkos::Impl::HostSharedPtr<instance_data> m_instance_data;
+
+ public:
+  using execution_space      = HPX;
+  using memory_space         = HostSpace;
+  using device_type          = Kokkos::Device<execution_space, memory_space>;
+  using array_layout         = LayoutRight;
+  using size_type            = memory_space::size_type;
+  using scratch_memory_space = ScratchMemorySpace<HPX>;
+
+  HPX()
+      : m_instance_data(Kokkos::Impl::HostSharedPtr<instance_data>(
+            &m_default_instance_data, &default_instance_deleter)) {}
+  ~HPX() = default;
+  HPX(instance_mode mode)
+      : m_instance_data(
+            mode == instance_mode::independent
+                ? (Kokkos::Impl::HostSharedPtr<instance_data>(
+                      new instance_data(m_next_instance_id++)))
+                : Kokkos::Impl::HostSharedPtr<instance_data>(
+                      &m_default_instance_data, &default_instance_deleter)) {}
+  HPX(hpx::execution::experimental::unique_any_sender<> &&sender)
+      : m_instance_data(Kokkos::Impl::HostSharedPtr<instance_data>(
+            new instance_data(m_next_instance_id++, std::move(sender)))) {}
+
+  HPX(HPX &&other)      = default;
+  HPX(const HPX &other) = default;
+
+  HPX &operator=(HPX &&) = default;
+  HPX &operator=(const HPX &) = default;
+
+  void print_configuration(std::ostream &os, bool /*verbose*/ = false) const;
+  instance_data &impl_get_instance_data() const noexcept {
+    KOKKOS_EXPECTS(m_instance_data.get());
+    return *m_instance_data.get();
+  }
+  uint32_t impl_instance_id() const noexcept {
+    return impl_get_instance_data().m_instance_id;
+  }
+
+  static bool &impl_get_in_parallel() noexcept;
+
+  struct impl_in_parallel_scope {
+    impl_in_parallel_scope() noexcept;
+    ~impl_in_parallel_scope() noexcept;
+    impl_in_parallel_scope(impl_in_parallel_scope &&)      = delete;
+    impl_in_parallel_scope(impl_in_parallel_scope const &) = delete;
+    impl_in_parallel_scope &operator=(impl_in_parallel_scope &&) = delete;
+    impl_in_parallel_scope &operator=(impl_in_parallel_scope const &) = delete;
+  };
+
+  struct impl_not_in_parallel_scope {
+    impl_not_in_parallel_scope() noexcept;
+    ~impl_not_in_parallel_scope() noexcept;
+    impl_not_in_parallel_scope(impl_not_in_parallel_scope &&)      = delete;
+    impl_not_in_parallel_scope(impl_not_in_parallel_scope const &) = delete;
+    impl_not_in_parallel_scope &operator=(impl_not_in_parallel_scope &&) =
+        delete;
+    impl_not_in_parallel_scope &operator=(impl_not_in_parallel_scope const &) =
+        delete;
+  };
+
+  static bool in_parallel(HPX const & = HPX()) noexcept {
+    return impl_get_in_parallel();
+  }
+
+  static void impl_decrement_active_parallel_region_count();
+  static void impl_increment_active_parallel_region_count();
+
+  void impl_instance_fence_locked(const std::string &name) const;
+  void impl_instance_fence(const std::string &name) const;
+  static void impl_static_fence(const std::string &name);
+
+  void fence(
+      const std::string &name =
+          "Kokkos::Experimental::HPX::fence: Unnamed Instance Fence") const {
+    impl_instance_fence(name);
+  }
+
+  static bool is_asynchronous(HPX const & = HPX()) noexcept {
+#if defined(KOKKOS_ENABLE_IMPL_HPX_ASYNC_DISPATCH)
+    return true;
+#else
+    return false;
+#endif
+  }
+
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+  template <typename F>
+  KOKKOS_DEPRECATED static void partition_master(
+      F const &, int requested_num_partitions = 0, int = 0) {
+    if (requested_num_partitions > 1) {
+      Kokkos::abort(
+          "Kokkos::Experimental::HPX::partition_master: can't partition an "
+          "HPX instance\n");
+    }
+  }
+#endif
+
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
+  static int concurrency();
+#else
+  int concurrency() const;
+#endif
+  static void impl_initialize(InitializationSettings const &);
+  static bool impl_is_initialized() noexcept;
+  static void impl_finalize();
+  static int impl_thread_pool_size() noexcept;
+  static int impl_thread_pool_rank() noexcept;
+  static int impl_thread_pool_size(int depth);
+
+  static int impl_max_hardware_threads() noexcept {
+    return hpx::threads::hardware_concurrency();
+  }
+
+  static int impl_hardware_thread_id() noexcept {
+    return hpx::get_worker_thread_num();
+  }
+
+  Kokkos::Impl::hpx_thread_buffer &impl_get_buffer() const noexcept {
+    return impl_get_instance_data().m_buffer;
+  }
+
+  hpx::execution::experimental::unique_any_sender<> &impl_get_sender() const
+      noexcept {
+    return impl_get_instance_data().m_sender;
+  }
+
+  hpx::execution::experimental::any_sender<> get_sender() const noexcept {
+    std::lock_guard l(impl_get_sender_mutex());
+    auto &s      = impl_get_sender();
+    auto split_s = hpx::execution::experimental::split(std::move(s));
+    s            = split_s;
+    return hpx::execution::experimental::any_sender<>{split_s};
+  }
+
+  hpx::future<void> impl_get_future() const noexcept {
+    return hpx::execution::experimental::make_future(get_sender());
+  }
+
+  hpx::spinlock &impl_get_sender_mutex() const noexcept {
+    return impl_get_instance_data().m_sender_mutex;
+  }
+
+  template <typename I>
+  void impl_bulk_plain_erased(
+      [[maybe_unused]] bool force_synchronous, bool is_light_weight_policy,
+      std::function<void(I)> &&f, I const n,
+      hpx::threads::thread_stacksize stacksize =
+          hpx::threads::thread_stacksize::default_) const {
+    Kokkos::Experimental::HPX::impl_increment_active_parallel_region_count();
+
+    namespace ex = hpx::execution::experimental;
+
+    auto &sen = impl_get_sender();
+    auto &mut = impl_get_sender_mutex();
+
+    std::lock_guard<hpx::spinlock> l(mut);
+    hpx::util::ignore_lock(&mut);
+
+    {
+      if (n == 1 && is_light_weight_policy &&
+          (hpx::threads::get_self_ptr() != nullptr)) {
+        sen = std::move(sen) | ex::then(hpx::bind_front(std::move(f), 0)) |
+              ex::then(Kokkos::Experimental::HPX::
+                           impl_decrement_active_parallel_region_count) |
+              ex::ensure_started();
+      } else {
+        sen = std::move(sen) |
+              ex::transfer(
+                  ex::with_stacksize(ex::thread_pool_scheduler{}, stacksize)) |
+              ex::bulk(n, std::move(f)) |
+              ex::then(Kokkos::Experimental::HPX::
+                           impl_decrement_active_parallel_region_count) |
+              ex::ensure_started();
+      }
+    }
+
+#if defined(KOKKOS_ENABLE_IMPL_HPX_ASYNC_DISPATCH)
+    if (force_synchronous)
+#endif
+    {
+      impl_instance_fence_locked(
+          "Kokkos::Experimental::HPX: fence due to forced synchronizations");
+    }
+  }
+
+  template <typename Functor, typename Index>
+  void impl_bulk_plain(bool force_synchronous, bool is_light_weight_policy,
+                       Functor const &functor, Index const n,
+                       hpx::threads::thread_stacksize stacksize =
+                           hpx::threads::thread_stacksize::default_) const {
+    impl_bulk_plain_erased(force_synchronous, is_light_weight_policy,
+                           {[functor](Index i) {
+                             impl_in_parallel_scope p;
+                             functor.execute_range(i);
+                           }},
+                           n, stacksize);
+  }
+
+  template <typename Index>
+  void impl_bulk_setup_finalize_erased(
+      [[maybe_unused]] bool force_synchronous, bool is_light_weight_policy,
+      std::function<void(Index)> &&f, std::function<void()> &&f_setup,
+      std::function<void()> &&f_finalize, Index const n,
+      hpx::threads::thread_stacksize stacksize =
+          hpx::threads::thread_stacksize::default_) const {
+    Kokkos::Experimental::HPX::impl_increment_active_parallel_region_count();
+
+    namespace ex = hpx::execution::experimental;
+    using hpx::threads::thread_stacksize;
+
+    auto &sen = impl_get_sender();
+    auto &mut = impl_get_sender_mutex();
+
+    std::lock_guard<hpx::spinlock> l(mut);
+    hpx::util::ignore_lock(&mut);
+
+    {
+      if (n == 1 && is_light_weight_policy &&
+          (hpx::threads::get_self_ptr() != nullptr)) {
+        sen = std::move(sen) | ex::then(std::move(f_setup)) |
+              ex::then(hpx::bind_front(std::move(f), 0)) |
+              ex::then(std::move(f_finalize)) |
+              ex::then(Kokkos::Experimental::HPX::
+                           impl_decrement_active_parallel_region_count) |
+              ex::ensure_started();
+      } else {
+        sen = std::move(sen) |
+              ex::transfer(
+                  ex::with_stacksize(ex::thread_pool_scheduler{}, stacksize)) |
+              ex::then(std::move(f_setup)) | ex::bulk(n, std::move(f)) |
+              ex::then(std::move(f_finalize)) |
+              ex::then(Kokkos::Experimental::HPX::
+                           impl_decrement_active_parallel_region_count) |
+              ex::ensure_started();
+      }
+    }
+
+#if defined(KOKKOS_ENABLE_IMPL_HPX_ASYNC_DISPATCH)
+    if (force_synchronous)
+#endif
+    {
+      impl_instance_fence_locked(
+          "Kokkos::Experimental::HPX: fence due to forced syncronizations");
+    }
+  }
+
+  template <typename Functor, typename Index>
+  void impl_bulk_setup_finalize(
+      bool force_synchronous, bool is_light_weight_policy,
+      Functor const &functor, Index const n,
+      hpx::threads::thread_stacksize stacksize =
+          hpx::threads::thread_stacksize::default_) const {
+    impl_bulk_setup_finalize_erased(force_synchronous, is_light_weight_policy,
+                                    {[functor](Index i) {
+                                      impl_in_parallel_scope p;
+                                      functor.execute_range(i);
+                                    }},
+                                    {[functor]() {
+                                      impl_in_parallel_scope p;
+                                      functor.setup();
+                                    }},
+                                    {[functor]() {
+                                      impl_in_parallel_scope p;
+                                      functor.finalize();
+                                    }},
+                                    n, stacksize);
+  }
+
+  static constexpr const char *name() noexcept { return "HPX"; }
+
+ private:
+  friend bool operator==(HPX const &lhs, HPX const &rhs) {
+    return lhs.impl_instance_id() == rhs.impl_instance_id();
+  }
+  friend bool operator!=(HPX const &lhs, HPX const &rhs) {
+    return !(lhs == rhs);
+  }
+};
+
+extern template void HPX::impl_bulk_plain_erased<int>(
+    bool, bool, std::function<void(int)> &&, int const,
+    hpx::threads::thread_stacksize stacksize) const;
+
+extern template void HPX::impl_bulk_plain_erased<unsigned int>(
+    bool, bool, std::function<void(unsigned int)> &&, unsigned int const,
+    hpx::threads::thread_stacksize stacksize) const;
+
+extern template void HPX::impl_bulk_plain_erased<long>(
+    bool, bool, std::function<void(long)> &&, long const,
+    hpx::threads::thread_stacksize stacksize) const;
+
+extern template void HPX::impl_bulk_plain_erased<std::size_t>(
+    bool, bool, std::function<void(std::size_t)> &&, std::size_t const,
+    hpx::threads::thread_stacksize stacksize) const;
+
+extern template void HPX::impl_bulk_setup_finalize_erased<int>(
+    bool, bool, std::function<void(int)> &&, std::function<void()> &&,
+    std::function<void()> &&, int const,
+    hpx::threads::thread_stacksize stacksize) const;
+
+extern template void HPX::impl_bulk_setup_finalize_erased<unsigned int>(
+    bool, bool, std::function<void(unsigned int)> &&, std::function<void()> &&,
+    std::function<void()> &&, unsigned int const,
+    hpx::threads::thread_stacksize stacksize) const;
+
+extern template void HPX::impl_bulk_setup_finalize_erased<long>(
+    bool, bool, std::function<void(long)> &&, std::function<void()> &&,
+    std::function<void()> &&, long const,
+    hpx::threads::thread_stacksize stacksize) const;
+
+extern template void HPX::impl_bulk_setup_finalize_erased<std::size_t>(
+    bool, bool, std::function<void(std::size_t)> &&, std::function<void()> &&,
+    std::function<void()> &&, std::size_t const,
+    hpx::threads::thread_stacksize stacksize) const;
+}  // namespace Experimental
+
+namespace Tools {
+namespace Experimental {
+template <>
+struct DeviceTypeTraits<Kokkos::Experimental::HPX> {
+  static constexpr DeviceType id = DeviceType::HPX;
+  static int device_id(const Kokkos::Experimental::HPX &) { return 0; }
+};
+}  // namespace Experimental
+}  // namespace Tools
+}  // namespace Kokkos
+
+namespace Kokkos {
+namespace Impl {
+template <>
+struct MemorySpaceAccess<Kokkos::Experimental::HPX::memory_space,
+                         Kokkos::Experimental::HPX::scratch_memory_space> {
+  enum : bool { assignable = false };
+  enum : bool { accessible = true };
+  enum : bool { deepcopy = false };
+};
+
+}  // namespace Impl
+}  // namespace Kokkos
+
+namespace Kokkos {
+namespace Experimental {
+template <>
+class UniqueToken<HPX, UniqueTokenScope::Instance> {
+ private:
+  using buffer_type = Kokkos::View<uint32_t *, Kokkos::HostSpace>;
+  int m_count;
+  buffer_type m_buffer_view;
+  uint32_t volatile *m_buffer;
+
+ public:
+  using execution_space = HPX;
+  using size_type       = int;
+
+  /// \brief create object size for concurrency on the given instance
+  ///
+  /// This object should not be shared between instances
+  UniqueToken(execution_space const & = execution_space()) noexcept
+      : m_count(execution_space::impl_max_hardware_threads()),
+        m_buffer_view(buffer_type()),
+        m_buffer(nullptr) {}
+
+  UniqueToken(size_type max_size, execution_space const & = execution_space())
+      : m_count(max_size > execution_space::impl_max_hardware_threads()
+                    ? execution_space::impl_max_hardware_threads()
+                    : max_size),
+        m_buffer_view(
+            max_size > execution_space::impl_max_hardware_threads()
+                ? buffer_type()
+                : buffer_type("UniqueToken::m_buffer_view",
+                              ::Kokkos::Impl::concurrent_bitset::buffer_bound(
+                                  m_count))),
+        m_buffer(m_buffer_view.data()) {}
+
+  /// \brief upper bound for acquired values, i.e. 0 <= value < size()
+  KOKKOS_INLINE_FUNCTION
+  int size() const noexcept { return m_count; }
+
+  /// \brief acquire value such that 0 <= value < size()
+  KOKKOS_INLINE_FUNCTION
+  int acquire() const noexcept {
+    KOKKOS_IF_ON_HOST((
+        if (m_buffer == nullptr) {
+          return execution_space::impl_hardware_thread_id();
+        } else {
+          const ::Kokkos::pair<int, int> result =
+              ::Kokkos::Impl::concurrent_bitset::acquire_bounded(
+                  m_buffer, m_count, ::Kokkos::Impl::clock_tic() % m_count);
+
+          if (result.first < 0) {
+            ::Kokkos::abort(
+                "UniqueToken<HPX> failure to acquire tokens, no tokens "
+                "available");
+          }
+          return result.first;
+        }))
+
+    KOKKOS_IF_ON_DEVICE((return 0;))
+  }
+
+  /// \brief release a value acquired by generate
+  KOKKOS_INLINE_FUNCTION
+  void release(int i) const noexcept {
+    KOKKOS_IF_ON_HOST((if (m_buffer != nullptr) {
+      ::Kokkos::Impl::concurrent_bitset::release(m_buffer, i);
+    }))
+
+    KOKKOS_IF_ON_DEVICE(((void)i;))
+  }
+};
+
+template <>
+class UniqueToken<HPX, UniqueTokenScope::Global> {
+ public:
+  using execution_space = HPX;
+  using size_type       = int;
+  UniqueToken(execution_space const & = execution_space()) noexcept {}
+
+  // NOTE: Currently this assumes that there is no oversubscription.
+  // hpx::get_num_worker_threads can't be used directly because it may yield
+  // it's task (problematic if called after hpx::get_worker_thread_num).
+  int size() const noexcept { return HPX::impl_max_hardware_threads(); }
+  int acquire() const noexcept { return HPX::impl_hardware_thread_id(); }
+  void release(int) const noexcept {}
+};
+}  // namespace Experimental
+}  // namespace Kokkos
+
+namespace Kokkos {
+namespace Impl {
+
+struct HPXTeamMember {
+ public:
+  using execution_space = Kokkos::Experimental::HPX;
+  using scratch_memory_space =
+      Kokkos::ScratchMemorySpace<Kokkos::Experimental::HPX>;
+  using team_handle = HPXTeamMember;
+
+ private:
+  scratch_memory_space m_team_shared;
+
+  int m_league_size;
+  int m_league_rank;
+  int m_team_size;
+  int m_team_rank;
+
+ public:
+  KOKKOS_INLINE_FUNCTION
+  const scratch_memory_space &team_shmem() const {
+    return m_team_shared.set_team_thread_mode(0, 1, 0);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const execution_space::scratch_memory_space &team_scratch(const int) const {
+    return m_team_shared.set_team_thread_mode(0, 1, 0);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const execution_space::scratch_memory_space &thread_scratch(const int) const {
+    return m_team_shared.set_team_thread_mode(0, team_size(), team_rank());
+  }
+
+  KOKKOS_INLINE_FUNCTION int league_rank() const noexcept {
+    return m_league_rank;
+  }
+
+  KOKKOS_INLINE_FUNCTION int league_size() const noexcept {
+    return m_league_size;
+  }
+
+  KOKKOS_INLINE_FUNCTION int team_rank() const noexcept { return m_team_rank; }
+  KOKKOS_INLINE_FUNCTION int team_size() const noexcept { return m_team_size; }
+
+  template <class... Properties>
+  constexpr KOKKOS_INLINE_FUNCTION HPXTeamMember(
+      const TeamPolicyInternal<Kokkos::Experimental::HPX, Properties...>
+          &policy,
+      const int team_rank, const int league_rank, void *scratch,
+      size_t scratch_size) noexcept
+      : m_team_shared(scratch, scratch_size, scratch, scratch_size),
+        m_league_size(policy.league_size()),
+        m_league_rank(league_rank),
+        m_team_size(policy.team_size()),
+        m_team_rank(team_rank) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void team_barrier() const {}
+
+  template <class ValueType>
+  KOKKOS_INLINE_FUNCTION void team_broadcast(ValueType &, const int &) const {}
+
+  template <class Closure, class ValueType>
+  KOKKOS_INLINE_FUNCTION void team_broadcast(const Closure &closure,
+                                             ValueType &value,
+                                             const int &) const {
+    closure(value);
+  }
+
+  template <class ValueType, class JoinOp>
+  KOKKOS_INLINE_FUNCTION ValueType team_reduce(const ValueType &value,
+                                               const JoinOp &) const {
+    return value;
+  }
+
+  template <class ReducerType>
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<is_reducer<ReducerType>::value>
+  team_reduce(const ReducerType &) const {}
+
+  template <typename Type>
+  KOKKOS_INLINE_FUNCTION Type
+  team_scan(const Type &value, Type *const global_accum = nullptr) const {
+    if (global_accum) {
+      Kokkos::atomic_fetch_add(global_accum, value);
+    }
+
+    return 0;
+  }
+};
+
+template <class... Properties>
+class TeamPolicyInternal<Kokkos::Experimental::HPX, Properties...>
+    : public PolicyTraits<Properties...> {
+ public:
+  using traits = PolicyTraits<Properties...>;
+
+  //! Tag this class as a kokkos execution policy
+  using execution_policy = TeamPolicyInternal;
+
+  using member_type = HPXTeamMember;
+
+  //! Execution space of this execution policy:
+  using execution_space = Kokkos::Experimental::HPX;
+
+ private:
+  typename traits::execution_space m_space{};
+  int m_league_size;
+  int m_team_size;
+  std::size_t m_team_scratch_size[2];
+  std::size_t m_thread_scratch_size[2];
+  int m_chunk_size;
+
+ public:
+  // NOTE: Max size is 1 for simplicity. In most cases more than 1 is not
+  // necessary on CPU. Implement later if there is a need.
+  template <class FunctorType>
+  inline static int team_size_max(const FunctorType &) {
+    return 1;
+  }
+
+  template <class FunctorType>
+  inline static int team_size_recommended(const FunctorType &) {
+    return 1;
+  }
+
+  template <class FunctorType>
+  inline static int team_size_recommended(const FunctorType &, const int &) {
+    return 1;
+  }
+
+  template <class FunctorType>
+  int team_size_max(const FunctorType &, const ParallelForTag &) const {
+    return 1;
+  }
+
+  template <class FunctorType>
+  int team_size_max(const FunctorType &, const ParallelReduceTag &) const {
+    return 1;
+  }
+
+  template <class FunctorType, class ReducerType>
+  int team_size_max(const FunctorType &, const ReducerType &,
+                    const ParallelReduceTag &) const {
+    return 1;
+  }
+
+  template <class FunctorType>
+  int team_size_recommended(const FunctorType &, const ParallelForTag &) const {
+    return 1;
+  }
+
+  template <class FunctorType>
+  int team_size_recommended(const FunctorType &,
+                            const ParallelReduceTag &) const {
+    return 1;
+  }
+
+  template <class FunctorType, class ReducerType>
+  int team_size_recommended(const FunctorType &, const ReducerType &,
+                            const ParallelReduceTag &) const {
+    return 1;
+  }
+
+  static int vector_length_max() { return 1; }
+
+  inline int impl_vector_length() noexcept { return 1; }
+  inline bool impl_auto_team_size() noexcept { return false; }
+  inline bool impl_auto_vector_length() noexcept { return false; }
+  inline void impl_set_vector_length(int) noexcept {}
+  inline void impl_set_team_size(int) noexcept {}
+
+ private:
+  inline void init(const int league_size_request, const int team_size_request) {
+    m_league_size           = league_size_request;
+    const int max_team_size = 1;  // TODO: Can't use team_size_max(...) because
+                                  // it requires a functor as argument.
+    m_team_size =
+        team_size_request > max_team_size ? max_team_size : team_size_request;
+
+    if (m_chunk_size > 0) {
+      if (!Impl::is_integral_power_of_two(m_chunk_size))
+        Kokkos::abort("TeamPolicy blocking granularity must be power of two");
+    } else {
+      int new_chunk_size = 1;
+      while (new_chunk_size * 4 * m_space.concurrency() < m_league_size) {
+        new_chunk_size *= 2;
+      }
+
+      if (new_chunk_size < 128) {
+        new_chunk_size = 1;
+        while ((new_chunk_size * m_space.concurrency() < m_league_size) &&
+               (new_chunk_size < 128))
+          new_chunk_size *= 2;
+      }
+
+      m_chunk_size = new_chunk_size;
+    }
+  }
+
+ public:
+  inline int team_size() const { return m_team_size; }
+  inline int league_size() const { return m_league_size; }
+
+  size_t scratch_size(const int &level, int team_size_ = -1) const {
+    if (team_size_ < 0) {
+      team_size_ = m_team_size;
+    }
+    return m_team_scratch_size[level] +
+           team_size_ * m_thread_scratch_size[level];
+  }
+
+  inline static int scratch_size_max(int level) {
+    return (level == 0 ? 1024 * 32 :  // Roughly L1 size
+                20 * 1024 * 1024);    // Limit to keep compatibility with CUDA
+  }
+
+ public:
+  template <class ExecSpace, class... OtherProperties>
+  friend class TeamPolicyInternal;
+
+  const typename traits::execution_space &space() const { return m_space; }
+
+  template <class... OtherProperties>
+  TeamPolicyInternal(const TeamPolicyInternal<Kokkos::Experimental::HPX,
+                                              OtherProperties...> &p) {
+    m_space                  = p.m_space;
+    m_league_size            = p.m_league_size;
+    m_team_size              = p.m_team_size;
+    m_team_scratch_size[0]   = p.m_team_scratch_size[0];
+    m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
+    m_team_scratch_size[1]   = p.m_team_scratch_size[1];
+    m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
+    m_chunk_size             = p.m_chunk_size;
+  }
+
+  TeamPolicyInternal(const typename traits::execution_space &space,
+                     int league_size_request, int team_size_request,
+                     int /* vector_length_request */ = 1)
+      : m_space{space},
+        m_team_scratch_size{0, 0},
+        m_thread_scratch_size{0, 0},
+        m_chunk_size(0) {
+    init(league_size_request, team_size_request);
+  }
+
+  TeamPolicyInternal(const typename traits::execution_space &space,
+                     int league_size_request, const Kokkos::AUTO_t &,
+                     int /* vector_length_request */ = 1)
+      : m_space{space},
+        m_team_scratch_size{0, 0},
+        m_thread_scratch_size{0, 0},
+        m_chunk_size(0) {
+    init(league_size_request, 1);
+  }
+
+  TeamPolicyInternal(const typename traits::execution_space &space,
+                     int league_size_request,
+                     const Kokkos::AUTO_t &, /* team_size_request */
+                     const Kokkos::AUTO_t & /* vector_length_request */)
+      : m_space{space},
+        m_team_scratch_size{0, 0},
+        m_thread_scratch_size{0, 0},
+        m_chunk_size(0) {
+    init(league_size_request, 1);
+  }
+
+  TeamPolicyInternal(const typename traits::execution_space &space,
+                     int league_size_request, int team_size_request,
+                     const Kokkos::AUTO_t & /* vector_length_request */
+                     )
+      : m_space{space},
+        m_team_scratch_size{0, 0},
+        m_thread_scratch_size{0, 0},
+        m_chunk_size(0) {
+    init(league_size_request, team_size_request);
+  }
+
+  TeamPolicyInternal(int league_size_request,
+                     const Kokkos::AUTO_t &, /* team_size_request */
+                     const Kokkos::AUTO_t & /* vector_length_request */)
+      : m_team_scratch_size{0, 0},
+        m_thread_scratch_size{0, 0},
+        m_chunk_size(0) {
+    init(league_size_request, 1);
+  }
+
+  TeamPolicyInternal(int league_size_request, int team_size_request,
+                     const Kokkos::AUTO_t & /* vector_length_request */
+                     )
+      : m_team_scratch_size{0, 0},
+        m_thread_scratch_size{0, 0},
+        m_chunk_size(0) {
+    init(league_size_request, team_size_request);
+  }
+
+  TeamPolicyInternal(int league_size_request, int team_size_request,
+                     int /* vector_length_request */ = 1)
+      : m_team_scratch_size{0, 0},
+        m_thread_scratch_size{0, 0},
+        m_chunk_size(0) {
+    init(league_size_request, team_size_request);
+  }
+
+  TeamPolicyInternal(int league_size_request, const Kokkos::AUTO_t &,
+                     int /* vector_length_request */ = 1)
+      : m_team_scratch_size{0, 0},
+        m_thread_scratch_size{0, 0},
+        m_chunk_size(0) {
+    init(league_size_request, 1);
+  }
+
+  inline int chunk_size() const { return m_chunk_size; }
+
+  inline TeamPolicyInternal &set_chunk_size(
+      typename traits::index_type chunk_size_) {
+    m_chunk_size = chunk_size_;
+    return *this;
+  }
+
+  inline TeamPolicyInternal &set_scratch_size(const int &level,
+                                              const PerTeamValue &per_team) {
+    m_team_scratch_size[level] = per_team.value;
+    return *this;
+  }
+
+  inline TeamPolicyInternal &set_scratch_size(
+      const int &level, const PerThreadValue &per_thread) {
+    m_thread_scratch_size[level] = per_thread.value;
+    return *this;
+  }
+
+  inline TeamPolicyInternal &set_scratch_size(
+      const int &level, const PerTeamValue &per_team,
+      const PerThreadValue &per_thread) {
+    m_team_scratch_size[level]   = per_team.value;
+    m_thread_scratch_size[level] = per_thread.value;
+    return *this;
+  }
+};
+}  // namespace Impl
+}  // namespace Kokkos
+
+namespace Kokkos {
+namespace Impl {
+
+template <class FunctorType, class... Traits>
+class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>,
+                  Kokkos::Experimental::HPX> {
+ private:
+  using Policy  = Kokkos::RangePolicy<Traits...>;
+  using WorkTag = typename Policy::work_tag;
+  using Member  = typename Policy::member_type;
+
+  const FunctorType m_functor;
+  const Policy m_policy;
+
+ public:
+  void execute_range(const Member i_chunk) const {
+    const auto r = get_chunk_range(i_chunk, m_policy.begin(),
+                                   m_policy.chunk_size(), m_policy.end());
+    for (Member i = r.begin; i < r.end; ++i) {
+      if constexpr (std::is_same_v<WorkTag, void>) {
+        m_functor(i);
+      } else {
+        m_functor(WorkTag{}, i);
+      }
+    }
+  }
+
+  void execute() const {
+    const Member num_chunks =
+        get_num_chunks(m_policy.begin(), m_policy.chunk_size(), m_policy.end());
+    m_policy.space().impl_bulk_plain(false, is_light_weight_policy<Policy>(),
+                                     *this, num_chunks,
+                                     hpx::threads::thread_stacksize::nostack);
+  }
+
+  inline ParallelFor(const FunctorType &arg_functor, Policy arg_policy)
+      : m_functor(arg_functor), m_policy(arg_policy) {}
+};
+
+template <class FunctorType, class... Traits>
+class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
+                  Kokkos::Experimental::HPX> {
+ private:
+  using MDRangePolicy = Kokkos::MDRangePolicy<Traits...>;
+  using Policy        = typename MDRangePolicy::impl_range_policy;
+  using WorkTag       = typename MDRangePolicy::work_tag;
+  using Member        = typename Policy::member_type;
+  using iterate_type =
+      typename Kokkos::Impl::HostIterateTile<MDRangePolicy, FunctorType,
+                                             WorkTag, void>;
+
+  const iterate_type m_iter;
+  const Policy m_policy;
+
+ public:
+  void execute_range(const Member i_chunk) const {
+    const auto r = get_chunk_range(i_chunk, m_policy.begin(),
+                                   m_policy.chunk_size(), m_policy.end());
+    for (Member i = r.begin; i < r.end; ++i) {
+      m_iter(i);
+    }
+  }
+
+  void execute() const {
+    const Member num_chunks =
+        get_num_chunks(m_policy.begin(), m_policy.chunk_size(), m_policy.end());
+    m_iter.m_rp.space().impl_bulk_plain(
+        false, is_light_weight_policy<MDRangePolicy>(), *this, num_chunks,
+        hpx::threads::thread_stacksize::nostack);
+  }
+
+  inline ParallelFor(const FunctorType &arg_functor, MDRangePolicy arg_policy)
+      : m_iter(arg_policy, arg_functor),
+        m_policy(Policy(0, arg_policy.m_num_tiles).set_chunk_size(1)) {}
+  template <typename Policy, typename Functor>
+  static int max_tile_size_product(const Policy &, const Functor &) {
+    /**
+     * 1024 here is just our guess for a reasonable max tile size,
+     * it isn't a hardware constraint. If people see a use for larger
+     * tile size products, we're happy to change this.
+     */
+    return 1024;
+  }
+};
+}  // namespace Impl
+}  // namespace Kokkos
+
+namespace Kokkos {
+namespace Impl {
+template <class CombinedFunctorReducerType, class... Traits>
+class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>,
+                     Kokkos::Experimental::HPX> {
+ private:
+  using Policy      = Kokkos::RangePolicy<Traits...>;
+  using FunctorType = typename CombinedFunctorReducerType::functor_type;
+  using ReducerType = typename CombinedFunctorReducerType::reducer_type;
+
+  using WorkTag = typename Policy::work_tag;
+  using Member  = typename Policy::member_type;
+
+  using value_type     = typename ReducerType::value_type;
+  using pointer_type   = typename ReducerType::pointer_type;
+  using reference_type = typename ReducerType::reference_type;
+
+  const CombinedFunctorReducerType m_functor_reducer;
+  const Policy m_policy;
+  const pointer_type m_result_ptr;
+  const bool m_force_synchronous;
+
+ public:
+  void setup() const {
+    const ReducerType &reducer   = m_functor_reducer.get_reducer();
+    const std::size_t value_size = reducer.value_size();
+    const int num_worker_threads = m_policy.space().concurrency();
+
+    hpx_thread_buffer &buffer = m_policy.space().impl_get_buffer();
+    buffer.resize(num_worker_threads, value_size);
+
+    for (int t = 0; t < num_worker_threads; ++t) {
+      reducer.init(reinterpret_cast<pointer_type>(buffer.get(t)));
+    }
+  }
+
+  void execute_range(const Member i_chunk) const {
+    hpx_thread_buffer &buffer = m_policy.space().impl_get_buffer();
+    reference_type update =
+        ReducerType::reference(reinterpret_cast<pointer_type>(
+            buffer.get(Kokkos::Experimental::HPX::impl_hardware_thread_id())));
+    const auto r = get_chunk_range(i_chunk, m_policy.begin(),
+                                   m_policy.chunk_size(), m_policy.end());
+    for (Member i = r.begin; i < r.end; ++i) {
+      if constexpr (std::is_same_v<WorkTag, void>) {
+        m_functor_reducer.get_functor()(i, update);
+      } else {
+        m_functor_reducer.get_functor()(WorkTag{}, i, update);
+      }
+    }
+  }
+
+  void finalize() const {
+    hpx_thread_buffer &buffer    = m_policy.space().impl_get_buffer();
+    const ReducerType &reducer   = m_functor_reducer.get_reducer();
+    const int num_worker_threads = m_policy.space().concurrency();
+    for (int i = 1; i < num_worker_threads; ++i) {
+      reducer.join(reinterpret_cast<pointer_type>(buffer.get(0)),
+                   reinterpret_cast<pointer_type>(buffer.get(i)));
+    }
+
+    pointer_type final_value_ptr =
+        reinterpret_cast<pointer_type>(buffer.get(0));
+
+    reducer.final(final_value_ptr);
+
+    if (m_result_ptr != nullptr) {
+      const int n = reducer.value_count();
+
+      for (int j = 0; j < n; ++j) {
+        m_result_ptr[j] = final_value_ptr[j];
+      }
+    }
+  }
+
+  void execute() const {
+    if (m_policy.end() <= m_policy.begin()) {
+      if (m_result_ptr) {
+        const ReducerType &reducer = m_functor_reducer.get_reducer();
+        reducer.init(m_result_ptr);
+        reducer.final(m_result_ptr);
+      }
+      return;
+    }
+
+    const Member num_chunks =
+        get_num_chunks(m_policy.begin(), m_policy.chunk_size(), m_policy.end());
+    m_policy.space().impl_bulk_setup_finalize(
+        m_force_synchronous, is_light_weight_policy<Policy>(), *this,
+        num_chunks, hpx::threads::thread_stacksize::nostack);
+  }
+
+  template <class ViewType>
+  inline ParallelReduce(const CombinedFunctorReducerType &arg_functor_reducer,
+                        Policy arg_policy, const ViewType &arg_view)
+      : m_functor_reducer(arg_functor_reducer),
+        m_policy(arg_policy),
+        m_result_ptr(arg_view.data()),
+        m_force_synchronous(!arg_view.impl_track().has_record()) {
+    static_assert(
+        Kokkos::Impl::MemorySpaceAccess<typename ViewType::memory_space,
+                                        Kokkos::HostSpace>::accessible,
+        "HPX reduce result must be a View accessible from HostSpace");
+  }
+};
+
+template <class CombinedFunctorReducerType, class... Traits>
+class ParallelReduce<CombinedFunctorReducerType,
+                     Kokkos::MDRangePolicy<Traits...>,
+                     Kokkos::Experimental::HPX> {
+ private:
+  using MDRangePolicy = Kokkos::MDRangePolicy<Traits...>;
+  using FunctorType   = typename CombinedFunctorReducerType::functor_type;
+  using ReducerType   = typename CombinedFunctorReducerType::reducer_type;
+
+  using Policy  = typename MDRangePolicy::impl_range_policy;
+  using WorkTag = typename MDRangePolicy::work_tag;
+  using Member  = typename Policy::member_type;
+
+  using pointer_type   = typename ReducerType::pointer_type;
+  using value_type     = typename ReducerType::value_type;
+  using reference_type = typename ReducerType::reference_type;
+  using iterate_type   = typename Kokkos::Impl::HostIterateTile<
+      MDRangePolicy, CombinedFunctorReducerType, WorkTag, reference_type>;
+
+  const iterate_type m_iter;
+  const Policy m_policy;
+  const pointer_type m_result_ptr;
+  const bool m_force_synchronous;
+
+ public:
+  void setup() const {
+    const ReducerType &reducer   = m_iter.m_func.get_reducer();
+    const std::size_t value_size = reducer.value_size();
+    const int num_worker_threads = m_policy.space().concurrency();
+
+    hpx_thread_buffer &buffer = m_iter.m_rp.space().impl_get_buffer();
+    buffer.resize(num_worker_threads, value_size);
+
+    for (int t = 0; t < num_worker_threads; ++t) {
+      reducer.init(reinterpret_cast<pointer_type>(buffer.get(t)));
+    }
+  }
+
+  void execute_range(const Member i_chunk) const {
+    hpx_thread_buffer &buffer = m_iter.m_rp.space().impl_get_buffer();
+    reference_type update =
+        ReducerType::reference(reinterpret_cast<pointer_type>(
+            buffer.get(Kokkos::Experimental::HPX::impl_hardware_thread_id())));
+    const auto r = get_chunk_range(i_chunk, m_policy.begin(),
+                                   m_policy.chunk_size(), m_policy.end());
+    for (Member i = r.begin; i < r.end; ++i) {
+      m_iter(i, update);
+    }
+  }
+
+  void finalize() const {
+    hpx_thread_buffer &buffer    = m_iter.m_rp.space().impl_get_buffer();
+    ReducerType reducer          = m_iter.m_func.get_reducer();
+    const int num_worker_threads = m_policy.space().concurrency();
+    for (int i = 1; i < num_worker_threads; ++i) {
+      reducer.join(reinterpret_cast<pointer_type>(buffer.get(0)),
+                   reinterpret_cast<pointer_type>(buffer.get(i)));
+    }
+
+    pointer_type final_value_ptr =
+        reinterpret_cast<pointer_type>(buffer.get(0));
+
+    reducer.final(final_value_ptr);
+
+    if (m_result_ptr != nullptr) {
+      const int n = reducer.value_count();
+
+      for (int j = 0; j < n; ++j) {
+        m_result_ptr[j] = final_value_ptr[j];
+      }
+    }
+  }
+
+  void execute() const {
+    const Member num_chunks =
+        get_num_chunks(m_policy.begin(), m_policy.chunk_size(), m_policy.end());
+    m_iter.m_rp.space().impl_bulk_setup_finalize(
+        m_force_synchronous, is_light_weight_policy<MDRangePolicy>(), *this,
+        num_chunks, hpx::threads::thread_stacksize::nostack);
+  }
+
+  template <class ViewType>
+  inline ParallelReduce(const CombinedFunctorReducerType &arg_functor_reducer,
+                        MDRangePolicy arg_policy, const ViewType &arg_view)
+      : m_iter(arg_policy, arg_functor_reducer),
+        m_policy(Policy(0, arg_policy.m_num_tiles).set_chunk_size(1)),
+        m_result_ptr(arg_view.data()),
+        m_force_synchronous(!arg_view.impl_track().has_record()) {
+    static_assert(
+        Kokkos::Impl::MemorySpaceAccess<typename ViewType::memory_space,
+                                        Kokkos::HostSpace>::accessible,
+        "HPX reduce result must be a View accessible from HostSpace");
+  }
+
+  template <typename Policy, typename Functor>
+  static int max_tile_size_product(const Policy &, const Functor &) {
+    /**
+     * 1024 here is just our guess for a reasonable max tile size,
+     * it isn't a hardware constraint. If people see a use for larger
+     * tile size products, we're happy to change this.
+     */
+    return 1024;
+  }
+};
+}  // namespace Impl
+}  // namespace Kokkos
+
+namespace Kokkos {
+namespace Impl {
+
+template <class FunctorType, class... Traits>
+class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
+                   Kokkos::Experimental::HPX> {
+ private:
+  using Policy    = Kokkos::RangePolicy<Traits...>;
+  using WorkTag   = typename Policy::work_tag;
+  using WorkRange = typename Policy::WorkRange;
+  using Member    = typename Policy::member_type;
+  using Analysis =
+      FunctorAnalysis<FunctorPatternInterface::SCAN, Policy, FunctorType, void>;
+  using pointer_type   = typename Analysis::pointer_type;
+  using reference_type = typename Analysis::reference_type;
+  using value_type     = typename Analysis::value_type;
+  using barrier_type   = hpx::barrier<>;
+
+  const FunctorType m_functor;
+  const Policy m_policy;
+
+ public:
+  void setup() const {
+    const int num_worker_threads = m_policy.space().concurrency();
+    const std::size_t value_size = Analysis::value_size(m_functor);
+
+    hpx_thread_buffer &buffer = m_policy.space().impl_get_buffer();
+    buffer.resize(num_worker_threads, 2 * value_size, sizeof(barrier_type));
+
+    new (buffer.get_extra_space()) barrier_type(num_worker_threads);
+  }
+
+  void execute_chunk(const Member i_begin, const Member i_end,
+                     reference_type update, const bool final) const {
+    for (Member i = i_begin; i < i_end; ++i) {
+      if constexpr (std::is_same_v<WorkTag, void>) {
+        m_functor(i, update, final);
+      } else {
+        m_functor(WorkTag{}, i, update, final);
+      }
+    }
+  }
+
+  void execute_range(int t) const {
+    const int num_worker_threads = m_policy.space().concurrency();
+    const int value_count        = Analysis::value_count(m_functor);
+    const std::size_t value_size = Analysis::value_size(m_functor);
+
+    hpx_thread_buffer &buffer = m_policy.space().impl_get_buffer();
+    typename Analysis::Reducer final_reducer(m_functor);
+    barrier_type &barrier =
+        *static_cast<barrier_type *>(buffer.get_extra_space());
+    reference_type update_sum =
+        final_reducer.init(reinterpret_cast<pointer_type>(buffer.get(t)));
+
+    const WorkRange range(m_policy, t, num_worker_threads);
+    execute_chunk(range.begin(), range.end(), update_sum, false);
+
+    {
+      // Since arrive_and_wait may yield and resume on another worker thread we
+      // set in_parallel = false on the current thread before suspending and set
+      // it again to true when we resume.
+      Kokkos::Experimental::HPX::impl_not_in_parallel_scope p;
+      barrier.arrive_and_wait();
+    }
+
+    if (t == 0) {
+      final_reducer.init(reinterpret_cast<pointer_type>(
+          static_cast<char *>(buffer.get(0)) + value_size));
+
+      for (int i = 1; i < num_worker_threads; ++i) {
+        pointer_type ptr_1_prev =
+            reinterpret_cast<pointer_type>(buffer.get(i - 1));
+        pointer_type ptr_2_prev = reinterpret_cast<pointer_type>(
+            static_cast<char *>(buffer.get(i - 1)) + value_size);
+        pointer_type ptr_2 = reinterpret_cast<pointer_type>(
+            static_cast<char *>(buffer.get(i)) + value_size);
+
+        for (int j = 0; j < value_count; ++j) {
+          ptr_2[j] = ptr_2_prev[j];
+        }
+
+        final_reducer.join(ptr_2, ptr_1_prev);
+      }
+    }
+
+    {
+      // Since arrive_and_wait may yield and resume on another worker thread we
+      // set in_parallel = false on the current thread before suspending and set
+      // it again to true when we resume.
+      Kokkos::Experimental::HPX::impl_not_in_parallel_scope p;
+      barrier.arrive_and_wait();
+    }
+
+    reference_type update_base =
+        Analysis::Reducer::reference(reinterpret_cast<pointer_type>(
+            static_cast<char *>(buffer.get(t)) + value_size));
+
+    execute_chunk(range.begin(), range.end(), update_base, true);
+  }
+
+  void finalize() const {
+    hpx_thread_buffer &buffer = m_policy.space().impl_get_buffer();
+    static_cast<barrier_type *>(buffer.get_extra_space())->~barrier_type();
+  }
+
+  void execute() const {
+    const int num_worker_threads = m_policy.space().concurrency();
+    m_policy.space().impl_bulk_setup_finalize(
+        false, is_light_weight_policy<Policy>(), *this, num_worker_threads,
+        hpx::threads::thread_stacksize::small_);
+  }
+
+  inline ParallelScan(const FunctorType &arg_functor, const Policy &arg_policy)
+      : m_functor(arg_functor), m_policy(arg_policy) {}
+};
+
+template <class FunctorType, class ReturnType, class... Traits>
+class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
+                            ReturnType, Kokkos::Experimental::HPX> {
+ private:
+  using Policy         = Kokkos::RangePolicy<Traits...>;
+  using WorkTag        = typename Policy::work_tag;
+  using WorkRange      = typename Policy::WorkRange;
+  using Member         = typename Policy::member_type;
+  using Analysis       = FunctorAnalysis<FunctorPatternInterface::SCAN, Policy,
+                                   FunctorType, ReturnType>;
+  using pointer_type   = typename Analysis::pointer_type;
+  using reference_type = typename Analysis::reference_type;
+  using value_type     = typename Analysis::value_type;
+  using barrier_type   = hpx::barrier<>;
+
+  const FunctorType m_functor;
+  const Policy m_policy;
+  pointer_type m_result_ptr;
+
+ public:
+  void setup() const {
+    const int num_worker_threads = m_policy.space().concurrency();
+    const std::size_t value_size = Analysis::value_size(m_functor);
+
+    hpx_thread_buffer &buffer = m_policy.space().impl_get_buffer();
+    buffer.resize(num_worker_threads, 2 * value_size, sizeof(barrier_type));
+
+    new (buffer.get_extra_space()) barrier_type(num_worker_threads);
+  }
+
+  void execute_chunk(const Member i_begin, const Member i_end,
+                     reference_type update, const bool final) const {
+    for (Member i = i_begin; i < i_end; ++i) {
+      if constexpr (std::is_same_v<WorkTag, void>) {
+        m_functor(i, update, final);
+      } else {
+        m_functor(WorkTag{}, i, update, final);
+      }
+    }
+  }
+
+  void execute_range(int t) const {
+    const int num_worker_threads = m_policy.space().concurrency();
+    const int value_count        = Analysis::value_count(m_functor);
+    const std::size_t value_size = Analysis::value_size(m_functor);
+
+    hpx_thread_buffer &buffer = m_policy.space().impl_get_buffer();
+    typename Analysis::Reducer final_reducer(m_functor);
+    barrier_type &barrier =
+        *static_cast<barrier_type *>(buffer.get_extra_space());
+    reference_type update_sum =
+        final_reducer.init(reinterpret_cast<pointer_type>(buffer.get(t)));
+
+    const WorkRange range(m_policy, t, num_worker_threads);
+    execute_chunk(range.begin(), range.end(), update_sum, false);
+
+    {
+      // Since arrive_and_wait may yield and resume on another worker thread we
+      // set in_parallel = false on the current thread before suspending and set
+      // it again to true when we resume.
+      Kokkos::Experimental::HPX::impl_not_in_parallel_scope p;
+      barrier.arrive_and_wait();
+    }
+
+    if (t == 0) {
+      final_reducer.init(reinterpret_cast<pointer_type>(
+          static_cast<char *>(buffer.get(0)) + value_size));
+
+      for (int i = 1; i < num_worker_threads; ++i) {
+        pointer_type ptr_1_prev =
+            reinterpret_cast<pointer_type>(buffer.get(i - 1));
+        pointer_type ptr_2_prev = reinterpret_cast<pointer_type>(
+            static_cast<char *>(buffer.get(i - 1)) + value_size);
+        pointer_type ptr_2 = reinterpret_cast<pointer_type>(
+            static_cast<char *>(buffer.get(i)) + value_size);
+
+        for (int j = 0; j < value_count; ++j) {
+          ptr_2[j] = ptr_2_prev[j];
+        }
+
+        final_reducer.join(ptr_2, ptr_1_prev);
+      }
+    }
+
+    {
+      // Since arrive_and_wait may yield and resume on another worker thread we
+      // set in_parallel = false on the current thread before suspending and set
+      // it again to true when we resume.
+      Kokkos::Experimental::HPX::impl_not_in_parallel_scope p;
+      barrier.arrive_and_wait();
+    }
+
+    reference_type update_base =
+        Analysis::Reducer::reference(reinterpret_cast<pointer_type>(
+            static_cast<char *>(buffer.get(t)) + value_size));
+
+    execute_chunk(range.begin(), range.end(), update_base, true);
+
+    if (t == num_worker_threads - 1) {
+      *m_result_ptr = update_base;
+    }
+  }
+
+  void finalize() const {
+    hpx_thread_buffer &buffer = m_policy.space().impl_get_buffer();
+    static_cast<barrier_type *>(buffer.get_extra_space())->~barrier_type();
+  }
+
+  void execute() const {
+    const int num_worker_threads = m_policy.space().concurrency();
+    m_policy.space().impl_bulk_setup_finalize(
+        false, is_light_weight_policy<Policy>(), *this, num_worker_threads,
+        hpx::threads::thread_stacksize::small_);
+  }
+
+  template <class ViewType>
+  ParallelScanWithTotal(const FunctorType &arg_functor,
+                        const Policy &arg_policy,
+                        const ViewType &arg_result_view)
+      : m_functor(arg_functor),
+        m_policy(arg_policy),
+        m_result_ptr(arg_result_view.data()) {
+    static_assert(
+        Kokkos::Impl::MemorySpaceAccess<typename ViewType::memory_space,
+                                        Kokkos::HostSpace>::accessible,
+        "Kokkos::HPX parallel_scan result must be host-accessible!");
+  }
+};
+}  // namespace Impl
+}  // namespace Kokkos
+
+namespace Kokkos {
+namespace Impl {
+template <class FunctorType, class... Properties>
+class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
+                  Kokkos::Experimental::HPX> {
+ private:
+  using Policy  = TeamPolicyInternal<Kokkos::Experimental::HPX, Properties...>;
+  using WorkTag = typename Policy::work_tag;
+  using Member  = typename Policy::member_type;
+  using memory_space = Kokkos::HostSpace;
+
+  const FunctorType m_functor;
+  const Policy m_policy;
+  const int m_league;
+  const std::size_t m_shared;
+
+ public:
+  void setup() const {
+    const int num_worker_threads = m_policy.space().concurrency();
+
+    hpx_thread_buffer &buffer = m_policy.space().impl_get_buffer();
+    buffer.resize(num_worker_threads, m_shared);
+  }
+
+  void execute_range(const int i) const {
+    const int t = Kokkos::Experimental::HPX::impl_hardware_thread_id();
+    hpx_thread_buffer &buffer = m_policy.space().impl_get_buffer();
+    const auto r =
+        get_chunk_range(i, 0, m_policy.chunk_size(), m_policy.league_size());
+    for (int league_rank = r.begin; league_rank < r.end; ++league_rank) {
+      if constexpr (std::is_same_v<WorkTag, void>) {
+        m_functor(Member(m_policy, 0, league_rank, buffer.get(t), m_shared));
+      } else {
+        m_functor(WorkTag{},
+                  Member(m_policy, 0, league_rank, buffer.get(t), m_shared));
+      }
+    }
+  }
+
+  void finalize() const {}
+
+  void execute() const {
+    const int num_chunks =
+        get_num_chunks(0, m_policy.chunk_size(), m_policy.league_size());
+    m_policy.space().impl_bulk_setup_finalize(
+        false, is_light_weight_policy<Policy>(), *this, num_chunks,
+        hpx::threads::thread_stacksize::nostack);
+  }
+
+  ParallelFor(const FunctorType &arg_functor, const Policy &arg_policy)
+      : m_functor(arg_functor),
+        m_policy(arg_policy),
+        m_league(arg_policy.league_size()),
+        m_shared(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) +
+                 FunctorTeamShmemSize<FunctorType>::value(
+                     arg_functor, arg_policy.team_size())) {}
+};
+
+template <class CombinedFunctorReducerType, class... Properties>
+class ParallelReduce<CombinedFunctorReducerType,
+                     Kokkos::TeamPolicy<Properties...>,
+                     Kokkos::Experimental::HPX> {
+ private:
+  using Policy = TeamPolicyInternal<Kokkos::Experimental::HPX, Properties...>;
+  using FunctorType = typename CombinedFunctorReducerType::functor_type;
+  using ReducerType = typename CombinedFunctorReducerType::reducer_type;
+
+  using Member  = typename Policy::member_type;
+  using WorkTag = typename Policy::work_tag;
+
+  using pointer_type   = typename ReducerType::pointer_type;
+  using reference_type = typename ReducerType::reference_type;
+  using value_type     = typename ReducerType::value_type;
+
+  const CombinedFunctorReducerType m_functor_reducer;
+  const int m_league;
+  const Policy m_policy;
+  pointer_type m_result_ptr;
+  const std::size_t m_shared;
+  const bool m_force_synchronous;
+
+ public:
+  void setup() const {
+    const ReducerType &reducer   = m_functor_reducer.get_reducer();
+    const std::size_t value_size = reducer.value_size();
+    const int num_worker_threads = m_policy.space().concurrency();
+
+    hpx_thread_buffer &buffer = m_policy.space().impl_get_buffer();
+    buffer.resize(num_worker_threads, value_size + m_shared);
+
+    for (int t = 0; t < num_worker_threads; ++t) {
+      reducer.init(reinterpret_cast<pointer_type>(buffer.get(t)));
+    }
+  }
+
+  void execute_range(const int i) const {
+    const ReducerType &reducer   = m_functor_reducer.get_reducer();
+    const std::size_t value_size = reducer.value_size();
+    std::size_t t = Kokkos::Experimental::HPX::impl_hardware_thread_id();
+    hpx_thread_buffer &buffer = m_policy.space().impl_get_buffer();
+    reference_type update =
+        ReducerType::reference(reinterpret_cast<pointer_type>(buffer.get(t)));
+    const auto r =
+        get_chunk_range(i, 0, m_policy.chunk_size(), m_policy.league_size());
+    char *local_buffer = static_cast<char *>(buffer.get(t)) + value_size;
+    for (int league_rank = r.begin; league_rank < r.end; ++league_rank) {
+      if constexpr (std::is_same_v<WorkTag, void>) {
+        m_functor_reducer.get_functor()(
+            Member(m_policy, 0, league_rank, local_buffer, m_shared), update);
+      } else {
+        m_functor_reducer.get_functor()(
+            WorkTag{}, Member(m_policy, 0, league_rank, local_buffer, m_shared),
+            update);
+      }
+    }
+  }
+
+  void finalize() const {
+    hpx_thread_buffer &buffer    = m_policy.space().impl_get_buffer();
+    const ReducerType &reducer   = m_functor_reducer.get_reducer();
+    const int num_worker_threads = m_policy.space().concurrency();
+    const pointer_type ptr = reinterpret_cast<pointer_type>(buffer.get(0));
+    for (int t = 1; t < num_worker_threads; ++t) {
+      reducer.join(ptr, reinterpret_cast<pointer_type>(buffer.get(t)));
+    }
+
+    reducer.final(ptr);
+
+    if (m_result_ptr) {
+      const int n = reducer.value_count();
+
+      for (int j = 0; j < n; ++j) {
+        m_result_ptr[j] = ptr[j];
+      }
+    }
+  }
+
+  void execute() const {
+    if (m_policy.league_size() * m_policy.team_size() == 0) {
+      if (m_result_ptr) {
+        const ReducerType &reducer = m_functor_reducer.get_reducer();
+        reducer.init(m_result_ptr);
+        reducer.final(m_result_ptr);
+      }
+      return;
+    }
+
+    const int num_chunks =
+        get_num_chunks(0, m_policy.chunk_size(), m_policy.league_size());
+    m_policy.space().impl_bulk_setup_finalize(
+        m_force_synchronous, is_light_weight_policy<Policy>(), *this,
+        num_chunks, hpx::threads::thread_stacksize::nostack);
+  }
+
+  template <class ViewType>
+  ParallelReduce(const CombinedFunctorReducerType &arg_functor_reducer,
+                 const Policy &arg_policy, const ViewType &arg_result)
+      : m_functor_reducer(arg_functor_reducer),
+        m_league(arg_policy.league_size()),
+        m_policy(arg_policy),
+        m_result_ptr(arg_result.data()),
+        m_shared(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) +
+                 FunctorTeamShmemSize<FunctorType>::value(
+                     m_functor_reducer.get_functor(), arg_policy.team_size())),
+        m_force_synchronous(!arg_result.impl_track().has_record()) {
+    static_assert(
+        Kokkos::Impl::MemorySpaceAccess<typename ViewType::memory_space,
+                                        Kokkos::HostSpace>::accessible,
+        "HPX reduce result must be a View accessible from HostSpace");
+  }
+};
+}  // namespace Impl
+}  // namespace Kokkos
+
+namespace Kokkos {
+
+template <typename iType>
+KOKKOS_INLINE_FUNCTION
+    Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>
+    TeamThreadRange(const Impl::HPXTeamMember &thread, const iType &count) {
+  return Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>(
+      thread, count);
+}
+
+template <typename iType1, typename iType2>
+KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct<
+    std::common_type_t<iType1, iType2>, Impl::HPXTeamMember>
+TeamThreadRange(const Impl::HPXTeamMember &thread, const iType1 &i_begin,
+                const iType2 &i_end) {
+  using iType = std::common_type_t<iType1, iType2>;
+  return Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>(
+      thread, iType(i_begin), iType(i_end));
+}
+
+template <typename iType>
+KOKKOS_INLINE_FUNCTION
+    Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>
+    TeamVectorRange(const Impl::HPXTeamMember &thread, const iType &count) {
+  return Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>(
+      thread, count);
+}
+
+template <typename iType1, typename iType2>
+KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct<
+    std::common_type_t<iType1, iType2>, Impl::HPXTeamMember>
+TeamVectorRange(const Impl::HPXTeamMember &thread, const iType1 &i_begin,
+                const iType2 &i_end) {
+  using iType = std::common_type_t<iType1, iType2>;
+  return Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>(
+      thread, iType(i_begin), iType(i_end));
+}
+
+template <typename iType>
+KOKKOS_INLINE_FUNCTION
+    Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>
+    ThreadVectorRange(const Impl::HPXTeamMember &thread, const iType &count) {
+  return Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>(
+      thread, count);
+}
+
+template <typename iType1, typename iType2>
+KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct<
+    std::common_type_t<iType1, iType2>, Impl::HPXTeamMember>
+ThreadVectorRange(const Impl::HPXTeamMember &thread, const iType1 &i_begin,
+                  const iType2 &i_end) {
+  using iType = std::common_type_t<iType1, iType2>;
+  return Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>(
+      thread, iType(i_begin), iType(i_end));
+}
+
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadSingleStruct<Impl::HPXTeamMember> PerTeam(
+    const Impl::HPXTeamMember &thread) {
+  return Impl::ThreadSingleStruct<Impl::HPXTeamMember>(thread);
+}
+
+KOKKOS_INLINE_FUNCTION
+Impl::VectorSingleStruct<Impl::HPXTeamMember> PerThread(
+    const Impl::HPXTeamMember &thread) {
+  return Impl::VectorSingleStruct<Impl::HPXTeamMember>(thread);
+}
+
+/** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each
+ * i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all threads of the the calling thread team.
+ */
+template <typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION void parallel_for(
+    const Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>
+        &loop_boundaries,
+    const Lambda &lambda) {
+  for (iType i = loop_boundaries.start; i < loop_boundaries.end;
+       i += loop_boundaries.increment)
+    lambda(i);
+}
+
+/** \brief  Inter-thread vector parallel_reduce. Executes lambda(iType i,
+ * ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all threads of the the calling thread team
+ * and a summation of val is performed and put into result.
+ */
+template <typename iType, class Lambda, typename ValueType,
+          typename = std::enable_if_t<!Kokkos::is_reducer<ValueType>::value>>
+KOKKOS_INLINE_FUNCTION void parallel_reduce(
+    const Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>
+        &loop_boundaries,
+    const Lambda &lambda, ValueType &result) {
+  result = ValueType();
+  for (iType i = loop_boundaries.start; i < loop_boundaries.end;
+       i += loop_boundaries.increment) {
+    lambda(i, result);
+  }
+}
+
+/** \brief  Intra-thread vector parallel_for. Executes lambda(iType i) for each
+ * i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread.
+ */
+template <typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION void parallel_for(
+    const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>
+        &loop_boundaries,
+    const Lambda &lambda) {
+#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for (iType i = loop_boundaries.start; i < loop_boundaries.end;
+       i += loop_boundaries.increment) {
+    lambda(i);
+  }
+}
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i,
+ * ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread
+ * and a summation of val is performed and put into result.
+ */
+template <typename iType, class Lambda, typename ValueType,
+          typename = std::enable_if_t<!Kokkos::is_reducer<ValueType>::value>>
+KOKKOS_INLINE_FUNCTION void parallel_reduce(
+    const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>
+        &loop_boundaries,
+    const Lambda &lambda, ValueType &result) {
+  result = ValueType();
+#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for (iType i = loop_boundaries.start; i < loop_boundaries.end;
+       i += loop_boundaries.increment) {
+    lambda(i, result);
+  }
+}
+
+template <typename iType, class Lambda, typename ReducerType,
+          typename = std::enable_if_t<Kokkos::is_reducer<ReducerType>::value>>
+KOKKOS_INLINE_FUNCTION void parallel_reduce(
+    const Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>
+        &loop_boundaries,
+    const Lambda &lambda, const ReducerType &reducer) {
+  reducer.init(reducer.reference());
+  for (iType i = loop_boundaries.start; i < loop_boundaries.end;
+       i += loop_boundaries.increment) {
+    lambda(i, reducer.reference());
+  }
+}
+
+template <typename iType, class Lambda, typename ReducerType,
+          typename = std::enable_if_t<Kokkos::is_reducer<ReducerType>::value>>
+KOKKOS_INLINE_FUNCTION void parallel_reduce(
+    const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>
+        &loop_boundaries,
+    const Lambda &lambda, const ReducerType &reducer) {
+  reducer.init(reducer.reference());
+#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for (iType i = loop_boundaries.start; i < loop_boundaries.end;
+       i += loop_boundaries.increment) {
+    lambda(i, reducer.reference());
+  }
+}
+
+template <typename iType, class FunctorType, typename ValueType>
+KOKKOS_INLINE_FUNCTION void parallel_scan(
+    Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember> const
+        &loop_boundaries,
+    const FunctorType &lambda, ValueType &return_val) {
+  using functor_value_type = typename Kokkos::Impl::FunctorAnalysis<
+      Kokkos::Impl::FunctorPatternInterface::SCAN, void, FunctorType,
+      void>::value_type;
+  static_assert(std::is_same_v<functor_value_type, ValueType>,
+                "Non-matching value types of functor and return type");
+
+  ValueType scan_val{};
+
+  // Intra-member scan
+  for (iType i = loop_boundaries.start; i < loop_boundaries.end;
+       i += loop_boundaries.increment) {
+    lambda(i, scan_val, false);
+  }
+
+  // 'scan_val' output is the exclusive prefix sum
+  scan_val = loop_boundaries.thread.team_scan(scan_val);
+
+  for (iType i = loop_boundaries.start; i < loop_boundaries.end;
+       i += loop_boundaries.increment) {
+    lambda(i, scan_val, true);
+  }
+
+  return_val = scan_val;
+}
+
+template <typename iType, typename FunctorType>
+KOKKOS_INLINE_FUNCTION void parallel_scan(
+    const Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>
+        &loop_bounds,
+    const FunctorType &lambda) {
+  // Extract value_type from lambda
+  using value_type = typename Kokkos::Impl::FunctorAnalysis<
+      Kokkos::Impl::FunctorPatternInterface::SCAN, void, FunctorType,
+      void>::value_type;
+
+  value_type scan_val;
+  parallel_scan(loop_bounds, lambda, scan_val);
+}
+
+/** \brief  Intra-thread vector parallel exclusive prefix sum. Executes
+ * lambda(iType i, ValueType & val, bool final) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes in the thread and a scan
+ * operation is performed. Depending on the target execution space the operator
+ * might be called twice: once with final=false and once with final=true. When
+ * final==true val contains the prefix sum value. The contribution of this "i"
+ * needs to be added to val no matter whether final==true or not. In a serial
+ * execution (i.e. team_size==1) the operator is only called once with
+ * final==true. Scan_val will be set to the final sum value over all vector
+ */
+template <typename iType, class FunctorType>
+KOKKOS_INLINE_FUNCTION void parallel_scan(
+    const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>
+        &loop_boundaries,
+    const FunctorType &lambda) {
+  using value_type =
+      typename Impl::FunctorAnalysis<Impl::FunctorPatternInterface::SCAN,
+                                     TeamPolicy<Experimental::HPX>, FunctorType,
+                                     void>::value_type;
+
+  value_type scan_val = value_type();
+
+#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for (iType i = loop_boundaries.start; i < loop_boundaries.end;
+       i += loop_boundaries.increment) {
+    lambda(i, scan_val, true);
+  }
+}
+
+/** \brief  Intra-thread vector parallel scan with reducer
+ *
+ */
+template <typename iType, class FunctorType, typename ReducerType>
+KOKKOS_INLINE_FUNCTION std::enable_if_t<Kokkos::is_reducer<ReducerType>::value>
+parallel_scan(
+    const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>
+        &loop_boundaries,
+    const FunctorType &lambda, const ReducerType &reducer) {
+  typename ReducerType::value_type scan_val;
+  reducer.init(scan_val);
+
+#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for (iType i = loop_boundaries.start; i < loop_boundaries.end;
+       i += loop_boundaries.increment) {
+    lambda(i, scan_val, true);
+  }
+  reducer.reference() = scan_val;
+}
+
+template <typename iType, class FunctorType, typename ValueType>
+KOKKOS_INLINE_FUNCTION void parallel_scan(
+    const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>
+        &loop_boundaries,
+    const FunctorType &lambda, ValueType &return_val) {
+  // Extract ValueType from FunctorType
+  using closure_value_type = typename Kokkos::Impl::FunctorAnalysis<
+      Kokkos::Impl::FunctorPatternInterface::SCAN, void, FunctorType,
+      void>::value_type;
+  static_assert(std::is_same<closure_value_type, ValueType>::value,
+                "Non-matching value types of closure and return type");
+
+  ValueType accum;
+  parallel_scan(loop_boundaries, lambda, Kokkos::Sum<ValueType>(accum));
+
+  return_val = accum;
+}
+
+template <class FunctorType>
+KOKKOS_INLINE_FUNCTION void single(
+    const Impl::VectorSingleStruct<Impl::HPXTeamMember> &,
+    const FunctorType &lambda) {
+  lambda();
+}
+
+template <class FunctorType>
+KOKKOS_INLINE_FUNCTION void single(
+    const Impl::ThreadSingleStruct<Impl::HPXTeamMember> &,
+    const FunctorType &lambda) {
+  lambda();
+}
+
+template <class FunctorType, class ValueType>
+KOKKOS_INLINE_FUNCTION void single(
+    const Impl::VectorSingleStruct<Impl::HPXTeamMember> &,
+    const FunctorType &lambda, ValueType &val) {
+  lambda(val);
+}
+
+template <class FunctorType, class ValueType>
+KOKKOS_INLINE_FUNCTION void single(
+    const Impl::ThreadSingleStruct<Impl::HPXTeamMember> &,
+    const FunctorType &lambda, ValueType &val) {
+  lambda(val);
+}
+
+}  // namespace Kokkos
+
+#include <HPX/Kokkos_HPX_Task.hpp>
+
+#endif /* #if defined( KOKKOS_ENABLE_HPX ) */
+#endif /* #ifndef KOKKOS_HPX_HPP */
diff --git a/packages/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp b/packages/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp
index e75b7be49dc28f64f3fa818b6c51a2a41e3a5154..28c75b2515ae45ea4239439ba713468ab5ac2d2b 100644
--- a/packages/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp
+++ b/packages/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp
@@ -20,12 +20,13 @@
 #include <Kokkos_Macros.hpp>
 #if defined(KOKKOS_ENABLE_HPX) && defined(KOKKOS_ENABLE_TASKDAG)
 
+#include <Kokkos_Atomic.hpp>
 #include <Kokkos_TaskScheduler_fwd.hpp>
 
-#include <Kokkos_HPX.hpp>
+#include <HPX/Kokkos_HPX.hpp>
 
-#include <hpx/local/execution.hpp>
-#include <hpx/local/future.hpp>
+#include <hpx/execution.hpp>
+#include <hpx/future.hpp>
 
 #include <type_traits>
 
@@ -39,6 +40,51 @@ template <class QueueType>
 class TaskQueueSpecialization<
     SimpleTaskScheduler<Kokkos::Experimental::HPX, QueueType>> {
  public:
+  void setup() const {
+    const int num_worker_threads = Kokkos::Experimental::HPX().concurrency();
+
+    hpx_thread_buffer &buffer = Kokkos::Experimental::HPX().impl_get_buffer();
+    buffer.resize(num_worker_threads, 512);
+  }
+
+  void execute_range(int t) const {
+    // NOTE: This implementation has been simplified based on the
+    // assumption that team_size = 1. The HPX backend currently only
+    // supports a team size of 1.
+    const int num_worker_threads = Kokkos::Experimental::HPX().concurrency();
+
+    hpx_thread_buffer &buffer = Kokkos::Experimental::HPX().impl_get_buffer();
+
+    buffer.get(t);
+    HPXTeamMember member(
+        TeamPolicyInternal<Kokkos::Experimental::HPX>(
+            Kokkos::Experimental::HPX(), num_worker_threads, 1),
+        0, t, buffer.get(t), 512);
+
+    member_type single_exec(*scheduler, member);
+    member_type &team_exec = single_exec;
+
+    auto &queue          = scheduler->queue();
+    auto &team_scheduler = team_exec.scheduler();
+
+    using task_base_type = typename scheduler_type::task_base_type;
+    auto current_task    = OptionalRef<task_base_type>(nullptr);
+
+    while (!queue.is_done()) {
+      current_task = queue.pop_ready_task(team_scheduler.team_scheduler_info());
+
+      if (current_task) {
+        KOKKOS_EXPECTS(current_task->is_single_runnable() ||
+                       current_task->is_team_runnable());
+        current_task->as_runnable_task().run(single_exec);
+        queue.complete((*std::move(current_task)).as_runnable_task(),
+                       team_scheduler.team_scheduler_info());
+      }
+    }
+  }
+
+  void finalize() const {}
+
   using execution_space = Kokkos::Experimental::HPX;
   using scheduler_type =
       SimpleTaskScheduler<Kokkos::Experimental::HPX, QueueType>;
@@ -47,69 +93,14 @@ class TaskQueueSpecialization<
   using memory_space = Kokkos::HostSpace;
 
   static void execute(scheduler_type const &scheduler) {
-    // NOTE: We create an instance so that we can use dispatch_execute_task.
+    // NOTE: We create an instance so that we can use impl_bulk_setup_finalize.
     // This is not necessarily the most efficient, but can be improved later.
     TaskQueueSpecialization<scheduler_type> task_queue;
-    task_queue.scheduler = &scheduler;
-    Kokkos::Impl::dispatch_execute_task(&task_queue,
-                                        Kokkos::Experimental::HPX());
-    Kokkos::Experimental::HPX().fence(
-        "Kokkos::Impl::TaskQueueSpecialization<SimpleTask>::execute: fence "
-        "after task execution");
-  }
-
-  // Must provide task queue execution function
-  void execute_task() const {
-    // See [note 1] in Kokkos_HPX.hpp for an explanation. The work graph policy
-    // does not store an execution space instance, so we only need to reset the
-    // parallel region count here.
-    Kokkos::Experimental::HPX::reset_count_on_exit_parallel reset_count_on_exit;
-
-    using hpx::for_loop;
-    using hpx::execution::par;
-    using hpx::execution::static_chunk_size;
-    using task_base_type = typename scheduler_type::task_base_type;
-
-    const int num_worker_threads = Kokkos::Experimental::HPX::concurrency();
-
-    thread_buffer &buffer = Kokkos::Experimental::HPX().impl_get_buffer();
-    buffer.resize(num_worker_threads, 512);
-
-    auto &queue = scheduler->queue();
-
-    for_loop(par.with(static_chunk_size(1)), 0, num_worker_threads,
-             [this, &queue, &buffer, num_worker_threads](int) {
-               // NOTE: This implementation has been simplified based on the
-               // assumption that team_size = 1. The HPX backend currently only
-               // supports a team size of 1.
-               std::size_t t =
-                   Kokkos::Experimental::HPX::impl_hardware_thread_id();
-
-               buffer.get(t);
-               HPXTeamMember member(
-                   TeamPolicyInternal<Kokkos::Experimental::HPX>(
-                       Kokkos::Experimental::HPX(), num_worker_threads, 1),
-                   0, t, buffer.get(t), 512);
-
-               member_type single_exec(*scheduler, member);
-               member_type &team_exec = single_exec;
-
-               auto &team_scheduler = team_exec.scheduler();
-               auto current_task    = OptionalRef<task_base_type>(nullptr);
-
-               while (!queue.is_done()) {
-                 current_task =
-                     queue.pop_ready_task(team_scheduler.team_scheduler_info());
-
-                 if (current_task) {
-                   KOKKOS_ASSERT(current_task->is_single_runnable() ||
-                                 current_task->is_team_runnable());
-                   current_task->as_runnable_task().run(single_exec);
-                   queue.complete((*std::move(current_task)).as_runnable_task(),
-                                  team_scheduler.team_scheduler_info());
-                 }
-               }
-             });
+    task_queue.scheduler         = &scheduler;
+    const int num_worker_threads = Kokkos::Experimental::HPX().concurrency();
+    Kokkos::Experimental::HPX().impl_bulk_setup_finalize(
+        true, false, task_queue, num_worker_threads,
+        hpx::threads::thread_stacksize::nostack);
   }
 
   static uint32_t get_max_team_count(execution_space const &espace) {
@@ -133,6 +124,68 @@ class TaskQueueSpecializationConstrained<
     std::enable_if_t<std::is_same<typename Scheduler::execution_space,
                                   Kokkos::Experimental::HPX>::value>> {
  public:
+  void setup() const {
+    const int num_worker_threads = Kokkos::Experimental::HPX().concurrency();
+
+    hpx_thread_buffer &buffer = Kokkos::Experimental::HPX().impl_get_buffer();
+    buffer.resize(num_worker_threads, 512);
+
+    auto &queue = scheduler->queue();
+    queue.initialize_team_queues(num_worker_threads);
+  }
+
+  void execute_range(int t) const {
+    // NOTE: This implementation has been simplified based on the
+    // assumption that team_size = 1. The HPX backend currently only
+    // supports a team size of 1.
+    const int num_worker_threads = Kokkos::Experimental::HPX().concurrency();
+
+    hpx_thread_buffer &buffer = Kokkos::Experimental::HPX().impl_get_buffer();
+
+    buffer.get(Kokkos::Experimental::HPX::impl_hardware_thread_id());
+    HPXTeamMember member(
+        TeamPolicyInternal<Kokkos::Experimental::HPX>(
+            Kokkos::Experimental::HPX(), num_worker_threads, 1),
+        0, t, buffer.get(t), 512);
+
+    using task_base_type = typename scheduler_type::task_base;
+    using queue_type     = typename scheduler_type::queue_type;
+
+    static task_base_type *const end = (task_base_type *)task_base_type::EndTag;
+    constexpr task_base_type *no_more_tasks_sentinel = nullptr;
+
+    member_type single_exec(*scheduler, member);
+    member_type &team_exec = single_exec;
+
+    auto &team_queue     = team_exec.scheduler().queue();
+    task_base_type *task = no_more_tasks_sentinel;
+
+    do {
+      if (task != no_more_tasks_sentinel && task != end) {
+        team_queue.complete(task);
+      }
+
+      if (desul::atomic_load(&team_queue.m_ready_count,
+                             desul::MemoryOrderAcquire(),
+                             desul::MemoryScopeDevice()) > 0) {
+        task = end;
+        for (int i = 0; i < queue_type::NumQueue && end == task; ++i) {
+          for (int j = 0; j < 2 && end == task; ++j) {
+            task = queue_type::pop_ready_task(&team_queue.m_ready[i][j]);
+          }
+        }
+      } else {
+        task = team_queue.attempt_to_steal_task();
+      }
+
+      if (task != no_more_tasks_sentinel && task != end) {
+        (*task->m_apply)(task, &single_exec);
+      }
+    } while (task != no_more_tasks_sentinel);
+  }
+
+  void finalize() const {}
+
   using execution_space = Kokkos::Experimental::HPX;
   using scheduler_type  = Scheduler;
   using member_type =
@@ -144,7 +197,7 @@ class TaskQueueSpecializationConstrained<
     using task_base_type = typename scheduler_type::task_base;
     using queue_type     = typename scheduler_type::queue_type;
 
-    if (1 == Kokkos::Experimental::HPX::concurrency()) {
+    if (1 == Kokkos::Experimental::HPX().concurrency()) {
       task_base_type *const end = (task_base_type *)task_base_type::EndTag;
       task_base_type *task      = end;
 
@@ -175,82 +228,14 @@ class TaskQueueSpecializationConstrained<
   }
 
   static void execute(scheduler_type const &scheduler) {
-    // NOTE: We create an instance so that we can use dispatch_execute_task.
+    // NOTE: We create an instance so that we can use impl_bulk_setup_finalize.
     // This is not necessarily the most efficient, but can be improved later.
     TaskQueueSpecializationConstrained<scheduler_type> task_queue;
-    task_queue.scheduler = &scheduler;
-    Kokkos::Impl::dispatch_execute_task(&task_queue,
-                                        Kokkos::Experimental::HPX());
-    Kokkos::Experimental::HPX().fence(
-        "Kokkos::Impl::TaskQueueSpecialization<SimpleTask>::execute: fence "
-        "after task execution");
-  }
-
-  // Must provide task queue execution function
-  void execute_task() const {
-    // See [note 1] in Kokkos_HPX.hpp for an explanation. The work graph policy
-    // does not store an execution space instance, so we only need to reset the
-    // parallel region count here.
-    Kokkos::Experimental::HPX::reset_count_on_exit_parallel reset_count_on_exit;
-
-    using hpx::for_loop;
-    using hpx::execution::par;
-    using hpx::execution::static_chunk_size;
-
-    using task_base_type = typename scheduler_type::task_base;
-    using queue_type     = typename scheduler_type::queue_type;
-
-    const int num_worker_threads     = Kokkos::Experimental::HPX::concurrency();
-    static task_base_type *const end = (task_base_type *)task_base_type::EndTag;
-    constexpr task_base_type *no_more_tasks_sentinel = nullptr;
-
-    thread_buffer &buffer = Kokkos::Experimental::HPX().impl_get_buffer();
-    buffer.resize(num_worker_threads, 512);
-
-    auto &queue = scheduler->queue();
-    queue.initialize_team_queues(num_worker_threads);
-
-    auto exec = Kokkos::Experimental::HPX::impl_get_executor();
-
-    for_loop(
-        par.on(exec).with(static_chunk_size(1)), 0, num_worker_threads,
-        [this, &buffer, num_worker_threads](int t) {
-          // NOTE: This implementation has been simplified based on the
-          // assumption that team_size = 1. The HPX backend currently only
-          // supports a team size of 1.
-          buffer.get(Kokkos::Experimental::HPX::impl_hardware_thread_id());
-          HPXTeamMember member(
-              TeamPolicyInternal<Kokkos::Experimental::HPX>(
-                  Kokkos::Experimental::HPX(), num_worker_threads, 1),
-              0, t, buffer.get(t), 512);
-
-          member_type single_exec(*scheduler, member);
-          member_type &team_exec = single_exec;
-
-          auto &team_queue     = team_exec.scheduler().queue();
-          task_base_type *task = no_more_tasks_sentinel;
-
-          do {
-            if (task != no_more_tasks_sentinel && task != end) {
-              team_queue.complete(task);
-            }
-
-            if (*((volatile int *)&team_queue.m_ready_count) > 0) {
-              task = end;
-              for (int i = 0; i < queue_type::NumQueue && end == task; ++i) {
-                for (int j = 0; j < 2 && end == task; ++j) {
-                  task = queue_type::pop_ready_task(&team_queue.m_ready[i][j]);
-                }
-              }
-            } else {
-              task = team_queue.attempt_to_steal_task();
-            }
-
-            if (task != no_more_tasks_sentinel && task != end) {
-              (*task->m_apply)(task, &single_exec);
-            }
-          } while (task != no_more_tasks_sentinel);
-        });
+    task_queue.scheduler         = &scheduler;
+    const int num_worker_threads = Kokkos::Experimental::HPX().concurrency();
+    Kokkos::Experimental::HPX().impl_bulk_setup_finalize(
+        true, false, task_queue, num_worker_threads,
+        hpx::threads::thread_stacksize::nostack);
   }
 
   template <typename TaskType>
diff --git a/packages/kokkos/core/src/HPX/Kokkos_HPX_WorkGraphPolicy.hpp b/packages/kokkos/core/src/HPX/Kokkos_HPX_WorkGraphPolicy.hpp
index 72a80199355b09d742168882939d344831bd6fb8..92eafa4250dce7efba91eb4acaf659150ff41712 100644
--- a/packages/kokkos/core/src/HPX/Kokkos_HPX_WorkGraphPolicy.hpp
+++ b/packages/kokkos/core/src/HPX/Kokkos_HPX_WorkGraphPolicy.hpp
@@ -17,10 +17,9 @@
 #ifndef KOKKOS_HPX_WORKGRAPHPOLICY_HPP
 #define KOKKOS_HPX_WORKGRAPHPOLICY_HPP
 
-#include <Kokkos_HPX.hpp>
+#include <HPX/Kokkos_HPX.hpp>
 
-#include <hpx/local/algorithm.hpp>
-#include <hpx/local/execution.hpp>
+#include <hpx/execution.hpp>
 
 namespace Kokkos {
 namespace Impl {
@@ -35,53 +34,28 @@ class ParallelFor<FunctorType, Kokkos::WorkGraphPolicy<Traits...>,
   Policy m_policy;
   FunctorType m_functor;
 
-  template <class TagType>
-  std::enable_if_t<std::is_void<TagType>::value> execute_functor(
-      const std::int32_t w) const noexcept {
-    m_functor(w);
-  }
-
-  template <class TagType>
-  std::enable_if_t<!std::is_void<TagType>::value> execute_functor(
-      const std::int32_t w) const noexcept {
-    const TagType t{};
-    m_functor(t, w);
-  }
-
  public:
-  void execute() const {
-    dispatch_execute_task(this, m_policy.space());
-    m_policy.space().fence(
-        "Kokkos::Experimental::Impl::HPX::ParallelFor<WorkGraphPolicy>: fence "
-        "after kernel execution");
+  void execute_range(int) const {
+    std::int32_t w = m_policy.pop_work();
+    while (w != Policy::COMPLETED_TOKEN) {
+      if (w != Policy::END_TOKEN) {
+        if constexpr (std::is_same_v<WorkTag, void>) {
+          m_functor(w);
+        } else {
+          m_functor(WorkTag{}, w);
+        }
+        m_policy.completed_work(w);
+      }
+
+      w = m_policy.pop_work();
+    }
   }
 
-  void execute_task() const {
-    // See [note 1] in Kokkos_HPX.hpp for an explanation. The work graph policy
-    // does not store an execution space instance, so we only need to reset the
-    // parallel region count here.
-    Kokkos::Experimental::HPX::reset_count_on_exit_parallel reset_count_on_exit;
-
-    const int num_worker_threads = Kokkos::Experimental::HPX::concurrency();
-
-    using hpx::for_loop;
-    using hpx::execution::par;
-    using hpx::execution::static_chunk_size;
-
-    auto exec = Kokkos::Experimental::HPX::impl_get_executor();
-
-    for_loop(par.on(exec).with(static_chunk_size(1)), 0, num_worker_threads,
-             [this](int) {
-               std::int32_t w = m_policy.pop_work();
-               while (w != Policy::COMPLETED_TOKEN) {
-                 if (w != Policy::END_TOKEN) {
-                   execute_functor<WorkTag>(w);
-                   m_policy.completed_work(w);
-                 }
-
-                 w = m_policy.pop_work();
-               }
-             });
+  void execute() const {
+    const int num_worker_threads = Kokkos::Experimental::HPX().concurrency();
+    Kokkos::Experimental::HPX().impl_bulk_plain(
+        true, is_light_weight_policy<Policy>(), *this, num_worker_threads,
+        hpx::threads::thread_stacksize::nostack);
   }
 
   inline ParallelFor(const FunctorType &arg_functor, const Policy &arg_policy)
diff --git a/packages/kokkos/core/src/Kokkos_Abort.hpp b/packages/kokkos/core/src/Kokkos_Abort.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a8f38837ea04a02697731e00f819ce21326f1dad
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_Abort.hpp
@@ -0,0 +1,105 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_ABORT_HPP
+#define KOKKOS_ABORT_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <Kokkos_Printf.hpp>
+#ifdef KOKKOS_ENABLE_CUDA
+#include <Cuda/Kokkos_Cuda_abort.hpp>
+#endif
+#ifdef KOKKOS_ENABLE_HIP
+#include <HIP/Kokkos_HIP_Abort.hpp>
+#endif
+#ifdef KOKKOS_ENABLE_SYCL
+#include <SYCL/Kokkos_SYCL_Abort.hpp>
+#endif
+
+namespace Kokkos {
+namespace Impl {
+
+[[noreturn]] void host_abort(const char *const);
+
+#if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__)
+
+#if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK)
+// required to workaround failures in random number generator unit tests with
+// pre-volta architectures
+#define KOKKOS_IMPL_ABORT_NORETURN
+#else
+// cuda_abort aborts when building for other platforms than macOS
+#define KOKKOS_IMPL_ABORT_NORETURN [[noreturn]]
+#endif
+
+#elif defined(KOKKOS_COMPILER_NVHPC)
+
+#define KOKKOS_IMPL_ABORT_NORETURN
+
+#elif defined(KOKKOS_ENABLE_HIP) && defined(__HIP_DEVICE_COMPILE__)
+// HIP aborts
+#define KOKKOS_IMPL_ABORT_NORETURN [[noreturn]]
+#elif defined(KOKKOS_ENABLE_SYCL) && defined(__SYCL_DEVICE_ONLY__)
+// FIXME_SYCL SYCL doesn't abort
+#define KOKKOS_IMPL_ABORT_NORETURN
+#elif !defined(KOKKOS_ENABLE_OPENMPTARGET) && !defined(KOKKOS_ENABLE_OPENACC)
+// Host aborts
+#define KOKKOS_IMPL_ABORT_NORETURN [[noreturn]]
+#else
+// Everything else does not abort
+#define KOKKOS_IMPL_ABORT_NORETURN
+#endif
+
+// FIXME_SYCL
+// Accomodate host pass for device functions that are not [[noreturn]]
+#if defined(KOKKOS_ENABLE_SYCL) || \
+    (defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK))
+#define KOKKOS_IMPL_ABORT_NORETURN_DEVICE
+#else
+#define KOKKOS_IMPL_ABORT_NORETURN_DEVICE KOKKOS_IMPL_ABORT_NORETURN
+#endif
+
+#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) ||          \
+    defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_OPENMPTARGET) || \
+    defined(KOKKOS_ENABLE_OPENACC)
+KOKKOS_IMPL_ABORT_NORETURN_DEVICE inline KOKKOS_IMPL_DEVICE_FUNCTION void
+device_abort(const char *const msg) {
+#if defined(KOKKOS_ENABLE_CUDA)
+  ::Kokkos::Impl::cuda_abort(msg);
+#elif defined(KOKKOS_ENABLE_HIP)
+  ::Kokkos::Impl::hip_abort(msg);
+#elif defined(KOKKOS_ENABLE_SYCL)
+  ::Kokkos::Impl::sycl_abort(msg);
+#elif defined(KOKKOS_ENABLE_OPENMPTARGET) || defined(KOKKOS_ENABLE_OPENACC)
+  printf("%s", msg);  // FIXME_OPENMPTARGET FIXME_OPENACC
+#else
+#error faulty logic
+#endif
+}
+#endif
+}  // namespace Impl
+
+KOKKOS_IMPL_ABORT_NORETURN KOKKOS_INLINE_FUNCTION void abort(
+    const char *const message) {
+  KOKKOS_IF_ON_HOST(::Kokkos::Impl::host_abort(message);)
+  KOKKOS_IF_ON_DEVICE(::Kokkos::Impl::device_abort(message);)
+}
+
+#undef KOKKOS_IMPL_ABORT_NORETURN
+
+}  // namespace Kokkos
+
+#endif /* #ifndef KOKKOS_ABORT_HPP */
diff --git a/packages/kokkos/core/src/Kokkos_Array.hpp b/packages/kokkos/core/src/Kokkos_Array.hpp
index 1e3ab4741cd45601c5730cad45519ddb84fb6105..82ceaaec21833dc0a320aa4e7e1c3d64261008a2 100644
--- a/packages/kokkos/core/src/Kokkos_Array.hpp
+++ b/packages/kokkos/core/src/Kokkos_Array.hpp
@@ -163,8 +163,8 @@ struct Array<T, 0, Proxy> {
     return *reinterpret_cast<const_pointer>(-1);
   }
 
-  KOKKOS_INLINE_FUNCTION pointer data() { return pointer(0); }
-  KOKKOS_INLINE_FUNCTION const_pointer data() const { return const_pointer(0); }
+  KOKKOS_INLINE_FUNCTION pointer data() { return nullptr; }
+  KOKKOS_INLINE_FUNCTION const_pointer data() const { return nullptr; }
 
   KOKKOS_DEFAULTED_FUNCTION ~Array()            = default;
   KOKKOS_DEFAULTED_FUNCTION Array()             = default;
@@ -199,7 +199,7 @@ struct Array<T, KOKKOS_INVALID_INDEX, Array<>::contiguous> {
   using const_pointer   = std::add_const_t<T>*;
 
   KOKKOS_INLINE_FUNCTION constexpr size_type size() const { return m_size; }
-  KOKKOS_INLINE_FUNCTION constexpr bool empty() const { return 0 != m_size; }
+  KOKKOS_INLINE_FUNCTION constexpr bool empty() const { return 0 == m_size; }
   KOKKOS_INLINE_FUNCTION constexpr size_type max_size() const { return m_size; }
 
   template <typename iType>
@@ -234,14 +234,14 @@ struct Array<T, KOKKOS_INVALID_INDEX, Array<>::contiguous> {
 
   KOKKOS_INLINE_FUNCTION
   Array& operator=(const Array& rhs) {
-    const size_t n = std::min(m_size, rhs.size());
+    const size_t n = size() < rhs.size() ? size() : rhs.size();
     for (size_t i = 0; i < n; ++i) m_elem[i] = rhs[i];
     return *this;
   }
 
   template <size_t N, class P>
   KOKKOS_INLINE_FUNCTION Array& operator=(const Array<T, N, P>& rhs) {
-    const size_t n = std::min(m_size, rhs.size());
+    const size_t n = size() < rhs.size() ? size() : rhs.size();
     for (size_t i = 0; i < n; ++i) m_elem[i] = rhs[i];
     return *this;
   }
@@ -268,7 +268,7 @@ struct Array<T, KOKKOS_INVALID_INDEX, Array<>::strided> {
   using const_pointer   = std::add_const_t<T>*;
 
   KOKKOS_INLINE_FUNCTION constexpr size_type size() const { return m_size; }
-  KOKKOS_INLINE_FUNCTION constexpr bool empty() const { return 0 != m_size; }
+  KOKKOS_INLINE_FUNCTION constexpr bool empty() const { return 0 == m_size; }
   KOKKOS_INLINE_FUNCTION constexpr size_type max_size() const { return m_size; }
 
   template <typename iType>
@@ -303,15 +303,15 @@ struct Array<T, KOKKOS_INVALID_INDEX, Array<>::strided> {
 
   KOKKOS_INLINE_FUNCTION
   Array& operator=(const Array& rhs) {
-    const size_t n = std::min(m_size, rhs.size());
-    for (size_t i = 0; i < n; ++i) m_elem[i] = rhs[i];
+    const size_t n = size() < rhs.size() ? size() : rhs.size();
+    for (size_t i = 0; i < n; ++i) m_elem[i * m_stride] = rhs[i];
     return *this;
   }
 
   template <size_t N, class P>
   KOKKOS_INLINE_FUNCTION Array& operator=(const Array<T, N, P>& rhs) {
-    const size_t n = std::min(m_size, rhs.size());
-    for (size_t i = 0; i < n; ++i) m_elem[i] = rhs[i];
+    const size_t n = size() < rhs.size() ? size() : rhs.size();
+    for (size_t i = 0; i < n; ++i) m_elem[i * m_stride] = rhs[i];
     return *this;
   }
 
diff --git a/packages/kokkos/core/src/Kokkos_Assert.hpp b/packages/kokkos/core/src/Kokkos_Assert.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c3b9004734a16f4cea233b0497df108e43e213e8
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_Assert.hpp
@@ -0,0 +1,70 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_ASSERT_HPP
+#define KOKKOS_ASSERT_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <Kokkos_Abort.hpp>
+
+#if !defined(NDEBUG) || defined(KOKKOS_ENFORCE_CONTRACTS) || \
+    defined(KOKKOS_ENABLE_DEBUG)
+#define KOKKOS_EXPECTS(...)                                                    \
+  {                                                                            \
+    if (!bool(__VA_ARGS__)) {                                                  \
+      ::Kokkos::abort(                                                         \
+          "Kokkos contract violation:\n  "                                     \
+          "  Expected precondition `" #__VA_ARGS__                             \
+          "` evaluated false.\n"                                               \
+          "Error at " KOKKOS_IMPL_TOSTRING(__FILE__) ":" KOKKOS_IMPL_TOSTRING( \
+              __LINE__) " \n");                                                \
+    }                                                                          \
+  }
+#define KOKKOS_ENSURES(...)                                                    \
+  {                                                                            \
+    if (!bool(__VA_ARGS__)) {                                                  \
+      ::Kokkos::abort(                                                         \
+          "Kokkos contract violation:\n  "                                     \
+          "  Ensured postcondition `" #__VA_ARGS__                             \
+          "` evaluated false.\n"                                               \
+          "Error at " KOKKOS_IMPL_TOSTRING(__FILE__) ":" KOKKOS_IMPL_TOSTRING( \
+              __LINE__) " \n");                                                \
+    }                                                                          \
+  }
+// some projects already define this for themselves, so don't mess
+// them up
+#ifndef KOKKOS_ASSERT
+#define KOKKOS_ASSERT(...)                                                     \
+  {                                                                            \
+    if (!bool(__VA_ARGS__)) {                                                  \
+      ::Kokkos::abort(                                                         \
+          "Kokkos contract violation:\n  "                                     \
+          "  Asserted condition `" #__VA_ARGS__                                \
+          "` evaluated false.\n"                                               \
+          "Error at " KOKKOS_IMPL_TOSTRING(__FILE__) ":" KOKKOS_IMPL_TOSTRING( \
+              __LINE__) " \n");                                                \
+    }                                                                          \
+  }
+#endif  // ifndef KOKKOS_ASSERT
+#else   // not debug mode
+#define KOKKOS_EXPECTS(...)
+#define KOKKOS_ENSURES(...)
+#ifndef KOKKOS_ASSERT
+#define KOKKOS_ASSERT(...)
+#endif  // ifndef KOKKOS_ASSERT
+#endif  // end debug mode ifdefs
+
+#endif /* #ifndef KOKKOS_ASSERT_HPP */
diff --git a/packages/kokkos/core/src/Kokkos_Atomic.hpp b/packages/kokkos/core/src/Kokkos_Atomic.hpp
index 1347e09ebd6a488278ac59c364aeff6309f0517e..6fc903f2743454174813317eb2160a9163398d8b 100644
--- a/packages/kokkos/core/src/Kokkos_Atomic.hpp
+++ b/packages/kokkos/core/src/Kokkos_Atomic.hpp
@@ -46,350 +46,9 @@
 
 #include <Kokkos_Macros.hpp>
 
-#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
 #include <Kokkos_Atomics_Desul_Wrapper.hpp>
 #include <Kokkos_Atomics_Desul_Volatile_Wrapper.hpp>
-#include <impl/Kokkos_Utilities.hpp>
 
-// Helper functions for places where we really should have called SeqCst atomics
-// anyway These can go away when we call desul unconditionally Non-Desul
-// versions are below
-namespace Kokkos {
-namespace Impl {
-using desul::MemoryOrderSeqCst;
-using desul::MemoryScopeDevice;
-
-template <class T>
-KOKKOS_INLINE_FUNCTION void desul_atomic_dec(T* dest, MemoryOrderSeqCst,
-                                             MemoryScopeDevice) {
-  return desul::atomic_dec(const_cast<T*>(dest), desul::MemoryOrderSeqCst(),
-                           desul::MemoryScopeDevice());
-}
-
-template <class T>
-KOKKOS_INLINE_FUNCTION void desul_atomic_inc(T* dest, MemoryOrderSeqCst,
-                                             MemoryScopeDevice) {
-  return desul::atomic_inc(const_cast<T*>(dest), desul::MemoryOrderSeqCst(),
-                           desul::MemoryScopeDevice());
-}
-
-template <class T>
-KOKKOS_INLINE_FUNCTION T
-desul_atomic_exchange(T* dest, const Kokkos::Impl::type_identity_t<T> val,
-                      MemoryOrderSeqCst, MemoryScopeDevice) {
-  return desul::atomic_exchange(const_cast<T*>(dest), val,
-                                desul::MemoryOrderSeqCst(),
-                                desul::MemoryScopeDevice());
-}
-
-template <class T>
-KOKKOS_INLINE_FUNCTION T desul_atomic_compare_exchange(
-    T* dest, Kokkos::Impl::type_identity_t<const T> compare,
-    Kokkos::Impl::type_identity_t<const T> val, MemoryOrderSeqCst,
-    MemoryScopeDevice) {
-  return desul::atomic_compare_exchange(dest, compare, val,
-                                        desul::MemoryOrderSeqCst(),
-                                        desul::MemoryScopeDevice());
-}
-
-}  // namespace Impl
-}  // namespace Kokkos
-#else
-
-#include <Kokkos_HostSpace.hpp>
-#include <impl/Kokkos_Traits.hpp>
-
-//----------------------------------------------------------------------------
-
-// Need to fix this for pure clang on windows
-#if defined(_WIN32)
-#define KOKKOS_ENABLE_WINDOWS_ATOMICS
-
-#if defined(KOKKOS_ENABLE_CUDA)
-#define KOKKOS_ENABLE_CUDA_ATOMICS
-#if defined(KOKKOS_COMPILER_CLANG)
-#define KOKKOS_ENABLE_GNU_ATOMICS
-#endif
-#endif
-
-#else  // _WIN32
-#if defined(KOKKOS_ENABLE_CUDA)
-
-// Compiling NVIDIA device code, must use Cuda atomics:
-
-#define KOKKOS_ENABLE_CUDA_ATOMICS
-
-#elif defined(KOKKOS_ENABLE_HIP)
-
-#define KOKKOS_ENABLE_HIP_ATOMICS
-
-#endif
-
-#if !defined(KOKKOS_ENABLE_GNU_ATOMICS) &&    \
-    !defined(KOKKOS_ENABLE_INTEL_ATOMICS) &&  \
-    !defined(KOKKOS_ENABLE_OPENMP_ATOMICS) && \
-    !defined(KOKKOS_ENABLE_STD_ATOMICS) &&    \
-    !defined(KOKKOS_ENABLE_SERIAL_ATOMICS)
-
-// Compiling for non-Cuda atomic implementation has not been pre-selected.
-// Choose the best implementation for the detected compiler.
-// Preference: GCC, INTEL, OMP31
-
-#if defined(KOKKOS_INTERNAL_NOT_PARALLEL)
-
-#define KOKKOS_ENABLE_SERIAL_ATOMICS
-
-#elif defined(KOKKOS_COMPILER_GNU) || defined(KOKKOS_COMPILER_CLANG) || \
-    defined(KOKKOS_COMPILER_NVCC)
-
-#define KOKKOS_ENABLE_GNU_ATOMICS
-
-#elif defined(KOKKOS_COMPILER_INTEL) || defined(KOKKOS_COMPILER_CRAYC)
-
-#define KOKKOS_ENABLE_INTEL_ATOMICS
-
-#elif defined(_OPENMP) && (201107 <= _OPENMP)
-
-#define KOKKOS_ENABLE_OPENMP_ATOMICS
-
-#else
-
-#error "KOKKOS_ATOMICS_USE : Unsupported compiler"
-
-#endif
-
-#endif /* Not pre-selected atomic implementation */
-#endif
-
-#ifdef KOKKOS_ENABLE_CUDA
-#include <Cuda/Kokkos_Cuda_Locks.hpp>
-#endif
-
-namespace Kokkos {
-template <typename T>
-KOKKOS_INLINE_FUNCTION void atomic_add(volatile T* const dest, const T src);
-
-// Atomic increment
-template <typename T>
-KOKKOS_INLINE_FUNCTION void atomic_increment(volatile T* a);
-
-template <typename T>
-KOKKOS_INLINE_FUNCTION void atomic_decrement(volatile T* a);
-}  // namespace Kokkos
-
-namespace Kokkos {
-
-inline const char* atomic_query_version() {
-#if defined(KOKKOS_ENABLE_CUDA_ATOMICS)
-  return "KOKKOS_ENABLE_CUDA_ATOMICS";
-#elif defined(KOKKOS_ENABLE_GNU_ATOMICS)
-  return "KOKKOS_ENABLE_GNU_ATOMICS";
-#elif defined(KOKKOS_ENABLE_INTEL_ATOMICS)
-  return "KOKKOS_ENABLE_INTEL_ATOMICS";
-#elif defined(KOKKOS_ENABLE_OPENMP_ATOMICS)
-  return "KOKKOS_ENABLE_OPENMP_ATOMICS";
-#elif defined(KOKKOS_ENABLE_WINDOWS_ATOMICS)
-  return "KOKKOS_ENABLE_WINDOWS_ATOMICS";
-#elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS)
-  return "KOKKOS_ENABLE_SERIAL_ATOMICS";
-#else
-#error "No valid response for atomic_query_version!"
-#endif
-}
-
-}  // namespace Kokkos
-
-//----------------------------------------------------------------------------
-// Atomic Memory Orders
-//
-// Implements Strongly-typed analogs of C++ standard memory orders
-#include "impl/Kokkos_Atomic_Memory_Order.hpp"
-
-#if defined(KOKKOS_ENABLE_HIP)
-#include <HIP/Kokkos_HIP_Atomic.hpp>
-#endif
-
-#if defined(KOKKOS_ENABLE_WINDOWS_ATOMICS)
-#include "impl/Kokkos_Atomic_Windows.hpp"
-#endif
-//----------------------------------------------------------------------------
-// Atomic Assembly
-//
-// Implements CAS128-bit in assembly
-
-#include "impl/Kokkos_Atomic_Assembly.hpp"
-
-//----------------------------------------------------------------------------
-// Memory fence
-//
-// All loads and stores from this thread will be globally consistent before
-// continuing
-//
-// void memory_fence() {...};
-#include "impl/Kokkos_Memory_Fence.hpp"
-
-//----------------------------------------------------------------------------
-// Atomic exchange
-//
-// template< typename T >
-// T atomic_exchange( volatile T* const dest , const T val )
-// { T tmp = *dest ; *dest = val ; return tmp ; }
-
-#include "impl/Kokkos_Atomic_Exchange.hpp"
-
-//----------------------------------------------------------------------------
-// Atomic compare-and-exchange
-//
-// template<class T>
-// bool atomic_compare_exchange_strong(volatile T* const dest, const T compare,
-// const T val) { bool equal = compare == *dest ; if ( equal ) { *dest = val ; }
-// return equal ; }
-
-#include "impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp"
-
-#include "impl/Kokkos_Atomic_Generic.hpp"
-
-//----------------------------------------------------------------------------
-// Atomic fetch and add
-//
-// template<class T>
-// T atomic_fetch_add(volatile T* const dest, const T val)
-// { T tmp = *dest ; *dest += val ; return tmp ; }
-
-#include "impl/Kokkos_Atomic_Fetch_Add.hpp"
-
-//----------------------------------------------------------------------------
-// Atomic increment
-//
-// template<class T>
-// T atomic_increment(volatile T* const dest)
-// { dest++; }
-
-#include "impl/Kokkos_Atomic_Increment.hpp"
-
-//----------------------------------------------------------------------------
-// Atomic Decrement
-//
-// template<class T>
-// T atomic_decrement(volatile T* const dest)
-// { dest--; }
-
-#include "impl/Kokkos_Atomic_Decrement.hpp"
-
-//----------------------------------------------------------------------------
-// Atomic fetch and sub
-//
-// template<class T>
-// T atomic_fetch_sub(volatile T* const dest, const T val)
-// { T tmp = *dest ; *dest -= val ; return tmp ; }
-
-#include "impl/Kokkos_Atomic_Fetch_Sub.hpp"
-
-//----------------------------------------------------------------------------
-// Atomic fetch and or
-//
-// template<class T>
-// T atomic_fetch_or(volatile T* const dest, const T val)
-// { T tmp = *dest ; *dest = tmp | val ; return tmp ; }
-
-#include "impl/Kokkos_Atomic_Fetch_Or.hpp"
-
-//----------------------------------------------------------------------------
-// Atomic fetch and and
-//
-// template<class T>
-// T atomic_fetch_and(volatile T* const dest, const T val)
-// { T tmp = *dest ; *dest = tmp & val ; return tmp ; }
-
-#include "impl/Kokkos_Atomic_Fetch_And.hpp"
-
-//----------------------------------------------------------------------------
-// Atomic MinMax
-//
-// template<class T>
-// T atomic_min(volatile T* const dest, const T val)
-// { T tmp = *dest ; *dest = min(*dest, val); return tmp ; }
-// template<class T>
-// T atomic_max(volatile T* const dest, const T val)
-// { T tmp = *dest ; *dest = max(*dest, val); return tmp ; }
-
-#include "impl/Kokkos_Atomic_MinMax.hpp"
-
-//----------------------------------------------------------------------------
-// Provide volatile_load and safe_load
-//
-// T volatile_load(T const volatile * const ptr);
-//
-// T const& safe_load(T const * const ptr);
-// XEON PHI
-// T safe_load(T const * const ptr
-
-#include "impl/Kokkos_Volatile_Load.hpp"
-
-//----------------------------------------------------------------------------
-// Provide atomic loads and stores with memory order semantics
-
-#include "impl/Kokkos_Atomic_Load.hpp"
-#include "impl/Kokkos_Atomic_Store.hpp"
-
-// Generic functions using the above defined functions
-#include "impl/Kokkos_Atomic_Generic_Secondary.hpp"
-//----------------------------------------------------------------------------
-// This atomic-style macro should be an inlined function, not a macro
-
-#if defined(KOKKOS_COMPILER_GNU) && !defined(__PGIC__) && \
-    !defined(__CUDA_ARCH__)
-
-#define KOKKOS_NONTEMPORAL_PREFETCH_LOAD(addr) __builtin_prefetch(addr, 0, 0)
-#define KOKKOS_NONTEMPORAL_PREFETCH_STORE(addr) __builtin_prefetch(addr, 1, 0)
-
-#else
-
-#define KOKKOS_NONTEMPORAL_PREFETCH_LOAD(addr) ((void)0)
-#define KOKKOS_NONTEMPORAL_PREFETCH_STORE(addr) ((void)0)
-
-#endif
-
-//----------------------------------------------------------------------------
-
-// Helper functions for places where we really should have called SeqCst atomics
-// anyway These can go away when we call desul unconditionally
-namespace Kokkos {
-namespace Impl {
-struct MemoryOrderSeqCst {};
-struct MemoryScopeDevice {};
-
-template <class T>
-KOKKOS_INLINE_FUNCTION void desul_atomic_dec(T* dest, MemoryOrderSeqCst,
-                                             MemoryScopeDevice) {
-  return Kokkos::atomic_decrement(dest);
-}
-
-template <class T>
-KOKKOS_INLINE_FUNCTION void desul_atomic_inc(T* dest, MemoryOrderSeqCst,
-                                             MemoryScopeDevice) {
-  return Kokkos::atomic_increment(dest);
-}
-
-template <class T>
-KOKKOS_INLINE_FUNCTION T
-desul_atomic_exchange(T* dest, Kokkos::Impl::type_identity_t<const T> val,
-                      MemoryOrderSeqCst, MemoryScopeDevice) {
-  return Kokkos::atomic_exchange(dest, val);
-}
-
-template <class T>
-KOKKOS_INLINE_FUNCTION T desul_atomic_compare_exchange(
-    T* dest, Kokkos::Impl::type_identity_t<const T> compare,
-    Kokkos::Impl::type_identity_t<const T> val, MemoryOrderSeqCst,
-    MemoryScopeDevice) {
-  return Kokkos::atomic_compare_exchange(dest, compare, val);
-}
-
-}  // namespace Impl
-}  // namespace Kokkos
-
-#endif /* !KOKKOS_ENABLE_IMPL_DESUL_ATOMICS */
 #ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_ATOMIC
 #undef KOKKOS_IMPL_PUBLIC_INCLUDE
 #undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_ATOMIC
diff --git a/packages/kokkos/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp b/packages/kokkos/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp
index 9da4b061102132f8613352975d3cebf7e190161f..1c4347463219d95f07cc68deccf46890e912e7f1 100644
--- a/packages/kokkos/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp
+++ b/packages/kokkos/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp
@@ -22,7 +22,6 @@ static_assert(false,
 #ifndef KOKKOS_DESUL_ATOMICS_VOLATILE_WRAPPER_HPP_
 #define KOKKOS_DESUL_ATOMICS_VOLATILE_WRAPPER_HPP_
 #include <Kokkos_Macros.hpp>
-#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
 #include <Kokkos_Atomics_Desul_Config.hpp>
 #include <desul/atomics.hpp>
 
@@ -195,5 +194,4 @@ T atomic_compare_exchange(volatile T* const dest, const T compare, const T desir
 #undef KOKKOS_DESUL_MEM_SCOPE
 
 // clang-format on
-#endif  // KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
 #endif
diff --git a/packages/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp b/packages/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp
index fdc5e123f687f1a8444c6196a5ba8260c1689212..bda37839805c75e0ee58f4f926f52b7083d339a4 100644
--- a/packages/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp
+++ b/packages/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp
@@ -23,11 +23,9 @@ static_assert(false,
 #define KOKKOS_DESUL_ATOMICS_WRAPPER_HPP_
 #include <Kokkos_Macros.hpp>
 
-#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
 #include <Kokkos_Atomics_Desul_Config.hpp>
 #include <desul/atomics.hpp>
 
-#include <impl/Kokkos_Atomic_Memory_Order.hpp>
 #include <impl/Kokkos_Volatile_Load.hpp>
 
 // clang-format off
@@ -229,54 +227,25 @@ T atomic_compare_exchange(T* const dest, desul::Impl::dont_deduce_this_parameter
 }
 
 namespace Impl {
-
-  template<class MemoryOrder>
-  struct KokkosToDesulMemoryOrder;
-
-  template<>
-  struct KokkosToDesulMemoryOrder<memory_order_seq_cst_t> {
-    using type = desul::MemoryOrderSeqCst;
-  };
-  template<>
-  struct KokkosToDesulMemoryOrder<memory_order_acquire_t> {
-    using type = desul::MemoryOrderAcquire;
-  };
-  template<>
-  struct KokkosToDesulMemoryOrder<memory_order_release_t> {
-    using type = desul::MemoryOrderRelease;
-  };
-  template<>
-  struct KokkosToDesulMemoryOrder<memory_order_acq_rel_t> {
-    using type = desul::MemoryOrderAcqRel;
-  };
-  template<>
-  struct KokkosToDesulMemoryOrder<memory_order_relaxed_t> {
-    using type = desul::MemoryOrderRelaxed;
-  };
   template<class T, class MemOrderSuccess, class MemOrderFailure> KOKKOS_INLINE_FUNCTION
-  bool atomic_compare_exchange_strong(T* const dest, T& expected, const T desired, MemOrderSuccess, MemOrderFailure) {
-    return desul::atomic_compare_exchange_strong(dest, expected, desired,
-                  typename KokkosToDesulMemoryOrder<MemOrderSuccess>::type(),
-                  typename KokkosToDesulMemoryOrder<MemOrderFailure>::type(),
-                  KOKKOS_DESUL_MEM_SCOPE);
-
+  bool atomic_compare_exchange_strong(T* const dest, T& expected, const T desired, MemOrderSuccess succ, MemOrderFailure fail) {
+    return desul::atomic_compare_exchange_strong(dest, expected, desired, succ, fail, KOKKOS_DESUL_MEM_SCOPE);
   }
   template<class T, class MemoryOrder>
   KOKKOS_INLINE_FUNCTION
-  T atomic_load(const T* const src, MemoryOrder) {
-    return desul::atomic_load(src, typename KokkosToDesulMemoryOrder<MemoryOrder>::type(), KOKKOS_DESUL_MEM_SCOPE);
+  T atomic_load(const T* const src, MemoryOrder order) {
+    return desul::atomic_load(src, order, KOKKOS_DESUL_MEM_SCOPE);
   }
   template<class T, class MemoryOrder>
   KOKKOS_INLINE_FUNCTION
-  void atomic_store(T* const src, const T val, MemoryOrder) {
-    return desul::atomic_store(src, val, typename KokkosToDesulMemoryOrder<MemoryOrder>::type(), KOKKOS_DESUL_MEM_SCOPE);
+  void atomic_store(T* const src, const T val, MemoryOrder order) {
+    return desul::atomic_store(src, val, order, KOKKOS_DESUL_MEM_SCOPE);
   }
-}
+}  // namespace Impl
 
-}
+}  // namespace Kokkos
 
 #undef KOKKOS_DESUL_MEM_SCOPE
 
 // clang-format on
-#endif  // KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
 #endif
diff --git a/packages/kokkos/core/src/Kokkos_BitManipulation.hpp b/packages/kokkos/core/src/Kokkos_BitManipulation.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f5653aaba343a1ec9622f68d70e954165bd0e90b
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_BitManipulation.hpp
@@ -0,0 +1,503 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_BIT_MANIPULATION_HPP
+#define KOKKOS_BIT_MANIPULATION_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <Kokkos_NumericTraits.hpp>
+#include <climits>  // CHAR_BIT
+#include <cstring>  //memcpy
+#include <type_traits>
+
+namespace Kokkos::Impl {
+
+template <class T>
+KOKKOS_FUNCTION constexpr T byteswap_fallback(T x) {
+  if constexpr (sizeof(T) > 1) {
+    using U = std::make_unsigned_t<T>;
+
+    size_t shift = CHAR_BIT * (sizeof(T) - 1);
+
+    U lo_mask = static_cast<unsigned char>(~0);
+    U hi_mask = lo_mask << shift;
+
+    U val = x;
+
+    for (size_t i = 0; i < sizeof(T) / 2; ++i) {
+      U lo_val = val & lo_mask;
+      U hi_val = val & hi_mask;
+
+      val = (val & ~lo_mask) | (hi_val >> shift);
+      val = (val & ~hi_mask) | (lo_val << shift);
+
+      lo_mask <<= CHAR_BIT;
+      hi_mask >>= CHAR_BIT;
+
+      shift -= 2 * CHAR_BIT;
+    }
+    return val;
+  }
+  // sizeof(T) == 1
+  return x;
+}
+
+template <class T>
+KOKKOS_FUNCTION constexpr int countl_zero_fallback(T x) {
+  // From Hacker's Delight (2nd edition) section 5-3
+  unsigned int y = 0;
+  using ::Kokkos::Experimental::digits_v;
+  int n = digits_v<T>;
+  int c = digits_v<T> / 2;
+  do {
+    y = x >> c;
+    if (y != 0) {
+      n -= c;
+      x = y;
+    }
+    c >>= 1;
+  } while (c != 0);
+  return n - static_cast<int>(x);
+}
+
+template <class T>
+KOKKOS_FUNCTION constexpr int countr_zero_fallback(T x) {
+  using ::Kokkos::Experimental::digits_v;
+  return digits_v<T> - countl_zero_fallback(static_cast<T>(
+                           static_cast<T>(~x) & static_cast<T>(x - 1)));
+}
+
+template <class T>
+KOKKOS_FUNCTION constexpr int popcount_fallback(T x) {
+  int c = 0;
+  for (; x != 0; x &= x - 1) {
+    ++c;
+  }
+  return c;
+}
+
+template <class T>
+inline constexpr bool is_standard_unsigned_integer_type_v =
+    std::is_same_v<T, unsigned char> || std::is_same_v<T, unsigned short> ||
+    std::is_same_v<T, unsigned int> || std::is_same_v<T, unsigned long> ||
+    std::is_same_v<T, unsigned long long>;
+
+}  // namespace Kokkos::Impl
+
+namespace Kokkos {
+
+//<editor-fold desc="[bit.cast], bit_cast">
+#if defined(KOKKOS_ENABLE_SYCL) && defined(__INTEL_LLVM_COMPILER) && \
+    __INTEL_LLVM_COMPILER < 20240000
+using sycl::detail::bit_cast;
+#else
+template <class To, class From>
+KOKKOS_FUNCTION std::enable_if_t<sizeof(To) == sizeof(From) &&
+                                     std::is_trivially_copyable_v<To> &&
+                                     std::is_trivially_copyable_v<From>,
+                                 To>
+bit_cast(From const& from) noexcept {
+#if defined(KOKKOS_ENABLE_SYCL) && defined(__INTEL_LLVM_COMPILER) && \
+    __INTEL_LLVM_COMPILER >= 20240000
+  return sycl::bit_cast<To>(from);
+#else
+  To to;
+  memcpy(static_cast<void*>(&to), static_cast<const void*>(&from), sizeof(To));
+  return to;
+#endif
+}
+#endif
+//</editor-fold>
+
+//<editor-fold desc="[bit.byteswap], byteswap">
+template <class T>
+KOKKOS_FUNCTION constexpr std::enable_if_t<std::is_integral_v<T>, T> byteswap(
+    T value) noexcept {
+  return Impl::byteswap_fallback(value);
+}
+//</editor-fold>
+
+//<editor-fold desc="[bit.count], counting">
+template <class T>
+KOKKOS_FUNCTION constexpr std::enable_if_t<
+    Impl::is_standard_unsigned_integer_type_v<T>, int>
+countl_zero(T x) noexcept {
+  using ::Kokkos::Experimental::digits_v;
+  if (x == 0) return digits_v<T>;
+  // TODO use compiler intrinsics when available
+  return Impl::countl_zero_fallback(x);
+}
+
+template <class T>
+KOKKOS_FUNCTION constexpr std::enable_if_t<
+    Impl::is_standard_unsigned_integer_type_v<T>, int>
+countl_one(T x) noexcept {
+  using ::Kokkos::Experimental::digits_v;
+  using ::Kokkos::Experimental::finite_max_v;
+  if (x == finite_max_v<T>) return digits_v<T>;
+  return countl_zero(static_cast<T>(~x));
+}
+
+template <class T>
+KOKKOS_FUNCTION constexpr std::enable_if_t<
+    Impl::is_standard_unsigned_integer_type_v<T>, int>
+countr_zero(T x) noexcept {
+  using ::Kokkos::Experimental::digits_v;
+  if (x == 0) return digits_v<T>;
+  // TODO use compiler intrinsics when available
+  return Impl::countr_zero_fallback(x);
+}
+
+template <class T>
+KOKKOS_FUNCTION constexpr std::enable_if_t<
+    Impl::is_standard_unsigned_integer_type_v<T>, int>
+countr_one(T x) noexcept {
+  using ::Kokkos::Experimental::digits_v;
+  using ::Kokkos::Experimental::finite_max_v;
+  if (x == finite_max_v<T>) return digits_v<T>;
+  return countr_zero(static_cast<T>(~x));
+}
+
+template <class T>
+KOKKOS_FUNCTION constexpr std::enable_if_t<
+    Impl::is_standard_unsigned_integer_type_v<T>, int>
+popcount(T x) noexcept {
+  if (x == 0) return 0;
+  // TODO use compiler intrinsics when available
+  return Impl::popcount_fallback(x);
+}
+//</editor-fold>
+
+//<editor-fold desc="[bit.pow.two], integral powers of 2">
+template <class T>
+KOKKOS_FUNCTION constexpr std::enable_if_t<
+    Impl::is_standard_unsigned_integer_type_v<T>, bool>
+has_single_bit(T x) noexcept {
+  return x != 0 && (((x & (x - 1)) == 0));
+}
+
+template <class T>
+KOKKOS_FUNCTION constexpr std::enable_if_t<
+    Impl::is_standard_unsigned_integer_type_v<T>, T>
+bit_ceil(T x) noexcept {
+  if (x <= 1) return 1;
+  using ::Kokkos::Experimental::digits_v;
+  return T{1} << (digits_v<T> - countl_zero(static_cast<T>(x - 1)));
+}
+
+template <class T>
+KOKKOS_FUNCTION constexpr std::enable_if_t<
+    Impl::is_standard_unsigned_integer_type_v<T>, T>
+bit_floor(T x) noexcept {
+  if (x == 0) return 0;
+  using ::Kokkos::Experimental::digits_v;
+  return T{1} << (digits_v<T> - 1 - countl_zero(x));
+}
+
+template <class T>
+KOKKOS_FUNCTION constexpr std::enable_if_t<
+    Impl::is_standard_unsigned_integer_type_v<T>, T>
+bit_width(T x) noexcept {
+  if (x == 0) return 0;
+  using ::Kokkos::Experimental::digits_v;
+  return digits_v<T> - countl_zero(x);
+}
+//</editor-fold>
+
+//<editor-fold desc="[bit.rotate], rotating">
+template <class T>
+[[nodiscard]] KOKKOS_FUNCTION constexpr std::enable_if_t<
+    Impl::is_standard_unsigned_integer_type_v<T>, T>
+rotl(T x, int s) noexcept {
+  using Experimental::digits_v;
+  constexpr auto dig = digits_v<T>;
+  int const rem      = s % dig;
+  if (rem == 0) return x;
+  if (rem > 0) return (x << rem) | (x >> ((dig - rem) % dig));
+  return (x >> -rem) | (x << ((dig + rem) % dig));  // rotr(x, -rem)
+}
+
+template <class T>
+[[nodiscard]] KOKKOS_FUNCTION constexpr std::enable_if_t<
+    Impl::is_standard_unsigned_integer_type_v<T>, T>
+rotr(T x, int s) noexcept {
+  using Experimental::digits_v;
+  constexpr auto dig = digits_v<T>;
+  int const rem      = s % dig;
+  if (rem == 0) return x;
+  if (rem > 0) return (x >> rem) | (x << ((dig - rem) % dig));
+  return (x << -rem) | (x >> ((dig + rem) % dig));  // rotl(x, -rem)
+}
+//</editor-fold>
+
+}  // namespace Kokkos
+
+namespace Kokkos::Impl {
+
+#if defined(KOKKOS_COMPILER_CLANG) || defined(KOKKOS_COMPILER_INTEL_LLVM) || \
+    defined(KOKKOS_COMPILER_GNU)
+#define KOKKOS_IMPL_USE_GCC_BUILT_IN_FUNCTIONS
+#endif
+
+template <class T>
+KOKKOS_IMPL_DEVICE_FUNCTION T byteswap_builtin_device(T x) noexcept {
+  return byteswap_fallback(x);
+}
+
+template <class T>
+KOKKOS_IMPL_HOST_FUNCTION T byteswap_builtin_host(T x) noexcept {
+#ifdef KOKKOS_IMPL_USE_GCC_BUILT_IN_FUNCTIONS
+  if constexpr (sizeof(T) == 1) {
+    return x;
+  } else if constexpr (sizeof(T) == 2) {
+    return __builtin_bswap16(x);
+  } else if constexpr (sizeof(T) == 4) {
+    return __builtin_bswap32(x);
+  } else if constexpr (sizeof(T) == 8) {
+    return __builtin_bswap64(x);
+  } else if constexpr (sizeof(T) == 16) {
+#if defined(__has_builtin)
+#if __has_builtin(__builtin_bswap128)
+    return __builtin_bswap128(x);
+#endif
+#endif
+    return (__builtin_bswap64(x >> 64) |
+            (static_cast<T>(__builtin_bswap64(x)) << 64));
+  }
+#endif
+
+  return byteswap_fallback(x);
+}
+
+template <class T>
+KOKKOS_IMPL_DEVICE_FUNCTION
+    std::enable_if_t<is_standard_unsigned_integer_type_v<T>, int>
+    countl_zero_builtin_device(T x) noexcept {
+#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
+  if constexpr (sizeof(T) == sizeof(long long int))
+    return __clzll(reinterpret_cast<long long int&>(x));
+  if constexpr (sizeof(T) == sizeof(int))
+    return __clz(reinterpret_cast<int&>(x));
+  using ::Kokkos::Experimental::digits_v;
+  constexpr int shift = digits_v<unsigned int> - digits_v<T>;
+  return __clz(x) - shift;
+#elif defined(KOKKOS_ENABLE_SYCL)
+  return sycl::clz(x);
+#else
+  return countl_zero_fallback(x);
+#endif
+}
+
+template <class T>
+KOKKOS_IMPL_HOST_FUNCTION
+    std::enable_if_t<is_standard_unsigned_integer_type_v<T>, int>
+    countl_zero_builtin_host(T x) noexcept {
+  using ::Kokkos::Experimental::digits_v;
+  if (x == 0) return digits_v<T>;
+#ifdef KOKKOS_IMPL_USE_GCC_BUILT_IN_FUNCTIONS
+  if constexpr (std::is_same_v<T, unsigned long long>) {
+    return __builtin_clzll(x);
+  } else if constexpr (std::is_same_v<T, unsigned long>) {
+    return __builtin_clzl(x);
+  } else if constexpr (std::is_same_v<T, unsigned int>) {
+    return __builtin_clz(x);
+  } else {
+    constexpr int shift = digits_v<unsigned int> - digits_v<T>;
+    return __builtin_clz(x) - shift;
+  }
+#else
+  return countl_zero_fallback(x);
+#endif
+}
+
+template <class T>
+KOKKOS_IMPL_DEVICE_FUNCTION
+    std::enable_if_t<is_standard_unsigned_integer_type_v<T>, int>
+    countr_zero_builtin_device(T x) noexcept {
+  using ::Kokkos::Experimental::digits_v;
+  if (x == 0) return digits_v<T>;
+#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
+  if constexpr (sizeof(T) == sizeof(long long int))
+    return __ffsll(reinterpret_cast<long long int&>(x)) - 1;
+  return __ffs(reinterpret_cast<int&>(x)) - 1;
+#elif defined(KOKKOS_ENABLE_SYCL)
+  return sycl::ctz(x);
+#else
+  return countr_zero_fallback(x);
+#endif
+}
+
+template <class T>
+KOKKOS_IMPL_HOST_FUNCTION
+    std::enable_if_t<is_standard_unsigned_integer_type_v<T>, int>
+    countr_zero_builtin_host(T x) noexcept {
+  using ::Kokkos::Experimental::digits_v;
+  if (x == 0) return digits_v<T>;
+#ifdef KOKKOS_IMPL_USE_GCC_BUILT_IN_FUNCTIONS
+  if constexpr (std::is_same_v<T, unsigned long long>) {
+    return __builtin_ctzll(x);
+  } else if constexpr (std::is_same_v<T, unsigned long>) {
+    return __builtin_ctzl(x);
+  } else {
+    return __builtin_ctz(x);
+  }
+#else
+  return countr_zero_fallback(x);
+#endif
+}
+
+template <class T>
+KOKKOS_IMPL_DEVICE_FUNCTION
+    std::enable_if_t<is_standard_unsigned_integer_type_v<T>, int>
+    popcount_builtin_device(T x) noexcept {
+#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
+  if constexpr (sizeof(T) == sizeof(long long int)) return __popcll(x);
+  return __popc(x);
+#elif defined(KOKKOS_ENABLE_SYCL)
+  return sycl::popcount(x);
+#else
+  return popcount_fallback(x);
+#endif
+}
+
+template <class T>
+KOKKOS_IMPL_HOST_FUNCTION
+    std::enable_if_t<is_standard_unsigned_integer_type_v<T>, int>
+    popcount_builtin_host(T x) noexcept {
+#ifdef KOKKOS_IMPL_USE_GCC_BUILT_IN_FUNCTIONS
+  if constexpr (std::is_same_v<T, unsigned long long>) {
+    return __builtin_popcountll(x);
+  } else if constexpr (std::is_same_v<T, unsigned long>) {
+    return __builtin_popcountl(x);
+  } else {
+    return __builtin_popcount(x);
+  }
+#else
+  return popcount_fallback(x);
+#endif
+}
+
+#undef KOKKOS_IMPL_USE_GCC_BUILT_IN_FUNCTIONS
+
+}  // namespace Kokkos::Impl
+
+namespace Kokkos::Experimental {
+
+template <class To, class From>
+KOKKOS_FUNCTION std::enable_if_t<sizeof(To) == sizeof(From) &&
+                                     std::is_trivially_copyable_v<To> &&
+                                     std::is_trivially_copyable_v<From>,
+                                 To>
+bit_cast_builtin(From const& from) noexcept {
+  // qualify the call to avoid ADL
+  return Kokkos::bit_cast<To>(from);  // no benefit to call the _builtin variant
+}
+
+template <class T>
+KOKKOS_FUNCTION std::enable_if_t<std::is_integral_v<T>, T> byteswap_builtin(
+    T x) noexcept {
+  KOKKOS_IF_ON_DEVICE((return ::Kokkos::Impl::byteswap_builtin_device(x);))
+  KOKKOS_IF_ON_HOST((return ::Kokkos::Impl::byteswap_builtin_host(x);))
+}
+
+template <class T>
+KOKKOS_FUNCTION std::enable_if_t<
+    ::Kokkos::Impl::is_standard_unsigned_integer_type_v<T>, int>
+countl_zero_builtin(T x) noexcept {
+  KOKKOS_IF_ON_DEVICE((return ::Kokkos::Impl::countl_zero_builtin_device(x);))
+  KOKKOS_IF_ON_HOST((return ::Kokkos::Impl::countl_zero_builtin_host(x);))
+}
+
+template <class T>
+KOKKOS_FUNCTION std::enable_if_t<
+    ::Kokkos::Impl::is_standard_unsigned_integer_type_v<T>, int>
+countl_one_builtin(T x) noexcept {
+  if (x == finite_max_v<T>) return digits_v<T>;
+  return countl_zero_builtin(static_cast<T>(~x));
+}
+
+template <class T>
+KOKKOS_FUNCTION std::enable_if_t<
+    ::Kokkos::Impl::is_standard_unsigned_integer_type_v<T>, int>
+countr_zero_builtin(T x) noexcept {
+  KOKKOS_IF_ON_DEVICE((return ::Kokkos::Impl::countr_zero_builtin_device(x);))
+  KOKKOS_IF_ON_HOST((return ::Kokkos::Impl::countr_zero_builtin_host(x);))
+}
+
+template <class T>
+KOKKOS_FUNCTION std::enable_if_t<
+    ::Kokkos::Impl::is_standard_unsigned_integer_type_v<T>, int>
+countr_one_builtin(T x) noexcept {
+  if (x == finite_max_v<T>) return digits_v<T>;
+  return countr_zero_builtin(static_cast<T>(~x));
+}
+
+template <class T>
+KOKKOS_FUNCTION std::enable_if_t<
+    ::Kokkos::Impl::is_standard_unsigned_integer_type_v<T>, int>
+popcount_builtin(T x) noexcept {
+  KOKKOS_IF_ON_DEVICE((return ::Kokkos::Impl::popcount_builtin_device(x);))
+  KOKKOS_IF_ON_HOST((return ::Kokkos::Impl::popcount_builtin_host(x);))
+}
+
+template <class T>
+KOKKOS_FUNCTION std::enable_if_t<
+    ::Kokkos::Impl::is_standard_unsigned_integer_type_v<T>, bool>
+has_single_bit_builtin(T x) noexcept {
+  return has_single_bit(x);  // no benefit to call the _builtin variant
+}
+
+template <class T>
+KOKKOS_FUNCTION
+    std::enable_if_t<::Kokkos::Impl::is_standard_unsigned_integer_type_v<T>, T>
+    bit_ceil_builtin(T x) noexcept {
+  if (x <= 1) return 1;
+  return T{1} << (digits_v<T> - countl_zero_builtin(static_cast<T>(x - 1)));
+}
+
+template <class T>
+KOKKOS_FUNCTION
+    std::enable_if_t<::Kokkos::Impl::is_standard_unsigned_integer_type_v<T>, T>
+    bit_floor_builtin(T x) noexcept {
+  if (x == 0) return 0;
+  return T{1} << (digits_v<T> - 1 - countl_zero_builtin(x));
+}
+
+template <class T>
+KOKKOS_FUNCTION
+    std::enable_if_t<::Kokkos::Impl::is_standard_unsigned_integer_type_v<T>, T>
+    bit_width_builtin(T x) noexcept {
+  if (x == 0) return 0;
+  return digits_v<T> - countl_zero_builtin(x);
+}
+
+template <class T>
+[[nodiscard]] KOKKOS_FUNCTION
+    std::enable_if_t<::Kokkos::Impl::is_standard_unsigned_integer_type_v<T>, T>
+    rotl_builtin(T x, int s) noexcept {
+  return rotl(x, s);  // no benefit to call the _builtin variant
+}
+
+template <class T>
+[[nodiscard]] KOKKOS_FUNCTION
+    std::enable_if_t<::Kokkos::Impl::is_standard_unsigned_integer_type_v<T>, T>
+    rotr_builtin(T x, int s) noexcept {
+  return rotr(x, s);  // no benefit to call the _builtin variant
+}
+
+}  // namespace Kokkos::Experimental
+
+#endif
diff --git a/packages/kokkos/core/src/Kokkos_Complex.hpp b/packages/kokkos/core/src/Kokkos_Complex.hpp
index 4cecbe4cfe59655c7b90205c279d689e2833043e..4d405116ccff5772513209cd300ef57dfb439b9c 100644
--- a/packages/kokkos/core/src/Kokkos_Complex.hpp
+++ b/packages/kokkos/core/src/Kokkos_Complex.hpp
@@ -44,6 +44,11 @@ class
     alignas(2 * sizeof(RealType))
 #endif
         complex {
+  static_assert(std::is_floating_point_v<RealType> &&
+                    std::is_same_v<RealType, std::remove_cv_t<RealType>>,
+                "Kokkos::complex can only be instantiated for a cv-unqualified "
+                "floating point type");
+
  private:
   RealType re_{};
   RealType im_{};
diff --git a/packages/kokkos/core/src/Kokkos_Concepts.hpp b/packages/kokkos/core/src/Kokkos_Concepts.hpp
index 2448efab046457e4ce9784418aeefaa62a2958b4..df78a644a034deb6689b640f35035d36f9378349 100644
--- a/packages/kokkos/core/src/Kokkos_Concepts.hpp
+++ b/packages/kokkos/core/src/Kokkos_Concepts.hpp
@@ -117,8 +117,8 @@ template <unsigned int maxT = 0 /* Max threads per block */
 struct LaunchBounds {
   using launch_bounds = LaunchBounds;
   using type          = LaunchBounds<maxT, minB>;
-  static unsigned int constexpr maxTperB{maxT};
-  static unsigned int constexpr minBperSM{minB};
+  static constexpr unsigned int maxTperB{maxT};
+  static constexpr unsigned int minBperSM{minB};
 };
 
 }  // namespace Kokkos
diff --git a/packages/kokkos/core/src/Kokkos_CopyViews.hpp b/packages/kokkos/core/src/Kokkos_CopyViews.hpp
index 6d5d9548c7998a51969f09d9c2d2f17d523017f3..a0ca55be7043e8157d5cb44580e9be5f97149f16 100644
--- a/packages/kokkos/core/src/Kokkos_CopyViews.hpp
+++ b/packages/kokkos/core/src/Kokkos_CopyViews.hpp
@@ -25,6 +25,7 @@ static_assert(false,
 #include <Kokkos_Parallel.hpp>
 #include <KokkosExp_MDRangePolicy.hpp>
 #include <Kokkos_Layout.hpp>
+#include <impl/Kokkos_HostSpace_ZeroMemset.hpp>
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@@ -534,7 +535,7 @@ void view_copy(const ExecutionSpace& space, const DstType& dst,
         "Kokkos::Impl::view_copy called with invalid execution space");
   } else {
     // Figure out iteration order in case we need it
-    int64_t strides[DstType::Rank + 1];
+    int64_t strides[DstType::rank + 1];
     dst.stride(strides);
     Kokkos::Iterate iterate;
     if (Kokkos::is_layouttiled<typename DstType::array_layout>::value) {
@@ -548,7 +549,7 @@ void view_copy(const ExecutionSpace& space, const DstType& dst,
       iterate = Kokkos::Iterate::Left;
     } else if (std::is_same<typename DstType::array_layout,
                             Kokkos::LayoutStride>::value) {
-      if (strides[0] > strides[DstType::Rank - 1])
+      if (strides[0] > strides[DstType::rank - 1])
         iterate = Kokkos::Iterate::Right;
       else
         iterate = Kokkos::Iterate::Left;
@@ -566,26 +567,26 @@ void view_copy(const ExecutionSpace& space, const DstType& dst,
         Kokkos::Impl::ViewCopy<
             typename DstType::uniform_runtime_nomemspace_type,
             typename SrcType::uniform_runtime_const_nomemspace_type,
-            Kokkos::LayoutRight, ExecutionSpace, DstType::Rank, int64_t>(
+            Kokkos::LayoutRight, ExecutionSpace, DstType::rank, int64_t>(
             dst, src, space);
       else
         Kokkos::Impl::ViewCopy<
             typename DstType::uniform_runtime_nomemspace_type,
             typename SrcType::uniform_runtime_const_nomemspace_type,
-            Kokkos::LayoutLeft, ExecutionSpace, DstType::Rank, int64_t>(
+            Kokkos::LayoutLeft, ExecutionSpace, DstType::rank, int64_t>(
             dst, src, space);
     } else {
       if (iterate == Kokkos::Iterate::Right)
         Kokkos::Impl::ViewCopy<
             typename DstType::uniform_runtime_nomemspace_type,
             typename SrcType::uniform_runtime_const_nomemspace_type,
-            Kokkos::LayoutRight, ExecutionSpace, DstType::Rank, int>(dst, src,
+            Kokkos::LayoutRight, ExecutionSpace, DstType::rank, int>(dst, src,
                                                                      space);
       else
         Kokkos::Impl::ViewCopy<
             typename DstType::uniform_runtime_nomemspace_type,
             typename SrcType::uniform_runtime_const_nomemspace_type,
-            Kokkos::LayoutLeft, ExecutionSpace, DstType::Rank, int>(dst, src,
+            Kokkos::LayoutLeft, ExecutionSpace, DstType::rank, int>(dst, src,
                                                                     space);
     }
   }
@@ -620,7 +621,7 @@ void view_copy(const DstType& dst, const SrcType& src) {
   }
 
   // Figure out iteration order in case we need it
-  int64_t strides[DstType::Rank + 1];
+  int64_t strides[DstType::rank + 1];
   dst.stride(strides);
   Kokkos::Iterate iterate;
   if (Kokkos::is_layouttiled<typename DstType::array_layout>::value) {
@@ -634,7 +635,7 @@ void view_copy(const DstType& dst, const SrcType& src) {
     iterate = Kokkos::Iterate::Left;
   } else if (std::is_same<typename DstType::array_layout,
                           Kokkos::LayoutStride>::value) {
-    if (strides[0] > strides[DstType::Rank - 1])
+    if (strides[0] > strides[DstType::rank - 1])
       iterate = Kokkos::Iterate::Right;
     else
       iterate = Kokkos::Iterate::Left;
@@ -653,26 +654,26 @@ void view_copy(const DstType& dst, const SrcType& src) {
         Kokkos::Impl::ViewCopy<
             typename DstType::uniform_runtime_nomemspace_type,
             typename SrcType::uniform_runtime_const_nomemspace_type,
-            Kokkos::LayoutRight, dst_execution_space, DstType::Rank, int64_t>(
+            Kokkos::LayoutRight, dst_execution_space, DstType::rank, int64_t>(
             dst, src);
       else
         Kokkos::Impl::ViewCopy<
             typename DstType::uniform_runtime_nomemspace_type,
             typename SrcType::uniform_runtime_const_nomemspace_type,
-            Kokkos::LayoutLeft, dst_execution_space, DstType::Rank, int64_t>(
+            Kokkos::LayoutLeft, dst_execution_space, DstType::rank, int64_t>(
             dst, src);
     } else {
       if (iterate == Kokkos::Iterate::Right)
         Kokkos::Impl::ViewCopy<
             typename DstType::uniform_runtime_nomemspace_type,
             typename SrcType::uniform_runtime_const_nomemspace_type,
-            Kokkos::LayoutRight, src_execution_space, DstType::Rank, int64_t>(
+            Kokkos::LayoutRight, src_execution_space, DstType::rank, int64_t>(
             dst, src);
       else
         Kokkos::Impl::ViewCopy<
             typename DstType::uniform_runtime_nomemspace_type,
             typename SrcType::uniform_runtime_const_nomemspace_type,
-            Kokkos::LayoutLeft, src_execution_space, DstType::Rank, int64_t>(
+            Kokkos::LayoutLeft, src_execution_space, DstType::rank, int64_t>(
             dst, src);
     }
   } else {
@@ -681,26 +682,26 @@ void view_copy(const DstType& dst, const SrcType& src) {
         Kokkos::Impl::ViewCopy<
             typename DstType::uniform_runtime_nomemspace_type,
             typename SrcType::uniform_runtime_const_nomemspace_type,
-            Kokkos::LayoutRight, dst_execution_space, DstType::Rank, int>(dst,
+            Kokkos::LayoutRight, dst_execution_space, DstType::rank, int>(dst,
                                                                           src);
       else
         Kokkos::Impl::ViewCopy<
             typename DstType::uniform_runtime_nomemspace_type,
             typename SrcType::uniform_runtime_const_nomemspace_type,
-            Kokkos::LayoutLeft, dst_execution_space, DstType::Rank, int>(dst,
+            Kokkos::LayoutLeft, dst_execution_space, DstType::rank, int>(dst,
                                                                          src);
     } else {
       if (iterate == Kokkos::Iterate::Right)
         Kokkos::Impl::ViewCopy<
             typename DstType::uniform_runtime_nomemspace_type,
             typename SrcType::uniform_runtime_const_nomemspace_type,
-            Kokkos::LayoutRight, src_execution_space, DstType::Rank, int>(dst,
+            Kokkos::LayoutRight, src_execution_space, DstType::rank, int>(dst,
                                                                           src);
       else
         Kokkos::Impl::ViewCopy<
             typename DstType::uniform_runtime_nomemspace_type,
             typename SrcType::uniform_runtime_const_nomemspace_type,
-            Kokkos::LayoutLeft, src_execution_space, DstType::Rank, int>(dst,
+            Kokkos::LayoutLeft, src_execution_space, DstType::rank, int>(dst,
                                                                          src);
     }
   }
@@ -832,7 +833,7 @@ struct CommonSubview<DstType, SrcType, 8, Arg0, Arg1, Arg2, Arg3, Arg4, Arg5,
 
 template <class DstType, class SrcType,
           class ExecSpace = typename DstType::execution_space,
-          int Rank        = DstType::Rank>
+          int Rank        = DstType::rank>
 struct ViewRemap;
 
 template <class DstType, class SrcType, class ExecSpace>
@@ -874,7 +875,7 @@ struct ViewRemap<DstType, SrcType, ExecSpace, 2> {
       } else {
         p_type ext1(0, std::min(dst.extent(1), src.extent(1)));
         using sv_adapter_type =
-            CommonSubview<DstType, SrcType, 2, Kokkos::Impl::ALL_t, p_type>;
+            CommonSubview<DstType, SrcType, 2, Kokkos::ALL_t, p_type>;
         sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1);
         view_copy(exec_space..., common_subview.dst_sub,
                   common_subview.src_sub);
@@ -883,7 +884,7 @@ struct ViewRemap<DstType, SrcType, ExecSpace, 2> {
       if (dst.extent(1) == src.extent(1)) {
         p_type ext0(0, std::min(dst.extent(0), src.extent(0)));
         using sv_adapter_type =
-            CommonSubview<DstType, SrcType, 2, p_type, Kokkos::Impl::ALL_t>;
+            CommonSubview<DstType, SrcType, 2, p_type, Kokkos::ALL_t>;
         sv_adapter_type common_subview(dst, src, ext0, Kokkos::ALL);
         view_copy(exec_space..., common_subview.dst_sub,
                   common_subview.src_sub);
@@ -915,8 +916,8 @@ struct ViewRemap<DstType, SrcType, ExecSpace, 3> {
       if (dst.extent(2) == src.extent(2)) {
         p_type ext1(0, std::min(dst.extent(1), src.extent(1)));
         using sv_adapter_type =
-            CommonSubview<DstType, SrcType, 3, Kokkos::Impl::ALL_t, p_type,
-                          Kokkos::Impl::ALL_t>;
+            CommonSubview<DstType, SrcType, 3, Kokkos::ALL_t, p_type,
+                          Kokkos::ALL_t>;
         sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1,
                                        Kokkos::ALL);
         view_copy(exec_space..., common_subview.dst_sub,
@@ -925,8 +926,7 @@ struct ViewRemap<DstType, SrcType, ExecSpace, 3> {
         p_type ext1(0, std::min(dst.extent(1), src.extent(1)));
         p_type ext2(0, std::min(dst.extent(2), src.extent(2)));
         using sv_adapter_type =
-            CommonSubview<DstType, SrcType, 3, Kokkos::Impl::ALL_t, p_type,
-                          p_type>;
+            CommonSubview<DstType, SrcType, 3, Kokkos::ALL_t, p_type, p_type>;
         sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2);
         view_copy(exec_space..., common_subview.dst_sub,
                   common_subview.src_sub);
@@ -935,8 +935,8 @@ struct ViewRemap<DstType, SrcType, ExecSpace, 3> {
       if (dst.extent(2) == src.extent(2)) {
         p_type ext0(0, std::min(dst.extent(0), src.extent(0)));
         p_type ext1(0, std::min(dst.extent(1), src.extent(1)));
-        using sv_adapter_type = CommonSubview<DstType, SrcType, 3, p_type,
-                                              p_type, Kokkos::Impl::ALL_t>;
+        using sv_adapter_type =
+            CommonSubview<DstType, SrcType, 3, p_type, p_type, Kokkos::ALL_t>;
         sv_adapter_type common_subview(dst, src, ext0, ext1, Kokkos::ALL);
         view_copy(exec_space..., common_subview.dst_sub,
                   common_subview.src_sub);
@@ -970,8 +970,8 @@ struct ViewRemap<DstType, SrcType, ExecSpace, 4> {
         p_type ext1(0, std::min(dst.extent(1), src.extent(1)));
         p_type ext2(0, std::min(dst.extent(2), src.extent(2)));
         using sv_adapter_type =
-            CommonSubview<DstType, SrcType, 4, Kokkos::Impl::ALL_t, p_type,
-                          p_type, Kokkos::Impl::ALL_t>;
+            CommonSubview<DstType, SrcType, 4, Kokkos::ALL_t, p_type, p_type,
+                          Kokkos::ALL_t>;
         sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2,
                                        Kokkos::ALL);
         view_copy(exec_space..., common_subview.dst_sub,
@@ -981,8 +981,8 @@ struct ViewRemap<DstType, SrcType, ExecSpace, 4> {
         p_type ext2(0, std::min(dst.extent(2), src.extent(2)));
         p_type ext3(0, std::min(dst.extent(3), src.extent(3)));
         using sv_adapter_type =
-            CommonSubview<DstType, SrcType, 4, Kokkos::Impl::ALL_t, p_type,
-                          p_type, p_type>;
+            CommonSubview<DstType, SrcType, 4, Kokkos::ALL_t, p_type, p_type,
+                          p_type>;
         sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2, ext3);
         view_copy(exec_space..., common_subview.dst_sub,
                   common_subview.src_sub);
@@ -992,9 +992,8 @@ struct ViewRemap<DstType, SrcType, ExecSpace, 4> {
         p_type ext0(0, std::min(dst.extent(0), src.extent(0)));
         p_type ext1(0, std::min(dst.extent(1), src.extent(1)));
         p_type ext2(0, std::min(dst.extent(2), src.extent(2)));
-        using sv_adapter_type =
-            CommonSubview<DstType, SrcType, 4, p_type, p_type, p_type,
-                          Kokkos::Impl::ALL_t>;
+        using sv_adapter_type = CommonSubview<DstType, SrcType, 4, p_type,
+                                              p_type, p_type, Kokkos::ALL_t>;
         sv_adapter_type common_subview(dst, src, ext0, ext1, ext2, Kokkos::ALL);
         view_copy(exec_space..., common_subview.dst_sub,
                   common_subview.src_sub);
@@ -1030,8 +1029,8 @@ struct ViewRemap<DstType, SrcType, ExecSpace, 5> {
         p_type ext2(0, std::min(dst.extent(2), src.extent(2)));
         p_type ext3(0, std::min(dst.extent(3), src.extent(3)));
         using sv_adapter_type =
-            CommonSubview<DstType, SrcType, 5, Kokkos::Impl::ALL_t, p_type,
-                          p_type, p_type, Kokkos::Impl::ALL_t>;
+            CommonSubview<DstType, SrcType, 5, Kokkos::ALL_t, p_type, p_type,
+                          p_type, Kokkos::ALL_t>;
         sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2, ext3,
                                        Kokkos::ALL);
         view_copy(exec_space..., common_subview.dst_sub,
@@ -1042,8 +1041,8 @@ struct ViewRemap<DstType, SrcType, ExecSpace, 5> {
         p_type ext3(0, std::min(dst.extent(3), src.extent(3)));
         p_type ext4(0, std::min(dst.extent(4), src.extent(4)));
         using sv_adapter_type =
-            CommonSubview<DstType, SrcType, 5, Kokkos::Impl::ALL_t, p_type,
-                          p_type, p_type, p_type>;
+            CommonSubview<DstType, SrcType, 5, Kokkos::ALL_t, p_type, p_type,
+                          p_type, p_type>;
         sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2, ext3,
                                        ext4);
         view_copy(exec_space..., common_subview.dst_sub,
@@ -1057,7 +1056,7 @@ struct ViewRemap<DstType, SrcType, ExecSpace, 5> {
         p_type ext3(0, std::min(dst.extent(3), src.extent(3)));
         using sv_adapter_type =
             CommonSubview<DstType, SrcType, 5, p_type, p_type, p_type, p_type,
-                          Kokkos::Impl::ALL_t>;
+                          Kokkos::ALL_t>;
         sv_adapter_type common_subview(dst, src, ext0, ext1, ext2, ext3,
                                        Kokkos::ALL);
         view_copy(exec_space..., common_subview.dst_sub,
@@ -1095,8 +1094,8 @@ struct ViewRemap<DstType, SrcType, ExecSpace, 6> {
         p_type ext3(0, std::min(dst.extent(3), src.extent(3)));
         p_type ext4(0, std::min(dst.extent(4), src.extent(4)));
         using sv_adapter_type =
-            CommonSubview<DstType, SrcType, 6, Kokkos::Impl::ALL_t, p_type,
-                          p_type, p_type, p_type, Kokkos::Impl::ALL_t>;
+            CommonSubview<DstType, SrcType, 6, Kokkos::ALL_t, p_type, p_type,
+                          p_type, p_type, Kokkos::ALL_t>;
         sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2, ext3,
                                        ext4, Kokkos::ALL);
         view_copy(exec_space..., common_subview.dst_sub,
@@ -1108,8 +1107,8 @@ struct ViewRemap<DstType, SrcType, ExecSpace, 6> {
         p_type ext4(0, std::min(dst.extent(4), src.extent(4)));
         p_type ext5(0, std::min(dst.extent(5), src.extent(5)));
         using sv_adapter_type =
-            CommonSubview<DstType, SrcType, 6, Kokkos::Impl::ALL_t, p_type,
-                          p_type, p_type, p_type, p_type>;
+            CommonSubview<DstType, SrcType, 6, Kokkos::ALL_t, p_type, p_type,
+                          p_type, p_type, p_type>;
         sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2, ext3,
                                        ext4, ext5);
         view_copy(exec_space..., common_subview.dst_sub,
@@ -1125,7 +1124,7 @@ struct ViewRemap<DstType, SrcType, ExecSpace, 6> {
 
         using sv_adapter_type =
             CommonSubview<DstType, SrcType, 6, p_type, p_type, p_type, p_type,
-                          p_type, Kokkos::Impl::ALL_t>;
+                          p_type, Kokkos::ALL_t>;
         sv_adapter_type common_subview(dst, src, ext0, ext1, ext2, ext3, ext4,
                                        Kokkos::ALL);
         view_copy(exec_space..., common_subview.dst_sub,
@@ -1169,8 +1168,8 @@ struct ViewRemap<DstType, SrcType, ExecSpace, 7> {
         p_type ext4(0, std::min(dst.extent(4), src.extent(4)));
         p_type ext5(0, std::min(dst.extent(5), src.extent(5)));
         using sv_adapter_type =
-            CommonSubview<DstType, SrcType, 7, Kokkos::Impl::ALL_t, p_type,
-                          p_type, p_type, p_type, p_type, Kokkos::Impl::ALL_t>;
+            CommonSubview<DstType, SrcType, 7, Kokkos::ALL_t, p_type, p_type,
+                          p_type, p_type, p_type, Kokkos::ALL_t>;
         sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2, ext3,
                                        ext4, ext5, Kokkos::ALL);
         view_copy(exec_space..., common_subview.dst_sub,
@@ -1183,8 +1182,8 @@ struct ViewRemap<DstType, SrcType, ExecSpace, 7> {
         p_type ext5(0, std::min(dst.extent(5), src.extent(5)));
         p_type ext6(0, std::min(dst.extent(6), src.extent(6)));
         using sv_adapter_type =
-            CommonSubview<DstType, SrcType, 7, Kokkos::Impl::ALL_t, p_type,
-                          p_type, p_type, p_type, p_type, p_type>;
+            CommonSubview<DstType, SrcType, 7, Kokkos::ALL_t, p_type, p_type,
+                          p_type, p_type, p_type, p_type>;
         sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2, ext3,
                                        ext4, ext5, ext6);
         view_copy(exec_space..., common_subview.dst_sub,
@@ -1200,7 +1199,7 @@ struct ViewRemap<DstType, SrcType, ExecSpace, 7> {
         p_type ext5(0, std::min(dst.extent(5), src.extent(5)));
         using sv_adapter_type =
             CommonSubview<DstType, SrcType, 7, p_type, p_type, p_type, p_type,
-                          p_type, p_type, Kokkos::Impl::ALL_t>;
+                          p_type, p_type, Kokkos::ALL_t>;
         sv_adapter_type common_subview(dst, src, ext0, ext1, ext2, ext3, ext4,
                                        ext5, Kokkos::ALL);
         view_copy(exec_space..., common_subview.dst_sub,
@@ -1245,9 +1244,8 @@ struct ViewRemap<DstType, SrcType, ExecSpace, 8> {
         p_type ext5(0, std::min(dst.extent(5), src.extent(5)));
         p_type ext6(0, std::min(dst.extent(6), src.extent(6)));
         using sv_adapter_type =
-            CommonSubview<DstType, SrcType, 8, Kokkos::Impl::ALL_t, p_type,
-                          p_type, p_type, p_type, p_type, p_type,
-                          Kokkos::Impl::ALL_t>;
+            CommonSubview<DstType, SrcType, 8, Kokkos::ALL_t, p_type, p_type,
+                          p_type, p_type, p_type, p_type, Kokkos::ALL_t>;
         sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2, ext3,
                                        ext4, ext5, ext6, Kokkos::ALL);
         view_copy(exec_space..., common_subview.dst_sub,
@@ -1261,8 +1259,8 @@ struct ViewRemap<DstType, SrcType, ExecSpace, 8> {
         p_type ext6(0, std::min(dst.extent(6), src.extent(6)));
         p_type ext7(0, std::min(dst.extent(7), src.extent(7)));
         using sv_adapter_type =
-            CommonSubview<DstType, SrcType, 8, Kokkos::Impl::ALL_t, p_type,
-                          p_type, p_type, p_type, p_type, p_type, p_type>;
+            CommonSubview<DstType, SrcType, 8, Kokkos::ALL_t, p_type, p_type,
+                          p_type, p_type, p_type, p_type, p_type>;
         sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2, ext3,
                                        ext4, ext5, ext6, ext7);
         view_copy(exec_space..., common_subview.dst_sub,
@@ -1279,7 +1277,7 @@ struct ViewRemap<DstType, SrcType, ExecSpace, 8> {
         p_type ext6(0, std::min(dst.extent(6), src.extent(6)));
         using sv_adapter_type =
             CommonSubview<DstType, SrcType, 8, p_type, p_type, p_type, p_type,
-                          p_type, p_type, p_type, Kokkos::Impl::ALL_t>;
+                          p_type, p_type, p_type, Kokkos::ALL_t>;
         sv_adapter_type common_subview(dst, src, ext0, ext1, ext2, ext3, ext4,
                                        ext5, ext6, Kokkos::ALL);
         view_copy(exec_space..., common_subview.dst_sub,
@@ -1313,7 +1311,7 @@ inline void contiguous_fill(
   using ViewTypeFlat = Kokkos::View<
       typename ViewType::value_type*, Kokkos::LayoutRight,
       Kokkos::Device<typename ViewType::execution_space,
-                     std::conditional_t<ViewType::Rank == 0,
+                     std::conditional_t<ViewType::rank == 0,
                                         typename ViewType::memory_space,
                                         Kokkos::AnonymousSpace>>,
       Kokkos::MemoryTraits<0>>;
@@ -1321,23 +1319,23 @@ inline void contiguous_fill(
   ViewTypeFlat dst_flat(dst.data(), dst.size());
   if (dst.span() < static_cast<size_t>(std::numeric_limits<int>::max())) {
     Kokkos::Impl::ViewFill<ViewTypeFlat, Kokkos::LayoutRight, ExecutionSpace,
-                           ViewTypeFlat::Rank, int>(dst_flat, value,
+                           ViewTypeFlat::rank, int>(dst_flat, value,
                                                     exec_space);
   } else
     Kokkos::Impl::ViewFill<ViewTypeFlat, Kokkos::LayoutRight, ExecutionSpace,
-                           ViewTypeFlat::Rank, int64_t>(dst_flat, value,
+                           ViewTypeFlat::rank, int64_t>(dst_flat, value,
                                                         exec_space);
 }
 
-template <typename ExecutionSpace, class DT, class... DP>
+// Default implementation for execution spaces that don't provide a definition
+template <typename ExecutionSpace, class ViewType>
 struct ZeroMemset {
-  ZeroMemset(const ExecutionSpace& exec_space, const View<DT, DP...>& dst,
-             typename ViewTraits<DT, DP...>::const_value_type& value) {
+  ZeroMemset(const ExecutionSpace& exec_space, const ViewType& dst,
+             typename ViewType::const_value_type& value) {
     contiguous_fill(exec_space, dst, value);
   }
 
-  ZeroMemset(const View<DT, DP...>& dst,
-             typename ViewTraits<DT, DP...>::const_value_type& value) {
+  ZeroMemset(const ViewType& dst, typename ViewType::const_value_type& value) {
     contiguous_fill(ExecutionSpace(), dst, value);
   }
 };
@@ -1354,7 +1352,7 @@ contiguous_fill_or_memset(
 // leading to the significant performance issues
 #ifndef KOKKOS_ARCH_A64FX
   if (Impl::is_zero_byte(value))
-    ZeroMemset<ExecutionSpace, DT, DP...>(exec_space, dst, value);
+    ZeroMemset<ExecutionSpace, View<DT, DP...>>(exec_space, dst, value);
   else
 #endif
     contiguous_fill(exec_space, dst, value);
@@ -1386,7 +1384,7 @@ contiguous_fill_or_memset(
 // leading to the significant performance issues
 #ifndef KOKKOS_ARCH_A64FX
   if (Impl::is_zero_byte(value))
-    ZeroMemset<exec_space_type, DT, DP...>(dst, value);
+    ZeroMemset<exec_space_type, View<DT, DP...>>(dst, value);
   else
 #endif
     contiguous_fill(exec_space_type(), dst, value);
@@ -1450,7 +1448,7 @@ inline void deep_copy(
   }
 
   // Figure out iteration order to do the ViewFill
-  int64_t strides[ViewType::Rank + 1];
+  int64_t strides[ViewType::rank + 1];
   dst.stride(strides);
   Kokkos::Iterate iterate;
   if (std::is_same<typename ViewType::array_layout,
@@ -1461,7 +1459,7 @@ inline void deep_copy(
     iterate = Kokkos::Iterate::Left;
   } else if (std::is_same<typename ViewType::array_layout,
                           Kokkos::LayoutStride>::value) {
-    if (strides[0] > strides[ViewType::Rank > 0 ? ViewType::Rank - 1 : 0])
+    if (strides[0] > strides[ViewType::rank > 0 ? ViewType::rank - 1 : 0])
       iterate = Kokkos::Iterate::Right;
     else
       iterate = Kokkos::Iterate::Left;
@@ -1476,26 +1474,26 @@ inline void deep_copy(
   // Lets call the right ViewFill functor based on integer space needed and
   // iteration type
   using ViewTypeUniform =
-      std::conditional_t<ViewType::Rank == 0,
+      std::conditional_t<ViewType::rank == 0,
                          typename ViewType::uniform_runtime_type,
                          typename ViewType::uniform_runtime_nomemspace_type>;
   if (dst.span() > static_cast<size_t>(std::numeric_limits<int>::max())) {
     if (iterate == Kokkos::Iterate::Right)
       Kokkos::Impl::ViewFill<ViewTypeUniform, Kokkos::LayoutRight,
-                             exec_space_type, ViewType::Rank, int64_t>(
+                             exec_space_type, ViewType::rank, int64_t>(
           dst, value, exec_space_type());
     else
       Kokkos::Impl::ViewFill<ViewTypeUniform, Kokkos::LayoutLeft,
-                             exec_space_type, ViewType::Rank, int64_t>(
+                             exec_space_type, ViewType::rank, int64_t>(
           dst, value, exec_space_type());
   } else {
     if (iterate == Kokkos::Iterate::Right)
       Kokkos::Impl::ViewFill<ViewTypeUniform, Kokkos::LayoutRight,
-                             exec_space_type, ViewType::Rank, int>(
+                             exec_space_type, ViewType::rank, int>(
           dst, value, exec_space_type());
     else
       Kokkos::Impl::ViewFill<ViewTypeUniform, Kokkos::LayoutLeft,
-                             exec_space_type, ViewType::Rank, int>(
+                             exec_space_type, ViewType::rank, int>(
           dst, value, exec_space_type());
   }
   Kokkos::fence("Kokkos::deep_copy: scalar copy, post copy fence");
@@ -1639,19 +1637,19 @@ inline void deep_copy(
           "match: ");
       message += dst.label();
       message += "(";
-      for (int r = 0; r < dst_type::Rank - 1; r++) {
-        message += std::to_string(dst.extent(r));
+      message += std::to_string(dst.extent(0));
+      for (size_t r = 1; r < dst_type::rank; r++) {
         message += ",";
+        message += std::to_string(dst.extent(r));
       }
-      message += std::to_string(dst.extent(dst_type::Rank - 1));
       message += ") ";
       message += src.label();
       message += "(";
-      for (int r = 0; r < src_type::Rank - 1; r++) {
-        message += std::to_string(src.extent(r));
+      message += std::to_string(src.extent(0));
+      for (size_t r = 1; r < src_type::rank; r++) {
         message += ",";
+        message += std::to_string(src.extent(r));
       }
-      message += std::to_string(src.extent(src_type::Rank - 1));
       message += ") ";
 
       Kokkos::Impl::throw_runtime_exception(message);
@@ -1722,19 +1720,19 @@ inline void deep_copy(
         "Deprecation Error: Kokkos::deep_copy extents of views don't match: ");
     message += dst.label();
     message += "(";
-    for (int r = 0; r < dst_type::Rank - 1; r++) {
-      message += std::to_string(dst.extent(r));
+    message += std::to_string(dst.extent(0));
+    for (size_t r = 1; r < dst_type::rank; r++) {
       message += ",";
+      message += std::to_string(dst.extent(r));
     }
-    message += std::to_string(dst.extent(dst_type::Rank - 1));
     message += ") ";
     message += src.label();
     message += "(";
-    for (int r = 0; r < src_type::Rank - 1; r++) {
-      message += std::to_string(src.extent(r));
+    message += std::to_string(src.extent(0));
+    for (size_t r = 1; r < src_type::rank; r++) {
       message += ",";
+      message += std::to_string(src.extent(r));
     }
-    message += std::to_string(src.extent(src_type::Rank - 1));
     message += ") ";
 
     Kokkos::Impl::throw_runtime_exception(message);
@@ -1761,7 +1759,7 @@ inline void deep_copy(
     Kokkos::fence(
         "Kokkos::deep_copy: copy between contiguous views, pre view equality "
         "check");
-    if ((void*)dst.data() != (void*)src.data()) {
+    if ((void*)dst.data() != (void*)src.data() && 0 < nbytes) {
       Kokkos::Impl::DeepCopy<dst_memory_space, src_memory_space>(
           dst.data(), src.data(), nbytes);
       Kokkos::fence(
@@ -2562,7 +2560,7 @@ inline void deep_copy(
   } else {
     using ViewType = View<DT, DP...>;
     // Figure out iteration order to do the ViewFill
-    int64_t strides[ViewType::Rank + 1];
+    int64_t strides[ViewType::rank + 1];
     dst.stride(strides);
     Kokkos::Iterate iterate;
     if (std::is_same<typename ViewType::array_layout,
@@ -2573,7 +2571,7 @@ inline void deep_copy(
       iterate = Kokkos::Iterate::Left;
     } else if (std::is_same<typename ViewType::array_layout,
                             Kokkos::LayoutStride>::value) {
-      if (strides[0] > strides[ViewType::Rank > 0 ? ViewType::Rank - 1 : 0])
+      if (strides[0] > strides[ViewType::rank > 0 ? ViewType::rank - 1 : 0])
         iterate = Kokkos::Iterate::Right;
       else
         iterate = Kokkos::Iterate::Left;
@@ -2588,23 +2586,23 @@ inline void deep_copy(
     // Lets call the right ViewFill functor based on integer space needed and
     // iteration type
     using ViewTypeUniform =
-        std::conditional_t<ViewType::Rank == 0,
+        std::conditional_t<ViewType::rank == 0,
                            typename ViewType::uniform_runtime_type,
                            typename ViewType::uniform_runtime_nomemspace_type>;
     if (dst.span() > static_cast<size_t>(std::numeric_limits<int32_t>::max())) {
       if (iterate == Kokkos::Iterate::Right)
         Kokkos::Impl::ViewFill<ViewTypeUniform, Kokkos::LayoutRight, ExecSpace,
-                               ViewType::Rank, int64_t>(dst, value, space);
+                               ViewType::rank, int64_t>(dst, value, space);
       else
         Kokkos::Impl::ViewFill<ViewTypeUniform, Kokkos::LayoutLeft, ExecSpace,
-                               ViewType::Rank, int64_t>(dst, value, space);
+                               ViewType::rank, int64_t>(dst, value, space);
     } else {
       if (iterate == Kokkos::Iterate::Right)
         Kokkos::Impl::ViewFill<ViewTypeUniform, Kokkos::LayoutRight, ExecSpace,
-                               ViewType::Rank, int32_t>(dst, value, space);
+                               ViewType::rank, int32_t>(dst, value, space);
       else
         Kokkos::Impl::ViewFill<ViewTypeUniform, Kokkos::LayoutLeft, ExecSpace,
-                               ViewType::Rank, int32_t>(dst, value, space);
+                               ViewType::rank, int32_t>(dst, value, space);
     }
   }
   if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) {
@@ -2646,7 +2644,7 @@ inline void deep_copy(
       Impl::contiguous_fill_or_memset(fill_exec_space(), dst, value);
     } else {
       using ViewTypeUniform = std::conditional_t<
-          View<DT, DP...>::Rank == 0,
+          View<DT, DP...>::rank == 0,
           typename View<DT, DP...>::uniform_runtime_type,
           typename View<DT, DP...>::uniform_runtime_nomemspace_type>;
       Kokkos::Impl::ViewFill<ViewTypeUniform, typename dst_traits::array_layout,
@@ -2803,19 +2801,19 @@ inline void deep_copy(
           "match: ");
       message += dst.label();
       message += "(";
-      for (int r = 0; r < dst_type::Rank - 1; r++) {
-        message += std::to_string(dst.extent(r));
+      message += std::to_string(dst.extent(0));
+      for (size_t r = 1; r < dst_type::rank; r++) {
         message += ",";
+        message += std::to_string(dst.extent(r));
       }
-      message += std::to_string(dst.extent(dst_type::Rank - 1));
       message += ") ";
       message += src.label();
       message += "(";
-      for (int r = 0; r < src_type::Rank - 1; r++) {
-        message += std::to_string(src.extent(r));
+      message += std::to_string(src.extent(0));
+      for (size_t r = 1; r < src_type::rank; r++) {
         message += ",";
+        message += std::to_string(src.extent(r));
       }
-      message += std::to_string(src.extent(src_type::Rank - 1));
       message += ") ";
 
       Kokkos::Impl::throw_runtime_exception(message);
@@ -2872,19 +2870,19 @@ inline void deep_copy(
         "Deprecation Error: Kokkos::deep_copy extents of views don't match: ");
     message += dst.label();
     message += "(";
-    for (int r = 0; r < dst_type::Rank - 1; r++) {
-      message += std::to_string(dst.extent(r));
+    message += std::to_string(dst.extent(0));
+    for (size_t r = 1; r < dst_type::rank; r++) {
       message += ",";
+      message += std::to_string(dst.extent(r));
     }
-    message += std::to_string(dst.extent(dst_type::Rank - 1));
     message += ") ";
     message += src.label();
     message += "(";
-    for (int r = 0; r < src_type::Rank - 1; r++) {
-      message += std::to_string(src.extent(r));
+    message += std::to_string(src.extent(0));
+    for (size_t r = 1; r < src_type::rank; r++) {
       message += ",";
+      message += std::to_string(src.extent(r));
     }
-    message += std::to_string(src.extent(src_type::Rank - 1));
     message += ") ";
 
     Kokkos::Impl::throw_runtime_exception(message);
@@ -2908,7 +2906,7 @@ inline void deep_copy(
       ((dst_type::rank < 7) || (dst.stride_6() == src.stride_6())) &&
       ((dst_type::rank < 8) || (dst.stride_7() == src.stride_7()))) {
     const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span();
-    if ((void*)dst.data() != (void*)src.data()) {
+    if ((void*)dst.data() != (void*)src.data() && 0 < nbytes) {
       Kokkos::Impl::DeepCopy<dst_memory_space, src_memory_space, ExecSpace>(
           exec_space, dst.data(), src.data(), nbytes);
     }
@@ -3439,27 +3437,33 @@ struct MirrorType {
   using view_type = Kokkos::View<data_type, array_layout, Space>;
 };
 
+template <class... ViewCtorArgs>
+void check_view_ctor_args_create_mirror() {
+  using alloc_prop_input = Impl::ViewCtorProp<ViewCtorArgs...>;
+
+  static_assert(
+      !alloc_prop_input::has_label,
+      "The view constructor arguments passed to Kokkos::create_mirror[_view] "
+      "must not include a label!");
+  static_assert(!alloc_prop_input::has_pointer,
+                "The view constructor arguments passed to "
+                "Kokkos::create_mirror[_view] must "
+                "not include a pointer!");
+  static_assert(!alloc_prop_input::allow_padding,
+                "The view constructor arguments passed to "
+                "Kokkos::create_mirror[_view] must "
+                "not explicitly allow padding!");
+}
+
 template <class T, class... P, class... ViewCtorArgs>
 inline std::enable_if_t<!Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space,
                         typename Kokkos::View<T, P...>::HostMirror>
 create_mirror(const Kokkos::View<T, P...>& src,
               const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
-  using src_type         = View<T, P...>;
-  using dst_type         = typename src_type::HostMirror;
-  using alloc_prop_input = Impl::ViewCtorProp<ViewCtorArgs...>;
+  using src_type = View<T, P...>;
+  using dst_type = typename src_type::HostMirror;
 
-  static_assert(
-      !alloc_prop_input::has_label,
-      "The view constructor arguments passed to Kokkos::create_mirror "
-      "must not include a label!");
-  static_assert(
-      !alloc_prop_input::has_pointer,
-      "The view constructor arguments passed to Kokkos::create_mirror must "
-      "not include a pointer!");
-  static_assert(
-      !alloc_prop_input::allow_padding,
-      "The view constructor arguments passed to Kokkos::create_mirror must "
-      "not explicitly allow padding!");
+  check_view_ctor_args_create_mirror<ViewCtorArgs...>();
 
   auto prop_copy = Impl::with_properties_if_unset(
       arg_prop, std::string(src.label()).append("_mirror"));
@@ -3473,20 +3477,7 @@ template <class T, class... P, class... ViewCtorArgs,
               Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
 auto create_mirror(const Kokkos::View<T, P...>& src,
                    const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
-  using alloc_prop_input = Impl::ViewCtorProp<ViewCtorArgs...>;
-
-  static_assert(
-      !alloc_prop_input::has_label,
-      "The view constructor arguments passed to Kokkos::create_mirror "
-      "must not include a label!");
-  static_assert(
-      !alloc_prop_input::has_pointer,
-      "The view constructor arguments passed to Kokkos::create_mirror must "
-      "not include a pointer!");
-  static_assert(
-      !alloc_prop_input::allow_padding,
-      "The view constructor arguments passed to Kokkos::create_mirror must "
-      "not explicitly allow padding!");
+  check_view_ctor_args_create_mirror<ViewCtorArgs...>();
 
   auto prop_copy = Impl::with_properties_if_unset(
       arg_prop, std::string(src.label()).append("_mirror"));
@@ -3562,6 +3553,7 @@ inline std::enable_if_t<
     typename Kokkos::View<T, P...>::HostMirror>
 create_mirror_view(const Kokkos::View<T, P...>& src,
                    const Impl::ViewCtorProp<ViewCtorArgs...>&) {
+  check_view_ctor_args_create_mirror<ViewCtorArgs...>();
   return src;
 }
 
@@ -3592,6 +3584,7 @@ std::enable_if_t<Impl::MirrorViewType<
                      T, P...>::view_type>
 create_mirror_view(const Kokkos::View<T, P...>& src,
                    const Impl::ViewCtorProp<ViewCtorArgs...>&) {
+  check_view_ctor_args_create_mirror<ViewCtorArgs...>();
   return src;
 }
 
diff --git a/packages/kokkos/core/src/Kokkos_Core.hpp b/packages/kokkos/core/src/Kokkos_Core.hpp
index cf898a71e7ad2ebf2154f08d11d3d31d1a494968..805411a699ec28854971dee696f8cd430884f04c 100644
--- a/packages/kokkos/core/src/Kokkos_Core.hpp
+++ b/packages/kokkos/core/src/Kokkos_Core.hpp
@@ -53,6 +53,7 @@
 #include <Kokkos_MathematicalFunctions.hpp>
 #include <Kokkos_MathematicalSpecialFunctions.hpp>
 #include <Kokkos_NumericTraits.hpp>
+#include <Kokkos_BitManipulation.hpp>
 #include <Kokkos_MemoryPool.hpp>
 #include <Kokkos_Array.hpp>
 #include <Kokkos_View.hpp>
@@ -99,6 +100,9 @@ void declare_configuration_metadata(const std::string& category,
 [[nodiscard]] bool is_initialized() noexcept;
 [[nodiscard]] bool is_finalized() noexcept;
 
+[[nodiscard]] int device_id() noexcept;
+[[nodiscard]] int num_threads() noexcept;
+
 bool show_warnings() noexcept;
 bool tune_internals() noexcept;
 
@@ -270,7 +274,7 @@ std::vector<ExecSpace> partition_space(ExecSpace const& space, Args...) {
 
 template <class ExecSpace, class T>
 std::vector<ExecSpace> partition_space(ExecSpace const& space,
-                                       std::vector<T>& weights) {
+                                       std::vector<T> const& weights) {
   static_assert(is_execution_space<ExecSpace>::value,
                 "Kokkos Error: partition_space expects an Execution Space as "
                 "first argument");
diff --git a/packages/kokkos/core/src/Kokkos_Core_fwd.hpp b/packages/kokkos/core/src/Kokkos_Core_fwd.hpp
index a0b4df0c4bd09cf6726f5badf087f061573eb31a..44f1c5b42f4d48a50909ba66c8e9fab02d249ae0 100644
--- a/packages/kokkos/core/src/Kokkos_Core_fwd.hpp
+++ b/packages/kokkos/core/src/Kokkos_Core_fwd.hpp
@@ -26,6 +26,7 @@
 // and compiler environment then sets a collection of #define macros.
 
 #include <Kokkos_Macros.hpp>
+#include <Kokkos_Printf.hpp>
 #include <impl/Kokkos_Error.hpp>
 #include <impl/Kokkos_Utilities.hpp>
 
@@ -34,11 +35,15 @@
 #endif
 
 //----------------------------------------------------------------------------
-// Have assumed a 64bit build (8byte pointers) throughout the code base.
-
+// Have assumed a 64-bit build (8-byte pointers) throughout the code base.
+// 32-bit build allowed but unsupported.
+#ifdef KOKKOS_IMPL_32BIT
+static_assert(sizeof(void *) == 4,
+              "Kokkos assumes 64-bit build; i.e., 4-byte pointers");
+#else
 static_assert(sizeof(void *) == 8,
               "Kokkos assumes 64-bit build; i.e., 8-byte pointers");
-
+#endif
 //----------------------------------------------------------------------------
 
 namespace Kokkos {
@@ -293,12 +298,9 @@ template <class DstSpace, class SrcSpace,
           class Enable         = void>
 struct DeepCopy;
 
-template <typename ExecutionSpace, class DT, class... DP>
-struct ZeroMemset;
-
 template <class ViewType, class Layout = typename ViewType::array_layout,
           class ExecSpace = typename ViewType::execution_space,
-          int Rank = ViewType::Rank, typename iType = int64_t>
+          int Rank = ViewType::rank, typename iType = int64_t>
 struct ViewFill;
 
 template <class ViewTypeA, class ViewTypeB, class Layout, class ExecSpace,
@@ -325,11 +327,14 @@ class ParallelFor;
 ///
 /// This is an implementation detail of parallel_reduce.  Users should
 /// skip this and go directly to the nonmember function parallel_reduce.
-template <class FunctorType, class ExecPolicy, class ReducerType = InvalidType,
-          class ExecutionSpace = typename Impl::FunctorPolicyExecutionSpace<
-              FunctorType, ExecPolicy>::execution_space>
+template <typename CombinedFunctorReducerType, typename PolicyType,
+          typename ExecutionSpaceType>
 class ParallelReduce;
 
+template <typename FunctorType, typename FunctorAnalysisReducerType,
+          typename Enable = void>
+class CombinedFunctorReducer;
+
 /// \class ParallelScan
 /// \brief Implementation detail of parallel_scan.
 ///
diff --git a/packages/kokkos/core/src/Kokkos_ExecPolicy.hpp b/packages/kokkos/core/src/Kokkos_ExecPolicy.hpp
index 357c2572a5c66880e068cdc72bae3a61fa45dcf9..ae1585a4989f2db19816587893a88218f10e2553 100644
--- a/packages/kokkos/core/src/Kokkos_ExecPolicy.hpp
+++ b/packages/kokkos/core/src/Kokkos_ExecPolicy.hpp
@@ -738,20 +738,11 @@ struct ThreadVectorRangeBoundariesStruct {
                                               const index_type& count) noexcept
       : start(static_cast<index_type>(0)), end(count) {}
 
-  KOKKOS_INLINE_FUNCTION
-  constexpr ThreadVectorRangeBoundariesStruct(const index_type& count) noexcept
-      : start(static_cast<index_type>(0)), end(count) {}
-
   KOKKOS_INLINE_FUNCTION
   constexpr ThreadVectorRangeBoundariesStruct(
       const TeamMemberType, const index_type& arg_begin,
       const index_type& arg_end) noexcept
       : start(static_cast<index_type>(arg_begin)), end(arg_end) {}
-
-  KOKKOS_INLINE_FUNCTION
-  constexpr ThreadVectorRangeBoundariesStruct(
-      const index_type& arg_begin, const index_type& arg_end) noexcept
-      : start(static_cast<index_type>(arg_begin)), end(arg_end) {}
 };
 
 template <class TeamMemberType>
diff --git a/packages/kokkos/core/src/Kokkos_Graph.hpp b/packages/kokkos/core/src/Kokkos_Graph.hpp
index 7f77c00b2d7b8f03c5d3abfa26628b51e314f884..643bdcc02ccc904c80ae3fe0b7e1712de18af819 100644
--- a/packages/kokkos/core/src/Kokkos_Graph.hpp
+++ b/packages/kokkos/core/src/Kokkos_Graph.hpp
@@ -161,6 +161,12 @@ Graph<ExecutionSpace> create_graph(Closure&& arg_closure) {
 #include <impl/Kokkos_GraphNodeImpl.hpp>
 #include <impl/Kokkos_Default_Graph_Impl.hpp>
 #include <Cuda/Kokkos_Cuda_Graph_Impl.hpp>
+#if defined(KOKKOS_ENABLE_HIP)
+// The implementation of hipGraph in ROCm 5.2 is bugged, so we cannot use it.
+#if !((HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR == 2))
+#include <HIP/Kokkos_HIP_Graph_Impl.hpp>
+#endif
+#endif
 #ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_GRAPH
 #undef KOKKOS_IMPL_PUBLIC_INCLUDE
 #undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_GRAPH
diff --git a/packages/kokkos/core/src/Kokkos_GraphNode.hpp b/packages/kokkos/core/src/Kokkos_GraphNode.hpp
index 1cfd2b382b2e7b409e39ce3b921f66c99d9be106..2a4e2cf6414a55f8283641dcc10ca8a71f16e2dd 100644
--- a/packages/kokkos/core/src/Kokkos_GraphNode.hpp
+++ b/packages/kokkos/core/src/Kokkos_GraphNode.hpp
@@ -376,14 +376,30 @@ class GraphNodeRef {
     auto policy = Experimental::require((Policy &&) arg_policy,
                                         Kokkos::Impl::KernelInGraphProperty{});
 
+    using passed_reducer_type = typename return_value_adapter::reducer_type;
+
+    using reducer_selector = Kokkos::Impl::if_c<
+        std::is_same<InvalidType, passed_reducer_type>::value, functor_type,
+        passed_reducer_type>;
+    using analysis = Kokkos::Impl::FunctorAnalysis<
+        Kokkos::Impl::FunctorPatternInterface::REDUCE, Policy,
+        typename reducer_selector::type,
+        typename return_value_adapter::value_type>;
+    typename analysis::Reducer final_reducer(
+        reducer_selector::select(functor, return_value));
+    Kokkos::Impl::CombinedFunctorReducer<functor_type,
+                                         typename analysis::Reducer>
+        functor_reducer(functor, final_reducer);
+
     using next_policy_t = decltype(policy);
-    using next_kernel_t = Kokkos::Impl::GraphNodeKernelImpl<
-        ExecutionSpace, next_policy_t, functor_type, Kokkos::ParallelReduceTag,
-        typename return_value_adapter::reducer_type>;
+    using next_kernel_t =
+        Kokkos::Impl::GraphNodeKernelImpl<ExecutionSpace, next_policy_t,
+                                          decltype(functor_reducer),
+                                          Kokkos::ParallelReduceTag>;
 
     return this->_then_kernel(next_kernel_t{
         std::move(arg_name), graph_impl_ptr->get_execution_space(),
-        (Functor &&) functor, (Policy &&) policy,
+        functor_reducer, (Policy &&) policy,
         return_value_adapter::return_value(return_value, functor)});
   }
 
diff --git a/packages/kokkos/core/src/Kokkos_HBWSpace.hpp b/packages/kokkos/core/src/Kokkos_HBWSpace.hpp
index d9064a298339e6e611c90dd5c9b60336cc7e2527..369b7bafb7b8f54253625b0290502e45f42ded58 100644
--- a/packages/kokkos/core/src/Kokkos_HBWSpace.hpp
+++ b/packages/kokkos/core/src/Kokkos_HBWSpace.hpp
@@ -31,41 +31,6 @@ namespace Kokkos {
 
 namespace Experimental {
 
-namespace Impl {
-
-/// \brief Initialize lock array for arbitrary size atomics.
-///
-/// Arbitrary atomics are implemented using a hash table of locks
-/// where the hash value is derived from the address of the
-/// object for which an atomic operation is performed.
-/// This function initializes the locks to zero (unset).
-void init_lock_array_hbw_space();
-
-/// \brief Acquire a lock for the address
-///
-/// This function tries to acquire the lock for the hash value derived
-/// from the provided ptr. If the lock is successfully acquired the
-/// function returns true. Otherwise it returns false.
-bool lock_address_hbw_space(void* ptr);
-
-/// \brief Release lock for the address
-///
-/// This function releases the lock for the hash value derived
-/// from the provided ptr. This function should only be called
-/// after previously successfully acquiring a lock with
-/// lock_address.
-void unlock_address_hbw_space(void* ptr);
-
-}  // namespace Impl
-
-}  // namespace Experimental
-
-}  // namespace Kokkos
-
-namespace Kokkos {
-
-namespace Experimental {
-
 /// \class HBWSpace
 /// \brief Memory management for host memory.
 ///
diff --git a/packages/kokkos/core/src/Kokkos_HPX.hpp b/packages/kokkos/core/src/Kokkos_HPX.hpp
deleted file mode 100644
index 06693852622df02fb747228f7e1a4aa1ad0625a3..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/src/Kokkos_HPX.hpp
+++ /dev/null
@@ -1,2365 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
-#include <Kokkos_Macros.hpp>
-static_assert(false,
-              "Including non-public Kokkos header files is not allowed.");
-#endif
-#ifndef KOKKOS_HPX_HPP
-#define KOKKOS_HPX_HPP
-
-#include <Kokkos_Macros.hpp>
-#if defined(KOKKOS_ENABLE_HPX)
-
-#include <Kokkos_Core_fwd.hpp>
-
-#include <Kokkos_HostSpace.hpp>
-#include <cstddef>
-#include <iosfwd>
-
-#ifdef KOKKOS_ENABLE_HBWSPACE
-#include <Kokkos_HBWSpace.hpp>
-#endif
-
-#include <Kokkos_HostSpace.hpp>
-#include <Kokkos_Layout.hpp>
-#include <Kokkos_MemoryTraits.hpp>
-#include <Kokkos_Parallel.hpp>
-#include <Kokkos_ScratchSpace.hpp>
-#include <Kokkos_TaskScheduler.hpp>
-#include <impl/Kokkos_ConcurrentBitset.hpp>
-#include <impl/Kokkos_FunctorAnalysis.hpp>
-#include <impl/Kokkos_Tools.hpp>
-#include <impl/Kokkos_TaskQueue.hpp>
-#include <impl/Kokkos_InitializationSettings.hpp>
-
-#include <KokkosExp_MDRangePolicy.hpp>
-
-#include <hpx/local/algorithm.hpp>
-#include <hpx/local/barrier.hpp>
-#include <hpx/local/condition_variable.hpp>
-#include <hpx/local/execution.hpp>
-#include <hpx/local/future.hpp>
-#include <hpx/local/init.hpp>
-#include <hpx/local/mutex.hpp>
-#include <hpx/local/runtime.hpp>
-#include <hpx/local/thread.hpp>
-
-#include <Kokkos_UniqueToken.hpp>
-
-#include <functional>
-#include <iostream>
-#include <memory>
-#include <sstream>
-#include <type_traits>
-#include <vector>
-
-// There are currently two different implementations for the parallel dispatch
-// functions:
-//
-// - 0: The HPX way. Unfortunately, this comes with unnecessary
-//      overheads at the moment, so there is
-// - 1: The manual way. This uses for_loop, but only spawns one task per worker
-//      thread. This is significantly faster in most cases.
-//
-// In the long run 0 should be the preferred implementation, but until HPX is
-// improved 1 will be the default.
-#ifndef KOKKOS_HPX_IMPLEMENTATION
-#define KOKKOS_HPX_IMPLEMENTATION 1
-#endif
-
-#if (KOKKOS_HPX_IMPLEMENTATION < 0) || (KOKKOS_HPX_IMPLEMENTATION > 1)
-#error "You have chosen an invalid value for KOKKOS_HPX_IMPLEMENTATION"
-#endif
-
-// [note 1]
-//
-// When using the asynchronous backend and independent instances, we explicitly
-// reset the shared data at the end of a parallel task (execute_task). We do
-// this to avoid circular references with shared pointers that would otherwise
-// never be released.
-//
-// The HPX instance holds shared data for the instance in a shared_ptr. One of
-// the pieces of shared data is the future that we use to sequence parallel
-// dispatches. When a parallel task is launched, a copy of the closure
-// (ParallelFor, ParallelReduce, etc.) is captured in the task. The closure
-// also holds the policy, the policy holds the HPX instance, the instance holds
-// the shared data (for use of buffers in the parallel task). When attaching a
-// continuation to a future, the continuation is stored in the future (shared
-// state). This means that there is a cycle future -> continuation -> closure
-// -> policy -> HPX -> shared data -> future. We break this by releasing the
-// shared data early, as (the pointer to) the shared data will not be used
-// anymore by the closure at the end of execute_task.
-//
-// We also mark the shared instance data as mutable so that we can reset it
-// from the const execute_task member function.
-
-namespace Kokkos {
-namespace Impl {
-class thread_buffer {
-  static constexpr std::size_t m_cache_line_size = 64;
-
-  std::size_t m_num_threads;
-  std::size_t m_size_per_thread;
-  std::size_t m_size_total;
-  char *m_data;
-
-  void pad_to_cache_line(std::size_t &size) {
-    size = ((size + m_cache_line_size - 1) / m_cache_line_size) *
-           m_cache_line_size;
-  }
-
- public:
-  thread_buffer()
-      : m_num_threads(0),
-        m_size_per_thread(0),
-        m_size_total(0),
-        m_data(nullptr) {}
-  thread_buffer(const std::size_t num_threads,
-                const std::size_t size_per_thread) {
-    resize(num_threads, size_per_thread);
-  }
-  ~thread_buffer() { delete[] m_data; }
-
-  thread_buffer(const thread_buffer &) = delete;
-  thread_buffer(thread_buffer &&)      = delete;
-  thread_buffer &operator=(const thread_buffer &) = delete;
-  thread_buffer &operator=(thread_buffer) = delete;
-
-  void resize(const std::size_t num_threads,
-              const std::size_t size_per_thread) {
-    m_num_threads     = num_threads;
-    m_size_per_thread = size_per_thread;
-
-    pad_to_cache_line(m_size_per_thread);
-
-    std::size_t size_total_new = m_num_threads * m_size_per_thread;
-
-    if (m_size_total < size_total_new) {
-      delete[] m_data;
-      m_data       = new char[size_total_new];
-      m_size_total = size_total_new;
-    }
-  }
-
-  char *get(std::size_t thread_num) {
-    assert(thread_num < m_num_threads);
-    if (m_data == nullptr) {
-      return nullptr;
-    }
-    return &m_data[thread_num * m_size_per_thread];
-  }
-
-  std::size_t size_per_thread() const noexcept { return m_size_per_thread; }
-  std::size_t size_total() const noexcept { return m_size_total; }
-};
-}  // namespace Impl
-
-namespace Experimental {
-class HPX {
- public:
-  static constexpr uint32_t impl_default_instance_id() { return 1; }
-
- private:
-  static bool m_hpx_initialized;
-  uint32_t m_instance_id = impl_default_instance_id();
-
-#if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH)
-  static std::atomic<uint32_t> m_next_instance_id;
-
- public:
-  enum class instance_mode { default_, independent };
-
- private:
-  static uint32_t m_active_parallel_region_count;
-  static hpx::spinlock m_active_parallel_region_count_mutex;
-  static hpx::condition_variable_any m_active_parallel_region_count_cond;
-
-  struct instance_data {
-    instance_data() = default;
-    instance_data(hpx::shared_future<void> future) : m_future(future) {}
-    Kokkos::Impl::thread_buffer m_buffer;
-    hpx::shared_future<void> m_future = hpx::make_ready_future<void>();
-    hpx::spinlock m_future_mutex;
-  };
-
-  mutable std::shared_ptr<instance_data> m_independent_instance_data;
-  static instance_data m_default_instance_data;
-
-  std::reference_wrapper<Kokkos::Impl::thread_buffer> m_buffer;
-  std::reference_wrapper<hpx::shared_future<void>> m_future;
-  std::reference_wrapper<hpx::spinlock> m_future_mutex;
-#else
-  static Kokkos::Impl::thread_buffer m_default_buffer;
-#endif
-
- public:
-  using execution_space      = HPX;
-  using memory_space         = HostSpace;
-  using device_type          = Kokkos::Device<execution_space, memory_space>;
-  using array_layout         = LayoutRight;
-  using size_type            = memory_space::size_type;
-  using scratch_memory_space = ScratchMemorySpace<HPX>;
-
-#if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH)
-  HPX()
-  noexcept
-      : m_instance_id(impl_default_instance_id()),
-        m_buffer(m_default_instance_data.m_buffer),
-        m_future(m_default_instance_data.m_future),
-        m_future_mutex(m_default_instance_data.m_future_mutex) {}
-
-  HPX(instance_mode mode)
-      : m_instance_id(mode == instance_mode::independent
-                          ? m_next_instance_id++
-                          : impl_default_instance_id()),
-        m_independent_instance_data(mode == instance_mode::independent
-                                        ? (new instance_data())
-                                        : nullptr),
-        m_buffer(mode == instance_mode::independent
-                     ? m_independent_instance_data->m_buffer
-                     : m_default_instance_data.m_buffer),
-        m_future(mode == instance_mode::independent
-                     ? m_independent_instance_data->m_future
-                     : m_default_instance_data.m_future),
-        m_future_mutex(mode == instance_mode::independent
-                           ? m_independent_instance_data->m_future_mutex
-                           : m_default_instance_data.m_future_mutex) {}
-
-  HPX(hpx::shared_future<void> future)
-      : m_instance_id(m_next_instance_id++),
-
-        m_independent_instance_data(new instance_data(future)),
-        m_buffer(m_independent_instance_data->m_buffer),
-        m_future(m_independent_instance_data->m_future),
-        m_future_mutex(m_independent_instance_data->m_future_mutex) {}
-
-  HPX(HPX &&other) = default;
-  HPX &operator=(HPX &&other) = default;
-  HPX(const HPX &other)       = default;
-  HPX &operator=(const HPX &other) = default;
-#else
-  HPX() noexcept {}
-#endif
-
-  void print_configuration(std::ostream &os, bool /*verbose*/ = false) const {
-    os << "HPX backend\n";
-    os << "HPX Execution Space:\n";
-    os << "  KOKKOS_ENABLE_HPX: yes\n";
-    os << "\nHPX Runtime Configuration:\n";
-  }
-  uint32_t impl_instance_id() const noexcept { return m_instance_id; }
-
-#if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH)
-  static bool in_parallel(HPX const &instance = HPX()) noexcept {
-    return !instance.impl_get_future().is_ready();
-  }
-#else
-  static bool in_parallel(HPX const & = HPX()) noexcept { return false; }
-#endif
-
-#if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH)
-  static void impl_decrement_active_parallel_region_count() {
-    std::unique_lock<hpx::spinlock> l(m_active_parallel_region_count_mutex);
-    if (--m_active_parallel_region_count == 0) {
-      l.unlock();
-      m_active_parallel_region_count_cond.notify_all();
-    };
-  }
-
-  static void impl_increment_active_parallel_region_count() {
-    std::unique_lock<hpx::spinlock> l(m_active_parallel_region_count_mutex);
-    ++m_active_parallel_region_count;
-  }
-#endif
-
-  void fence(
-      const std::string &name =
-          "Kokkos::Experimental::HPX::fence: Unnamed Instance Fence") const {
-    Kokkos::Tools::Experimental::Impl::profile_fence_event<
-        Kokkos::Experimental::HPX>(
-        name,
-        Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{
-            impl_instance_id()},
-        [&]() {
-#if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH)
-          impl_get_future().wait();
-          // Reset the future to free variables that may have been captured in
-          // parallel regions.
-          impl_get_future() = hpx::make_ready_future<void>();
-#endif
-        });
-  }
-
-  static void impl_static_fence(const std::string &name) {
-    Kokkos::Tools::Experimental::Impl::profile_fence_event<
-        Kokkos::Experimental::HPX>(
-        name,
-        Kokkos::Tools::Experimental::SpecialSynchronizationCases::
-            GlobalDeviceSynchronization,
-        [&]() {
-#if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH)
-          std::unique_lock<hpx::spinlock> l(
-              m_active_parallel_region_count_mutex);
-          m_active_parallel_region_count_cond.wait(
-              l, [&]() { return m_active_parallel_region_count == 0; });
-          // Reset the future to free variables that may have been captured in
-          // parallel regions (however, we don't have access to futures from
-          // instances other than the default instances, they will only be
-          // released by fence).
-          HPX().impl_get_future() = hpx::make_ready_future<void>();
-#endif
-        });
-  }
-
-  static hpx::execution::parallel_executor impl_get_executor() {
-    return hpx::execution::parallel_executor();
-  }
-
-  static bool is_asynchronous(HPX const & = HPX()) noexcept {
-#if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH)
-    return true;
-#else
-    return false;
-#endif
-  }
-
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
-  template <typename F>
-  KOKKOS_DEPRECATED static void partition_master(
-      F const &, int requested_num_partitions = 0, int = 0) {
-    if (requested_num_partitions > 1) {
-      Kokkos::abort(
-          "Kokkos::Experimental::HPX::partition_master: can't partition an "
-          "HPX instance\n");
-    }
-  }
-#endif
-
-  static int concurrency();
-  static void impl_initialize(InitializationSettings const &);
-  static bool impl_is_initialized() noexcept;
-  static void impl_finalize();
-
-  static int impl_thread_pool_size() noexcept {
-    hpx::runtime *rt = hpx::get_runtime_ptr();
-    if (rt == nullptr) {
-      return 0;
-    } else {
-      if (hpx::threads::get_self_ptr() == nullptr) {
-        return hpx::resource::get_thread_pool(0).get_os_thread_count();
-      } else {
-        return hpx::this_thread::get_pool()->get_os_thread_count();
-      }
-    }
-  }
-
-  static int impl_thread_pool_rank() noexcept {
-    hpx::runtime *rt = hpx::get_runtime_ptr();
-    if (rt == nullptr) {
-      return 0;
-    } else {
-      if (hpx::threads::get_self_ptr() == nullptr) {
-        return 0;
-      } else {
-        return hpx::this_thread::get_pool()->get_pool_index();
-      }
-    }
-  }
-
-  static int impl_thread_pool_size(int depth) {
-    if (depth == 0) {
-      return impl_thread_pool_size();
-    } else {
-      return 1;
-    }
-  }
-
-  static int impl_max_hardware_threads() noexcept {
-    return hpx::threads::hardware_concurrency();
-  }
-
-  static int impl_hardware_thread_id() noexcept {
-    return hpx::get_worker_thread_num();
-  }
-
-  Kokkos::Impl::thread_buffer &impl_get_buffer() const noexcept {
-#if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH)
-    return m_buffer.get();
-#else
-    return m_default_buffer;
-#endif
-  }
-
-#if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH)
-  hpx::shared_future<void> &impl_get_future() const noexcept {
-    return m_future;
-  }
-
-  hpx::spinlock &impl_get_future_mutex() const noexcept {
-    return m_future_mutex;
-  }
-#endif
-
-#if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH)
-  struct [[nodiscard]] reset_on_exit_parallel {
-    HPX const &m_space;
-    reset_on_exit_parallel(HPX const &space) : m_space(space) {}
-    ~reset_on_exit_parallel() {
-      // See [note 1] for an explanation. m_independent_instance_data is
-      // marked mutable.
-      m_space.m_independent_instance_data.reset();
-
-      HPX::impl_decrement_active_parallel_region_count();
-    }
-  };
-
-  // This struct is identical to the above except it does not reset the shared
-  // data. It does, however, still decrement the parallel region count. It is
-  // meant for use in parallel regions which do not capture the execution space
-  // instance.
-  struct [[nodiscard]] reset_count_on_exit_parallel {
-    reset_count_on_exit_parallel() = default;
-    ~reset_count_on_exit_parallel() {
-      HPX::impl_decrement_active_parallel_region_count();
-    }
-  };
-#else
-  struct [[nodiscard]] reset_on_exit_parallel {
-    reset_on_exit_parallel(HPX const &) = default;
-    ~reset_on_exit_parallel()           = default;
-  };
-
-  struct [[nodiscard]] reset_count_on_exit_parallel {
-    reset_count_on_exit_parallel()  = default;
-    ~reset_count_on_exit_parallel() = default;
-  };
-#endif
-
-  static constexpr const char *name() noexcept { return "HPX"; }
-
- private:
-  friend bool operator==(HPX const &lhs, HPX const &rhs) {
-    return lhs.m_instance_id == rhs.m_instance_id;
-  }
-  friend bool operator!=(HPX const &lhs, HPX const &rhs) {
-    return !(lhs == rhs);
-  }
-};
-}  // namespace Experimental
-
-namespace Tools {
-namespace Experimental {
-template <>
-struct DeviceTypeTraits<Kokkos::Experimental::HPX> {
-  static constexpr DeviceType id = DeviceType::HPX;
-  static int device_id(const Kokkos::Experimental::HPX &) { return 0; }
-};
-}  // namespace Experimental
-}  // namespace Tools
-
-namespace Impl {
-
-#if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH)
-template <typename Closure>
-inline void dispatch_execute_task(Closure *closure,
-                                  Kokkos::Experimental::HPX const &instance,
-                                  bool force_synchronous = false) {
-  Kokkos::Experimental::HPX::impl_increment_active_parallel_region_count();
-
-  Closure closure_copy = *closure;
-
-  {
-    std::unique_lock<hpx::spinlock> l(instance.impl_get_future_mutex());
-    hpx::util::ignore_lock(&instance.impl_get_future_mutex());
-    hpx::shared_future<void> &fut = instance.impl_get_future();
-
-    fut = fut.then(hpx::execution::parallel_executor(
-                       hpx::threads::thread_schedule_hint(0)),
-                   [closure_copy](hpx::shared_future<void> &&) {
-                     return closure_copy.execute_task();
-                   });
-  }
-
-  if (force_synchronous) {
-    instance.fence(
-        "Kokkos::Experimental::Impl::HPX::dispatch_execute_task: fence due to "
-        "forced syncronizations");
-  }
-}
-#else
-template <typename Closure>
-inline void dispatch_execute_task(Closure *closure,
-                                  Kokkos::Experimental::HPX const &,
-                                  bool = false) {
-  closure->execute_task();
-}
-#endif
-}  // namespace Impl
-}  // namespace Kokkos
-
-namespace Kokkos {
-namespace Impl {
-template <>
-struct MemorySpaceAccess<Kokkos::Experimental::HPX::memory_space,
-                         Kokkos::Experimental::HPX::scratch_memory_space> {
-  enum : bool { assignable = false };
-  enum : bool { accessible = true };
-  enum : bool { deepcopy = false };
-};
-
-}  // namespace Impl
-}  // namespace Kokkos
-
-namespace Kokkos {
-namespace Experimental {
-template <>
-class UniqueToken<HPX, UniqueTokenScope::Instance> {
- private:
-  using buffer_type = Kokkos::View<uint32_t *, Kokkos::HostSpace>;
-  int m_count;
-  buffer_type m_buffer_view;
-  uint32_t volatile *m_buffer;
-
- public:
-  using execution_space = HPX;
-  using size_type       = int;
-
-  /// \brief create object size for concurrency on the given instance
-  ///
-  /// This object should not be shared between instances
-  UniqueToken(execution_space const & = execution_space()) noexcept
-      : m_count(execution_space::impl_max_hardware_threads()),
-        m_buffer_view(buffer_type()),
-        m_buffer(nullptr) {}
-
-  UniqueToken(size_type max_size, execution_space const & = execution_space())
-      : m_count(max_size > execution_space::impl_max_hardware_threads()
-                    ? execution_space::impl_max_hardware_threads()
-                    : max_size),
-        m_buffer_view(
-            max_size > execution_space::impl_max_hardware_threads()
-                ? buffer_type()
-                : buffer_type("UniqueToken::m_buffer_view",
-                              ::Kokkos::Impl::concurrent_bitset::buffer_bound(
-                                  m_count))),
-        m_buffer(m_buffer_view.data()) {}
-
-  /// \brief upper bound for acquired values, i.e. 0 <= value < size()
-  KOKKOS_INLINE_FUNCTION
-  int size() const noexcept { return m_count; }
-
-  /// \brief acquire value such that 0 <= value < size()
-  KOKKOS_INLINE_FUNCTION
-  int acquire() const noexcept {
-    KOKKOS_IF_ON_HOST((
-        if (m_buffer == nullptr) {
-          return execution_space::impl_hardware_thread_id();
-        } else {
-          const ::Kokkos::pair<int, int> result =
-              ::Kokkos::Impl::concurrent_bitset::acquire_bounded(
-                  m_buffer, m_count, ::Kokkos::Impl::clock_tic() % m_count);
-
-          if (result.first < 0) {
-            ::Kokkos::abort(
-                "UniqueToken<HPX> failure to acquire tokens, no tokens "
-                "available");
-          }
-          return result.first;
-        }))
-
-    KOKKOS_IF_ON_DEVICE((return 0;))
-  }
-
-  /// \brief release a value acquired by generate
-  KOKKOS_INLINE_FUNCTION
-  void release(int i) const noexcept {
-    KOKKOS_IF_ON_HOST((if (m_buffer != nullptr) {
-      ::Kokkos::Impl::concurrent_bitset::release(m_buffer, i);
-    }))
-
-    KOKKOS_IF_ON_DEVICE(((void)i;))
-  }
-};
-
-template <>
-class UniqueToken<HPX, UniqueTokenScope::Global> {
- public:
-  using execution_space = HPX;
-  using size_type       = int;
-  UniqueToken(execution_space const & = execution_space()) noexcept {}
-
-  // NOTE: Currently this assumes that there is no oversubscription.
-  // hpx::get_num_worker_threads can't be used directly because it may yield
-  // it's task (problematic if called after hpx::get_worker_thread_num).
-  int size() const noexcept { return HPX::impl_max_hardware_threads(); }
-  int acquire() const noexcept { return HPX::impl_hardware_thread_id(); }
-  void release(int) const noexcept {}
-};
-}  // namespace Experimental
-}  // namespace Kokkos
-
-namespace Kokkos {
-namespace Impl {
-
-struct HPXTeamMember {
- public:
-  using execution_space = Kokkos::Experimental::HPX;
-  using scratch_memory_space =
-      Kokkos::ScratchMemorySpace<Kokkos::Experimental::HPX>;
-  using team_handle = HPXTeamMember;
-
- private:
-  scratch_memory_space m_team_shared;
-
-  int m_league_size;
-  int m_league_rank;
-  int m_team_size;
-  int m_team_rank;
-
- public:
-  KOKKOS_INLINE_FUNCTION
-  const scratch_memory_space &team_shmem() const {
-    return m_team_shared.set_team_thread_mode(0, 1, 0);
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  const execution_space::scratch_memory_space &team_scratch(const int) const {
-    return m_team_shared.set_team_thread_mode(0, 1, 0);
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  const execution_space::scratch_memory_space &thread_scratch(const int) const {
-    return m_team_shared.set_team_thread_mode(0, team_size(), team_rank());
-  }
-
-  KOKKOS_INLINE_FUNCTION int league_rank() const noexcept {
-    return m_league_rank;
-  }
-
-  KOKKOS_INLINE_FUNCTION int league_size() const noexcept {
-    return m_league_size;
-  }
-
-  KOKKOS_INLINE_FUNCTION int team_rank() const noexcept { return m_team_rank; }
-  KOKKOS_INLINE_FUNCTION int team_size() const noexcept { return m_team_size; }
-
-  template <class... Properties>
-  constexpr KOKKOS_INLINE_FUNCTION HPXTeamMember(
-      const TeamPolicyInternal<Kokkos::Experimental::HPX, Properties...>
-          &policy,
-      const int team_rank, const int league_rank, void *scratch,
-      size_t scratch_size) noexcept
-      : m_team_shared(scratch, scratch_size, scratch, scratch_size),
-        m_league_size(policy.league_size()),
-        m_league_rank(league_rank),
-        m_team_size(policy.team_size()),
-        m_team_rank(team_rank) {}
-
-  KOKKOS_INLINE_FUNCTION
-  void team_barrier() const {}
-
-  template <class ValueType>
-  KOKKOS_INLINE_FUNCTION void team_broadcast(ValueType &, const int &) const {}
-
-  template <class Closure, class ValueType>
-  KOKKOS_INLINE_FUNCTION void team_broadcast(const Closure &closure,
-                                             ValueType &value,
-                                             const int &) const {
-    closure(value);
-  }
-
-  template <class ValueType, class JoinOp>
-  KOKKOS_INLINE_FUNCTION ValueType team_reduce(const ValueType &value,
-                                               const JoinOp &) const {
-    return value;
-  }
-
-  template <class ReducerType>
-  KOKKOS_INLINE_FUNCTION std::enable_if_t<is_reducer<ReducerType>::value>
-  team_reduce(const ReducerType &) const {}
-
-  template <typename Type>
-  KOKKOS_INLINE_FUNCTION Type
-  team_scan(const Type &value, Type *const global_accum = nullptr) const {
-    if (global_accum) {
-      Kokkos::atomic_fetch_add(global_accum, value);
-    }
-
-    return 0;
-  }
-};
-
-template <class... Properties>
-class TeamPolicyInternal<Kokkos::Experimental::HPX, Properties...>
-    : public PolicyTraits<Properties...> {
-  int m_league_size;
-  int m_team_size;
-  std::size_t m_team_scratch_size[2];
-  std::size_t m_thread_scratch_size[2];
-  int m_chunk_size;
-
- public:
-  using traits = PolicyTraits<Properties...>;
-
-  //! Tag this class as a kokkos execution policy
-  using execution_policy = TeamPolicyInternal;
-
-  using member_type = HPXTeamMember;
-
-  //! Execution space of this execution policy:
-  using execution_space = Kokkos::Experimental::HPX;
-
-  // NOTE: Max size is 1 for simplicity. In most cases more than 1 is not
-  // necessary on CPU. Implement later if there is a need.
-  template <class FunctorType>
-  inline static int team_size_max(const FunctorType &) {
-    return 1;
-  }
-
-  template <class FunctorType>
-  inline static int team_size_recommended(const FunctorType &) {
-    return 1;
-  }
-
-  template <class FunctorType>
-  inline static int team_size_recommended(const FunctorType &, const int &) {
-    return 1;
-  }
-
-  template <class FunctorType>
-  int team_size_max(const FunctorType &, const ParallelForTag &) const {
-    return 1;
-  }
-
-  template <class FunctorType>
-  int team_size_max(const FunctorType &, const ParallelReduceTag &) const {
-    return 1;
-  }
-
-  template <class FunctorType, class ReducerType>
-  int team_size_max(const FunctorType &, const ReducerType &,
-                    const ParallelReduceTag &) const {
-    return 1;
-  }
-
-  template <class FunctorType>
-  int team_size_recommended(const FunctorType &, const ParallelForTag &) const {
-    return 1;
-  }
-
-  template <class FunctorType>
-  int team_size_recommended(const FunctorType &,
-                            const ParallelReduceTag &) const {
-    return 1;
-  }
-
-  template <class FunctorType, class ReducerType>
-  int team_size_recommended(const FunctorType &, const ReducerType &,
-                            const ParallelReduceTag &) const {
-    return 1;
-  }
-
-  static int vector_length_max() { return 1; }
-
-  inline int impl_vector_length() noexcept { return 1; }
-  inline bool impl_auto_team_size() noexcept { return false; }
-  inline bool impl_auto_vector_length() noexcept { return false; }
-  inline void impl_set_vector_length(int) noexcept {}
-  inline void impl_set_team_size(int) noexcept {}
-
- private:
-  inline void init(const int league_size_request, const int team_size_request) {
-    m_league_size           = league_size_request;
-    const int max_team_size = 1;  // TODO: Can't use team_size_max(...) because
-                                  // it requires a functor as argument.
-    m_team_size =
-        team_size_request > max_team_size ? max_team_size : team_size_request;
-
-    if (m_chunk_size > 0) {
-      if (!Impl::is_integral_power_of_two(m_chunk_size))
-        Kokkos::abort("TeamPolicy blocking granularity must be power of two");
-    } else {
-      int new_chunk_size = 1;
-      while (new_chunk_size * 4 * Kokkos::Experimental::HPX::concurrency() <
-             m_league_size) {
-        new_chunk_size *= 2;
-      }
-
-      if (new_chunk_size < 128) {
-        new_chunk_size = 1;
-        while ((new_chunk_size * Kokkos::Experimental::HPX::concurrency() <
-                m_league_size) &&
-               (new_chunk_size < 128))
-          new_chunk_size *= 2;
-      }
-
-      m_chunk_size = new_chunk_size;
-    }
-  }
-
- public:
-  inline int team_size() const { return m_team_size; }
-  inline int league_size() const { return m_league_size; }
-
-  size_t scratch_size(const int &level, int team_size_ = -1) const {
-    if (team_size_ < 0) {
-      team_size_ = m_team_size;
-    }
-    return m_team_scratch_size[level] +
-           team_size_ * m_thread_scratch_size[level];
-  }
-
-  inline static int scratch_size_max(int level) {
-    return (level == 0 ? 1024 * 32 :  // Roughly L1 size
-                20 * 1024 * 1024);    // Limit to keep compatibility with CUDA
-  }
-
- public:
-  template <class ExecSpace, class... OtherProperties>
-  friend class TeamPolicyInternal;
-
-  const typename traits::execution_space &space() const {
-    static typename traits::execution_space m_space;
-    return m_space;
-  }
-
-  template <class... OtherProperties>
-  TeamPolicyInternal(const TeamPolicyInternal<Kokkos::Experimental::HPX,
-                                              OtherProperties...> &p) {
-    m_league_size            = p.m_league_size;
-    m_team_size              = p.m_team_size;
-    m_team_scratch_size[0]   = p.m_team_scratch_size[0];
-    m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
-    m_team_scratch_size[1]   = p.m_team_scratch_size[1];
-    m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
-    m_chunk_size             = p.m_chunk_size;
-  }
-
-  TeamPolicyInternal(const typename traits::execution_space &,
-                     int league_size_request, int team_size_request,
-                     int /* vector_length_request */ = 1)
-      : m_team_scratch_size{0, 0},
-        m_thread_scratch_size{0, 0},
-        m_chunk_size(0) {
-    init(league_size_request, team_size_request);
-  }
-
-  TeamPolicyInternal(const typename traits::execution_space &,
-                     int league_size_request, const Kokkos::AUTO_t &,
-                     int /* vector_length_request */ = 1)
-      : m_team_scratch_size{0, 0},
-        m_thread_scratch_size{0, 0},
-        m_chunk_size(0) {
-    init(league_size_request, 1);
-  }
-
-  TeamPolicyInternal(const typename traits::execution_space &,
-                     int league_size_request,
-                     const Kokkos::AUTO_t &, /* team_size_request */
-                     const Kokkos::AUTO_t & /* vector_length_request */)
-      : m_team_scratch_size{0, 0},
-        m_thread_scratch_size{0, 0},
-        m_chunk_size(0) {
-    init(league_size_request, 1);
-  }
-
-  TeamPolicyInternal(const typename traits::execution_space &,
-                     int league_size_request, int team_size_request,
-                     const Kokkos::AUTO_t & /* vector_length_request */
-                     )
-      : m_team_scratch_size{0, 0},
-        m_thread_scratch_size{0, 0},
-        m_chunk_size(0) {
-    init(league_size_request, team_size_request);
-  }
-
-  TeamPolicyInternal(int league_size_request,
-                     const Kokkos::AUTO_t &, /* team_size_request */
-                     const Kokkos::AUTO_t & /* vector_length_request */)
-      : m_team_scratch_size{0, 0},
-        m_thread_scratch_size{0, 0},
-        m_chunk_size(0) {
-    init(league_size_request, 1);
-  }
-
-  TeamPolicyInternal(int league_size_request, int team_size_request,
-                     const Kokkos::AUTO_t & /* vector_length_request */
-                     )
-      : m_team_scratch_size{0, 0},
-        m_thread_scratch_size{0, 0},
-        m_chunk_size(0) {
-    init(league_size_request, team_size_request);
-  }
-
-  TeamPolicyInternal(int league_size_request, int team_size_request,
-                     int /* vector_length_request */ = 1)
-      : m_team_scratch_size{0, 0},
-        m_thread_scratch_size{0, 0},
-        m_chunk_size(0) {
-    init(league_size_request, team_size_request);
-  }
-
-  TeamPolicyInternal(int league_size_request, const Kokkos::AUTO_t &,
-                     int /* vector_length_request */ = 1)
-      : m_team_scratch_size{0, 0},
-        m_thread_scratch_size{0, 0},
-        m_chunk_size(0) {
-    init(league_size_request, 1);
-  }
-
-  inline int chunk_size() const { return m_chunk_size; }
-
-  inline TeamPolicyInternal &set_chunk_size(
-      typename traits::index_type chunk_size_) {
-    m_chunk_size = chunk_size_;
-    return *this;
-  }
-
-  inline TeamPolicyInternal &set_scratch_size(const int &level,
-                                              const PerTeamValue &per_team) {
-    m_team_scratch_size[level] = per_team.value;
-    return *this;
-  }
-
-  inline TeamPolicyInternal &set_scratch_size(
-      const int &level, const PerThreadValue &per_thread) {
-    m_thread_scratch_size[level] = per_thread.value;
-    return *this;
-  }
-
-  inline TeamPolicyInternal &set_scratch_size(
-      const int &level, const PerTeamValue &per_team,
-      const PerThreadValue &per_thread) {
-    m_team_scratch_size[level]   = per_team.value;
-    m_thread_scratch_size[level] = per_thread.value;
-    return *this;
-  }
-};
-}  // namespace Impl
-}  // namespace Kokkos
-
-namespace Kokkos {
-namespace Impl {
-
-template <typename Policy>
-typename Policy::member_type get_hpx_adjusted_chunk_size(Policy const &policy) {
-  const int concurrency = Kokkos::Experimental::HPX::concurrency();
-  const typename Policy::member_type n        = policy.end() - policy.begin();
-  typename Policy::member_type new_chunk_size = policy.chunk_size();
-
-  while (n >= 4 * concurrency * new_chunk_size) {
-    new_chunk_size *= 2;
-  }
-
-  return new_chunk_size;
-}
-
-template <class FunctorType, class... Traits>
-class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>,
-                  Kokkos::Experimental::HPX> {
- private:
-  using Policy    = Kokkos::RangePolicy<Traits...>;
-  using WorkTag   = typename Policy::work_tag;
-  using WorkRange = typename Policy::WorkRange;
-  using Member    = typename Policy::member_type;
-
-  const FunctorType m_functor;
-  const Policy m_policy;
-
-  template <class TagType>
-  static std::enable_if_t<std::is_void<TagType>::value> execute_functor(
-      const FunctorType &functor, const Member i) {
-    functor(i);
-  }
-
-  template <class TagType>
-  static std::enable_if_t<!std::is_void<TagType>::value> execute_functor(
-      const FunctorType &functor, const Member i) {
-    const TagType t{};
-    functor(t, i);
-  }
-
-  template <class TagType>
-  static std::enable_if_t<std::is_void<TagType>::value> execute_functor_range(
-      const FunctorType &functor, const Member i_begin, const Member i_end) {
-    for (Member i = i_begin; i < i_end; ++i) {
-      functor(i);
-    }
-  }
-
-  template <class TagType>
-  static std::enable_if_t<!std::is_void<TagType>::value> execute_functor_range(
-      const FunctorType &functor, const Member i_begin, const Member i_end) {
-    const TagType t{};
-    for (Member i = i_begin; i < i_end; ++i) {
-      functor(t, i);
-    }
-  }
-
- public:
-  void execute() const {
-    Kokkos::Impl::dispatch_execute_task(this, m_policy.space());
-  }
-
-  void execute_task() const {
-    // See [note 1] for an explanation.
-    Kokkos::Experimental::HPX::reset_on_exit_parallel reset_on_exit(
-        m_policy.space());
-
-    auto exec = Kokkos::Experimental::HPX::impl_get_executor();
-
-    using hpx::execution::par;
-    using hpx::execution::static_chunk_size;
-
-#if KOKKOS_HPX_IMPLEMENTATION == 0
-    using hpx::for_loop;
-
-    for_loop(par.on(exec).with(static_chunk_size(m_policy.chunk_size())),
-             m_policy.begin(), m_policy.end(), [this](const Member i) {
-               execute_functor<WorkTag>(m_functor, i);
-             });
-
-#elif KOKKOS_HPX_IMPLEMENTATION == 1
-    using hpx::for_loop_strided;
-
-    const Member chunk_size = get_hpx_adjusted_chunk_size(m_policy);
-
-    for_loop_strided(
-        par.on(exec), m_policy.begin(), m_policy.end(), chunk_size,
-        [this, chunk_size](const Member i_begin) {
-          const Member i_end = (std::min)(i_begin + chunk_size, m_policy.end());
-          execute_functor_range<WorkTag>(m_functor, i_begin, i_end);
-        });
-#endif
-  }
-
-  inline ParallelFor(const FunctorType &arg_functor, Policy arg_policy)
-      : m_functor(arg_functor), m_policy(arg_policy) {}
-};
-
-template <class FunctorType, class... Traits>
-class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
-                  Kokkos::Experimental::HPX> {
- private:
-  using MDRangePolicy = Kokkos::MDRangePolicy<Traits...>;
-  using Policy        = typename MDRangePolicy::impl_range_policy;
-  using WorkTag       = typename MDRangePolicy::work_tag;
-  using WorkRange     = typename Policy::WorkRange;
-  using Member        = typename Policy::member_type;
-  using iterate_type =
-      typename Kokkos::Impl::HostIterateTile<MDRangePolicy, FunctorType,
-                                             WorkTag, void>;
-
-  const iterate_type m_iter;
-  const Policy m_policy;
-
- public:
-  void execute() const { dispatch_execute_task(this, m_iter.m_rp.space()); }
-
-  inline void execute_task() const {
-    // See [note 1] for an explanation.
-    Kokkos::Experimental::HPX::reset_on_exit_parallel reset_on_exit(
-        m_iter.m_rp.space());
-
-    auto exec = Kokkos::Experimental::HPX::impl_get_executor();
-
-    using hpx::execution::par;
-    using hpx::execution::static_chunk_size;
-
-#if KOKKOS_HPX_IMPLEMENTATION == 0
-    using hpx::for_loop;
-
-    for_loop(par.on(exec).with(
-                 static_chunk_size(get_hpx_adjusted_chunk_size(m_policy))),
-             m_policy.begin(), m_policy.end(),
-             [this](const Member i) { iterate_type(i); });
-
-#elif KOKKOS_HPX_IMPLEMENTATION == 1
-    using hpx::for_loop_strided;
-
-    const Member chunk_size = get_hpx_adjusted_chunk_size(m_policy);
-
-    for_loop_strided(par.on(exec), m_policy.begin(), m_policy.end(), chunk_size,
-                     [this, chunk_size](const Member i_begin) {
-                       const Member i_end =
-                           (std::min)(i_begin + chunk_size, m_policy.end());
-                       for (Member i = i_begin; i < i_end; ++i) {
-                         m_iter(i);
-                       }
-                     });
-#endif
-  }
-
-  inline ParallelFor(const FunctorType &arg_functor, MDRangePolicy arg_policy)
-      : m_iter(arg_policy, arg_functor),
-        m_policy(Policy(0, arg_policy.m_num_tiles).set_chunk_size(1)) {}
-  template <typename Policy, typename Functor>
-  static int max_tile_size_product(const Policy &, const Functor &) {
-    /**
-     * 1024 here is just our guess for a reasonable max tile size,
-     * it isn't a hardware constraint. If people see a use for larger
-     * tile size products, we're happy to change this.
-     */
-    return 1024;
-  }
-};
-}  // namespace Impl
-}  // namespace Kokkos
-
-namespace Kokkos {
-namespace Impl {
-template <class FunctorType, class ReducerType, class... Traits>
-class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
-                     Kokkos::Experimental::HPX> {
- private:
-  using Policy    = Kokkos::RangePolicy<Traits...>;
-  using WorkTag   = typename Policy::work_tag;
-  using WorkRange = typename Policy::WorkRange;
-  using Member    = typename Policy::member_type;
-  using ReducerConditional =
-      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                         FunctorType, ReducerType>;
-  using ReducerTypeFwd = typename ReducerConditional::type;
-  using Analysis =
-      FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy, ReducerTypeFwd>;
-  using value_type     = typename Analysis::value_type;
-  using pointer_type   = typename Analysis::pointer_type;
-  using reference_type = typename Analysis::reference_type;
-
-  const FunctorType m_functor;
-  const Policy m_policy;
-  const ReducerType m_reducer;
-  const pointer_type m_result_ptr;
-
-  bool m_force_synchronous;
-
-  template <class TagType>
-  inline static std::enable_if_t<std::is_void<TagType>::value> execute_functor(
-      const FunctorType &functor, const Member i, reference_type update) {
-    functor(i, update);
-  }
-
-  template <class TagType>
-  inline static std::enable_if_t<!std::is_void<TagType>::value> execute_functor(
-      const FunctorType &functor, const Member i, reference_type update) {
-    const TagType t{};
-    functor(t, i, update);
-  }
-
-  template <class TagType>
-  inline std::enable_if_t<std::is_void<TagType>::value> execute_functor_range(
-      reference_type update, const Member i_begin, const Member i_end) const {
-    for (Member i = i_begin; i < i_end; ++i) {
-      m_functor(i, update);
-    }
-  }
-
-  template <class TagType>
-  inline std::enable_if_t<!std::is_void<TagType>::value> execute_functor_range(
-      reference_type update, const Member i_begin, const Member i_end) const {
-    const TagType t{};
-
-    for (Member i = i_begin; i < i_end; ++i) {
-      m_functor(t, i, update);
-    }
-  }
-
-  class value_type_wrapper {
-   private:
-    std::size_t m_value_size;
-    char *m_value_buffer;
-
-   public:
-    value_type_wrapper() : m_value_size(0), m_value_buffer(nullptr) {}
-
-    value_type_wrapper(const std::size_t value_size)
-        : m_value_size(value_size), m_value_buffer(new char[m_value_size]) {}
-
-    value_type_wrapper(const value_type_wrapper &other)
-        : m_value_size(0), m_value_buffer(nullptr) {
-      if (this != &other) {
-        m_value_buffer = new char[other.m_value_size];
-        m_value_size   = other.m_value_size;
-
-        std::copy(other.m_value_buffer, other.m_value_buffer + m_value_size,
-                  m_value_buffer);
-      }
-    }
-
-    ~value_type_wrapper() { delete[] m_value_buffer; }
-
-    value_type_wrapper(value_type_wrapper &&other)
-        : m_value_size(0), m_value_buffer(nullptr) {
-      if (this != &other) {
-        m_value_buffer = other.m_value_buffer;
-        m_value_size   = other.m_value_size;
-
-        other.m_value_buffer = nullptr;
-        other.m_value_size   = 0;
-      }
-    }
-
-    value_type_wrapper &operator=(const value_type_wrapper &other) {
-      if (this != &other) {
-        delete[] m_value_buffer;
-        m_value_buffer = new char[other.m_value_size];
-        m_value_size   = other.m_value_size;
-
-        std::copy(other.m_value_buffer, other.m_value_buffer + m_value_size,
-                  m_value_buffer);
-      }
-
-      return *this;
-    }
-
-    value_type_wrapper &operator=(value_type_wrapper &&other) {
-      if (this != &other) {
-        delete[] m_value_buffer;
-        m_value_buffer = other.m_value_buffer;
-        m_value_size   = other.m_value_size;
-
-        other.m_value_buffer = nullptr;
-        other.m_value_size   = 0;
-      }
-
-      return *this;
-    }
-
-    pointer_type pointer() const {
-      return reinterpret_cast<pointer_type>(m_value_buffer);
-    }
-
-    reference_type reference() const {
-      return Analysis::Reducer::reference(
-          reinterpret_cast<pointer_type>(m_value_buffer));
-    }
-  };
-
- public:
-  void execute() const {
-    if (m_policy.end() <= m_policy.begin()) {
-      if (m_result_ptr) {
-        typename Analysis::Reducer final_reducer(
-            &ReducerConditional::select(m_functor, m_reducer));
-
-        final_reducer.init(m_result_ptr);
-        final_reducer.final(m_result_ptr);
-      }
-      return;
-    }
-    dispatch_execute_task(this, m_policy.space(), m_force_synchronous);
-  }
-
-  inline void execute_task() const {
-    // See [note 1] for an explanation.
-    Kokkos::Experimental::HPX::reset_on_exit_parallel reset_on_exit(
-        m_policy.space());
-
-    typename Analysis::Reducer final_reducer(
-        &ReducerConditional::select(m_functor, m_reducer));
-
-    const std::size_t value_size =
-        Analysis::value_size(ReducerConditional::select(m_functor, m_reducer));
-
-    auto exec = Kokkos::Experimental::HPX::impl_get_executor();
-
-    using hpx::for_loop;
-    using hpx::execution::par;
-    using hpx::execution::static_chunk_size;
-
-#if KOKKOS_HPX_IMPLEMENTATION == 0
-    // NOTE: This version makes the most use of HPX functionality, but
-    // requires the struct value_type_wrapper to handle different
-    // reference_types. It is also significantly slower than the version
-    // below due to not reusing the buffer used by other functions.
-    using hpx::parallel::reduction;
-
-    value_type_wrapper final_value(value_size);
-    value_type_wrapper identity(value_size);
-
-    final_reducer.init(final_value.pointer());
-    final_reducer.init(identity.pointer());
-
-    for_loop(par.on(exec).with(
-                 static_chunk_size(get_hpx_adjusted_chunk_size(m_policy))),
-             m_policy.begin(), m_policy.end(),
-             reduction(final_value, identity,
-                       [final_reducer](
-                           value_type_wrapper &a,
-                           value_type_wrapper &b) -> value_type_wrapper & {
-                         final_reducer.join(a.pointer(), b.pointer());
-                         return a;
-                       }),
-             [this](Member i, value_type_wrapper &update) {
-               execute_functor<WorkTag>(m_functor, i, update.reference());
-             });
-
-    pointer_type final_value_ptr = final_value.pointer();
-
-#elif KOKKOS_HPX_IMPLEMENTATION == 1
-    using hpx::for_loop_strided;
-
-    const int num_worker_threads = Kokkos::Experimental::HPX::concurrency();
-
-    thread_buffer &buffer = m_policy.space().impl_get_buffer();
-    buffer.resize(num_worker_threads, value_size);
-
-    for_loop(
-        par.on(exec).with(static_chunk_size(1)), 0, num_worker_threads,
-        [&buffer, final_reducer ](const int t) noexcept {
-          final_reducer.init(reinterpret_cast<pointer_type>(buffer.get(t)));
-        });
-
-    const Member chunk_size = get_hpx_adjusted_chunk_size(m_policy);
-
-    for_loop_strided(
-        par.on(exec), m_policy.begin(), m_policy.end(), chunk_size,
-        [this, &buffer, chunk_size](const Member i_begin) {
-          reference_type update = Analysis::Reducer::reference(
-              reinterpret_cast<pointer_type>(buffer.get(
-                  Kokkos::Experimental::HPX::impl_hardware_thread_id())));
-          const Member i_end = (std::min)(i_begin + chunk_size, m_policy.end());
-          execute_functor_range<WorkTag>(update, i_begin, i_end);
-        });
-
-    for (int i = 1; i < num_worker_threads; ++i) {
-      final_reducer.join(reinterpret_cast<pointer_type>(buffer.get(0)),
-                         reinterpret_cast<pointer_type>(buffer.get(i)));
-    }
-
-    pointer_type final_value_ptr =
-        reinterpret_cast<pointer_type>(buffer.get(0));
-#endif
-
-    final_reducer.final(final_value_ptr);
-
-    if (m_result_ptr != nullptr) {
-      const int n = Analysis::value_count(
-          ReducerConditional::select(m_functor, m_reducer));
-
-      for (int j = 0; j < n; ++j) {
-        m_result_ptr[j] = final_value_ptr[j];
-      }
-    }
-  }
-
-  template <class ViewType>
-  inline ParallelReduce(
-      const FunctorType &arg_functor, Policy arg_policy,
-      const ViewType &arg_view,
-      std::enable_if_t<Kokkos::is_view<ViewType>::value &&
-                           !Kokkos::is_reducer<ReducerType>::value,
-                       void *> = nullptr)
-      : m_functor(arg_functor),
-        m_policy(arg_policy),
-        m_reducer(InvalidType()),
-        m_result_ptr(arg_view.data()),
-        m_force_synchronous(!arg_view.impl_track().has_record()) {}
-
-  inline ParallelReduce(const FunctorType &arg_functor, Policy arg_policy,
-                        const ReducerType &reducer)
-      : m_functor(arg_functor),
-        m_policy(arg_policy),
-        m_reducer(reducer),
-        m_result_ptr(reducer.view().data()),
-        m_force_synchronous(!reducer.view().impl_track().has_record()) {}
-};
-
-template <class FunctorType, class ReducerType, class... Traits>
-class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
-                     Kokkos::Experimental::HPX> {
- private:
-  using MDRangePolicy = Kokkos::MDRangePolicy<Traits...>;
-  using Policy        = typename MDRangePolicy::impl_range_policy;
-  using WorkTag       = typename MDRangePolicy::work_tag;
-  using WorkRange     = typename Policy::WorkRange;
-  using Member        = typename Policy::member_type;
-  using ReducerConditional =
-      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                         FunctorType, ReducerType>;
-  using ReducerTypeFwd = typename ReducerConditional::type;
-  using Analysis       = FunctorAnalysis<FunctorPatternInterface::REDUCE,
-                                   MDRangePolicy, ReducerTypeFwd>;
-
-  using pointer_type   = typename Analysis::pointer_type;
-  using value_type     = typename Analysis::value_type;
-  using reference_type = typename Analysis::reference_type;
-  using iterate_type =
-      typename Kokkos::Impl::HostIterateTile<MDRangePolicy, FunctorType,
-                                             WorkTag, reference_type>;
-
-  const iterate_type m_iter;
-  const Policy m_policy;
-  const ReducerType m_reducer;
-  const pointer_type m_result_ptr;
-
-  bool m_force_synchronous;
-
- public:
-  void execute() const {
-    dispatch_execute_task(this, m_iter.m_rp.space(), m_force_synchronous);
-  }
-
-  inline void execute_task() const {
-    // See [note 1] for an explanation.
-    Kokkos::Experimental::HPX::reset_on_exit_parallel reset_on_exit(
-        m_iter.m_rp.space());
-
-    const int num_worker_threads = Kokkos::Experimental::HPX::concurrency();
-    const std::size_t value_size = Analysis::value_size(
-        ReducerConditional::select(m_iter.m_func, m_reducer));
-
-    thread_buffer &buffer = m_iter.m_rp.space().impl_get_buffer();
-    buffer.resize(num_worker_threads, value_size);
-
-    using hpx::for_loop;
-    using hpx::execution::par;
-    using hpx::execution::static_chunk_size;
-
-    auto exec = Kokkos::Experimental::HPX::impl_get_executor();
-
-    typename Analysis::Reducer final_reducer(
-        &ReducerConditional::select(m_iter.m_func, m_reducer));
-
-#if KOKKOS_HPX_IMPLEMENTATION == 0
-
-    for_loop(
-        par.on(exec).with(static_chunk_size(1)), 0, num_worker_threads,
-        [&buffer, final_reducer](std::size_t t) {
-          final_reducer.init(reinterpret_cast<pointer_type>(buffer.get(t)));
-        });
-
-    for_loop(par.on(exec).with(
-                 static_chunk_size(get_hpx_adjusted_chunk_size(m_policy))),
-             m_policy.begin(), m_policy.end(), [this, &buffer](const Member i) {
-               reference_type update = Analysis::Reducer::reference(
-                   reinterpret_cast<pointer_type>(buffer.get(
-                       Kokkos::Experimental::HPX::impl_hardware_thread_id())));
-               m_iter(i, update);
-             });
-
-#elif KOKKOS_HPX_IMPLEMENTATION == 1
-    using hpx::for_loop_strided;
-
-    for_loop(
-        par.on(exec).with(static_chunk_size(1)), std::size_t(0),
-        num_worker_threads, [&buffer, final_reducer](const std::size_t t) {
-          final_reducer.init(reinterpret_cast<pointer_type>(buffer.get(t)));
-        });
-
-    const Member chunk_size = get_hpx_adjusted_chunk_size(m_policy);
-
-    for_loop_strided(
-        par.on(exec), m_policy.begin(), m_policy.end(), chunk_size,
-        [this, &buffer, chunk_size](const Member i_begin) {
-          reference_type update = Analysis::Reducer::reference(
-              reinterpret_cast<pointer_type>(buffer.get(
-                  Kokkos::Experimental::HPX::impl_hardware_thread_id())));
-          const Member i_end = (std::min)(i_begin + chunk_size, m_policy.end());
-
-          for (Member i = i_begin; i < i_end; ++i) {
-            m_iter(i, update);
-          }
-        });
-#endif
-
-    for (int i = 1; i < num_worker_threads; ++i) {
-      final_reducer.join(reinterpret_cast<pointer_type>(buffer.get(0)),
-                         reinterpret_cast<pointer_type>(buffer.get(i)));
-    }
-
-    final_reducer.final(reinterpret_cast<pointer_type>(buffer.get(0)));
-
-    if (m_result_ptr != nullptr) {
-      const int n = Analysis::value_count(
-          ReducerConditional::select(m_iter.m_func, m_reducer));
-
-      for (int j = 0; j < n; ++j) {
-        m_result_ptr[j] = reinterpret_cast<pointer_type>(buffer.get(0))[j];
-      }
-    }
-  }
-
-  template <class ViewType>
-  inline ParallelReduce(
-      const FunctorType &arg_functor, MDRangePolicy arg_policy,
-      const ViewType &arg_view,
-      std::enable_if_t<Kokkos::is_view<ViewType>::value &&
-                           !Kokkos::is_reducer<ReducerType>::value,
-                       void *> = nullptr)
-      : m_iter(arg_policy, arg_functor),
-        m_policy(Policy(0, arg_policy.m_num_tiles).set_chunk_size(1)),
-        m_reducer(InvalidType()),
-        m_result_ptr(arg_view.data()),
-        m_force_synchronous(!arg_view.impl_track().has_record()) {}
-
-  inline ParallelReduce(const FunctorType &arg_functor,
-                        MDRangePolicy arg_policy, const ReducerType &reducer)
-      : m_iter(arg_policy, arg_functor),
-        m_policy(Policy(0, arg_policy.m_num_tiles).set_chunk_size(1)),
-        m_reducer(reducer),
-        m_result_ptr(reducer.view().data()),
-        m_force_synchronous(!reducer.view().impl_track().has_record()) {}
-  template <typename Policy, typename Functor>
-  static int max_tile_size_product(const Policy &, const Functor &) {
-    /**
-     * 1024 here is just our guess for a reasonable max tile size,
-     * it isn't a hardware constraint. If people see a use for larger
-     * tile size products, we're happy to change this.
-     */
-    return 1024;
-  }
-};
-}  // namespace Impl
-}  // namespace Kokkos
-
-namespace Kokkos {
-namespace Impl {
-
-template <class FunctorType, class... Traits>
-class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
-                   Kokkos::Experimental::HPX> {
- private:
-  using Policy    = Kokkos::RangePolicy<Traits...>;
-  using WorkTag   = typename Policy::work_tag;
-  using WorkRange = typename Policy::WorkRange;
-  using Member    = typename Policy::member_type;
-  using Analysis =
-      FunctorAnalysis<FunctorPatternInterface::SCAN, Policy, FunctorType>;
-  using pointer_type   = typename Analysis::pointer_type;
-  using reference_type = typename Analysis::reference_type;
-  using value_type     = typename Analysis::value_type;
-
-  const FunctorType m_functor;
-  const Policy m_policy;
-
-  template <class TagType>
-  inline static std::enable_if_t<std::is_void<TagType>::value>
-  execute_functor_range(const FunctorType &functor, const Member i_begin,
-                        const Member i_end, reference_type update,
-                        const bool final) {
-    for (Member i = i_begin; i < i_end; ++i) {
-      functor(i, update, final);
-    }
-  }
-
-  template <class TagType>
-  inline static std::enable_if_t<!std::is_void<TagType>::value>
-  execute_functor_range(const FunctorType &functor, const Member i_begin,
-                        const Member i_end, reference_type update,
-                        const bool final) {
-    const TagType t{};
-    for (Member i = i_begin; i < i_end; ++i) {
-      functor(t, i, update, final);
-    }
-  }
-
- public:
-  void execute() const { dispatch_execute_task(this, m_policy.space()); }
-
-  inline void execute_task() const {
-    // See [note 1] for an explanation.
-    Kokkos::Experimental::HPX::reset_on_exit_parallel reset_on_exit(
-        m_policy.space());
-
-    const int num_worker_threads = Kokkos::Experimental::HPX::concurrency();
-    const int value_count        = Analysis::value_count(m_functor);
-    const std::size_t value_size = Analysis::value_size(m_functor);
-
-    thread_buffer &buffer = m_policy.space().impl_get_buffer();
-    buffer.resize(num_worker_threads, 2 * value_size);
-
-    using hpx::barrier;
-    using hpx::for_loop;
-    using hpx::execution::par;
-    using hpx::execution::static_chunk_size;
-
-    barrier<> bar(num_worker_threads);
-    auto exec = Kokkos::Experimental::HPX::impl_get_executor();
-
-    typename Analysis::Reducer final_reducer(&m_functor);
-
-    for_loop(
-        par.on(exec).with(static_chunk_size(1)), 0, num_worker_threads,
-        [this, &bar, &buffer, num_worker_threads, value_count, value_size,
-         final_reducer](int t) {
-          reference_type update_sum =
-              final_reducer.init(reinterpret_cast<pointer_type>(buffer.get(t)));
-
-          const WorkRange range(m_policy, t, num_worker_threads);
-          execute_functor_range<WorkTag>(m_functor, range.begin(), range.end(),
-                                         update_sum, false);
-
-          bar.arrive_and_wait();
-
-          if (t == 0) {
-            final_reducer.init(
-                reinterpret_cast<pointer_type>(buffer.get(0) + value_size));
-
-            for (int i = 1; i < num_worker_threads; ++i) {
-              pointer_type ptr_1_prev =
-                  reinterpret_cast<pointer_type>(buffer.get(i - 1));
-              pointer_type ptr_2_prev = reinterpret_cast<pointer_type>(
-                  buffer.get(i - 1) + value_size);
-              pointer_type ptr_2 =
-                  reinterpret_cast<pointer_type>(buffer.get(i) + value_size);
-
-              for (int j = 0; j < value_count; ++j) {
-                ptr_2[j] = ptr_2_prev[j];
-              }
-
-              final_reducer.join(ptr_2, ptr_1_prev);
-            }
-          }
-
-          bar.arrive_and_wait();
-
-          reference_type update_base = Analysis::Reducer::reference(
-              reinterpret_cast<pointer_type>(buffer.get(t) + value_size));
-
-          execute_functor_range<WorkTag>(m_functor, range.begin(), range.end(),
-                                         update_base, true);
-        });
-  }
-
-  inline ParallelScan(const FunctorType &arg_functor, const Policy &arg_policy)
-      : m_functor(arg_functor), m_policy(arg_policy) {}
-};
-
-template <class FunctorType, class ReturnType, class... Traits>
-class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
-                            ReturnType, Kokkos::Experimental::HPX> {
- private:
-  using Policy    = Kokkos::RangePolicy<Traits...>;
-  using WorkTag   = typename Policy::work_tag;
-  using WorkRange = typename Policy::WorkRange;
-  using Member    = typename Policy::member_type;
-  using Analysis =
-      FunctorAnalysis<FunctorPatternInterface::SCAN, Policy, FunctorType>;
-  using pointer_type   = typename Analysis::pointer_type;
-  using reference_type = typename Analysis::reference_type;
-  using value_type     = typename Analysis::value_type;
-
-  const FunctorType m_functor;
-  const Policy m_policy;
-  pointer_type m_result_ptr;
-
-  template <class TagType>
-  inline static std::enable_if_t<std::is_void<TagType>::value>
-  execute_functor_range(const FunctorType &functor, const Member i_begin,
-                        const Member i_end, reference_type update,
-                        const bool final) {
-    for (Member i = i_begin; i < i_end; ++i) {
-      functor(i, update, final);
-    }
-  }
-
-  template <class TagType>
-  inline static std::enable_if_t<!std::is_void<TagType>::value>
-  execute_functor_range(const FunctorType &functor, const Member i_begin,
-                        const Member i_end, reference_type update,
-                        const bool final) {
-    const TagType t{};
-    for (Member i = i_begin; i < i_end; ++i) {
-      functor(t, i, update, final);
-    }
-  }
-
- public:
-  void execute() const { dispatch_execute_task(this, m_policy.space()); }
-
-  inline void execute_task() const {
-    // See [note 1] for an explanation.
-    Kokkos::Experimental::HPX::reset_on_exit_parallel reset_on_exit(
-        m_policy.space());
-
-    const int num_worker_threads = Kokkos::Experimental::HPX::concurrency();
-    const int value_count        = Analysis::value_count(m_functor);
-    const std::size_t value_size = Analysis::value_size(m_functor);
-
-    thread_buffer &buffer = m_policy.space().impl_get_buffer();
-    buffer.resize(num_worker_threads, 2 * value_size);
-
-    using hpx::barrier;
-    using hpx::for_loop;
-    using hpx::execution::par;
-    using hpx::execution::static_chunk_size;
-
-    barrier<> bar(num_worker_threads);
-    auto exec = Kokkos::Experimental::HPX::impl_get_executor();
-
-    typename Analysis::Reducer final_reducer(&m_functor);
-
-    for_loop(
-        par.on(exec).with(static_chunk_size(1)), 0, num_worker_threads,
-        [this, &bar, &buffer, num_worker_threads, value_count, value_size,
-         final_reducer](int t) {
-          reference_type update_sum =
-              final_reducer.init(reinterpret_cast<pointer_type>(buffer.get(t)));
-
-          const WorkRange range(m_policy, t, num_worker_threads);
-          execute_functor_range<WorkTag>(m_functor, range.begin(), range.end(),
-                                         update_sum, false);
-
-          bar.arrive_and_wait();
-
-          if (t == 0) {
-            final_reducer.init(
-                reinterpret_cast<pointer_type>(buffer.get(0) + value_size));
-
-            for (int i = 1; i < num_worker_threads; ++i) {
-              pointer_type ptr_1_prev =
-                  reinterpret_cast<pointer_type>(buffer.get(i - 1));
-              pointer_type ptr_2_prev = reinterpret_cast<pointer_type>(
-                  buffer.get(i - 1) + value_size);
-              pointer_type ptr_2 =
-                  reinterpret_cast<pointer_type>(buffer.get(i) + value_size);
-
-              for (int j = 0; j < value_count; ++j) {
-                ptr_2[j] = ptr_2_prev[j];
-              }
-
-              final_reducer.join(ptr_2, ptr_1_prev);
-            }
-          }
-
-          bar.arrive_and_wait();
-
-          reference_type update_base = Analysis::Reducer::reference(
-              reinterpret_cast<pointer_type>(buffer.get(t) + value_size));
-
-          execute_functor_range<WorkTag>(m_functor, range.begin(), range.end(),
-                                         update_base, true);
-
-          if (t == num_worker_threads - 1) {
-            *m_result_ptr = update_base;
-          }
-        });
-  }
-
-  template <class ViewType>
-  ParallelScanWithTotal(const FunctorType &arg_functor,
-                        const Policy &arg_policy,
-                        const ViewType &arg_result_view)
-      : m_functor(arg_functor),
-        m_policy(arg_policy),
-        m_result_ptr(arg_result_view.data()) {
-    static_assert(
-        Kokkos::Impl::MemorySpaceAccess<typename ViewType::memory_space,
-                                        Kokkos::HostSpace>::accessible,
-        "Kokkos::HPX parallel_scan result must be host-accessible!");
-  }
-};
-}  // namespace Impl
-}  // namespace Kokkos
-
-namespace Kokkos {
-namespace Impl {
-template <class FunctorType, class... Properties>
-class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
-                  Kokkos::Experimental::HPX> {
- private:
-  using Policy  = TeamPolicyInternal<Kokkos::Experimental::HPX, Properties...>;
-  using WorkTag = typename Policy::work_tag;
-  using Member  = typename Policy::member_type;
-  using memory_space = Kokkos::HostSpace;
-
-  const FunctorType m_functor;
-  const Policy m_policy;
-  const int m_league;
-  const std::size_t m_shared;
-
-  template <class TagType>
-  inline static std::enable_if_t<std::is_void<TagType>::value> execute_functor(
-      const FunctorType &functor, const Policy &policy, const int league_rank,
-      char *local_buffer, const std::size_t local_buffer_size) {
-    functor(Member(policy, 0, league_rank, local_buffer, local_buffer_size));
-  }
-
-  template <class TagType>
-  inline static std::enable_if_t<!std::is_void<TagType>::value> execute_functor(
-      const FunctorType &functor, const Policy &policy, const int league_rank,
-      char *local_buffer, const std::size_t local_buffer_size) {
-    const TagType t{};
-    functor(t, Member(policy, 0, league_rank, local_buffer, local_buffer_size));
-  }
-
-  template <class TagType>
-  inline static std::enable_if_t<std::is_void<TagType>::value>
-  execute_functor_range(const FunctorType &functor, const Policy &policy,
-                        const int league_rank_begin, const int league_rank_end,
-                        char *local_buffer,
-                        const std::size_t local_buffer_size) {
-    for (int league_rank = league_rank_begin; league_rank < league_rank_end;
-         ++league_rank) {
-      functor(Member(policy, 0, league_rank, local_buffer, local_buffer_size));
-    }
-  }
-
-  template <class TagType>
-  inline static std::enable_if_t<!std::is_void<TagType>::value>
-  execute_functor_range(const FunctorType &functor, const Policy &policy,
-                        const int league_rank_begin, const int league_rank_end,
-                        char *local_buffer,
-                        const std::size_t local_buffer_size) {
-    const TagType t{};
-    for (int league_rank = league_rank_begin; league_rank < league_rank_end;
-         ++league_rank) {
-      functor(t,
-              Member(policy, 0, league_rank, local_buffer, local_buffer_size));
-    }
-  }
-
- public:
-  void execute() const { dispatch_execute_task(this, m_policy.space()); }
-
-  inline void execute_task() const {
-    // See [note 1] for an explanation.
-    Kokkos::Experimental::HPX::reset_on_exit_parallel reset_on_exit(
-        m_policy.space());
-
-    const int num_worker_threads = Kokkos::Experimental::HPX::concurrency();
-
-    thread_buffer &buffer = m_policy.space().impl_get_buffer();
-    buffer.resize(num_worker_threads, m_shared);
-
-    auto exec = Kokkos::Experimental::HPX::impl_get_executor();
-
-    using hpx::execution::par;
-    using hpx::execution::static_chunk_size;
-
-#if KOKKOS_HPX_IMPLEMENTATION == 0
-    using hpx::for_loop;
-
-    for_loop(
-        par.on(exec).with(static_chunk_size(m_policy.chunk_size())), 0,
-        m_policy.league_size(), [this, &buffer](const int league_rank) {
-          execute_functor<WorkTag>(
-              m_functor, m_policy, league_rank,
-              buffer.get(Kokkos::Experimental::HPX::impl_hardware_thread_id()),
-              m_shared);
-        });
-
-#elif KOKKOS_HPX_IMPLEMENTATION == 1
-    using hpx::for_loop_strided;
-
-    for_loop_strided(
-        par.on(exec), 0, m_policy.league_size(), m_policy.chunk_size(),
-        [this, &buffer](const int league_rank_begin) {
-          const int league_rank_end =
-              (std::min)(league_rank_begin + m_policy.chunk_size(),
-                         m_policy.league_size());
-          execute_functor_range<WorkTag>(
-              m_functor, m_policy, league_rank_begin, league_rank_end,
-              buffer.get(Kokkos::Experimental::HPX::impl_hardware_thread_id()),
-              m_shared);
-        });
-#endif
-  }
-
-  ParallelFor(const FunctorType &arg_functor, const Policy &arg_policy)
-      : m_functor(arg_functor),
-        m_policy(arg_policy),
-        m_league(arg_policy.league_size()),
-        m_shared(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) +
-                 FunctorTeamShmemSize<FunctorType>::value(
-                     arg_functor, arg_policy.team_size())) {}
-};
-
-template <class FunctorType, class ReducerType, class... Properties>
-class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
-                     ReducerType, Kokkos::Experimental::HPX> {
- private:
-  using Policy  = TeamPolicyInternal<Kokkos::Experimental::HPX, Properties...>;
-  using Member  = typename Policy::member_type;
-  using WorkTag = typename Policy::work_tag;
-  using ReducerConditional =
-      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                         FunctorType, ReducerType>;
-  using ReducerTypeFwd = typename ReducerConditional::type;
-  using Analysis =
-      FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy, ReducerTypeFwd>;
-  using pointer_type   = typename Analysis::pointer_type;
-  using reference_type = typename Analysis::reference_type;
-  using value_type     = typename Analysis::value_type;
-
-  const FunctorType m_functor;
-  const int m_league;
-  const Policy m_policy;
-  const ReducerType m_reducer;
-  pointer_type m_result_ptr;
-  const std::size_t m_shared;
-
-  bool m_force_synchronous;
-
-  template <class TagType>
-  inline static std::enable_if_t<std::is_void<TagType>::value> execute_functor(
-      const FunctorType &functor, const Policy &policy, const int league_rank,
-      char *local_buffer, const std::size_t local_buffer_size,
-      reference_type update) {
-    functor(Member(policy, 0, league_rank, local_buffer, local_buffer_size),
-            update);
-  }
-
-  template <class TagType>
-  inline static std::enable_if_t<!std::is_void<TagType>::value> execute_functor(
-      const FunctorType &functor, const Policy &policy, const int league_rank,
-      char *local_buffer, const std::size_t local_buffer_size,
-      reference_type update) {
-    const TagType t{};
-    functor(t, Member(policy, 0, league_rank, local_buffer, local_buffer_size),
-            update);
-  }
-
-  template <class TagType>
-  inline static std::enable_if_t<std::is_void<TagType>::value>
-  execute_functor_range(const FunctorType &functor, const Policy &policy,
-                        const int league_rank_begin, const int league_rank_end,
-                        char *local_buffer, const std::size_t local_buffer_size,
-                        reference_type update) {
-    for (int league_rank = league_rank_begin; league_rank < league_rank_end;
-         ++league_rank) {
-      functor(Member(policy, 0, league_rank, local_buffer, local_buffer_size),
-              update);
-    }
-  }
-
-  template <class TagType>
-  inline static std::enable_if_t<!std::is_void<TagType>::value>
-  execute_functor_range(const FunctorType &functor, const Policy &policy,
-                        const int league_rank_begin, const int league_rank_end,
-                        char *local_buffer, const std::size_t local_buffer_size,
-                        reference_type update) {
-    const TagType t{};
-    for (int league_rank = league_rank_begin; league_rank < league_rank_end;
-         ++league_rank) {
-      functor(t,
-              Member(policy, 0, league_rank, local_buffer, local_buffer_size),
-              update);
-    }
-  }
-
- public:
-  void execute() const {
-    if (m_policy.league_size() * m_policy.team_size() == 0) {
-      if (m_result_ptr) {
-        typename Analysis::Reducer final_reducer(
-            &ReducerConditional::select(m_functor, m_reducer));
-        final_reducer.init(m_result_ptr);
-        final_reducer.final(m_result_ptr);
-      }
-      return;
-    }
-    dispatch_execute_task(this, m_policy.space());
-  }
-
-  inline void execute_task() const {
-    // See [note 1] for an explanation.
-    Kokkos::Experimental::HPX::reset_on_exit_parallel reset_on_exit(
-        m_policy.space());
-
-    const int num_worker_threads = Kokkos::Experimental::HPX::concurrency();
-    const std::size_t value_size =
-        Analysis::value_size(ReducerConditional::select(m_functor, m_reducer));
-
-    thread_buffer &buffer = m_policy.space().impl_get_buffer();
-    buffer.resize(num_worker_threads, value_size + m_shared);
-
-    auto exec = Kokkos::Experimental::HPX::impl_get_executor();
-
-    using hpx::for_loop;
-    using hpx::execution::par;
-    using hpx::execution::static_chunk_size;
-
-    typename Analysis::Reducer final_reducer(
-        &ReducerConditional::select(m_functor, m_reducer));
-
-#if KOKKOS_HPX_IMPLEMENTATION == 0
-
-    for_loop(
-        par.on(exec).with(static_chunk_size(1)), 0, num_worker_threads,
-        [&buffer, final_reducer](const std::size_t t) {
-          final_reducer.init(reinterpret_cast<pointer_type>(buffer.get(t)));
-        });
-
-    for_loop(par.on(exec).with(static_chunk_size(m_policy.chunk_size())), 0,
-             m_policy.league_size(),
-             [this, &buffer, value_size](const int league_rank) {
-               std::size_t t =
-                   Kokkos::Experimental::HPX::impl_hardware_thread_id();
-               reference_type update = Analysis::Reducer::reference(
-                   reinterpret_cast<pointer_type>(buffer.get(t)));
-
-               execute_functor<WorkTag>(m_functor, m_policy, league_rank,
-                                        buffer.get(t) + value_size, m_shared,
-                                        update);
-             });
-
-#elif KOKKOS_HPX_IMPLEMENTATION == 1
-    using hpx::for_loop_strided;
-
-    for_loop(
-        par.on(exec).with(static_chunk_size(1)), 0, num_worker_threads,
-        [&buffer, final_reducer](std::size_t const t) {
-          final_reducer.init(reinterpret_cast<pointer_type>(buffer.get(t)));
-        });
-
-    for_loop_strided(
-        par.on(exec), 0, m_policy.league_size(), m_policy.chunk_size(),
-        [this, &buffer, value_size](int const league_rank_begin) {
-          std::size_t t = Kokkos::Experimental::HPX::impl_hardware_thread_id();
-          reference_type update = Analysis::Reducer::reference(
-              reinterpret_cast<pointer_type>(buffer.get(t)));
-          const int league_rank_end =
-              (std::min)(league_rank_begin + m_policy.chunk_size(),
-                         m_policy.league_size());
-          execute_functor_range<WorkTag>(
-              m_functor, m_policy, league_rank_begin, league_rank_end,
-              buffer.get(t) + value_size, m_shared, update);
-        });
-#endif
-
-    const pointer_type ptr = reinterpret_cast<pointer_type>(buffer.get(0));
-    for (int t = 1; t < num_worker_threads; ++t) {
-      final_reducer.join(ptr, reinterpret_cast<pointer_type>(buffer.get(t)));
-    }
-
-    final_reducer.final(ptr);
-
-    if (m_result_ptr) {
-      const int n = Analysis::value_count(
-          ReducerConditional::select(m_functor, m_reducer));
-
-      for (int j = 0; j < n; ++j) {
-        m_result_ptr[j] = ptr[j];
-      }
-    }
-  }
-
-  template <class ViewType>
-  ParallelReduce(const FunctorType &arg_functor, const Policy &arg_policy,
-                 const ViewType &arg_result,
-                 std::enable_if_t<Kokkos::is_view<ViewType>::value &&
-                                      !Kokkos::is_reducer<ReducerType>::value,
-                                  void *> = nullptr)
-      : m_functor(arg_functor),
-        m_league(arg_policy.league_size()),
-        m_policy(arg_policy),
-        m_reducer(InvalidType()),
-        m_result_ptr(arg_result.data()),
-        m_shared(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) +
-                 FunctorTeamShmemSize<FunctorType>::value(
-                     m_functor, arg_policy.team_size())),
-        m_force_synchronous(!arg_result.impl_track().has_record()) {}
-
-  inline ParallelReduce(const FunctorType &arg_functor, Policy arg_policy,
-                        const ReducerType &reducer)
-      : m_functor(arg_functor),
-        m_league(arg_policy.league_size()),
-        m_policy(arg_policy),
-        m_reducer(reducer),
-        m_result_ptr(reducer.view().data()),
-        m_shared(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) +
-                 FunctorTeamShmemSize<FunctorType>::value(
-                     arg_functor, arg_policy.team_size())),
-        m_force_synchronous(!reducer.view().impl_track().has_record()) {}
-};
-}  // namespace Impl
-}  // namespace Kokkos
-
-namespace Kokkos {
-
-template <typename iType>
-KOKKOS_INLINE_FUNCTION
-    Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>
-    TeamThreadRange(const Impl::HPXTeamMember &thread, const iType &count) {
-  return Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>(
-      thread, count);
-}
-
-template <typename iType1, typename iType2>
-KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct<
-    std::common_type_t<iType1, iType2>, Impl::HPXTeamMember>
-TeamThreadRange(const Impl::HPXTeamMember &thread, const iType1 &i_begin,
-                const iType2 &i_end) {
-  using iType = std::common_type_t<iType1, iType2>;
-  return Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>(
-      thread, iType(i_begin), iType(i_end));
-}
-
-template <typename iType>
-KOKKOS_INLINE_FUNCTION
-    Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>
-    TeamVectorRange(const Impl::HPXTeamMember &thread, const iType &count) {
-  return Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>(
-      thread, count);
-}
-
-template <typename iType1, typename iType2>
-KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct<
-    std::common_type_t<iType1, iType2>, Impl::HPXTeamMember>
-TeamVectorRange(const Impl::HPXTeamMember &thread, const iType1 &i_begin,
-                const iType2 &i_end) {
-  using iType = std::common_type_t<iType1, iType2>;
-  return Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>(
-      thread, iType(i_begin), iType(i_end));
-}
-
-template <typename iType>
-KOKKOS_INLINE_FUNCTION
-    Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>
-    ThreadVectorRange(const Impl::HPXTeamMember &thread, const iType &count) {
-  return Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>(
-      thread, count);
-}
-
-template <typename iType1, typename iType2>
-KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct<
-    std::common_type_t<iType1, iType2>, Impl::HPXTeamMember>
-ThreadVectorRange(const Impl::HPXTeamMember &thread, const iType1 &i_begin,
-                  const iType2 &i_end) {
-  using iType = std::common_type_t<iType1, iType2>;
-  return Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>(
-      thread, iType(i_begin), iType(i_end));
-}
-
-KOKKOS_INLINE_FUNCTION
-Impl::ThreadSingleStruct<Impl::HPXTeamMember> PerTeam(
-    const Impl::HPXTeamMember &thread) {
-  return Impl::ThreadSingleStruct<Impl::HPXTeamMember>(thread);
-}
-
-KOKKOS_INLINE_FUNCTION
-Impl::VectorSingleStruct<Impl::HPXTeamMember> PerThread(
-    const Impl::HPXTeamMember &thread) {
-  return Impl::VectorSingleStruct<Impl::HPXTeamMember>(thread);
-}
-
-/** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each
- * i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all threads of the the calling thread team.
- */
-template <typename iType, class Lambda>
-KOKKOS_INLINE_FUNCTION void parallel_for(
-    const Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>
-        &loop_boundaries,
-    const Lambda &lambda) {
-  for (iType i = loop_boundaries.start; i < loop_boundaries.end;
-       i += loop_boundaries.increment)
-    lambda(i);
-}
-
-/** \brief  Inter-thread vector parallel_reduce. Executes lambda(iType i,
- * ValueType & val) for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all threads of the the calling thread team
- * and a summation of val is performed and put into result.
- */
-template <typename iType, class Lambda, typename ValueType>
-KOKKOS_INLINE_FUNCTION void parallel_reduce(
-    const Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>
-        &loop_boundaries,
-    const Lambda &lambda, ValueType &result) {
-  result = ValueType();
-  for (iType i = loop_boundaries.start; i < loop_boundaries.end;
-       i += loop_boundaries.increment) {
-    lambda(i, result);
-  }
-}
-
-/** \brief  Intra-thread vector parallel_for. Executes lambda(iType i) for each
- * i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all vector lanes of the the calling thread.
- */
-template <typename iType, class Lambda>
-KOKKOS_INLINE_FUNCTION void parallel_for(
-    const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>
-        &loop_boundaries,
-    const Lambda &lambda) {
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-  for (iType i = loop_boundaries.start; i < loop_boundaries.end;
-       i += loop_boundaries.increment) {
-    lambda(i);
-  }
-}
-
-/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i,
- * ValueType & val) for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all vector lanes of the the calling thread
- * and a summation of val is performed and put into result.
- */
-template <typename iType, class Lambda, typename ValueType>
-KOKKOS_INLINE_FUNCTION void parallel_reduce(
-    const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>
-        &loop_boundaries,
-    const Lambda &lambda, ValueType &result) {
-  result = ValueType();
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-  for (iType i = loop_boundaries.start; i < loop_boundaries.end;
-       i += loop_boundaries.increment) {
-    lambda(i, result);
-  }
-}
-
-template <typename iType, class Lambda, typename ReducerType>
-KOKKOS_INLINE_FUNCTION void parallel_reduce(
-    const Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>
-        &loop_boundaries,
-    const Lambda &lambda, const ReducerType &reducer) {
-  reducer.init(reducer.reference());
-  for (iType i = loop_boundaries.start; i < loop_boundaries.end;
-       i += loop_boundaries.increment) {
-    lambda(i, reducer.reference());
-  }
-}
-
-template <typename iType, class Lambda, typename ReducerType>
-KOKKOS_INLINE_FUNCTION void parallel_reduce(
-    const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>
-        &loop_boundaries,
-    const Lambda &lambda, const ReducerType &reducer) {
-  reducer.init(reducer.reference());
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-  for (iType i = loop_boundaries.start; i < loop_boundaries.end;
-       i += loop_boundaries.increment) {
-    lambda(i, reducer.reference());
-  }
-}
-
-template <typename iType, class FunctorType>
-KOKKOS_INLINE_FUNCTION void parallel_scan(
-    Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember> const
-        &loop_boundaries,
-    const FunctorType &lambda) {
-  using value_type = typename Kokkos::Impl::FunctorAnalysis<
-      Kokkos::Impl::FunctorPatternInterface::SCAN, void,
-      FunctorType>::value_type;
-
-  value_type scan_val = value_type();
-
-  // Intra-member scan
-  for (iType i = loop_boundaries.start; i < loop_boundaries.end;
-       i += loop_boundaries.increment) {
-    lambda(i, scan_val, false);
-  }
-
-  // 'scan_val' output is the exclusive prefix sum
-  scan_val = loop_boundaries.thread.team_scan(scan_val);
-
-  for (iType i = loop_boundaries.start; i < loop_boundaries.end;
-       i += loop_boundaries.increment) {
-    lambda(i, scan_val, true);
-  }
-}
-
-/** \brief  Intra-thread vector parallel exclusive prefix sum. Executes
- * lambda(iType i, ValueType & val, bool final) for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all vector lanes in the thread and a scan
- * operation is performed. Depending on the target execution space the operator
- * might be called twice: once with final=false and once with final=true. When
- * final==true val contains the prefix sum value. The contribution of this "i"
- * needs to be added to val no matter whether final==true or not. In a serial
- * execution (i.e. team_size==1) the operator is only called once with
- * final==true. Scan_val will be set to the final sum value over all vector
- */
-template <typename iType, class FunctorType>
-KOKKOS_INLINE_FUNCTION void parallel_scan(
-    const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>
-        &loop_boundaries,
-    const FunctorType &lambda) {
-  using value_type =
-      typename Impl::FunctorAnalysis<Impl::FunctorPatternInterface::SCAN,
-                                     TeamPolicy<Experimental::HPX>,
-                                     FunctorType>::value_type;
-
-  value_type scan_val = value_type();
-
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-  for (iType i = loop_boundaries.start; i < loop_boundaries.end;
-       i += loop_boundaries.increment) {
-    lambda(i, scan_val, true);
-  }
-}
-
-/** \brief  Intra-thread vector parallel scan with reducer
- *
- */
-template <typename iType, class FunctorType, typename ReducerType>
-KOKKOS_INLINE_FUNCTION std::enable_if_t<Kokkos::is_reducer<ReducerType>::value>
-parallel_scan(
-    const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>
-        &loop_boundaries,
-    const FunctorType &lambda, const ReducerType &reducer) {
-  typename ReducerType::value_type scan_val;
-  reducer.init(scan_val);
-
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-  for (iType i = loop_boundaries.start; i < loop_boundaries.end;
-       i += loop_boundaries.increment) {
-    lambda(i, scan_val, true);
-  }
-}
-
-template <class FunctorType>
-KOKKOS_INLINE_FUNCTION void single(
-    const Impl::VectorSingleStruct<Impl::HPXTeamMember> &,
-    const FunctorType &lambda) {
-  lambda();
-}
-
-template <class FunctorType>
-KOKKOS_INLINE_FUNCTION void single(
-    const Impl::ThreadSingleStruct<Impl::HPXTeamMember> &,
-    const FunctorType &lambda) {
-  lambda();
-}
-
-template <class FunctorType, class ValueType>
-KOKKOS_INLINE_FUNCTION void single(
-    const Impl::VectorSingleStruct<Impl::HPXTeamMember> &,
-    const FunctorType &lambda, ValueType &val) {
-  lambda(val);
-}
-
-template <class FunctorType, class ValueType>
-KOKKOS_INLINE_FUNCTION void single(
-    const Impl::ThreadSingleStruct<Impl::HPXTeamMember> &,
-    const FunctorType &lambda, ValueType &val) {
-  lambda(val);
-}
-
-}  // namespace Kokkos
-
-#include <HPX/Kokkos_HPX_Task.hpp>
-
-#endif /* #if defined( KOKKOS_ENABLE_HPX ) */
-#endif /* #ifndef KOKKOS_HPX_HPP */
diff --git a/packages/kokkos/core/src/Kokkos_Half.hpp b/packages/kokkos/core/src/Kokkos_Half.hpp
index 9231fac5ff72f740f7a8362d8938b3d7c3d0efa5..33217efe80bbb919688fd321fafe7d6994269e52 100644
--- a/packages/kokkos/core/src/Kokkos_Half.hpp
+++ b/packages/kokkos/core/src/Kokkos_Half.hpp
@@ -21,990 +21,10 @@
 #define KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_HALF
 #endif
 
-#include <type_traits>
-#include <Kokkos_Macros.hpp>
-#include <iosfwd>  // istream & ostream for extraction and insertion ops
-#include <string>
+#include <impl/Kokkos_Half_FloatingPointWrapper.hpp>
+#include <impl/Kokkos_Half_NumericTraits.hpp>
+#include <impl/Kokkos_Half_MathematicalFunctions.hpp>
 
-#ifdef KOKKOS_IMPL_HALF_TYPE_DEFINED
-
-// KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH: A macro to select which
-// floating_pointer_wrapper operator paths should be used. For CUDA, let the
-// compiler conditionally select when device ops are used For SYCL, we have a
-// full half type on both host and device
-#if defined(__CUDA_ARCH__) || defined(KOKKOS_ENABLE_SYCL)
-#define KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH
-#endif
-
-/************************* BEGIN forward declarations *************************/
-namespace Kokkos {
-namespace Experimental {
-namespace Impl {
-template <class FloatType>
-class floating_point_wrapper;
-}
-
-// Declare half_t (binary16)
-using half_t = Kokkos::Experimental::Impl::floating_point_wrapper<
-    Kokkos::Impl::half_impl_t ::type>;
-KOKKOS_INLINE_FUNCTION
-half_t cast_to_half(float val);
-KOKKOS_INLINE_FUNCTION
-half_t cast_to_half(bool val);
-KOKKOS_INLINE_FUNCTION
-half_t cast_to_half(double val);
-KOKKOS_INLINE_FUNCTION
-half_t cast_to_half(short val);
-KOKKOS_INLINE_FUNCTION
-half_t cast_to_half(int val);
-KOKKOS_INLINE_FUNCTION
-half_t cast_to_half(long val);
-KOKKOS_INLINE_FUNCTION
-half_t cast_to_half(long long val);
-KOKKOS_INLINE_FUNCTION
-half_t cast_to_half(unsigned short val);
-KOKKOS_INLINE_FUNCTION
-half_t cast_to_half(unsigned int val);
-KOKKOS_INLINE_FUNCTION
-half_t cast_to_half(unsigned long val);
-KOKKOS_INLINE_FUNCTION
-half_t cast_to_half(unsigned long long val);
-KOKKOS_INLINE_FUNCTION
-half_t cast_to_half(half_t);
-
-template <class T>
-KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, float>::value, T>
-    cast_from_half(half_t);
-template <class T>
-KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, bool>::value, T>
-    cast_from_half(half_t);
-template <class T>
-KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, double>::value, T>
-    cast_from_half(half_t);
-template <class T>
-KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, short>::value, T>
-    cast_from_half(half_t);
-template <class T>
-KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, int>::value, T>
-    cast_from_half(half_t);
-template <class T>
-KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, long>::value, T>
-    cast_from_half(half_t);
-template <class T>
-KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, long long>::value, T>
-    cast_from_half(half_t);
-template <class T>
-KOKKOS_INLINE_FUNCTION
-    std::enable_if_t<std::is_same<T, unsigned short>::value, T>
-        cast_from_half(half_t);
-template <class T>
-KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, unsigned int>::value, T>
-    cast_from_half(half_t);
-template <class T>
-KOKKOS_INLINE_FUNCTION
-    std::enable_if_t<std::is_same<T, unsigned long>::value, T>
-        cast_from_half(half_t);
-template <class T>
-KOKKOS_INLINE_FUNCTION
-    std::enable_if_t<std::is_same<T, unsigned long long>::value, T>
-        cast_from_half(half_t);
-
-// declare bhalf_t
-#ifdef KOKKOS_IMPL_BHALF_TYPE_DEFINED
-using bhalf_t = Kokkos::Experimental::Impl::floating_point_wrapper<
-    Kokkos::Impl ::bhalf_impl_t ::type>;
-
-KOKKOS_INLINE_FUNCTION
-bhalf_t cast_to_bhalf(float val);
-KOKKOS_INLINE_FUNCTION
-bhalf_t cast_to_bhalf(bool val);
-KOKKOS_INLINE_FUNCTION
-bhalf_t cast_to_bhalf(double val);
-KOKKOS_INLINE_FUNCTION
-bhalf_t cast_to_bhalf(short val);
-KOKKOS_INLINE_FUNCTION
-bhalf_t cast_to_bhalf(int val);
-KOKKOS_INLINE_FUNCTION
-bhalf_t cast_to_bhalf(long val);
-KOKKOS_INLINE_FUNCTION
-bhalf_t cast_to_bhalf(long long val);
-KOKKOS_INLINE_FUNCTION
-bhalf_t cast_to_bhalf(unsigned short val);
-KOKKOS_INLINE_FUNCTION
-bhalf_t cast_to_bhalf(unsigned int val);
-KOKKOS_INLINE_FUNCTION
-bhalf_t cast_to_bhalf(unsigned long val);
-KOKKOS_INLINE_FUNCTION
-bhalf_t cast_to_bhalf(unsigned long long val);
-KOKKOS_INLINE_FUNCTION
-bhalf_t cast_to_bhalf(bhalf_t val);
-
-template <class T>
-KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, float>::value, T>
-    cast_from_bhalf(bhalf_t);
-template <class T>
-KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, bool>::value, T>
-    cast_from_bhalf(bhalf_t);
-template <class T>
-KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, double>::value, T>
-    cast_from_bhalf(bhalf_t);
-template <class T>
-KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, short>::value, T>
-    cast_from_bhalf(bhalf_t);
-template <class T>
-KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, int>::value, T>
-    cast_from_bhalf(bhalf_t);
-template <class T>
-KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, long>::value, T>
-    cast_from_bhalf(bhalf_t);
-template <class T>
-KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, long long>::value, T>
-    cast_from_bhalf(bhalf_t);
-template <class T>
-KOKKOS_INLINE_FUNCTION
-    std::enable_if_t<std::is_same<T, unsigned short>::value, T>
-        cast_from_bhalf(bhalf_t);
-template <class T>
-KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, unsigned int>::value, T>
-    cast_from_bhalf(bhalf_t);
-template <class T>
-KOKKOS_INLINE_FUNCTION
-    std::enable_if_t<std::is_same<T, unsigned long>::value, T>
-        cast_from_bhalf(bhalf_t);
-template <class T>
-KOKKOS_INLINE_FUNCTION
-    std::enable_if_t<std::is_same<T, unsigned long long>::value, T>
-        cast_from_bhalf(bhalf_t);
-#endif  // KOKKOS_IMPL_BHALF_TYPE_DEFINED
-
-template <class T>
-static KOKKOS_INLINE_FUNCTION Kokkos::Experimental::half_t cast_to_wrapper(
-    T x, const volatile Kokkos::Impl::half_impl_t::type&);
-
-#ifdef KOKKOS_IMPL_BHALF_TYPE_DEFINED
-template <class T>
-static KOKKOS_INLINE_FUNCTION Kokkos::Experimental::bhalf_t cast_to_wrapper(
-    T x, const volatile Kokkos::Impl::bhalf_impl_t::type&);
-#endif  // KOKKOS_IMPL_BHALF_TYPE_DEFINED
-
-template <class T>
-static KOKKOS_INLINE_FUNCTION T
-cast_from_wrapper(const Kokkos::Experimental::half_t& x);
-
-#ifdef KOKKOS_IMPL_BHALF_TYPE_DEFINED
-template <class T>
-static KOKKOS_INLINE_FUNCTION T
-cast_from_wrapper(const Kokkos::Experimental::bhalf_t& x);
-#endif  // KOKKOS_IMPL_BHALF_TYPE_DEFINED
-/************************** END forward declarations **************************/
-
-namespace Impl {
-template <class FloatType>
-class alignas(FloatType) floating_point_wrapper {
- public:
-  using impl_type = FloatType;
-
- private:
-  impl_type val;
-  using fixed_width_integer_type = std::conditional_t<
-      sizeof(impl_type) == 2, uint16_t,
-      std::conditional_t<
-          sizeof(impl_type) == 4, uint32_t,
-          std::conditional_t<sizeof(impl_type) == 8, uint64_t, void>>>;
-  static_assert(!std::is_void<fixed_width_integer_type>::value,
-                "Invalid impl_type");
-
- public:
-  // In-class initialization and defaulted default constructors not used
-  // since Cuda supports half precision initialization via the below constructor
-  KOKKOS_FUNCTION
-  floating_point_wrapper() : val(0.0F) {}
-
-// Copy constructors
-// Getting "C2580: multiple versions of a defaulted special
-// member function are not allowed" with VS 16.11.3 and CUDA 11.4.2
-#if defined(_WIN32) && defined(KOKKOS_ENABLE_CUDA)
-  KOKKOS_FUNCTION
-  floating_point_wrapper(const floating_point_wrapper& rhs) : val(rhs.val) {}
-#else
-  KOKKOS_DEFAULTED_FUNCTION
-  floating_point_wrapper(const floating_point_wrapper&) noexcept = default;
-#endif
-
-  KOKKOS_INLINE_FUNCTION
-  floating_point_wrapper(const volatile floating_point_wrapper& rhs) {
-#if defined(KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH) && !defined(KOKKOS_ENABLE_SYCL)
-    val = rhs.val;
-#else
-    const volatile fixed_width_integer_type* rv_ptr =
-        reinterpret_cast<const volatile fixed_width_integer_type*>(&rhs.val);
-    const fixed_width_integer_type rv_val = *rv_ptr;
-    val       = reinterpret_cast<const impl_type&>(rv_val);
-#endif  // KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH
-  }
-
-  // Don't support implicit conversion back to impl_type.
-  // impl_type is a storage only type on host.
-  KOKKOS_FUNCTION
-  explicit operator impl_type() const { return val; }
-  KOKKOS_FUNCTION
-  explicit operator float() const { return cast_from_wrapper<float>(*this); }
-  KOKKOS_FUNCTION
-  explicit operator bool() const { return cast_from_wrapper<bool>(*this); }
-  KOKKOS_FUNCTION
-  explicit operator double() const { return cast_from_wrapper<double>(*this); }
-  KOKKOS_FUNCTION
-  explicit operator short() const { return cast_from_wrapper<short>(*this); }
-  KOKKOS_FUNCTION
-  explicit operator int() const { return cast_from_wrapper<int>(*this); }
-  KOKKOS_FUNCTION
-  explicit operator long() const { return cast_from_wrapper<long>(*this); }
-  KOKKOS_FUNCTION
-  explicit operator long long() const {
-    return cast_from_wrapper<long long>(*this);
-  }
-  KOKKOS_FUNCTION
-  explicit operator unsigned short() const {
-    return cast_from_wrapper<unsigned short>(*this);
-  }
-  KOKKOS_FUNCTION
-  explicit operator unsigned int() const {
-    return cast_from_wrapper<unsigned int>(*this);
-  }
-  KOKKOS_FUNCTION
-  explicit operator unsigned long() const {
-    return cast_from_wrapper<unsigned long>(*this);
-  }
-  KOKKOS_FUNCTION
-  explicit operator unsigned long long() const {
-    return cast_from_wrapper<unsigned long long>(*this);
-  }
-
-  /**
-   * Conversion constructors.
-   *
-   * Support implicit conversions from impl_type, float, double ->
-   * floating_point_wrapper. Mixed precision expressions require upcasting which
-   * is done in the
-   * "// Binary Arithmetic" operator overloads below.
-   *
-   * Support implicit conversions from integral types -> floating_point_wrapper.
-   * Expressions involving floating_point_wrapper with integral types require
-   * downcasting the integral types to floating_point_wrapper. Existing operator
-   * overloads can handle this with the addition of the below implicit
-   * conversion constructors.
-   */
-  KOKKOS_FUNCTION
-  constexpr floating_point_wrapper(impl_type rhs) : val(rhs) {}
-  KOKKOS_FUNCTION
-  floating_point_wrapper(float rhs) : val(cast_to_wrapper(rhs, val).val) {}
-  KOKKOS_FUNCTION
-  floating_point_wrapper(double rhs) : val(cast_to_wrapper(rhs, val).val) {}
-  KOKKOS_FUNCTION
-  explicit floating_point_wrapper(bool rhs)
-      : val(cast_to_wrapper(rhs, val).val) {}
-  KOKKOS_FUNCTION
-  floating_point_wrapper(short rhs) : val(cast_to_wrapper(rhs, val).val) {}
-  KOKKOS_FUNCTION
-  floating_point_wrapper(int rhs) : val(cast_to_wrapper(rhs, val).val) {}
-  KOKKOS_FUNCTION
-  floating_point_wrapper(long rhs) : val(cast_to_wrapper(rhs, val).val) {}
-  KOKKOS_FUNCTION
-  floating_point_wrapper(long long rhs) : val(cast_to_wrapper(rhs, val).val) {}
-  KOKKOS_FUNCTION
-  floating_point_wrapper(unsigned short rhs)
-      : val(cast_to_wrapper(rhs, val).val) {}
-  KOKKOS_FUNCTION
-  floating_point_wrapper(unsigned int rhs)
-      : val(cast_to_wrapper(rhs, val).val) {}
-  KOKKOS_FUNCTION
-  floating_point_wrapper(unsigned long rhs)
-      : val(cast_to_wrapper(rhs, val).val) {}
-  KOKKOS_FUNCTION
-  floating_point_wrapper(unsigned long long rhs)
-      : val(cast_to_wrapper(rhs, val).val) {}
-
-  // Unary operators
-  KOKKOS_FUNCTION
-  floating_point_wrapper operator+() const {
-    floating_point_wrapper tmp = *this;
-#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH
-    tmp.val = +tmp.val;
-#else
-    tmp.val   = cast_to_wrapper(+cast_from_wrapper<float>(tmp), val).val;
-#endif
-    return tmp;
-  }
-
-  KOKKOS_FUNCTION
-  floating_point_wrapper operator-() const {
-    floating_point_wrapper tmp = *this;
-#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH
-    tmp.val = -tmp.val;
-#else
-    tmp.val   = cast_to_wrapper(-cast_from_wrapper<float>(tmp), val).val;
-#endif
-    return tmp;
-  }
-
-  // Prefix operators
-  KOKKOS_FUNCTION
-  floating_point_wrapper& operator++() {
-#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH
-    val = val + impl_type(1.0F);  // cuda has no operator++ for __nv_bfloat
-#else
-    float tmp = cast_from_wrapper<float>(*this);
-    ++tmp;
-    val       = cast_to_wrapper(tmp, val).val;
-#endif
-    return *this;
-  }
-
-  KOKKOS_FUNCTION
-  floating_point_wrapper& operator--() {
-#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH
-    val = val - impl_type(1.0F);  // cuda has no operator-- for __nv_bfloat
-#else
-    float tmp = cast_from_wrapper<float>(*this);
-    --tmp;
-    val = cast_to_wrapper(tmp, val).val;
-#endif
-    return *this;
-  }
-
-  // Postfix operators
-  KOKKOS_FUNCTION
-  floating_point_wrapper operator++(int) {
-    floating_point_wrapper tmp = *this;
-    operator++();
-    return tmp;
-  }
-
-  KOKKOS_FUNCTION
-  floating_point_wrapper operator--(int) {
-    floating_point_wrapper tmp = *this;
-    operator--();
-    return tmp;
-  }
-
-  // Binary operators
-  KOKKOS_FUNCTION
-  floating_point_wrapper& operator=(impl_type rhs) {
-    val = rhs;
-    return *this;
-  }
-
-  template <class T>
-  KOKKOS_FUNCTION floating_point_wrapper& operator=(T rhs) {
-    val = cast_to_wrapper(rhs, val).val;
-    return *this;
-  }
-
-  template <class T>
-  KOKKOS_FUNCTION void operator=(T rhs) volatile {
-    impl_type new_val = cast_to_wrapper(rhs, val).val;
-    volatile fixed_width_integer_type* val_ptr =
-        reinterpret_cast<volatile fixed_width_integer_type*>(
-            const_cast<impl_type*>(&val));
-    *val_ptr = reinterpret_cast<fixed_width_integer_type&>(new_val);
-  }
-
-  // Compound operators
-  KOKKOS_FUNCTION
-  floating_point_wrapper& operator+=(floating_point_wrapper rhs) {
-#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH
-    val = val + rhs.val;  // cuda has no operator+= for __nv_bfloat
-#else
-    val = cast_to_wrapper(
-              cast_from_wrapper<float>(*this) + cast_from_wrapper<float>(rhs),
-              val)
-              .val;
-#endif
-    return *this;
-  }
-
-  KOKKOS_FUNCTION
-  void operator+=(const volatile floating_point_wrapper& rhs) volatile {
-    floating_point_wrapper tmp_rhs = rhs;
-    floating_point_wrapper tmp_lhs = *this;
-
-    tmp_lhs += tmp_rhs;
-    *this = tmp_lhs;
-  }
-
-  // Compound operators: upcast overloads for +=
-  template <class T>
-  KOKKOS_FUNCTION friend std::enable_if_t<
-      std::is_same<T, float>::value || std::is_same<T, double>::value, T>
-  operator+=(T& lhs, floating_point_wrapper rhs) {
-    lhs += static_cast<T>(rhs);
-    return lhs;
-  }
-
-  KOKKOS_FUNCTION
-  floating_point_wrapper& operator+=(float rhs) {
-    float result = static_cast<float>(val) + rhs;
-    val          = static_cast<impl_type>(result);
-    return *this;
-  }
-
-  KOKKOS_FUNCTION
-  floating_point_wrapper& operator+=(double rhs) {
-    double result = static_cast<double>(val) + rhs;
-    val           = static_cast<impl_type>(result);
-    return *this;
-  }
-
-  KOKKOS_FUNCTION
-  floating_point_wrapper& operator-=(floating_point_wrapper rhs) {
-#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH
-    val = val - rhs.val;  // cuda has no operator-= for __nv_bfloat
-#else
-    val = cast_to_wrapper(
-              cast_from_wrapper<float>(*this) - cast_from_wrapper<float>(rhs),
-              val)
-              .val;
-#endif
-    return *this;
-  }
-
-  KOKKOS_FUNCTION
-  void operator-=(const volatile floating_point_wrapper& rhs) volatile {
-    floating_point_wrapper tmp_rhs = rhs;
-    floating_point_wrapper tmp_lhs = *this;
-
-    tmp_lhs -= tmp_rhs;
-    *this = tmp_lhs;
-  }
-
-  // Compund operators: upcast overloads for -=
-  template <class T>
-  KOKKOS_FUNCTION friend std::enable_if_t<
-      std::is_same<T, float>::value || std::is_same<T, double>::value, T>
-  operator-=(T& lhs, floating_point_wrapper rhs) {
-    lhs -= static_cast<T>(rhs);
-    return lhs;
-  }
-
-  KOKKOS_FUNCTION
-  floating_point_wrapper& operator-=(float rhs) {
-    float result = static_cast<float>(val) - rhs;
-    val          = static_cast<impl_type>(result);
-    return *this;
-  }
-
-  KOKKOS_FUNCTION
-  floating_point_wrapper& operator-=(double rhs) {
-    double result = static_cast<double>(val) - rhs;
-    val           = static_cast<impl_type>(result);
-    return *this;
-  }
-
-  KOKKOS_FUNCTION
-  floating_point_wrapper& operator*=(floating_point_wrapper rhs) {
-#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH
-    val = val * rhs.val;  // cuda has no operator*= for __nv_bfloat
-#else
-    val = cast_to_wrapper(
-              cast_from_wrapper<float>(*this) * cast_from_wrapper<float>(rhs),
-              val)
-              .val;
-#endif
-    return *this;
-  }
-
-  KOKKOS_FUNCTION
-  void operator*=(const volatile floating_point_wrapper& rhs) volatile {
-    floating_point_wrapper tmp_rhs = rhs;
-    floating_point_wrapper tmp_lhs = *this;
-
-    tmp_lhs *= tmp_rhs;
-    *this = tmp_lhs;
-  }
-
-  // Compund operators: upcast overloads for *=
-  template <class T>
-  KOKKOS_FUNCTION friend std::enable_if_t<
-      std::is_same<T, float>::value || std::is_same<T, double>::value, T>
-  operator*=(T& lhs, floating_point_wrapper rhs) {
-    lhs *= static_cast<T>(rhs);
-    return lhs;
-  }
-
-  KOKKOS_FUNCTION
-  floating_point_wrapper& operator*=(float rhs) {
-    float result = static_cast<float>(val) * rhs;
-    val          = static_cast<impl_type>(result);
-    return *this;
-  }
-
-  KOKKOS_FUNCTION
-  floating_point_wrapper& operator*=(double rhs) {
-    double result = static_cast<double>(val) * rhs;
-    val           = static_cast<impl_type>(result);
-    return *this;
-  }
-
-  KOKKOS_FUNCTION
-  floating_point_wrapper& operator/=(floating_point_wrapper rhs) {
-#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH
-    val = val / rhs.val;  // cuda has no operator/= for __nv_bfloat
-#else
-    val = cast_to_wrapper(
-              cast_from_wrapper<float>(*this) / cast_from_wrapper<float>(rhs),
-              val)
-              .val;
-#endif
-    return *this;
-  }
-
-  KOKKOS_FUNCTION
-  void operator/=(const volatile floating_point_wrapper& rhs) volatile {
-    floating_point_wrapper tmp_rhs = rhs;
-    floating_point_wrapper tmp_lhs = *this;
-
-    tmp_lhs /= tmp_rhs;
-    *this = tmp_lhs;
-  }
-
-  // Compund operators: upcast overloads for /=
-  template <class T>
-  KOKKOS_FUNCTION friend std::enable_if_t<
-      std::is_same<T, float>::value || std::is_same<T, double>::value, T>
-  operator/=(T& lhs, floating_point_wrapper rhs) {
-    lhs /= static_cast<T>(rhs);
-    return lhs;
-  }
-
-  KOKKOS_FUNCTION
-  floating_point_wrapper& operator/=(float rhs) {
-    float result = static_cast<float>(val) / rhs;
-    val          = static_cast<impl_type>(result);
-    return *this;
-  }
-
-  KOKKOS_FUNCTION
-  floating_point_wrapper& operator/=(double rhs) {
-    double result = static_cast<double>(val) / rhs;
-    val           = static_cast<impl_type>(result);
-    return *this;
-  }
-
-  // Binary Arithmetic
-  KOKKOS_FUNCTION
-  friend floating_point_wrapper operator+(floating_point_wrapper lhs,
-                                          floating_point_wrapper rhs) {
-#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH
-    lhs += rhs;
-#else
-    lhs.val = cast_to_wrapper(
-                  cast_from_wrapper<float>(lhs) + cast_from_wrapper<float>(rhs),
-                  lhs.val)
-                  .val;
-#endif
-    return lhs;
-  }
-
-  // Binary Arithmetic upcast operators for +
-  template <class T>
-  KOKKOS_FUNCTION friend std::enable_if_t<
-      std::is_same<T, float>::value || std::is_same<T, double>::value, T>
-  operator+(floating_point_wrapper lhs, T rhs) {
-    return T(lhs) + rhs;
-  }
-
-  template <class T>
-  KOKKOS_FUNCTION friend std::enable_if_t<
-      std::is_same<T, float>::value || std::is_same<T, double>::value, T>
-  operator+(T lhs, floating_point_wrapper rhs) {
-    return lhs + T(rhs);
-  }
-
-  KOKKOS_FUNCTION
-  friend floating_point_wrapper operator-(floating_point_wrapper lhs,
-                                          floating_point_wrapper rhs) {
-#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH
-    lhs -= rhs;
-#else
-    lhs.val = cast_to_wrapper(
-                  cast_from_wrapper<float>(lhs) - cast_from_wrapper<float>(rhs),
-                  lhs.val)
-                  .val;
-#endif
-    return lhs;
-  }
-
-  // Binary Arithmetic upcast operators for -
-  template <class T>
-  KOKKOS_FUNCTION friend std::enable_if_t<
-      std::is_same<T, float>::value || std::is_same<T, double>::value, T>
-  operator-(floating_point_wrapper lhs, T rhs) {
-    return T(lhs) - rhs;
-  }
-
-  template <class T>
-  KOKKOS_FUNCTION friend std::enable_if_t<
-      std::is_same<T, float>::value || std::is_same<T, double>::value, T>
-  operator-(T lhs, floating_point_wrapper rhs) {
-    return lhs - T(rhs);
-  }
-
-  KOKKOS_FUNCTION
-  friend floating_point_wrapper operator*(floating_point_wrapper lhs,
-                                          floating_point_wrapper rhs) {
-#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH
-    lhs *= rhs;
-#else
-    lhs.val = cast_to_wrapper(
-                  cast_from_wrapper<float>(lhs) * cast_from_wrapper<float>(rhs),
-                  lhs.val)
-                  .val;
-#endif
-    return lhs;
-  }
-
-  // Binary Arithmetic upcast operators for *
-  template <class T>
-  KOKKOS_FUNCTION friend std::enable_if_t<
-      std::is_same<T, float>::value || std::is_same<T, double>::value, T>
-  operator*(floating_point_wrapper lhs, T rhs) {
-    return T(lhs) * rhs;
-  }
-
-  template <class T>
-  KOKKOS_FUNCTION friend std::enable_if_t<
-      std::is_same<T, float>::value || std::is_same<T, double>::value, T>
-  operator*(T lhs, floating_point_wrapper rhs) {
-    return lhs * T(rhs);
-  }
-
-  KOKKOS_FUNCTION
-  friend floating_point_wrapper operator/(floating_point_wrapper lhs,
-                                          floating_point_wrapper rhs) {
-#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH
-    lhs /= rhs;
-#else
-    lhs.val = cast_to_wrapper(
-                  cast_from_wrapper<float>(lhs) / cast_from_wrapper<float>(rhs),
-                  lhs.val)
-                  .val;
-#endif
-    return lhs;
-  }
-
-  // Binary Arithmetic upcast operators for /
-  template <class T>
-  KOKKOS_FUNCTION friend std::enable_if_t<
-      std::is_same<T, float>::value || std::is_same<T, double>::value, T>
-  operator/(floating_point_wrapper lhs, T rhs) {
-    return T(lhs) / rhs;
-  }
-
-  template <class T>
-  KOKKOS_FUNCTION friend std::enable_if_t<
-      std::is_same<T, float>::value || std::is_same<T, double>::value, T>
-  operator/(T lhs, floating_point_wrapper rhs) {
-    return lhs / T(rhs);
-  }
-
-  // Logical operators
-  KOKKOS_FUNCTION
-  bool operator!() const {
-#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH
-    return static_cast<bool>(!val);
-#else
-    return !cast_from_wrapper<float>(*this);
-#endif
-  }
-
-  // NOTE: Loses short-circuit evaluation
-  KOKKOS_FUNCTION
-  bool operator&&(floating_point_wrapper rhs) const {
-#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH
-    return static_cast<bool>(val && rhs.val);
-#else
-    return cast_from_wrapper<float>(*this) && cast_from_wrapper<float>(rhs);
-#endif
-  }
-
-  // NOTE: Loses short-circuit evaluation
-  KOKKOS_FUNCTION
-  bool operator||(floating_point_wrapper rhs) const {
-#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH
-    return static_cast<bool>(val || rhs.val);
-#else
-    return cast_from_wrapper<float>(*this) || cast_from_wrapper<float>(rhs);
-#endif
-  }
-
-  // Comparison operators
-  KOKKOS_FUNCTION
-  bool operator==(floating_point_wrapper rhs) const {
-#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH
-    return static_cast<bool>(val == rhs.val);
-#else
-    return cast_from_wrapper<float>(*this) == cast_from_wrapper<float>(rhs);
-#endif
-  }
-
-  KOKKOS_FUNCTION
-  bool operator!=(floating_point_wrapper rhs) const {
-#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH
-    return static_cast<bool>(val != rhs.val);
-#else
-    return cast_from_wrapper<float>(*this) != cast_from_wrapper<float>(rhs);
-#endif
-  }
-
-  KOKKOS_FUNCTION
-  bool operator<(floating_point_wrapper rhs) const {
-#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH
-    return static_cast<bool>(val < rhs.val);
-#else
-    return cast_from_wrapper<float>(*this) < cast_from_wrapper<float>(rhs);
-#endif
-  }
-
-  KOKKOS_FUNCTION
-  bool operator>(floating_point_wrapper rhs) const {
-#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH
-    return static_cast<bool>(val > rhs.val);
-#else
-    return cast_from_wrapper<float>(*this) > cast_from_wrapper<float>(rhs);
-#endif
-  }
-
-  KOKKOS_FUNCTION
-  bool operator<=(floating_point_wrapper rhs) const {
-#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH
-    return static_cast<bool>(val <= rhs.val);
-#else
-    return cast_from_wrapper<float>(*this) <= cast_from_wrapper<float>(rhs);
-#endif
-  }
-
-  KOKKOS_FUNCTION
-  bool operator>=(floating_point_wrapper rhs) const {
-#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH
-    return static_cast<bool>(val >= rhs.val);
-#else
-    return cast_from_wrapper<float>(*this) >= cast_from_wrapper<float>(rhs);
-#endif
-  }
-
-  KOKKOS_FUNCTION
-  friend bool operator==(const volatile floating_point_wrapper& lhs,
-                         const volatile floating_point_wrapper& rhs) {
-    floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs;
-    return tmp_lhs == tmp_rhs;
-  }
-
-  KOKKOS_FUNCTION
-  friend bool operator!=(const volatile floating_point_wrapper& lhs,
-                         const volatile floating_point_wrapper& rhs) {
-    floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs;
-    return tmp_lhs != tmp_rhs;
-  }
-
-  KOKKOS_FUNCTION
-  friend bool operator<(const volatile floating_point_wrapper& lhs,
-                        const volatile floating_point_wrapper& rhs) {
-    floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs;
-    return tmp_lhs < tmp_rhs;
-  }
-
-  KOKKOS_FUNCTION
-  friend bool operator>(const volatile floating_point_wrapper& lhs,
-                        const volatile floating_point_wrapper& rhs) {
-    floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs;
-    return tmp_lhs > tmp_rhs;
-  }
-
-  KOKKOS_FUNCTION
-  friend bool operator<=(const volatile floating_point_wrapper& lhs,
-                         const volatile floating_point_wrapper& rhs) {
-    floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs;
-    return tmp_lhs <= tmp_rhs;
-  }
-
-  KOKKOS_FUNCTION
-  friend bool operator>=(const volatile floating_point_wrapper& lhs,
-                         const volatile floating_point_wrapper& rhs) {
-    floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs;
-    return tmp_lhs >= tmp_rhs;
-  }
-
-  // Insertion and extraction operators
-  friend std::ostream& operator<<(std::ostream& os,
-                                  const floating_point_wrapper& x) {
-    const std::string out = std::to_string(static_cast<double>(x));
-    os << out;
-    return os;
-  }
-
-  friend std::istream& operator>>(std::istream& is, floating_point_wrapper& x) {
-    std::string in;
-    is >> in;
-    x = std::stod(in);
-    return is;
-  }
-};
-}  // namespace Impl
-
-// Declare wrapper overloads now that floating_point_wrapper is declared
-template <class T>
-static KOKKOS_INLINE_FUNCTION Kokkos::Experimental::half_t cast_to_wrapper(
-    T x, const volatile Kokkos::Impl::half_impl_t::type&) {
-  return Kokkos::Experimental::cast_to_half(x);
-}
-
-#ifdef KOKKOS_IMPL_BHALF_TYPE_DEFINED
-template <class T>
-static KOKKOS_INLINE_FUNCTION Kokkos::Experimental::bhalf_t cast_to_wrapper(
-    T x, const volatile Kokkos::Impl::bhalf_impl_t::type&) {
-  return Kokkos::Experimental::cast_to_bhalf(x);
-}
-#endif  // KOKKOS_IMPL_BHALF_TYPE_DEFINED
-
-template <class T>
-static KOKKOS_INLINE_FUNCTION T
-cast_from_wrapper(const Kokkos::Experimental::half_t& x) {
-  return Kokkos::Experimental::cast_from_half<T>(x);
-}
-
-#ifdef KOKKOS_IMPL_BHALF_TYPE_DEFINED
-template <class T>
-static KOKKOS_INLINE_FUNCTION T
-cast_from_wrapper(const Kokkos::Experimental::bhalf_t& x) {
-  return Kokkos::Experimental::cast_from_bhalf<T>(x);
-}
-#endif  // KOKKOS_IMPL_BHALF_TYPE_DEFINED
-
-}  // namespace Experimental
-}  // namespace Kokkos
-
-#endif  // KOKKOS_IMPL_HALF_TYPE_DEFINED
-
-// If none of the above actually did anything and defined a half precision type
-// define a fallback implementation here using float
-#ifndef KOKKOS_IMPL_HALF_TYPE_DEFINED
-#define KOKKOS_IMPL_HALF_TYPE_DEFINED
-#define KOKKOS_HALF_T_IS_FLOAT true
-namespace Kokkos {
-namespace Impl {
-struct half_impl_t {
-  using type = float;
-};
-}  // namespace Impl
-namespace Experimental {
-
-using half_t = Kokkos::Impl::half_impl_t::type;
-
-// cast_to_half
-KOKKOS_INLINE_FUNCTION
-half_t cast_to_half(float val) { return half_t(val); }
-KOKKOS_INLINE_FUNCTION
-half_t cast_to_half(bool val) { return half_t(val); }
-KOKKOS_INLINE_FUNCTION
-half_t cast_to_half(double val) { return half_t(val); }
-KOKKOS_INLINE_FUNCTION
-half_t cast_to_half(short val) { return half_t(val); }
-KOKKOS_INLINE_FUNCTION
-half_t cast_to_half(unsigned short val) { return half_t(val); }
-KOKKOS_INLINE_FUNCTION
-half_t cast_to_half(int val) { return half_t(val); }
-KOKKOS_INLINE_FUNCTION
-half_t cast_to_half(unsigned int val) { return half_t(val); }
-KOKKOS_INLINE_FUNCTION
-half_t cast_to_half(long val) { return half_t(val); }
-KOKKOS_INLINE_FUNCTION
-half_t cast_to_half(unsigned long val) { return half_t(val); }
-KOKKOS_INLINE_FUNCTION
-half_t cast_to_half(long long val) { return half_t(val); }
-KOKKOS_INLINE_FUNCTION
-half_t cast_to_half(unsigned long long val) { return half_t(val); }
-
-// cast_from_half
-// Using an explicit list here too, since the other ones are explicit and for
-// example don't include char
-template <class T>
-KOKKOS_INLINE_FUNCTION std::enable_if_t<
-    std::is_same<T, float>::value || std::is_same<T, bool>::value ||
-        std::is_same<T, double>::value || std::is_same<T, short>::value ||
-        std::is_same<T, unsigned short>::value || std::is_same<T, int>::value ||
-        std::is_same<T, unsigned int>::value || std::is_same<T, long>::value ||
-        std::is_same<T, unsigned long>::value ||
-        std::is_same<T, long long>::value ||
-        std::is_same<T, unsigned long long>::value,
-    T>
-cast_from_half(half_t val) {
-  return T(val);
-}
-
-}  // namespace Experimental
-}  // namespace Kokkos
-
-#else
-#define KOKKOS_HALF_T_IS_FLOAT false
-#endif  // KOKKOS_IMPL_HALF_TYPE_DEFINED
-
-#ifndef KOKKOS_IMPL_BHALF_TYPE_DEFINED
-#define KOKKOS_IMPL_BHALF_TYPE_DEFINED
-#define KOKKOS_BHALF_T_IS_FLOAT true
-namespace Kokkos {
-namespace Impl {
-struct bhalf_impl_t {
-  using type = float;
-};
-}  // namespace Impl
-
-namespace Experimental {
-
-using bhalf_t = Kokkos::Impl::bhalf_impl_t::type;
-
-// cast_to_bhalf
-KOKKOS_INLINE_FUNCTION
-bhalf_t cast_to_bhalf(float val) { return bhalf_t(val); }
-KOKKOS_INLINE_FUNCTION
-bhalf_t cast_to_bhalf(bool val) { return bhalf_t(val); }
-KOKKOS_INLINE_FUNCTION
-bhalf_t cast_to_bhalf(double val) { return bhalf_t(val); }
-KOKKOS_INLINE_FUNCTION
-bhalf_t cast_to_bhalf(short val) { return bhalf_t(val); }
-KOKKOS_INLINE_FUNCTION
-bhalf_t cast_to_bhalf(unsigned short val) { return bhalf_t(val); }
-KOKKOS_INLINE_FUNCTION
-bhalf_t cast_to_bhalf(int val) { return bhalf_t(val); }
-KOKKOS_INLINE_FUNCTION
-bhalf_t cast_to_bhalf(unsigned int val) { return bhalf_t(val); }
-KOKKOS_INLINE_FUNCTION
-bhalf_t cast_to_bhalf(long val) { return bhalf_t(val); }
-KOKKOS_INLINE_FUNCTION
-bhalf_t cast_to_bhalf(unsigned long val) { return bhalf_t(val); }
-KOKKOS_INLINE_FUNCTION
-bhalf_t cast_to_bhalf(long long val) { return bhalf_t(val); }
-KOKKOS_INLINE_FUNCTION
-bhalf_t cast_to_bhalf(unsigned long long val) { return bhalf_t(val); }
-
-// cast_from_bhalf
-template <class T>
-KOKKOS_INLINE_FUNCTION std::enable_if_t<
-    std::is_same<T, float>::value || std::is_same<T, bool>::value ||
-        std::is_same<T, double>::value || std::is_same<T, short>::value ||
-        std::is_same<T, unsigned short>::value || std::is_same<T, int>::value ||
-        std::is_same<T, unsigned int>::value || std::is_same<T, long>::value ||
-        std::is_same<T, unsigned long>::value ||
-        std::is_same<T, long long>::value ||
-        std::is_same<T, unsigned long long>::value,
-    T>
-cast_from_bhalf(bhalf_t val) {
-  return T(val);
-}
-}  // namespace Experimental
-}  // namespace Kokkos
-#else
-#define KOKKOS_BHALF_T_IS_FLOAT false
-#endif  // KOKKOS_IMPL_BHALF_TYPE_DEFINED
 #ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_HALF
 #undef KOKKOS_IMPL_PUBLIC_INCLUDE
 #undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_HALF
diff --git a/packages/kokkos/core/src/Kokkos_HostSpace.hpp b/packages/kokkos/core/src/Kokkos_HostSpace.hpp
index 4b839aca0568931f8d1fafc705ea7fad8de67c4b..90d140406374662bd005bf8959b61d7232e15d1a 100644
--- a/packages/kokkos/core/src/Kokkos_HostSpace.hpp
+++ b/packages/kokkos/core/src/Kokkos_HostSpace.hpp
@@ -41,37 +41,6 @@ static_assert(false,
 
 /*--------------------------------------------------------------------------*/
 
-namespace Kokkos {
-
-namespace Impl {
-
-/// \brief Initialize lock array for arbitrary size atomics.
-///
-/// Arbitrary atomics are implemented using a hash table of locks
-/// where the hash value is derived from the address of the
-/// object for which an atomic operation is performed.
-/// This function initializes the locks to zero (unset).
-void init_lock_array_host_space();
-
-/// \brief Acquire a lock for the address
-///
-/// This function tries to acquire the lock for the hash value derived
-/// from the provided ptr. If the lock is successfully acquired the
-/// function returns true. Otherwise it returns false.
-bool lock_address_host_space(void* ptr);
-
-/// \brief Release lock for the address
-///
-/// This function releases the lock for the hash value derived
-/// from the provided ptr. This function should only be called
-/// after previously successfully acquiring a lock with
-/// lock_address.
-void unlock_address_host_space(void* ptr);
-
-}  // namespace Impl
-
-}  // namespace Kokkos
-
 namespace Kokkos {
 /// \class HostSpace
 /// \brief Memory management for host memory.
@@ -95,25 +64,27 @@ class HostSpace {
   //! This memory space preferred device_type
   using device_type = Kokkos::Device<execution_space, memory_space>;
 
-  /**\brief  Default memory space instance */
-  HostSpace();
+  HostSpace()                     = default;
   HostSpace(HostSpace&& rhs)      = default;
   HostSpace(const HostSpace& rhs) = default;
   HostSpace& operator=(HostSpace&&) = default;
   HostSpace& operator=(const HostSpace&) = default;
   ~HostSpace()                           = default;
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
   /**\brief  Non-default memory space instance to choose allocation mechansim,
    * if available */
 
-  enum AllocationMechanism {
+  enum KOKKOS_DEPRECATED AllocationMechanism {
     STD_MALLOC,
     POSIX_MEMALIGN,
     POSIX_MMAP,
     INTEL_MM_ALLOC
   };
 
+  KOKKOS_DEPRECATED
   explicit HostSpace(const AllocationMechanism&);
+#endif
 
   /**\brief  Allocate untracked memory in the space */
   void* allocate(const size_t arg_alloc_size) const;
@@ -145,7 +116,6 @@ class HostSpace {
   static constexpr const char* name() { return m_name; }
 
  private:
-  AllocationMechanism m_alloc_mech;
   static constexpr const char* m_name = "Host";
   friend class Kokkos::Impl::SharedAllocationRecord<Kokkos::HostSpace, void>;
 };
@@ -218,7 +188,7 @@ class SharedAllocationRecord<Kokkos::HostSpace, void>
   static RecordBase s_root_record;
 #endif
 
-  const Kokkos::HostSpace m_space;
+  Kokkos::HostSpace m_space;
 
  protected:
   ~SharedAllocationRecord();
@@ -272,27 +242,6 @@ namespace Kokkos {
 
 namespace Impl {
 
-template <class DT, class... DP>
-struct ZeroMemset<typename HostSpace::execution_space, DT, DP...> {
-  ZeroMemset(const typename HostSpace::execution_space& exec,
-             const View<DT, DP...>& dst,
-             typename View<DT, DP...>::const_value_type&) {
-    // Host spaces, except for HPX, are synchronous and we need to fence for HPX
-    // since we can't properly enqueue a std::memset otherwise.
-    // We can't use exec.fence() directly since we don't have a full definition
-    // of HostSpace here.
-    hostspace_fence(exec);
-    using ValueType = typename View<DT, DP...>::value_type;
-    std::memset(dst.data(), 0, sizeof(ValueType) * dst.size());
-  }
-
-  ZeroMemset(const View<DT, DP...>& dst,
-             typename View<DT, DP...>::const_value_type&) {
-    using ValueType = typename View<DT, DP...>::value_type;
-    std::memset(dst.data(), 0, sizeof(ValueType) * dst.size());
-  }
-};
-
 template <>
 struct DeepCopy<HostSpace, HostSpace, DefaultHostExecutionSpace> {
   DeepCopy(void* dst, const void* src, size_t n) {
diff --git a/packages/kokkos/core/src/Kokkos_Macros.hpp b/packages/kokkos/core/src/Kokkos_Macros.hpp
index 289bfd7ddaebe9aa246d49f1db15e55b3a0e0a69..3cf7ac4fa24b666ca983e5e8aa716007fd7efba4 100644
--- a/packages/kokkos/core/src/Kokkos_Macros.hpp
+++ b/packages/kokkos/core/src/Kokkos_Macros.hpp
@@ -55,6 +55,7 @@
 
 #ifndef KOKKOS_DONT_INCLUDE_CORE_CONFIG_H
 #include <KokkosCore_config.h>
+#include <impl/Kokkos_NvidiaGpuArchitectures.hpp>
 #endif
 
 //----------------------------------------------------------------------------
@@ -65,19 +66,13 @@
  *  KOKKOS_COMPILER_NVCC
  *  KOKKOS_COMPILER_GNU
  *  KOKKOS_COMPILER_INTEL
+ *  KOKKOS_COMPILER_INTEL_LLVM
  *  KOKKOS_COMPILER_CRAYC
  *  KOKKOS_COMPILER_APPLECC
  *  KOKKOS_COMPILER_CLANG
  *  KOKKOS_COMPILER_NVHPC
  *  KOKKOS_COMPILER_MSVC
  *
- *  Macros for which compiler extension to use for atomics on intrinsic types
- *
- *  KOKKOS_ENABLE_CUDA_ATOMICS
- *  KOKKOS_ENABLE_GNU_ATOMICS
- *  KOKKOS_ENABLE_INTEL_ATOMICS
- *  KOKKOS_ENABLE_OPENMP_ATOMICS
- *
  *  A suite of 'KOKKOS_ENABLE_PRAGMA_...' are defined for internal use.
  *
  *  Macros for marking functions to run in an execution space:
@@ -129,46 +124,46 @@
 
 #if defined(__INTEL_COMPILER)
 #define KOKKOS_COMPILER_INTEL __INTEL_COMPILER
+
 #elif defined(__INTEL_LLVM_COMPILER)
-#define KOKKOS_COMPILER_INTEL __INTEL_LLVM_COMPILER
-#elif defined(__ICC)
-// Old define
-#define KOKKOS_COMPILER_INTEL __ICC
-#elif defined(__ECC)
-// Very old define
-#define KOKKOS_COMPILER_INTEL __ECC
-#endif
+#define KOKKOS_COMPILER_INTEL_LLVM __INTEL_LLVM_COMPILER
+
+// Cray compiler for device offload code
+#elif defined(__cray__) && defined(__clang__)
+#define KOKKOS_COMPILER_CRAY_LLVM \
+  __cray_major__ * 100 + __cray_minor__ * 10 + __cray_patchlevel__
 
+#elif defined(_CRAYC)
 // CRAY compiler for host code
-#if defined(_CRAYC)
 #define KOKKOS_COMPILER_CRAYC _CRAYC
-#endif
 
-#if defined(__APPLE_CC__)
+#elif defined(__APPLE_CC__)
 #define KOKKOS_COMPILER_APPLECC __APPLE_CC__
-#endif
 
-#if defined(__clang__) && !defined(KOKKOS_COMPILER_INTEL)
+#elif defined(__NVCOMPILER)
+#define KOKKOS_COMPILER_NVHPC                                 \
+  __NVCOMPILER_MAJOR__ * 10000 + __NVCOMPILER_MINOR__ * 100 + \
+      __NVCOMPILER_PATCHLEVEL__
+
+#elif defined(__clang__)
+// Check this after the Clang-based proprietary compilers which will also define
+// __clang__
 #define KOKKOS_COMPILER_CLANG \
   __clang_major__ * 100 + __clang_minor__ * 10 + __clang_patchlevel__
-#endif
 
-#if !defined(__clang__) && !defined(KOKKOS_COMPILER_INTEL) && defined(__GNUC__)
+#elif defined(__GNUC__)
+// Check this here because many compilers (at least Clang variants and Intel
+// classic) define `__GNUC__` for compatibility
 #define KOKKOS_COMPILER_GNU \
   __GNUC__ * 100 + __GNUC_MINOR__ * 10 + __GNUC_PATCHLEVEL__
 
-#if (530 > KOKKOS_COMPILER_GNU)
-#error "Compiling with GCC version earlier than 5.3.0 is not supported."
-#endif
+#if (820 > KOKKOS_COMPILER_GNU)
+#error "Compiling with GCC version earlier than 8.2.0 is not supported."
 #endif
 
-#if defined(__NVCOMPILER)
-#define KOKKOS_COMPILER_NVHPC                              \
-  __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ * 10 + \
-      __NVCOMPILER_PATCHLEVEL__
-#endif
-
-#if defined(_MSC_VER) && !defined(KOKKOS_COMPILER_INTEL)
+#elif defined(_MSC_VER)
+// Check this after Intel and Clang because those define _MSC_VER for
+// compatibility
 #define KOKKOS_COMPILER_MSVC _MSC_VER
 #endif
 
@@ -182,16 +177,13 @@
 //----------------------------------------------------------------------------
 // Intel compiler macros
 
-#if defined(KOKKOS_COMPILER_INTEL)
-// FIXME_SYCL
-#if !defined(KOKKOS_ENABLE_SYCL)
+#if defined(KOKKOS_COMPILER_INTEL) || defined(KOKKOS_COMPILER_INTEL_LLVM)
+#if defined(KOKKOS_COMPILER_INTEL_LLVM) && \
+    KOKKOS_COMPILER_INTEL_LLVM >= 20230100
 #define KOKKOS_ENABLE_PRAGMA_UNROLL 1
 #define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1
 #define KOKKOS_ENABLE_PRAGMA_VECTOR 1
-#endif
 
-// FIXME_SYCL
-#if !defined(KOKKOS_ENABLE_SYCL)
 #define KOKKOS_ENABLE_PRAGMA_IVDEP 1
 #endif
 
@@ -213,7 +205,7 @@
 #endif
 #endif
 
-#if (1900 > KOKKOS_COMPILER_INTEL)
+#if defined(KOKKOS_COMPILER_INTEL) && (1900 > KOKKOS_COMPILER_INTEL)
 #error "Compiling with Intel version earlier than 19.0.5 is not supported."
 #endif
 
@@ -231,10 +223,6 @@
 #endif
 #endif
 
-#if defined(KOKKOS_ARCH_AVX512MIC)
-#define KOKKOS_ENABLE_RFO_PREFETCH 1
-#endif
-
 #if defined(__MIC__)
 // Compiling for Xeon Phi
 #endif
@@ -276,10 +264,6 @@
 //#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1
 //#define KOKKOS_ENABLE_PRAGMA_VECTOR 1
 
-#if defined(KOKKOS_ARCH_AVX512MIC)
-#define KOKKOS_ENABLE_RFO_PREFETCH 1
-#endif
-
 #if !defined(KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION)
 #define KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION \
   inline __attribute__((always_inline))
@@ -297,7 +281,7 @@
 
 //----------------------------------------------------------------------------
 
-#if defined(KOKKOS_COMPILER_PGI)
+#if defined(KOKKOS_COMPILER_NVHPC)
 #define KOKKOS_ENABLE_PRAGMA_UNROLL 1
 #define KOKKOS_ENABLE_PRAGMA_IVDEP 1
 //#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1
@@ -358,7 +342,7 @@
 // Temporary solution for SYCL not supporting printf in kernels.
 // Might disappear at any point once we have found another solution.
 #if !defined(KOKKOS_IMPL_DO_NOT_USE_PRINTF)
-#define KOKKOS_IMPL_DO_NOT_USE_PRINTF(...) printf(__VA_ARGS__)
+#define KOKKOS_IMPL_DO_NOT_USE_PRINTF(...) ::printf(__VA_ARGS__)
 #endif
 
 //----------------------------------------------------------------------------
@@ -524,6 +508,7 @@ static constexpr bool kokkos_omp_on_host() { return false; }
     KOKKOS_IMPL_STRIP_PARENS(CODE)   \
   }
 #else
+#include <openacc.h>
 // FIXME_OPENACC acc_on_device is a non-constexpr function
 #define KOKKOS_IF_ON_DEVICE(CODE)                     \
   if constexpr (acc_on_device(acc_device_not_host)) { \
@@ -600,8 +585,9 @@ static constexpr bool kokkos_omp_on_host() { return false; }
 
 #define KOKKOS_ATTRIBUTE_NODISCARD [[nodiscard]]
 
-#if (defined(KOKKOS_COMPILER_GNU) || defined(KOKKOS_COMPILER_CLANG) ||  \
-     defined(KOKKOS_COMPILER_INTEL) || defined(KOKKOS_COMPILER_PGI)) && \
+#if (defined(KOKKOS_COMPILER_GNU) || defined(KOKKOS_COMPILER_CLANG) ||        \
+     defined(KOKKOS_COMPILER_INTEL) || defined(KOKKOS_COMPILER_INTEL_LLVM) || \
+     defined(KOKKOS_COMPILER_NVHPC)) &&                                       \
     !defined(_WIN32) && !defined(__ANDROID__)
 #if __has_include(<execinfo.h>)
 #define KOKKOS_IMPL_ENABLE_STACKTRACE
diff --git a/packages/kokkos/core/src/Kokkos_MathematicalFunctions.hpp b/packages/kokkos/core/src/Kokkos_MathematicalFunctions.hpp
index 5016249edcca8b0952e01a82f55e376773e11ab7..ee64c67b93bd7e61f684a755028fc94bcd836644 100644
--- a/packages/kokkos/core/src/Kokkos_MathematicalFunctions.hpp
+++ b/packages/kokkos/core/src/Kokkos_MathematicalFunctions.hpp
@@ -286,7 +286,7 @@ KOKKOS_INLINE_FUNCTION int abs(int n) {
 }
 KOKKOS_INLINE_FUNCTION long abs(long n) {
 // FIXME_NVHPC ptxas fatal   : unresolved extern function 'labs'
-#ifdef KOKKOS_COMPILER_NVHPC
+#if defined(KOKKOS_COMPILER_NVHPC) && KOKKOS_COMPILER_NVHPC < 230700
   return n > 0 ? n : -n;
 #else
   using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::abs;
@@ -295,7 +295,7 @@ KOKKOS_INLINE_FUNCTION long abs(long n) {
 }
 KOKKOS_INLINE_FUNCTION long long abs(long long n) {
 // FIXME_NVHPC ptxas fatal   : unresolved extern function 'labs'
-#ifdef KOKKOS_COMPILER_NVHPC
+#if defined(KOKKOS_COMPILER_NVHPC) && KOKKOS_COMPILER_NVHPC < 230700
   return n > 0 ? n : -n;
 #else
   using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::abs;
@@ -345,9 +345,7 @@ KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED(
 // Exponential functions
 KOKKOS_IMPL_MATH_UNARY_FUNCTION(exp)
 // FIXME_NVHPC nvc++ has issues with exp2
-#ifndef KOKKOS_COMPILER_NVHPC
-KOKKOS_IMPL_MATH_UNARY_FUNCTION(exp2)
-#else
+#if defined(KOKKOS_COMPILER_NVHPC) && KOKKOS_COMPILER_NVHPC < 230700
 KOKKOS_INLINE_FUNCTION float exp2(float val) {
   constexpr float ln2 = 0.693147180559945309417232121458176568L;
   return exp(ln2 * val);
@@ -365,6 +363,8 @@ KOKKOS_INLINE_FUNCTION double exp2(T val) {
   constexpr double ln2 = 0.693147180559945309417232121458176568L;
   return exp(ln2 * static_cast<double>(val));
 }
+#else
+KOKKOS_IMPL_MATH_UNARY_FUNCTION(exp2)
 #endif
 KOKKOS_IMPL_MATH_UNARY_FUNCTION(expm1)
 KOKKOS_IMPL_MATH_UNARY_FUNCTION(log)
@@ -485,6 +485,38 @@ KOKKOS_IMPL_MATH_UNARY_PREDICATE(signbit)
 #undef KOKKOS_IMPL_MATH_BINARY_FUNCTION
 #undef KOKKOS_IMPL_MATH_TERNARY_FUNCTION
 
+// non-standard math functions provided by CUDA/HIP/SYCL
+KOKKOS_INLINE_FUNCTION float rsqrt(float val) {
+#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
+  KOKKOS_IF_ON_DEVICE(return ::rsqrtf(val);)
+  KOKKOS_IF_ON_HOST(return 1.0f / Kokkos::sqrt(val);)
+#elif defined(KOKKOS_ENABLE_SYCL)
+  KOKKOS_IF_ON_DEVICE(return sycl::rsqrt(val);)
+  KOKKOS_IF_ON_HOST(return 1.0f / Kokkos::sqrt(val);)
+#else
+  return 1.0f / Kokkos::sqrt(val);
+#endif
+}
+KOKKOS_INLINE_FUNCTION double rsqrt(double val) {
+#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
+  KOKKOS_IF_ON_DEVICE(return ::rsqrt(val);)
+  KOKKOS_IF_ON_HOST(return 1.0 / Kokkos::sqrt(val);)
+#elif defined(KOKKOS_ENABLE_SYCL)
+  KOKKOS_IF_ON_DEVICE(return sycl::rsqrt(val);)
+  KOKKOS_IF_ON_HOST(return 1.0 / Kokkos::sqrt(val);)
+#else
+  return 1.0 / Kokkos::sqrt(val);
+#endif
+}
+inline long double rsqrt(long double val) { return 1.0l / Kokkos::sqrt(val); }
+KOKKOS_INLINE_FUNCTION float rsqrtf(float x) { return Kokkos::rsqrt(x); }
+inline long double rsqrtl(long double x) { return Kokkos::rsqrt(x); }
+template <class T>
+KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_integral_v<T>, double> rsqrt(
+    T x) {
+  return Kokkos::rsqrt(static_cast<double>(x));
+}
+
 }  // namespace Kokkos
 
 #ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_MATHFUNCTIONS
diff --git a/packages/kokkos/core/src/Kokkos_MathematicalSpecialFunctions.hpp b/packages/kokkos/core/src/Kokkos_MathematicalSpecialFunctions.hpp
index 63c2b58ef59ad00031be9d69c12cab303ec0ad17..2118a0ad9319225fbe5b7a10b66e6703447f2b35 100644
--- a/packages/kokkos/core/src/Kokkos_MathematicalSpecialFunctions.hpp
+++ b/packages/kokkos/core/src/Kokkos_MathematicalSpecialFunctions.hpp
@@ -846,69 +846,52 @@ KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_y1(const CmplxType& z,
 //! for a complex argument
 template <class CmplxType, class RealType, class IntType>
 KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_i0(const CmplxType& z,
-                                               const RealType& joint_val = 25,
-                                               const IntType& bw_start   = 70) {
+                                               const RealType& joint_val = 18,
+                                               const IntType& n_terms    = 50) {
   // This function is converted and modified from the corresponding Fortran
-  // programs CIKNB and CIK01 in S. Zhang & J. Jin "Computation of Special
+  // programs CIK01 in S. Zhang & J. Jin "Computation of Special
   // Functions" (Wiley, 1996).
   //    Input :  z         --- Complex argument
   //             joint_val --- Joint point of abs(z) separating small and large
   //                           argument regions
-  //             bw_start  --- Starting point for backward recurrence
+  //             n_terms   --- Numbers of terms used in the power series
   //    Output:  cbi0      --- I0(z)
-  using Kokkos::numbers::pi_v;
-
-  CmplxType cbi0;
-  constexpr auto pi    = pi_v<RealType>;
-  const RealType a[12] = {0.125,
-                          7.03125e-2,
-                          7.32421875e-2,
-                          1.1215209960938e-1,
-                          2.2710800170898e-1,
-                          5.7250142097473e-1,
-                          1.7277275025845e0,
-                          6.0740420012735e0,
-                          2.4380529699556e1,
-                          1.1001714026925e2,
-                          5.5133589612202e2,
-                          3.0380905109224e3};
 
+  CmplxType cbi0(1.0, 0.0);
   RealType a0  = Kokkos::abs(z);
   CmplxType z1 = z;
 
-  if (a0 < 1e-100) {  // Treat z=0 as a special case
-    cbi0 = CmplxType(1.0, 0.0);
-  } else {
+  if (a0 > 1e-100) {
     if (z.real() < 0.0) z1 = -z;
-    if (a0 <= joint_val) {  // Using backward recurrence for |z|<=joint_val
-                            // (default:25)
-      CmplxType cbs = CmplxType(0.0, 0.0);
-      // CmplxType csk0 = CmplxType(0.0,0.0);
-      CmplxType cf0 = CmplxType(0.0, 0.0);
-      CmplxType cf1 = CmplxType(1e-100, 0.0);
-      CmplxType cf, cs0;
-      for (int k = bw_start; k >= 0; k--) {  // Backward recurrence (default:
-                                             // 70)
-        cf = 2.0 * (k + 1.0) * cf1 / z1 + cf0;
-        if (k == 0) cbi0 = cf;
-        // if ((k == 2*(k/2)) && (k != 0)) {
-        //  csk0 = csk0+4.0*cf/static_cast<RealType>(k);
-        //}
-        cbs = cbs + 2.0 * cf;
-        cf0 = cf1;
-        cf1 = cf;
+    if (a0 <= joint_val) {
+      // Using power series definition for |z|<=joint_val (default:18)
+      CmplxType cr = CmplxType(1.0e+00, 0.0e+00);
+      CmplxType z2 = z * z;
+      for (int k = 1; k < n_terms; ++k) {
+        cr = RealType(.25) * cr * z2 / CmplxType(k * k);
+        cbi0 += cr;
+        if (Kokkos::abs(cr / cbi0) < RealType(1.e-15)) continue;
       }
-      cs0  = Kokkos::exp(z1) / (cbs - cf);
-      cbi0 = cbi0 * cs0;
-    } else {  // Using asymptotic expansion (6.2.1) for |z|>joint_val
-              // (default:25)
-      CmplxType ca = Kokkos::exp(z1) / Kokkos::sqrt(2.0 * pi * z1);
-      cbi0         = CmplxType(1.0, 0.0);
-      CmplxType zr = 1.0 / z1;
+    } else {
+      // Using asymptotic expansion (6.2.1) for |z|>joint_val (default:18)
+      const RealType a[12] = {0.125,
+                              7.03125e-2,
+                              7.32421875e-2,
+                              1.1215209960938e-1,
+                              2.2710800170898e-1,
+                              5.7250142097473e-1,
+                              1.7277275025845e0,
+                              6.0740420012735e0,
+                              2.4380529699556e1,
+                              1.1001714026925e2,
+                              5.5133589612202e2,
+                              3.0380905109224e3};
+
       for (int k = 1; k <= 12; k++) {
-        cbi0 = cbi0 + a[k - 1] * Kokkos::pow(zr, 1.0 * k);
+        cbi0 += a[k - 1] * Kokkos::pow(z1, -k);
       }
-      cbi0 = ca * cbi0;
+      cbi0 *= Kokkos::exp(z1) /
+              Kokkos::sqrt(2.0 * Kokkos::numbers::pi_v<RealType> * z1);
     }
   }
   return cbi0;
diff --git a/packages/kokkos/core/src/Kokkos_MemoryPool.hpp b/packages/kokkos/core/src/Kokkos_MemoryPool.hpp
index 6b47c43727675783d1ee22c54e143dac5abdc5f6..ce8c9e152fa3cf6633dd231b1de3e3648e7dcbf7 100644
--- a/packages/kokkos/core/src/Kokkos_MemoryPool.hpp
+++ b/packages/kokkos/core/src/Kokkos_MemoryPool.hpp
@@ -29,8 +29,6 @@ static_assert(false,
 #include <impl/Kokkos_Error.hpp>
 #include <impl/Kokkos_SharedAlloc.hpp>
 
-#include <iostream>
-
 namespace Kokkos {
 namespace Impl {
 /* Report violation of size constraints:
diff --git a/packages/kokkos/core/src/Kokkos_MemoryTraits.hpp b/packages/kokkos/core/src/Kokkos_MemoryTraits.hpp
index 762e1a4a5dc82a65a27436855703605161a3cfd5..c145d04a42b7415fd64d06cd2ae8cdeb5e880df2 100644
--- a/packages/kokkos/core/src/Kokkos_MemoryTraits.hpp
+++ b/packages/kokkos/core/src/Kokkos_MemoryTraits.hpp
@@ -48,17 +48,19 @@ template <unsigned T>
 struct MemoryTraits {
   //! Tag this class as a kokkos memory traits:
   using memory_traits = MemoryTraits<T>;
-  enum : bool {
-    is_unmanaged = (unsigned(0) != (T & unsigned(Kokkos::Unmanaged)))
-  };
-  enum : bool {
-    is_random_access = (unsigned(0) != (T & unsigned(Kokkos::RandomAccess)))
-  };
-  enum : bool { is_atomic = (unsigned(0) != (T & unsigned(Kokkos::Atomic))) };
-  enum : bool {
-    is_restrict = (unsigned(0) != (T & unsigned(Kokkos::Restrict)))
-  };
-  enum : bool { is_aligned = (unsigned(0) != (T & unsigned(Kokkos::Aligned))) };
+
+  static constexpr unsigned impl_value = T;
+
+  static constexpr bool is_unmanaged =
+      (unsigned(0) != (T & unsigned(Kokkos::Unmanaged)));
+  static constexpr bool is_random_access =
+      (unsigned(0) != (T & unsigned(Kokkos::RandomAccess)));
+  static constexpr bool is_atomic =
+      (unsigned(0) != (T & unsigned(Kokkos::Atomic)));
+  static constexpr bool is_restrict =
+      (unsigned(0) != (T & unsigned(Kokkos::Restrict)));
+  static constexpr bool is_aligned =
+      (unsigned(0) != (T & unsigned(Kokkos::Aligned)));
 };
 
 }  // namespace Kokkos
diff --git a/packages/kokkos/core/src/Kokkos_Pair.hpp b/packages/kokkos/core/src/Kokkos_Pair.hpp
index d42e07aa42369b1c8cb45decc1aa7c6238e7033f..7127c78280e2de74ab1e90c67bb0af9bc86ab080 100644
--- a/packages/kokkos/core/src/Kokkos_Pair.hpp
+++ b/packages/kokkos/core/src/Kokkos_Pair.hpp
@@ -62,8 +62,7 @@ struct pair {
   ///
   /// This calls the copy constructors of T1 and T2.  It won't compile
   /// if those copy constructors are not defined and public.
-#ifdef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC bug in NVHPC regarding constexpr
-                              // constructors used in device code
+#if defined(KOKKOS_COMPILER_NVHPC) && KOKKOS_COMPILER_NVHPC < 230700
   KOKKOS_FORCEINLINE_FUNCTION
 #else
   KOKKOS_FORCEINLINE_FUNCTION constexpr
@@ -75,8 +74,7 @@ struct pair {
   /// This calls the copy constructors of T1 and T2.  It won't compile
   /// if those copy constructors are not defined and public.
   template <class U, class V>
-#ifdef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC bug in NVHPC regarding constexpr
-                              // constructors used in device code
+#if defined(KOKKOS_COMPILER_NVHPC) && KOKKOS_COMPILER_NVHPC < 230700
   KOKKOS_FORCEINLINE_FUNCTION
 #else
   KOKKOS_FORCEINLINE_FUNCTION constexpr
diff --git a/packages/kokkos/core/src/Kokkos_Parallel.hpp b/packages/kokkos/core/src/Kokkos_Parallel.hpp
index 7c4207222662800481f4c395021437ee901ad1b2..484f6c0d5f4c02a1f506a88646329d5c46d8b310 100644
--- a/packages/kokkos/core/src/Kokkos_Parallel.hpp
+++ b/packages/kokkos/core/src/Kokkos_Parallel.hpp
@@ -399,7 +399,8 @@ inline void parallel_scan(const std::string& str, const ExecutionPolicy& policy,
 
   if constexpr (Kokkos::is_view<ReturnType>::value) {
     Kokkos::Impl::shared_allocation_tracking_disable();
-    Impl::ParallelScanWithTotal<FunctorType, ExecutionPolicy, ReturnType>
+    Impl::ParallelScanWithTotal<FunctorType, ExecutionPolicy,
+                                typename ReturnType::value_type>
         closure(functor, inner_policy, return_value);
     Kokkos::Impl::shared_allocation_tracking_enable();
     closure.execute();
diff --git a/packages/kokkos/core/src/Kokkos_Parallel_Reduce.hpp b/packages/kokkos/core/src/Kokkos_Parallel_Reduce.hpp
index d44bd89a9b5ec85d205b835f11cfd6e64ecff79f..d499eba6dcca49a01a420872aa41d97c20d939f5 100644
--- a/packages/kokkos/core/src/Kokkos_Parallel_Reduce.hpp
+++ b/packages/kokkos/core/src/Kokkos_Parallel_Reduce.hpp
@@ -27,7 +27,6 @@ static_assert(false,
 #include <impl/Kokkos_FunctorAnalysis.hpp>
 #include <impl/Kokkos_Tools_Generic.hpp>
 #include <type_traits>
-#include <iostream>
 
 namespace Kokkos {
 
@@ -37,6 +36,7 @@ struct Sum {
   // Required
   using reducer    = Sum<Scalar, Space>;
   using value_type = std::remove_cv_t<Scalar>;
+  static_assert(!std::is_pointer_v<value_type> && !std::is_array_v<value_type>);
 
   using result_view_type = Kokkos::View<value_type, Space>;
 
@@ -81,6 +81,7 @@ struct Prod {
   // Required
   using reducer    = Prod<Scalar, Space>;
   using value_type = std::remove_cv_t<Scalar>;
+  static_assert(!std::is_pointer_v<value_type> && !std::is_array_v<value_type>);
 
   using result_view_type = Kokkos::View<value_type, Space>;
 
@@ -125,6 +126,7 @@ struct Min {
   // Required
   using reducer    = Min<Scalar, Space>;
   using value_type = std::remove_cv_t<Scalar>;
+  static_assert(!std::is_pointer_v<value_type> && !std::is_array_v<value_type>);
 
   using result_view_type = Kokkos::View<value_type, Space>;
 
@@ -171,6 +173,7 @@ struct Max {
   // Required
   using reducer    = Max<Scalar, Space>;
   using value_type = std::remove_cv_t<Scalar>;
+  static_assert(!std::is_pointer_v<value_type> && !std::is_array_v<value_type>);
 
   using result_view_type = Kokkos::View<value_type, Space>;
 
@@ -218,6 +221,7 @@ struct LAnd {
   // Required
   using reducer    = LAnd<Scalar, Space>;
   using value_type = std::remove_cv_t<Scalar>;
+  static_assert(!std::is_pointer_v<value_type> && !std::is_array_v<value_type>);
 
   using result_view_type = Kokkos::View<value_type, Space>;
 
@@ -263,6 +267,7 @@ struct LOr {
   // Required
   using reducer    = LOr<Scalar, Space>;
   using value_type = std::remove_cv_t<Scalar>;
+  static_assert(!std::is_pointer_v<value_type> && !std::is_array_v<value_type>);
 
   using result_view_type = Kokkos::View<value_type, Space>;
 
@@ -309,6 +314,7 @@ struct BAnd {
   // Required
   using reducer    = BAnd<Scalar, Space>;
   using value_type = std::remove_cv_t<Scalar>;
+  static_assert(!std::is_pointer_v<value_type> && !std::is_array_v<value_type>);
 
   using result_view_type = Kokkos::View<value_type, Space>;
 
@@ -355,6 +361,7 @@ struct BOr {
   // Required
   using reducer    = BOr<Scalar, Space>;
   using value_type = std::remove_cv_t<Scalar>;
+  static_assert(!std::is_pointer_v<value_type> && !std::is_array_v<value_type>);
 
   using result_view_type = Kokkos::View<value_type, Space>;
 
@@ -399,12 +406,6 @@ template <class Scalar, class Index>
 struct ValLocScalar {
   Scalar val;
   Index loc;
-
-  KOKKOS_INLINE_FUNCTION
-  void operator=(const ValLocScalar& rhs) {
-    val = rhs.val;
-    loc = rhs.loc;
-  }
 };
 
 template <class Scalar, class Index, class Space>
@@ -412,6 +413,8 @@ struct MinLoc {
  private:
   using scalar_type = std::remove_cv_t<Scalar>;
   using index_type  = std::remove_cv_t<Index>;
+  static_assert(!std::is_pointer_v<scalar_type> &&
+                !std::is_array_v<scalar_type>);
 
  public:
   // Required
@@ -465,6 +468,8 @@ struct MaxLoc {
  private:
   using scalar_type = std::remove_cv_t<Scalar>;
   using index_type  = std::remove_cv_t<Index>;
+  static_assert(!std::is_pointer_v<scalar_type> &&
+                !std::is_array_v<scalar_type>);
 
  public:
   // Required
@@ -516,18 +521,14 @@ MaxLoc(View<ValLocScalar<Scalar, Index>, Properties...> const&)
 template <class Scalar>
 struct MinMaxScalar {
   Scalar min_val, max_val;
-
-  KOKKOS_INLINE_FUNCTION
-  void operator=(const MinMaxScalar& rhs) {
-    min_val = rhs.min_val;
-    max_val = rhs.max_val;
-  }
 };
 
 template <class Scalar, class Space>
 struct MinMax {
  private:
   using scalar_type = std::remove_cv_t<Scalar>;
+  static_assert(!std::is_pointer_v<scalar_type> &&
+                !std::is_array_v<scalar_type>);
 
  public:
   // Required
@@ -584,14 +585,6 @@ template <class Scalar, class Index>
 struct MinMaxLocScalar {
   Scalar min_val, max_val;
   Index min_loc, max_loc;
-
-  KOKKOS_INLINE_FUNCTION
-  void operator=(const MinMaxLocScalar& rhs) {
-    min_val = rhs.min_val;
-    min_loc = rhs.min_loc;
-    max_val = rhs.max_val;
-    max_loc = rhs.max_loc;
-  }
 };
 
 template <class Scalar, class Index, class Space>
@@ -599,6 +592,8 @@ struct MinMaxLoc {
  private:
   using scalar_type = std::remove_cv_t<Scalar>;
   using index_type  = std::remove_cv_t<Index>;
+  static_assert(!std::is_pointer_v<scalar_type> &&
+                !std::is_array_v<scalar_type>);
 
  public:
   // Required
@@ -668,6 +663,9 @@ struct MaxFirstLoc {
  private:
   using scalar_type = std::remove_cv_t<Scalar>;
   using index_type  = std::remove_cv_t<Index>;
+  static_assert(!std::is_pointer_v<scalar_type> &&
+                !std::is_array_v<scalar_type>);
+  static_assert(std::is_integral_v<index_type>);
 
  public:
   // Required
@@ -729,6 +727,9 @@ struct MaxFirstLocCustomComparator {
  private:
   using scalar_type = std::remove_cv_t<Scalar>;
   using index_type  = std::remove_cv_t<Index>;
+  static_assert(!std::is_pointer_v<scalar_type> &&
+                !std::is_array_v<scalar_type>);
+  static_assert(std::is_integral_v<index_type>);
 
  public:
   // Required
@@ -795,6 +796,9 @@ struct MinFirstLoc {
  private:
   using scalar_type = std::remove_cv_t<Scalar>;
   using index_type  = std::remove_cv_t<Index>;
+  static_assert(!std::is_pointer_v<scalar_type> &&
+                !std::is_array_v<scalar_type>);
+  static_assert(std::is_integral_v<index_type>);
 
  public:
   // Required
@@ -856,6 +860,9 @@ struct MinFirstLocCustomComparator {
  private:
   using scalar_type = std::remove_cv_t<Scalar>;
   using index_type  = std::remove_cv_t<Index>;
+  static_assert(!std::is_pointer_v<scalar_type> &&
+                !std::is_array_v<scalar_type>);
+  static_assert(std::is_integral_v<index_type>);
 
  public:
   // Required
@@ -922,6 +929,9 @@ struct MinMaxFirstLastLoc {
  private:
   using scalar_type = std::remove_cv_t<Scalar>;
   using index_type  = std::remove_cv_t<Index>;
+  static_assert(!std::is_pointer_v<scalar_type> &&
+                !std::is_array_v<scalar_type>);
+  static_assert(std::is_integral_v<index_type>);
 
  public:
   // Required
@@ -994,6 +1004,9 @@ struct MinMaxFirstLastLocCustomComparator {
  private:
   using scalar_type = std::remove_cv_t<Scalar>;
   using index_type  = std::remove_cv_t<Index>;
+  static_assert(!std::is_pointer_v<scalar_type> &&
+                !std::is_array_v<scalar_type>);
+  static_assert(std::is_integral_v<index_type>);
 
  public:
   // Required
@@ -1069,15 +1082,13 @@ MinMaxFirstLastLocCustomComparator(
 template <class Index>
 struct FirstLocScalar {
   Index min_loc_true;
-
-  KOKKOS_INLINE_FUNCTION
-  void operator=(const FirstLocScalar& rhs) { min_loc_true = rhs.min_loc_true; }
 };
 
 template <class Index, class Space>
 struct FirstLoc {
  private:
   using index_type = std::remove_cv_t<Index>;
+  static_assert(std::is_integral_v<index_type>);
 
  public:
   // Required
@@ -1132,15 +1143,13 @@ FirstLoc(View<FirstLocScalar<Index>, Properties...> const&)
 template <class Index>
 struct LastLocScalar {
   Index max_loc_true;
-
-  KOKKOS_INLINE_FUNCTION
-  void operator=(const LastLocScalar& rhs) { max_loc_true = rhs.max_loc_true; }
 };
 
 template <class Index, class Space>
 struct LastLoc {
  private:
   using index_type = std::remove_cv_t<Index>;
+  static_assert(std::is_integral_v<index_type>);
 
  public:
   // Required
@@ -1192,12 +1201,6 @@ LastLoc(View<LastLocScalar<Index>, Properties...> const&)
 template <class Index>
 struct StdIsPartScalar {
   Index max_loc_true, min_loc_false;
-
-  KOKKOS_INLINE_FUNCTION
-  void operator=(const StdIsPartScalar& rhs) {
-    min_loc_false = rhs.min_loc_false;
-    max_loc_true  = rhs.max_loc_true;
-  }
 };
 
 //
@@ -1207,6 +1210,7 @@ template <class Index, class Space>
 struct StdIsPartitioned {
  private:
   using index_type = std::remove_cv_t<Index>;
+  static_assert(std::is_integral_v<index_type>);
 
  public:
   // Required
@@ -1264,11 +1268,6 @@ StdIsPartitioned(View<StdIsPartScalar<Index>, Properties...> const&)
 template <class Index>
 struct StdPartPointScalar {
   Index min_loc_false;
-
-  KOKKOS_INLINE_FUNCTION
-  void operator=(const StdPartPointScalar& rhs) {
-    min_loc_false = rhs.min_loc_false;
-  }
 };
 
 //
@@ -1278,6 +1277,7 @@ template <class Index, class Space>
 struct StdPartitionPoint {
  private:
   using index_type = std::remove_cv_t<Index>;
+  static_assert(std::is_integral_v<index_type>);
 
  public:
   // Required
@@ -1331,6 +1331,46 @@ StdPartitionPoint(View<StdPartPointScalar<Index>, Properties...> const&)
 namespace Kokkos {
 namespace Impl {
 
+template <typename FunctorType, typename FunctorAnalysisReducerType,
+          typename Enable>
+class CombinedFunctorReducer {
+ public:
+  using functor_type = FunctorType;
+  using reducer_type = FunctorAnalysisReducerType;
+  CombinedFunctorReducer(const FunctorType& functor,
+                         const FunctorAnalysisReducerType& reducer)
+      : m_functor(functor), m_reducer(reducer) {}
+  KOKKOS_FUNCTION const FunctorType& get_functor() const { return m_functor; }
+  KOKKOS_FUNCTION const FunctorAnalysisReducerType& get_reducer() const {
+    return m_reducer;
+  }
+
+ private:
+  FunctorType m_functor;
+  FunctorAnalysisReducerType m_reducer;
+};
+template <typename FunctorType, typename FunctorAnalysisReducerType>
+class CombinedFunctorReducer<
+    FunctorType, FunctorAnalysisReducerType,
+    std::enable_if_t<std::is_same_v<
+        FunctorType, typename FunctorAnalysisReducerType::functor_type>>> {
+ public:
+  using functor_type = FunctorType;
+  using reducer_type = FunctorAnalysisReducerType;
+  CombinedFunctorReducer(const FunctorType& functor,
+                         const FunctorAnalysisReducerType&)
+      : m_reducer(functor) {}
+  KOKKOS_FUNCTION const FunctorType& get_functor() const {
+    return m_reducer.get_functor();
+  }
+  KOKKOS_FUNCTION const FunctorAnalysisReducerType& get_reducer() const {
+    return m_reducer;
+  }
+
+ private:
+  FunctorAnalysisReducerType m_reducer;
+};
+
 template <class T, class ReturnType, class ValueTraits>
 struct ParallelReduceReturnValue;
 
@@ -1396,12 +1436,12 @@ template <class ReturnType, class FunctorType>
 struct ParallelReduceReturnValue<
     std::enable_if_t<Kokkos::is_reducer<ReturnType>::value>, ReturnType,
     FunctorType> {
-  using return_type  = ReturnType;
+  using return_type  = typename ReturnType::result_view_type;
   using reducer_type = ReturnType;
   using value_type   = typename return_type::value_type;
 
-  static return_type return_value(ReturnType& return_val, const FunctorType&) {
-    return return_val;
+  static auto return_value(ReturnType& return_val, const FunctorType&) {
+    return return_val.view();
   }
 };
 
@@ -1449,29 +1489,41 @@ struct ParallelReduceAdaptor {
                                   const PolicyType& policy,
                                   const FunctorType& functor,
                                   ReturnType& return_value) {
-    uint64_t kpID = 0;
+    using PassedReducerType = typename return_value_adapter::reducer_type;
+    uint64_t kpID           = 0;
 
     PolicyType inner_policy = policy;
-    Kokkos::Tools::Impl::begin_parallel_reduce<
-        typename return_value_adapter::reducer_type>(inner_policy, functor,
-                                                     label, kpID);
-
+    Kokkos::Tools::Impl::begin_parallel_reduce<PassedReducerType>(
+        inner_policy, functor, label, kpID);
+
+    using ReducerSelector =
+        Kokkos::Impl::if_c<std::is_same<InvalidType, PassedReducerType>::value,
+                           FunctorType, PassedReducerType>;
+    using Analysis = FunctorAnalysis<FunctorPatternInterface::REDUCE,
+                                     PolicyType, typename ReducerSelector::type,
+                                     typename return_value_adapter::value_type>;
     Kokkos::Impl::shared_allocation_tracking_disable();
-    Impl::ParallelReduce<FunctorType, PolicyType,
-                         typename return_value_adapter::reducer_type>
-        closure(functor, inner_policy,
+    CombinedFunctorReducer functor_reducer(
+        functor, typename Analysis::Reducer(
+                     ReducerSelector::select(functor, return_value)));
+
+    // FIXME Remove "Wrapper" once all backends implement the new interface
+    Impl::ParallelReduce<decltype(functor_reducer), PolicyType,
+                         typename Impl::FunctorPolicyExecutionSpace<
+                             FunctorType, PolicyType>::execution_space>
+        closure(functor_reducer, inner_policy,
                 return_value_adapter::return_value(return_value, functor));
     Kokkos::Impl::shared_allocation_tracking_enable();
     closure.execute();
 
-    Kokkos::Tools::Impl::end_parallel_reduce<
-        typename return_value_adapter::reducer_type>(inner_policy, functor,
-                                                     label, kpID);
+    Kokkos::Tools::Impl::end_parallel_reduce<PassedReducerType>(
+        inner_policy, functor, label, kpID);
   }
 
   static constexpr bool is_array_reduction =
-      Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE, PolicyType,
-                            FunctorType>::StaticValueSize == 0;
+      Impl::FunctorAnalysis<
+          Impl::FunctorPatternInterface::REDUCE, PolicyType, FunctorType,
+          typename return_value_adapter::value_type>::StaticValueSize == 0;
 
   template <typename Dummy = ReturnType>
   static inline std::enable_if_t<!(is_array_reduction &&
@@ -1771,7 +1823,7 @@ inline void parallel_reduce(
         nullptr) {
   using FunctorAnalysis =
       Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE, PolicyType,
-                            FunctorType>;
+                            FunctorType, void>;
   using value_type = std::conditional_t<(FunctorAnalysis::StaticValueSize != 0),
                                         typename FunctorAnalysis::value_type,
                                         typename FunctorAnalysis::pointer_type>;
@@ -1796,7 +1848,7 @@ inline void parallel_reduce(
         nullptr) {
   using FunctorAnalysis =
       Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE, PolicyType,
-                            FunctorType>;
+                            FunctorType, void>;
   using value_type = std::conditional_t<(FunctorAnalysis::StaticValueSize != 0),
                                         typename FunctorAnalysis::value_type,
                                         typename FunctorAnalysis::pointer_type>;
@@ -1821,7 +1873,7 @@ inline void parallel_reduce(const size_t& policy, const FunctorType& functor) {
                                               FunctorType>::policy_type;
   using FunctorAnalysis =
       Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE, policy_type,
-                            FunctorType>;
+                            FunctorType, void>;
   using value_type = std::conditional_t<(FunctorAnalysis::StaticValueSize != 0),
                                         typename FunctorAnalysis::value_type,
                                         typename FunctorAnalysis::pointer_type>;
@@ -1848,7 +1900,7 @@ inline void parallel_reduce(const std::string& label, const size_t& policy,
                                               FunctorType>::policy_type;
   using FunctorAnalysis =
       Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE, policy_type,
-                            FunctorType>;
+                            FunctorType, void>;
   using value_type = std::conditional_t<(FunctorAnalysis::StaticValueSize != 0),
                                         typename FunctorAnalysis::value_type,
                                         typename FunctorAnalysis::pointer_type>;
diff --git a/packages/kokkos/core/src/Kokkos_Printf.hpp b/packages/kokkos/core/src/Kokkos_Printf.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..39f95825c3822a1a72c69517fd0f928d66cf4c84
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_Printf.hpp
@@ -0,0 +1,54 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_PRINTF_HPP
+#define KOKKOS_PRINTF_HPP
+
+#include <Kokkos_Macros.hpp>
+
+#ifdef KOKKOS_ENABLE_SYCL
+#include <sycl/sycl.hpp>
+#else
+#include <cstdio>
+#endif
+
+namespace Kokkos {
+
+// In contrast to std::printf, return void to get a consistent behavior across
+// backends. The GPU backends always return 1 and NVHPC only compiles if we
+// don't ask for the return value.
+template <typename... Args>
+KOKKOS_FUNCTION void printf(const char* format, Args... args) {
+#ifdef KOKKOS_ENABLE_SYCL
+  // Some compilers warn if "args" is empty and format is not a string literal
+  if constexpr (sizeof...(Args) == 0)
+    sycl::ext::oneapi::experimental::printf("%s", format);
+  else
+    sycl::ext::oneapi::experimental::printf(format, args...);
+#else
+  if constexpr (sizeof...(Args) == 0) ::printf("%s", format);
+    // FIXME_OPENMPTARGET non-string-literal argument used in printf is not
+    // supported for spir64
+#if !(defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU))
+  else
+    ::printf(format, args...);
+#endif
+#endif
+}
+
+}  // namespace Kokkos
+
+#endif /* #ifndef KOKKOS_PRINTF_HPP */
diff --git a/packages/kokkos/core/src/Kokkos_Profiling_ScopedRegion.hpp b/packages/kokkos/core/src/Kokkos_Profiling_ScopedRegion.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f45dfa324e9ff83672ea2cbaf7fcd6216f9f525c
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_Profiling_ScopedRegion.hpp
@@ -0,0 +1,51 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOSP_SCOPED_REGION_HPP
+#define KOKKOSP_SCOPED_REGION_HPP
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE_PROFILING_SCOPEDREGION
+#endif
+
+#include <Kokkos_Macros.hpp>
+#include <impl/Kokkos_Profiling.hpp>
+
+#include <string>
+
+namespace Kokkos::Profiling {
+
+class [[nodiscard]] ScopedRegion {
+ public:
+  ScopedRegion(ScopedRegion const &) = delete;
+  ScopedRegion &operator=(ScopedRegion const &) = delete;
+
+#if defined(__has_cpp_attribute) && __has_cpp_attribute(nodiscard) >= 201907
+  [[nodiscard]]
+#endif
+  explicit ScopedRegion(std::string const &name) {
+    Kokkos::Profiling::pushRegion(name);
+  }
+  ~ScopedRegion() { Kokkos::Profiling::popRegion(); }
+};
+
+}  // namespace Kokkos::Profiling
+
+#ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_CORE
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_PROFILING_SCOPEDREGION
+#endif
+#endif
diff --git a/packages/kokkos/core/src/Kokkos_ScratchSpace.hpp b/packages/kokkos/core/src/Kokkos_ScratchSpace.hpp
index a192b77f2802f56f0e7921526d7267c09537036c..a925e32a339e70e74e8de7b75445815179040792 100644
--- a/packages/kokkos/core/src/Kokkos_ScratchSpace.hpp
+++ b/packages/kokkos/core/src/Kokkos_ScratchSpace.hpp
@@ -126,7 +126,7 @@ class ScratchMemorySpace {
       // mfh 23 Jun 2015: printf call consumes 25 registers
       // in a CUDA build, so only print in debug mode.  The
       // function still returns nullptr if not enough memory.
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+      Kokkos::printf(
           "ScratchMemorySpace<...>::get_shmem: Failed to allocate "
           "%ld byte(s); remaining capacity is %ld byte(s)\n",
           long(size), long(capacity));
diff --git a/packages/kokkos/core/src/Kokkos_TaskScheduler.hpp b/packages/kokkos/core/src/Kokkos_TaskScheduler.hpp
index 690c845b303316f6e40656eeca363a448e114857..869a5f8ec26a99e21be0404d3b60056bdd027775 100644
--- a/packages/kokkos/core/src/Kokkos_TaskScheduler.hpp
+++ b/packages/kokkos/core/src/Kokkos_TaskScheduler.hpp
@@ -347,9 +347,8 @@ class BasicTaskScheduler : public Impl::TaskSchedulerBase {
         if (nullptr != t) {
           // Increment reference count to track subsequent assignment.
           // This likely has to be SeqCst
-          Kokkos::Impl::desul_atomic_inc(&(t->m_ref_count),
-                                         Kokkos::Impl::MemoryOrderSeqCst(),
-                                         Kokkos::Impl::MemoryScopeDevice());
+          desul::atomic_inc(&(t->m_ref_count), desul::MemoryOrderSeqCst(),
+                            desul::MemoryScopeDevice());
           if (q != static_cast<queue_type const*>(t->m_queue)) {
             Kokkos::abort(
                 "Kokkos when_all Futures must be in the same scheduler");
@@ -445,9 +444,9 @@ class BasicTaskScheduler : public Impl::TaskSchedulerBase {
           //}
           // Increment reference count to track subsequent assignment.
           // This increment likely has to be SeqCst
-          Kokkos::Impl::desul_atomic_inc(&(arg_f.m_task->m_ref_count),
-                                         Kokkos::Impl::MemoryOrderSeqCst(),
-                                         Kokkos::Impl::MemoryScopeDevice());
+          desul::atomic_inc(&(arg_f.m_task->m_ref_count),
+                            desul::MemoryOrderSeqCst(),
+                            desul::MemoryScopeDevice());
           dep[i] = arg_f.m_task;
         }
       }
diff --git a/packages/kokkos/core/src/Kokkos_View.hpp b/packages/kokkos/core/src/Kokkos_View.hpp
index 85957ba8fa540d88a5786f8328ef0153e2686d0f..bcbb28014cd935acfb7c7919be867f5b769be7ef 100644
--- a/packages/kokkos/core/src/Kokkos_View.hpp
+++ b/packages/kokkos/core/src/Kokkos_View.hpp
@@ -34,6 +34,7 @@ static_assert(false,
 #include <View/Hooks/Kokkos_ViewHooks.hpp>
 
 #include <impl/Kokkos_Tools.hpp>
+#include <impl/Kokkos_Utilities.hpp>
 
 #ifdef KOKKOS_ENABLE_IMPL_MDSPAN
 #include <View/MDSpan/Kokkos_MDSpan_Extents.hpp>
@@ -315,8 +316,8 @@ struct ViewTraits {
       typename prop::specialize,
       typename data_analysis::specialize>; /* mapping specialization tag */
 
-  enum { rank = dimension::rank };
-  enum { rank_dynamic = dimension::rank_dynamic };
+  static constexpr unsigned rank         = dimension::rank;
+  static constexpr unsigned rank_dynamic = dimension::rank_dynamic;
 
   //------------------------------------
   // Execution space, memory space, memory access traits, and host mirror space.
@@ -494,17 +495,21 @@ constexpr bool is_assignable(const Kokkos::View<ViewTDst...>& dst,
 
 namespace Kokkos {
 
-namespace {
+// FIXME_OPENMPTARGET - The `declare target` is needed for the Intel GPUs with
+// the OpenMPTarget backend
+#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_INTEL_LLVM)
+#pragma omp declare target
+#endif
 
-constexpr Kokkos::Impl::ALL_t ALL = Kokkos::Impl::ALL_t();
+inline constexpr Kokkos::ALL_t ALL{};
 
-constexpr Kokkos::Impl::WithoutInitializing_t WithoutInitializing =
-    Kokkos::Impl::WithoutInitializing_t();
+#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_INTEL_LLVM)
+#pragma omp end declare target
+#endif
 
-constexpr Kokkos::Impl::AllowPadding_t AllowPadding =
-    Kokkos::Impl::AllowPadding_t();
+inline constexpr Kokkos::Impl::WithoutInitializing_t WithoutInitializing{};
 
-}  // namespace
+inline constexpr Kokkos::Impl::AllowPadding_t AllowPadding{};
 
 /** \brief  Create View allocation parameter bundle from argument list.
  *
@@ -640,13 +645,15 @@ class View : public ViewTraits<DataType, Properties...> {
   //----------------------------------------
   // Domain rank and extents
 
-  enum { Rank = map_type::Rank };
-
-  /** \brief rank() to be implemented
-   */
-  // KOKKOS_INLINE_FUNCTION
-  // static
-  // constexpr unsigned rank() { return map_type::Rank; }
+  static constexpr Impl::integral_constant<size_t, traits::dimension::rank>
+      rank = {};
+  static constexpr Impl::integral_constant<size_t,
+                                           traits::dimension::rank_dynamic>
+      rank_dynamic = {};
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
+  enum {Rank KOKKOS_DEPRECATED_WITH_COMMENT("Use rank instead.") =
+            map_type::Rank};
+#endif
 
   template <typename iType>
   KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t<
@@ -807,14 +814,14 @@ class View : public ViewTraits<DataType, Properties...> {
 
   template <typename... Is>
   static KOKKOS_FUNCTION void check_access_member_function_valid_args(Is...) {
-    static_assert(Rank <= sizeof...(Is), "");
+    static_assert(rank <= sizeof...(Is), "");
     static_assert(sizeof...(Is) <= 8, "");
     static_assert(Kokkos::Impl::are_integral<Is...>::value, "");
   }
 
   template <typename... Is>
   static KOKKOS_FUNCTION void check_operator_parens_valid_args(Is...) {
-    static_assert(Rank == sizeof...(Is), "");
+    static_assert(rank == sizeof...(Is), "");
     static_assert(Kokkos::Impl::are_integral<Is...>::value, "");
   }
 
@@ -825,7 +832,7 @@ class View : public ViewTraits<DataType, Properties...> {
   template <typename I0>
   KOKKOS_FORCEINLINE_FUNCTION
       std::enable_if_t<(Kokkos::Impl::always_true<I0>::value &&  //
-                        (1 == Rank) && is_default_map && !is_layout_stride),
+                        (1 == rank) && is_default_map && !is_layout_stride),
                        reference_type>
       operator()(I0 i0) const {
     check_operator_parens_valid_args(i0);
@@ -836,7 +843,7 @@ class View : public ViewTraits<DataType, Properties...> {
   template <typename I0>
   KOKKOS_FORCEINLINE_FUNCTION
       std::enable_if_t<(Kokkos::Impl::always_true<I0>::value &&  //
-                        (1 == Rank) && is_default_map && is_layout_stride),
+                        (1 == rank) && is_default_map && is_layout_stride),
                        reference_type>
       operator()(I0 i0) const {
     check_operator_parens_valid_args(i0);
@@ -849,7 +856,7 @@ class View : public ViewTraits<DataType, Properties...> {
 
   template <typename I0>
   KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<
-      ((1 == Rank) && Kokkos::Impl::are_integral<I0>::value && !is_default_map),
+      ((1 == rank) && Kokkos::Impl::are_integral<I0>::value && !is_default_map),
       reference_type>
   operator[](I0 i0) const {
     KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0)
@@ -858,7 +865,7 @@ class View : public ViewTraits<DataType, Properties...> {
 
   template <typename I0>
   KOKKOS_FORCEINLINE_FUNCTION
-      std::enable_if_t<((1 == Rank) && Kokkos::Impl::are_integral<I0>::value &&
+      std::enable_if_t<((1 == rank) && Kokkos::Impl::are_integral<I0>::value &&
                         is_default_map && !is_layout_stride),
                        reference_type>
       operator[](I0 i0) const {
@@ -868,7 +875,7 @@ class View : public ViewTraits<DataType, Properties...> {
 
   template <typename I0>
   KOKKOS_FORCEINLINE_FUNCTION
-      std::enable_if_t<((1 == Rank) && Kokkos::Impl::are_integral<I0>::value &&
+      std::enable_if_t<((1 == rank) && Kokkos::Impl::are_integral<I0>::value &&
                         is_default_map && is_layout_stride),
                        reference_type>
       operator[](I0 i0) const {
@@ -880,48 +887,44 @@ class View : public ViewTraits<DataType, Properties...> {
   // Rank 2 default map operator()
 
   template <typename I0, typename I1>
-  KOKKOS_FORCEINLINE_FUNCTION
-      std::enable_if_t<(Kokkos::Impl::always_true<I0, I1>::value &&  //
-                        (2 == Rank) && is_default_map && is_layout_left &&
-                        (traits::rank_dynamic == 0)),
-                       reference_type>
-      operator()(I0 i0, I1 i1) const {
+  KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<
+      (Kokkos::Impl::always_true<I0, I1>::value &&  //
+       (2 == rank) && is_default_map && is_layout_left && (rank_dynamic == 0)),
+      reference_type>
+  operator()(I0 i0, I1 i1) const {
     check_operator_parens_valid_args(i0, i1);
     KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1)
     return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_dim.N0 * i1];
   }
 
   template <typename I0, typename I1>
-  KOKKOS_FORCEINLINE_FUNCTION
-      std::enable_if_t<(Kokkos::Impl::always_true<I0, I1>::value &&  //
-                        (2 == Rank) && is_default_map && is_layout_left &&
-                        (traits::rank_dynamic != 0)),
-                       reference_type>
-      operator()(I0 i0, I1 i1) const {
+  KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<
+      (Kokkos::Impl::always_true<I0, I1>::value &&  //
+       (2 == rank) && is_default_map && is_layout_left && (rank_dynamic != 0)),
+      reference_type>
+  operator()(I0 i0, I1 i1) const {
     check_operator_parens_valid_args(i0, i1);
     KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1)
     return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_stride * i1];
   }
 
   template <typename I0, typename I1>
-  KOKKOS_FORCEINLINE_FUNCTION
-      std::enable_if_t<(Kokkos::Impl::always_true<I0, I1>::value &&  //
-                        (2 == Rank) && is_default_map && is_layout_right &&
-                        (traits::rank_dynamic == 0)),
-                       reference_type>
-      operator()(I0 i0, I1 i1) const {
+  KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<
+      (Kokkos::Impl::always_true<I0, I1>::value &&  //
+       (2 == rank) && is_default_map && is_layout_right && (rank_dynamic == 0)),
+      reference_type>
+  operator()(I0 i0, I1 i1) const {
     check_operator_parens_valid_args(i0, i1);
     KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1)
     return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_dim.N1 * i0];
   }
 
   template <typename I0, typename I1>
-  KOKKOS_FORCEINLINE_FUNCTION
-      std::enable_if_t<(Kokkos::Impl::always_true<I0, I1>::value &&  //
-                        (2 == Rank) && is_default_map && is_layout_right &&
-                        (traits::rank_dynamic != 0)),
-                       reference_type>
-      operator()(I0 i0, I1 i1) const {
+  KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<
+      (Kokkos::Impl::always_true<I0, I1>::value &&  //
+       (2 == rank) && is_default_map && is_layout_right && (rank_dynamic != 0)),
+      reference_type>
+  operator()(I0 i0, I1 i1) const {
     check_operator_parens_valid_args(i0, i1);
     KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1)
     return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_stride * i0];
@@ -930,7 +933,7 @@ class View : public ViewTraits<DataType, Properties...> {
   template <typename I0, typename I1>
   KOKKOS_FORCEINLINE_FUNCTION
       std::enable_if_t<(Kokkos::Impl::always_true<I0, I1>::value &&  //
-                        (2 == Rank) && is_default_map && is_layout_stride),
+                        (2 == rank) && is_default_map && is_layout_stride),
                        reference_type>
       operator()(I0 i0, I1 i1) const {
     check_operator_parens_valid_args(i0, i1);
@@ -945,7 +948,7 @@ class View : public ViewTraits<DataType, Properties...> {
   template <typename... Is>
   KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<
       (Kokkos::Impl::always_true<Is...>::value &&  //
-       (2 != Rank) && (1 != Rank) && (0 != Rank) && is_default_map),
+       (2 != rank) && (1 != rank) && (0 != rank) && is_default_map),
       reference_type>
   operator()(Is... indices) const {
     check_operator_parens_valid_args(indices...);
@@ -956,7 +959,7 @@ class View : public ViewTraits<DataType, Properties...> {
   template <typename... Is>
   KOKKOS_FORCEINLINE_FUNCTION
       std::enable_if_t<(Kokkos::Impl::always_true<Is...>::value &&  //
-                        ((0 == Rank) || !is_default_map)),
+                        ((0 == rank) || !is_default_map)),
                        reference_type>
       operator()(Is... indices) const {
     check_operator_parens_valid_args(indices...);
@@ -969,7 +972,7 @@ class View : public ViewTraits<DataType, Properties...> {
 
   template <typename... Is>
   KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<
-      (Kokkos::Impl::always_true<Is...>::value && (0 == Rank)), reference_type>
+      (Kokkos::Impl::always_true<Is...>::value && (0 == rank)), reference_type>
   access(Is... extra) const {
     check_access_member_function_valid_args(extra...);
     KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, extra...)
@@ -982,7 +985,7 @@ class View : public ViewTraits<DataType, Properties...> {
   template <typename I0, typename... Is>
   KOKKOS_FORCEINLINE_FUNCTION
       std::enable_if_t<(Kokkos::Impl::always_true<I0, Is...>::value &&
-                        (1 == Rank) && !is_default_map),
+                        (1 == rank) && !is_default_map),
                        reference_type>
       access(I0 i0, Is... extra) const {
     check_access_member_function_valid_args(i0, extra...);
@@ -993,7 +996,7 @@ class View : public ViewTraits<DataType, Properties...> {
   template <typename I0, typename... Is>
   KOKKOS_FORCEINLINE_FUNCTION
       std::enable_if_t<(Kokkos::Impl::always_true<I0, Is...>::value &&
-                        (1 == Rank) && is_default_map && !is_layout_stride),
+                        (1 == rank) && is_default_map && !is_layout_stride),
                        reference_type>
       access(I0 i0, Is... extra) const {
     check_access_member_function_valid_args(i0, extra...);
@@ -1004,7 +1007,7 @@ class View : public ViewTraits<DataType, Properties...> {
   template <typename I0, typename... Is>
   KOKKOS_FORCEINLINE_FUNCTION
       std::enable_if_t<(Kokkos::Impl::always_true<I0, Is...>::value &&
-                        (1 == Rank) && is_default_map && is_layout_stride),
+                        (1 == rank) && is_default_map && is_layout_stride),
                        reference_type>
       access(I0 i0, Is... extra) const {
     check_access_member_function_valid_args(i0, extra...);
@@ -1018,7 +1021,7 @@ class View : public ViewTraits<DataType, Properties...> {
   template <typename I0, typename I1, typename... Is>
   KOKKOS_FORCEINLINE_FUNCTION
       std::enable_if_t<(Kokkos::Impl::always_true<I0, I1, Is...>::value &&
-                        (2 == Rank) && !is_default_map),
+                        (2 == rank) && !is_default_map),
                        reference_type>
       access(I0 i0, I1 i1, Is... extra) const {
     check_access_member_function_valid_args(i0, i1, extra...);
@@ -1028,8 +1031,8 @@ class View : public ViewTraits<DataType, Properties...> {
 
   template <typename I0, typename I1, typename... Is>
   KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<
-      (Kokkos::Impl::always_true<I0, I1, Is...>::value && (2 == Rank) &&
-       is_default_map && is_layout_left && (traits::rank_dynamic == 0)),
+      (Kokkos::Impl::always_true<I0, I1, Is...>::value && (2 == rank) &&
+       is_default_map && is_layout_left && (rank_dynamic == 0)),
       reference_type>
   access(I0 i0, I1 i1, Is... extra) const {
     check_access_member_function_valid_args(i0, i1, extra...);
@@ -1039,8 +1042,8 @@ class View : public ViewTraits<DataType, Properties...> {
 
   template <typename I0, typename I1, typename... Is>
   KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<
-      (Kokkos::Impl::always_true<I0, I1, Is...>::value && (2 == Rank) &&
-       is_default_map && is_layout_left && (traits::rank_dynamic != 0)),
+      (Kokkos::Impl::always_true<I0, I1, Is...>::value && (2 == rank) &&
+       is_default_map && is_layout_left && (rank_dynamic != 0)),
       reference_type>
   access(I0 i0, I1 i1, Is... extra) const {
     check_access_member_function_valid_args(i0, i1, extra...);
@@ -1050,8 +1053,8 @@ class View : public ViewTraits<DataType, Properties...> {
 
   template <typename I0, typename I1, typename... Is>
   KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<
-      (Kokkos::Impl::always_true<I0, I1, Is...>::value && (2 == Rank) &&
-       is_default_map && is_layout_right && (traits::rank_dynamic == 0)),
+      (Kokkos::Impl::always_true<I0, I1, Is...>::value && (2 == rank) &&
+       is_default_map && is_layout_right && (rank_dynamic == 0)),
       reference_type>
   access(I0 i0, I1 i1, Is... extra) const {
     check_access_member_function_valid_args(i0, i1, extra...);
@@ -1061,8 +1064,8 @@ class View : public ViewTraits<DataType, Properties...> {
 
   template <typename I0, typename I1, typename... Is>
   KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<
-      (Kokkos::Impl::always_true<I0, I1, Is...>::value && (2 == Rank) &&
-       is_default_map && is_layout_right && (traits::rank_dynamic != 0)),
+      (Kokkos::Impl::always_true<I0, I1, Is...>::value && (2 == rank) &&
+       is_default_map && is_layout_right && (rank_dynamic != 0)),
       reference_type>
   access(I0 i0, I1 i1, Is... extra) const {
     check_access_member_function_valid_args(i0, i1, extra...);
@@ -1073,7 +1076,7 @@ class View : public ViewTraits<DataType, Properties...> {
   template <typename I0, typename I1, typename... Is>
   KOKKOS_FORCEINLINE_FUNCTION
       std::enable_if_t<(Kokkos::Impl::always_true<I0, I1, Is...>::value &&
-                        (2 == Rank) && is_default_map && is_layout_stride),
+                        (2 == rank) && is_default_map && is_layout_stride),
                        reference_type>
       access(I0 i0, I1 i1, Is... extra) const {
     check_access_member_function_valid_args(i0, i1, extra...);
@@ -1088,7 +1091,7 @@ class View : public ViewTraits<DataType, Properties...> {
   template <typename I0, typename I1, typename I2, typename... Is>
   KOKKOS_FORCEINLINE_FUNCTION
       std::enable_if_t<(Kokkos::Impl::always_true<I0, I1, I2, Is...>::value &&
-                        (3 == Rank) && is_default_map),
+                        (3 == rank) && is_default_map),
                        reference_type>
       access(I0 i0, I1 i1, I2 i2, Is... extra) const {
     check_access_member_function_valid_args(i0, i1, i2, extra...);
@@ -1099,7 +1102,7 @@ class View : public ViewTraits<DataType, Properties...> {
   template <typename I0, typename I1, typename I2, typename... Is>
   KOKKOS_FORCEINLINE_FUNCTION
       std::enable_if_t<(Kokkos::Impl::always_true<I0, I1, I2, Is...>::value &&
-                        (3 == Rank) && !is_default_map),
+                        (3 == rank) && !is_default_map),
                        reference_type>
       access(I0 i0, I1 i1, I2 i2, Is... extra) const {
     check_access_member_function_valid_args(i0, i1, i2, extra...);
@@ -1112,7 +1115,7 @@ class View : public ViewTraits<DataType, Properties...> {
 
   template <typename I0, typename I1, typename I2, typename I3, typename... Is>
   KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<
-      (Kokkos::Impl::always_true<I0, I1, I2, I3, Is...>::value && (4 == Rank) &&
+      (Kokkos::Impl::always_true<I0, I1, I2, I3, Is...>::value && (4 == rank) &&
        is_default_map),
       reference_type>
   access(I0 i0, I1 i1, I2 i2, I3 i3, Is... extra) const {
@@ -1123,7 +1126,7 @@ class View : public ViewTraits<DataType, Properties...> {
 
   template <typename I0, typename I1, typename I2, typename I3, typename... Is>
   KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<
-      (Kokkos::Impl::always_true<I0, I1, I2, I3, Is...>::value && (4 == Rank) &&
+      (Kokkos::Impl::always_true<I0, I1, I2, I3, Is...>::value && (4 == rank) &&
        !is_default_map),
       reference_type>
   access(I0 i0, I1 i1, I2 i2, I3 i3, Is... extra) const {
@@ -1139,7 +1142,7 @@ class View : public ViewTraits<DataType, Properties...> {
             typename... Is>
   KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<
       (Kokkos::Impl::always_true<I0, I1, I2, I3, I4, Is...>::value &&
-       (5 == Rank) && is_default_map),
+       (5 == rank) && is_default_map),
       reference_type>
   access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, Is... extra) const {
     check_access_member_function_valid_args(i0, i1, i2, i3, i4, extra...);
@@ -1152,7 +1155,7 @@ class View : public ViewTraits<DataType, Properties...> {
             typename... Is>
   KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<
       (Kokkos::Impl::always_true<I0, I1, I2, I3, I4, Is...>::value &&
-       (5 == Rank) && !is_default_map),
+       (5 == rank) && !is_default_map),
       reference_type>
   access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, Is... extra) const {
     check_access_member_function_valid_args(i0, i1, i2, i3, i4, extra...);
@@ -1168,7 +1171,7 @@ class View : public ViewTraits<DataType, Properties...> {
             typename I5, typename... Is>
   KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<
       (Kokkos::Impl::always_true<I0, I1, I2, I3, I4, I5, Is...>::value &&
-       (6 == Rank) && is_default_map),
+       (6 == rank) && is_default_map),
       reference_type>
   access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, Is... extra) const {
     check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, extra...);
@@ -1181,7 +1184,7 @@ class View : public ViewTraits<DataType, Properties...> {
             typename I5, typename... Is>
   KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<
       (Kokkos::Impl::always_true<I0, I1, I2, I3, I4, I5, Is...>::value &&
-       (6 == Rank) && !is_default_map),
+       (6 == rank) && !is_default_map),
       reference_type>
   access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, Is... extra) const {
     check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, extra...);
@@ -1197,7 +1200,7 @@ class View : public ViewTraits<DataType, Properties...> {
             typename I5, typename I6, typename... Is>
   KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<
       (Kokkos::Impl::always_true<I0, I1, I2, I3, I4, I5, I6, Is...>::value &&
-       (7 == Rank) && is_default_map),
+       (7 == rank) && is_default_map),
       reference_type>
   access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, Is... extra) const {
     check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6,
@@ -1211,7 +1214,7 @@ class View : public ViewTraits<DataType, Properties...> {
             typename I5, typename I6, typename... Is>
   KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<
       (Kokkos::Impl::always_true<I0, I1, I2, I3, I4, I5, I6, Is...>::value &&
-       (7 == Rank) && !is_default_map),
+       (7 == rank) && !is_default_map),
       reference_type>
   access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, Is... extra) const {
     check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6,
@@ -1229,7 +1232,7 @@ class View : public ViewTraits<DataType, Properties...> {
   KOKKOS_FORCEINLINE_FUNCTION
       std::enable_if_t<(Kokkos::Impl::always_true<I0, I1, I2, I3, I4, I5, I6,
                                                   I7, Is...>::value &&
-                        (8 == Rank) && is_default_map),
+                        (8 == rank) && is_default_map),
                        reference_type>
       access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, I7 i7,
              Is... extra) const {
@@ -1246,7 +1249,7 @@ class View : public ViewTraits<DataType, Properties...> {
   KOKKOS_FORCEINLINE_FUNCTION
       std::enable_if_t<(Kokkos::Impl::always_true<I0, I1, I2, I3, I4, I5, I6,
                                                   I7, Is...>::value &&
-                        (8 == Rank) && !is_default_map),
+                        (8 == rank) && !is_default_map),
                        reference_type>
       access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, I7 i7,
              Is... extra) const {
@@ -1411,38 +1414,13 @@ class View : public ViewTraits<DataType, Properties...> {
     const std::string& alloc_name =
         Impl::get_property<Impl::LabelTag>(prop_copy);
     Impl::runtime_check_rank(
-        traits::rank, traits::rank_dynamic,
+        rank, rank_dynamic,
         std::is_same<typename traits::specialize, void>::value, i0, i1, i2, i3,
         i4, i5, i6, i7, alloc_name);
 
-//------------------------------------------------------------
-#if defined(KOKKOS_ENABLE_CUDA)
-    // If allocating in CudaUVMSpace must fence before and after
-    // the allocation to protect against possible concurrent access
-    // on the CPU and the GPU.
-    // Fence using the trait's execution space (which will be Kokkos::Cuda)
-    // to avoid incomplete type errors from using Kokkos::Cuda directly.
-    if (std::is_same<Kokkos::CudaUVMSpace,
-                     typename traits::device_type::memory_space>::value) {
-      typename traits::device_type::memory_space::execution_space().fence(
-          "Kokkos::View<...>::View: fence before allocating UVM");
-    }
-#endif
-    //------------------------------------------------------------
-
     Kokkos::Impl::SharedAllocationRecord<>* record = m_map.allocate_shared(
         prop_copy, arg_layout, Impl::ViewCtorProp<P...>::has_execution_space);
 
-//------------------------------------------------------------
-#if defined(KOKKOS_ENABLE_CUDA)
-    if (std::is_same<Kokkos::CudaUVMSpace,
-                     typename traits::device_type::memory_space>::value) {
-      typename traits::device_type::memory_space::execution_space().fence(
-          "Kokkos::View<...>::View: fence after allocating UVM");
-    }
-#endif
-    //------------------------------------------------------------
-
     // Setup and initialization complete, start tracking
     m_track.m_tracker.assign_allocated_record_to_uninitialized(record);
   }
@@ -1627,7 +1605,7 @@ class View : public ViewTraits<DataType, Properties...> {
         arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7);
 
     if (std::is_void<typename traits::specialize>::value &&
-        num_passed_args != traits::rank_dynamic) {
+        num_passed_args != rank_dynamic) {
       Kokkos::abort(
           "Kokkos::View::shmem_size() rank_dynamic != number of arguments.\n");
     }
@@ -1683,14 +1661,10 @@ class View : public ViewTraits<DataType, Properties...> {
   }
 };
 
-/** \brief Temporary free function rank()
- *         until rank() is implemented
- *         in the View
- */
 template <typename D, class... P>
-KOKKOS_INLINE_FUNCTION constexpr unsigned rank(const View<D, P...>& V) {
-  return V.Rank;
-}  // Temporary until added to view
+KOKKOS_INLINE_FUNCTION constexpr unsigned rank(const View<D, P...>&) {
+  return View<D, P...>::rank();
+}
 
 namespace Impl {
 
@@ -1706,7 +1680,7 @@ struct RankDataType<ValueType, 0> {
 
 template <unsigned N, typename... Args>
 KOKKOS_FUNCTION std::enable_if_t<
-    N == View<Args...>::Rank &&
+    N == View<Args...>::rank() &&
         std::is_same<typename ViewTraits<Args...>::specialize, void>::value,
     View<Args...>>
 as_view_of_rank_n(View<Args...> v) {
@@ -1717,7 +1691,7 @@ as_view_of_rank_n(View<Args...> v) {
 // never be called
 template <unsigned N, typename T, typename... Args>
 KOKKOS_FUNCTION std::enable_if_t<
-    N != View<T, Args...>::Rank &&
+    N != View<T, Args...>::rank() &&
         std::is_same<typename ViewTraits<T, Args...>::specialize, void>::value,
     View<typename RankDataType<typename View<T, Args...>::value_type, N>::type,
          Args...>>
@@ -1735,44 +1709,66 @@ void apply_to_view_of_static_rank(Function&& f, View<Args...> a) {
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
-template <class V, class... Args>
-using Subview =
-    typename Kokkos::Impl::ViewMapping<void /* deduce subview type from source
-                                               view traits */
-                                       ,
-                                       typename V::traits, Args...>::type;
+namespace Impl {
+template <class ValueType, class TypeList>
+struct TypeListToViewTraits;
+
+template <class ValueType, class... Properties>
+struct TypeListToViewTraits<ValueType, Kokkos::Impl::type_list<Properties...>> {
+  using type = ViewTraits<ValueType, Properties...>;
+};
+
+// It is not safe to assume that subviews of views with the Aligned memory trait
+// are also aligned. Hence, just remove that attribute for subviews.
+template <class D, class... P>
+struct RemoveAlignedMemoryTrait {
+ private:
+  using type_list_in  = Kokkos::Impl::type_list<P...>;
+  using memory_traits = typename ViewTraits<D, P...>::memory_traits;
+  using type_list_in_wo_memory_traits =
+      typename Kokkos::Impl::type_list_remove_first<memory_traits,
+                                                    type_list_in>::type;
+  using new_memory_traits =
+      Kokkos::MemoryTraits<memory_traits::impl_value & ~Kokkos::Aligned>;
+  using new_type_list = typename Kokkos::Impl::concat_type_list<
+      type_list_in_wo_memory_traits,
+      Kokkos::Impl::type_list<new_memory_traits>>::type;
+
+ public:
+  using type = typename TypeListToViewTraits<D, new_type_list>::type;
+};
+}  // namespace Impl
 
 template <class D, class... P, class... Args>
-KOKKOS_INLINE_FUNCTION
-    typename Kokkos::Impl::ViewMapping<void /* deduce subview type from source
-                                               view traits */
-                                       ,
-                                       ViewTraits<D, P...>, Args...>::type
-    subview(const View<D, P...>& src, Args... args) {
-  static_assert(View<D, P...>::Rank == sizeof...(Args),
+KOKKOS_INLINE_FUNCTION auto subview(const View<D, P...>& src, Args... args) {
+  static_assert(View<D, P...>::rank == sizeof...(Args),
                 "subview requires one argument for each source View rank");
 
   return typename Kokkos::Impl::ViewMapping<
       void /* deduce subview type from source view traits */
       ,
-      ViewTraits<D, P...>, Args...>::type(src, args...);
+      typename Impl::RemoveAlignedMemoryTrait<D, P...>::type,
+      Args...>::type(src, args...);
 }
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
 template <class MemoryTraits, class D, class... P, class... Args>
-KOKKOS_INLINE_FUNCTION typename Kokkos::Impl::ViewMapping<
-    void /* deduce subview type from source view traits */
-    ,
-    ViewTraits<D, P...>, Args...>::template apply<MemoryTraits>::type
-subview(const View<D, P...>& src, Args... args) {
-  static_assert(View<D, P...>::Rank == sizeof...(Args),
+KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION auto subview(const View<D, P...>& src,
+                                                      Args... args) {
+  static_assert(View<D, P...>::rank == sizeof...(Args),
                 "subview requires one argument for each source View rank");
+  static_assert(Kokkos::is_memory_traits<MemoryTraits>::value);
 
   return typename Kokkos::Impl::ViewMapping<
       void /* deduce subview type from source view traits */
       ,
-      ViewTraits<D, P...>,
-      Args...>::template apply<MemoryTraits>::type(src, args...);
+      typename Impl::RemoveAlignedMemoryTrait<D, P..., MemoryTraits>::type,
+      Args...>::type(src, args...);
 }
+#endif
+
+template <class V, class... Args>
+using Subview = decltype(subview(std::declval<V>(), std::declval<Args>()...));
 
 } /* namespace Kokkos */
 
@@ -1794,7 +1790,7 @@ KOKKOS_INLINE_FUNCTION bool operator==(const View<LT, LP...>& lhs,
                       typename rhs_traits::array_layout>::value &&
          std::is_same<typename lhs_traits::memory_space,
                       typename rhs_traits::memory_space>::value &&
-         unsigned(lhs_traits::rank) == unsigned(rhs_traits::rank) &&
+         View<LT, LP...>::rank() == View<RT, RP...>::rank() &&
          lhs.data() == rhs.data() && lhs.span() == rhs.span() &&
          lhs.extent(0) == rhs.extent(0) && lhs.extent(1) == rhs.extent(1) &&
          lhs.extent(2) == rhs.extent(2) && lhs.extent(3) == rhs.extent(3) &&
diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC.hpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC.hpp
index a115bc90f886ee7f406f4c8c01849ab10497f5ef..b012f6a42a41ace7e502bd884f682c2189e5a2a6 100644
--- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC.hpp
+++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC.hpp
@@ -39,7 +39,10 @@ static_assert(false,
 
 // FIXME_OPENACC: Below macro is temporarily enabled to avoid issues on existing
 // OpenACC compilers not supporting lambda with parallel loops.
+// LLVM/Clacc compiler does not need this.
+#ifndef KOKKOS_COMPILER_CLANG
 #define KOKKOS_ENABLE_OPENACC_COLLAPSE_HIERARCHICAL_CONSTRUCTS
+#endif
 
 namespace Kokkos::Experimental::Impl {
 class OpenACCInternal;
diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_FunctorAdapter.hpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_FunctorAdapter.hpp
index 1325e61e1d00024ccd932433841ebefdc4b79dd4..82d38586eb8fe35dea59925eafdf02a09ec1144b 100644
--- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_FunctorAdapter.hpp
+++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_FunctorAdapter.hpp
@@ -17,27 +17,40 @@
 #ifndef KOKKOS_OPENACC_FUNCTOR_ADAPTER_HPP
 #define KOKKOS_OPENACC_FUNCTOR_ADAPTER_HPP
 
+#include <OpenACC/Kokkos_OpenACC_Macros.hpp>
 #include <type_traits>
 
 namespace Kokkos::Experimental::Impl {
 
-template <class Functor, class Policy>
-class FunctorAdapter {
-  Functor m_functor;
-  using WorkTag = typename Policy::work_tag;
-
- public:
-  FunctorAdapter(Functor const &functor) : m_functor(functor) {}
-
-  template <class... Args>
-  KOKKOS_FUNCTION void operator()(Args &&... args) const {
-    if constexpr (std::is_void_v<WorkTag>) {
-      m_functor(static_cast<Args &&>(args)...);
-    } else {
-      m_functor(WorkTag(), static_cast<Args &&>(args)...);
-    }
+enum class RoutineClause { worker, seq };
+
+template <class Functor, class Policy, RoutineClause>
+class FunctorAdapter;
+
+#define KOKKOS_IMPL_ACC_FUNCTOR_ADAPTER(CLAUSE)                    \
+  template <class Functor, class Policy>                           \
+  class FunctorAdapter<Functor, Policy, RoutineClause::CLAUSE> {   \
+    Functor m_functor;                                             \
+    using WorkTag = typename Policy::work_tag;                     \
+                                                                   \
+   public:                                                         \
+    FunctorAdapter(Functor const &functor) : m_functor(functor) {} \
+                                                                   \
+    KOKKOS_IMPL_ACC_PRAGMA(routine CLAUSE)                         \
+    template <class... Args>                                       \
+    KOKKOS_FUNCTION void operator()(Args &&... args) const {       \
+      if constexpr (std::is_void_v<WorkTag>) {                     \
+        m_functor(static_cast<Args &&>(args)...);                  \
+      } else {                                                     \
+        m_functor(WorkTag(), static_cast<Args &&>(args)...);       \
+      }                                                            \
+    }                                                              \
   }
-};
+
+KOKKOS_IMPL_ACC_FUNCTOR_ADAPTER(worker);
+KOKKOS_IMPL_ACC_FUNCTOR_ADAPTER(seq);
+
+#undef KOKKOS_IMPL_ACC_FUNCTOR_ADAPTER
 
 }  // namespace Kokkos::Experimental::Impl
 
diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_MDRangePolicy.hpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_MDRangePolicy.hpp
index 4525f37a612d3abd8e5961d43256800cbb9675c4..9c58dd6fa6638d2faee3a5706d623d4672e6948c 100644
--- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_MDRangePolicy.hpp
+++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_MDRangePolicy.hpp
@@ -42,4 +42,19 @@ struct ThreadAndVectorNestLevel<Rank, Kokkos::Experimental::OpenACC,
 }  // namespace Impl
 }  // namespace Kokkos
 
+namespace Kokkos::Experimental::Impl {
+
+struct OpenACCCollapse {};
+struct OpenACCTile {};
+using OpenACCIterateLeft  = std::integral_constant<Iterate, Iterate::Left>;
+using OpenACCIterateRight = std::integral_constant<Iterate, Iterate::Right>;
+template <int N>
+using OpenACCMDRangeBegin = decltype(MDRangePolicy<OpenACC, Rank<N>>::m_lower);
+template <int N>
+using OpenACCMDRangeEnd = decltype(MDRangePolicy<OpenACC, Rank<N>>::m_upper);
+template <int N>
+using OpenACCMDRangeTile = decltype(MDRangePolicy<OpenACC, Rank<N>>::m_tile);
+
+}  // namespace Kokkos::Experimental::Impl
+
 #endif
diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_MDRange.hpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_MDRange.hpp
index ac219527c6aa3b0dfb1779fa4a71a6342542525f..550436fe7beceba231454017eb709f27b1b9aa7e 100644
--- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_MDRange.hpp
+++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_MDRange.hpp
@@ -24,17 +24,6 @@
 
 namespace Kokkos::Experimental::Impl {
 
-struct OpenACCCollapse {};
-struct OpenACCTile {};
-using OpenACCIterateLeft  = std::integral_constant<Iterate, Iterate::Left>;
-using OpenACCIterateRight = std::integral_constant<Iterate, Iterate::Right>;
-template <int N>
-using OpenACCMDRangeBegin = decltype(MDRangePolicy<OpenACC, Rank<N>>::m_lower);
-template <int N>
-using OpenACCMDRangeEnd = decltype(MDRangePolicy<OpenACC, Rank<N>>::m_upper);
-template <int N>
-using OpenACCMDRangeTile = decltype(MDRangePolicy<OpenACC, Rank<N>>::m_tile);
-
 template <class Functor>
 void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft,
                                      Functor const& functor,
@@ -651,7 +640,9 @@ template <class Functor, class... Traits>
 class Kokkos::Impl::ParallelFor<Functor, Kokkos::MDRangePolicy<Traits...>,
                                 Kokkos::Experimental::OpenACC> {
   using Policy = MDRangePolicy<Traits...>;
-  Kokkos::Experimental::Impl::FunctorAdapter<Functor, Policy> m_functor;
+  Kokkos::Experimental::Impl::FunctorAdapter<
+      Functor, Policy, Kokkos::Experimental::Impl::RoutineClause::seq>
+      m_functor;
   Policy m_policy;
 
  public:
diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_Range.hpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_Range.hpp
index ede93ec19e122fe0795c0705d470fde6fe776b59..6ddfc352fc99fd3cbc50e5d694e04e806d978e4b 100644
--- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_Range.hpp
+++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_Range.hpp
@@ -78,7 +78,9 @@ template <class Functor, class... Traits>
 class Kokkos::Impl::ParallelFor<Functor, Kokkos::RangePolicy<Traits...>,
                                 Kokkos::Experimental::OpenACC> {
   using Policy = Kokkos::RangePolicy<Traits...>;
-  Kokkos::Experimental::Impl::FunctorAdapter<Functor, Policy> m_functor;
+  Kokkos::Experimental::Impl::FunctorAdapter<
+      Functor, Policy, Kokkos::Experimental::Impl::RoutineClause::seq>
+      m_functor;
   Policy m_policy;
   using ScheduleType = Kokkos::Experimental::Impl::OpenACCScheduleType<Policy>;
 
diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_Team.hpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_Team.hpp
index c08c15879c3656f15665388cf3292e2e67ca16e5..4fce680aef09b45e4b91c6f703bd857bd975e4c3 100644
--- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_Team.hpp
+++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_Team.hpp
@@ -17,7 +17,6 @@
 #ifndef KOKKOS_OPENACC_PARALLEL_FOR_TEAM_HPP
 #define KOKKOS_OPENACC_PARALLEL_FOR_TEAM_HPP
 
-#include <openacc.h>
 #include <OpenACC/Kokkos_OpenACC_Team.hpp>
 #include <OpenACC/Kokkos_OpenACC_FunctorAdapter.hpp>
 
@@ -32,7 +31,9 @@ class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
  private:
   using Policy = Kokkos::Impl::TeamPolicyInternal<Kokkos::Experimental::OpenACC,
                                                   Properties...>;
-  Kokkos::Experimental::Impl::FunctorAdapter<FunctorType, Policy> m_functor;
+  Kokkos::Experimental::Impl::FunctorAdapter<
+      FunctorType, Policy, Kokkos::Experimental::Impl::RoutineClause::seq>
+      m_functor;
   using Member = typename Policy::member_type;
 
   const Policy m_policy;
@@ -131,7 +132,9 @@ class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
  private:
   using Policy = Kokkos::Impl::TeamPolicyInternal<Kokkos::Experimental::OpenACC,
                                                   Properties...>;
-  Kokkos::Experimental::Impl::FunctorAdapter<FunctorType, Policy> m_functor;
+  Kokkos::Experimental::Impl::FunctorAdapter<
+      FunctorType, Policy, Kokkos::Experimental::Impl::RoutineClause::worker>
+      m_functor;
   using Member = typename Policy::member_type;
 
   const Policy m_policy;
diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2c7793dc11650de1d70de7c2516e5d4c7c3ae50b
--- /dev/null
+++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp
@@ -0,0 +1,472 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_OPENACC_PARALLEL_REDUCE_MDRANGE_HPP
+#define KOKKOS_OPENACC_PARALLEL_REDUCE_MDRANGE_HPP
+
+#include <OpenACC/Kokkos_OpenACC.hpp>
+#include <OpenACC/Kokkos_OpenACC_Macros.hpp>
+#include <OpenACC/Kokkos_OpenACC_FunctorAdapter.hpp>
+#include <OpenACC/Kokkos_OpenACC_MDRangePolicy.hpp>
+#include <Kokkos_Parallel.hpp>
+
+namespace Kokkos::Experimental::Impl {
+
+// primary template: catch-all non-implemented custom reducers
+template <class Functor, class Reducer, class Policy,
+          bool = std::is_arithmetic_v<typename Reducer::value_type>>
+struct OpenACCParallelReduceMDRangeHelper {
+  OpenACCParallelReduceMDRangeHelper(Functor const&, Reducer const&,
+                                     Policy const&) {
+    static_assert(!Kokkos::Impl::always_true<Functor>::value,
+                  "not implemented");
+  }
+};
+}  // namespace Kokkos::Experimental::Impl
+
+template <class CombinedFunctorReducerType, class... Traits>
+class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
+                                   Kokkos::MDRangePolicy<Traits...>,
+                                   Kokkos::Experimental::OpenACC> {
+  using Policy      = MDRangePolicy<Traits...>;
+  using FunctorType = typename CombinedFunctorReducerType::functor_type;
+  using ReducerType = typename CombinedFunctorReducerType::reducer_type;
+
+  using Pointer   = typename ReducerType::pointer_type;
+  using ValueType = typename ReducerType::value_type;
+
+  CombinedFunctorReducerType m_functor_reducer;
+  Policy m_policy;
+  Pointer m_result_ptr;
+  bool m_result_ptr_on_device;
+
+ public:
+  template <class ViewType>
+  ParallelReduce(const CombinedFunctorReducerType& functor_reducer,
+                 const Policy& policy, const ViewType& result)
+      : m_functor_reducer(functor_reducer),
+        m_policy(policy),
+        m_result_ptr(result.data()),
+        m_result_ptr_on_device(
+            MemorySpaceAccess<Kokkos::Experimental::OpenACCSpace,
+                              typename ViewType::memory_space>::accessible) {}
+
+  void execute() const {
+    static_assert(1 < Policy::rank && Policy::rank < 7);
+    static_assert(Policy::inner_direction == Iterate::Left ||
+                  Policy::inner_direction == Iterate::Right);
+    constexpr int rank = Policy::rank;
+    ValueType val;
+    const ReducerType& reducer = m_functor_reducer.get_reducer();
+    reducer.init(&val);
+
+    for (int i = 0; i < rank; ++i) {
+      if (m_policy.m_lower[i] >= m_policy.m_upper[i]) {
+        if (m_result_ptr_on_device) {
+          acc_memcpy_to_device(m_result_ptr, &val, sizeof(ValueType));
+        } else {
+          *m_result_ptr = val;
+        }
+        return;
+      }
+    }
+
+    int const async_arg = m_policy.space().acc_async_queue();
+
+    Kokkos::Experimental::Impl::OpenACCParallelReduceMDRangeHelper(
+        Kokkos::Experimental::Impl::FunctorAdapter<
+            FunctorType, Policy,
+            Kokkos::Experimental::Impl::RoutineClause::seq>(
+            m_functor_reducer.get_functor()),
+        std::conditional_t<
+            std::is_same_v<FunctorType, typename ReducerType::functor_type>,
+            Sum<ValueType>, typename ReducerType::functor_type>(val),
+        m_policy);
+
+    // OpenACC backend supports only built-in Reducer types; thus
+    // reducer.final() below is a no-op.
+    reducer.final(&val);
+    // acc_wait(async_arg) in the below if-else statements is needed because the
+    // above OpenACC compute kernel can be executed asynchronously and val is a
+    // local host variable.
+    if (m_result_ptr_on_device) {
+      acc_memcpy_to_device_async(m_result_ptr, &val, sizeof(ValueType),
+                                 async_arg);
+      acc_wait(async_arg);
+    } else {
+      acc_wait(async_arg);
+      *m_result_ptr = val;
+    }
+  }
+};
+
+#define KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_DISPATCH_ITERATE(REDUCER,         \
+                                                             OPERATOR)        \
+  namespace Kokkos::Experimental::Impl {                                      \
+  template <class ValueType, class Functor>                                   \
+  void OpenACCParallelReduce##REDUCER(OpenACCIterateLeft, ValueType& aval,    \
+                                      Functor const& afunctor,                \
+                                      OpenACCMDRangeBegin<2> const& begin,    \
+                                      OpenACCMDRangeEnd<2> const& end,        \
+                                      int async_arg) {                        \
+    auto val = aval;                                                          \
+    auto const functor(afunctor);                                             \
+    int begin1 = begin[1];                                                    \
+    int end1   = end[1];                                                      \
+    int begin0 = begin[0];                                                    \
+    int end0   = end[0];                                                      \
+    /* clang-format off */ \
+    KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector collapse(2) reduction(OPERATOR:val) copyin(functor) async(async_arg))                                                  \
+    /* clang-format on */                                                     \
+    for (auto i1 = begin1; i1 < end1; ++i1) {                                 \
+      for (auto i0 = begin0; i0 < end0; ++i0) {                               \
+        functor(i0, i1, val);                                                 \
+      }                                                                       \
+    }                                                                         \
+    aval = val;                                                               \
+  }                                                                           \
+                                                                              \
+  template <class ValueType, class Functor>                                   \
+  void OpenACCParallelReduce##REDUCER(OpenACCIterateRight, ValueType& aval,   \
+                                      Functor const& afunctor,                \
+                                      OpenACCMDRangeBegin<2> const& begin,    \
+                                      OpenACCMDRangeEnd<2> const& end,        \
+                                      int async_arg) {                        \
+    auto val = aval;                                                          \
+    auto const functor(afunctor);                                             \
+    int begin0 = begin[0];                                                    \
+    int end0   = end[0];                                                      \
+    int begin1 = begin[1];                                                    \
+    int end1   = end[1];                                                      \
+    /* clang-format off */ \
+    KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector collapse(2) reduction(OPERATOR:val) copyin(functor) async(async_arg))                                                  \
+    /* clang-format on */                                                     \
+    for (auto i0 = begin0; i0 < end0; ++i0) {                                 \
+      for (auto i1 = begin1; i1 < end1; ++i1) {                               \
+        functor(i0, i1, val);                                                 \
+      }                                                                       \
+    }                                                                         \
+    aval = val;                                                               \
+  }                                                                           \
+                                                                              \
+  template <class ValueType, class Functor>                                   \
+  void OpenACCParallelReduce##REDUCER(OpenACCIterateLeft, ValueType& aval,    \
+                                      Functor const& afunctor,                \
+                                      OpenACCMDRangeBegin<3> const& begin,    \
+                                      OpenACCMDRangeEnd<3> const& end,        \
+                                      int async_arg) {                        \
+    auto val = aval;                                                          \
+    auto const functor(afunctor);                                             \
+    int begin2 = begin[2];                                                    \
+    int end2   = end[2];                                                      \
+    int begin1 = begin[1];                                                    \
+    int end1   = end[1];                                                      \
+    int begin0 = begin[0];                                                    \
+    int end0   = end[0];                                                      \
+    /* clang-format off */                                                  \
+    KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector collapse(3) reduction( \
+        OPERATOR                                                            \
+        : val) copyin(functor) async(async_arg)) \
+    /* clang-format on */                                                     \
+    for (auto i2 = begin2; i2 < end2; ++i2) {                                 \
+      for (auto i1 = begin1; i1 < end1; ++i1) {                               \
+        for (auto i0 = begin0; i0 < end0; ++i0) {                             \
+          functor(i0, i1, i2, val);                                           \
+        }                                                                     \
+      }                                                                       \
+    }                                                                         \
+    aval = val;                                                               \
+  }                                                                           \
+                                                                              \
+  template <class ValueType, class Functor>                                   \
+  void OpenACCParallelReduce##REDUCER(OpenACCIterateRight, ValueType& aval,   \
+                                      Functor const& afunctor,                \
+                                      OpenACCMDRangeBegin<3> const& begin,    \
+                                      OpenACCMDRangeEnd<3> const& end,        \
+                                      int async_arg) {                        \
+    auto val = aval;                                                          \
+    auto const functor(afunctor);                                             \
+    int begin0 = begin[0];                                                    \
+    int end0   = end[0];                                                      \
+    int begin1 = begin[1];                                                    \
+    int end1   = end[1];                                                      \
+    int begin2 = begin[2];                                                    \
+    int end2   = end[2];                                                      \
+    /* clang-format off */                                                  \
+    KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector collapse(3) reduction( \
+        OPERATOR                                                            \
+        : val) copyin(functor) async(async_arg)) \
+    /* clang-format on */                                                     \
+    for (auto i0 = begin0; i0 < end0; ++i0) {                                 \
+      for (auto i1 = begin1; i1 < end1; ++i1) {                               \
+        for (auto i2 = begin2; i2 < end2; ++i2) {                             \
+          functor(i0, i1, i2, val);                                           \
+        }                                                                     \
+      }                                                                       \
+    }                                                                         \
+    aval = val;                                                               \
+  }                                                                           \
+                                                                              \
+  template <class ValueType, class Functor>                                   \
+  void OpenACCParallelReduce##REDUCER(OpenACCIterateLeft, ValueType& aval,    \
+                                      Functor const& afunctor,                \
+                                      OpenACCMDRangeBegin<4> const& begin,    \
+                                      OpenACCMDRangeEnd<4> const& end,        \
+                                      int async_arg) {                        \
+    auto val = aval;                                                          \
+    auto const functor(afunctor);                                             \
+    int begin3 = begin[3];                                                    \
+    int end3   = end[3];                                                      \
+    int begin2 = begin[2];                                                    \
+    int end2   = end[2];                                                      \
+    int begin1 = begin[1];                                                    \
+    int end1   = end[1];                                                      \
+    int begin0 = begin[0];                                                    \
+    int end0   = end[0];                                                      \
+    /* clang-format off */ \
+    KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector collapse(4) reduction(OPERATOR:val) copyin(functor) async(async_arg))                                                  \
+    /* clang-format on */                                                     \
+    for (auto i3 = begin3; i3 < end3; ++i3) {                                 \
+      for (auto i2 = begin2; i2 < end2; ++i2) {                               \
+        for (auto i1 = begin1; i1 < end1; ++i1) {                             \
+          for (auto i0 = begin0; i0 < end0; ++i0) {                           \
+            functor(i0, i1, i2, i3, val);                                     \
+          }                                                                   \
+        }                                                                     \
+      }                                                                       \
+    }                                                                         \
+    aval = val;                                                               \
+  }                                                                           \
+                                                                              \
+  template <class ValueType, class Functor>                                   \
+  void OpenACCParallelReduce##REDUCER(OpenACCIterateRight, ValueType& aval,   \
+                                      Functor const& afunctor,                \
+                                      OpenACCMDRangeBegin<4> const& begin,    \
+                                      OpenACCMDRangeEnd<4> const& end,        \
+                                      int async_arg) {                        \
+    auto val = aval;                                                          \
+    auto const functor(afunctor);                                             \
+    int begin0 = begin[0];                                                    \
+    int end0   = end[0];                                                      \
+    int begin1 = begin[1];                                                    \
+    int end1   = end[1];                                                      \
+    int begin2 = begin[2];                                                    \
+    int end2   = end[2];                                                      \
+    int begin3 = begin[3];                                                    \
+    int end3   = end[3];                                                      \
+    /* clang-format off */ \
+    KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector collapse(4) reduction(OPERATOR:val) copyin(functor) async(async_arg))                                                  \
+    /* clang-format on */                                                     \
+    for (auto i0 = begin0; i0 < end0; ++i0) {                                 \
+      for (auto i1 = begin1; i1 < end1; ++i1) {                               \
+        for (auto i2 = begin2; i2 < end2; ++i2) {                             \
+          for (auto i3 = begin3; i3 < end3; ++i3) {                           \
+            functor(i0, i1, i2, i3, val);                                     \
+          }                                                                   \
+        }                                                                     \
+      }                                                                       \
+    }                                                                         \
+    aval = val;                                                               \
+  }                                                                           \
+                                                                              \
+  template <class ValueType, class Functor>                                   \
+  void OpenACCParallelReduce##REDUCER(OpenACCIterateLeft, ValueType& aval,    \
+                                      Functor const& afunctor,                \
+                                      OpenACCMDRangeBegin<5> const& begin,    \
+                                      OpenACCMDRangeEnd<5> const& end,        \
+                                      int async_arg) {                        \
+    auto val = aval;                                                          \
+    auto const functor(afunctor);                                             \
+    int begin4 = begin[4];                                                    \
+    int end4   = end[4];                                                      \
+    int begin3 = begin[3];                                                    \
+    int end3   = end[3];                                                      \
+    int begin2 = begin[2];                                                    \
+    int end2   = end[2];                                                      \
+    int begin1 = begin[1];                                                    \
+    int end1   = end[1];                                                      \
+    int begin0 = begin[0];                                                    \
+    int end0   = end[0];                                                      \
+    /* clang-format off */ \
+    KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector collapse(5) reduction(OPERATOR:val) copyin(functor) async(async_arg))                                                  \
+    /* clang-format on */                                                     \
+    for (auto i4 = begin4; i4 < end4; ++i4) {                                 \
+      for (auto i3 = begin3; i3 < end3; ++i3) {                               \
+        for (auto i2 = begin2; i2 < end2; ++i2) {                             \
+          for (auto i1 = begin1; i1 < end1; ++i1) {                           \
+            for (auto i0 = begin0; i0 < end0; ++i0) {                         \
+              functor(i0, i1, i2, i3, i4, val);                               \
+            }                                                                 \
+          }                                                                   \
+        }                                                                     \
+      }                                                                       \
+    }                                                                         \
+    aval = val;                                                               \
+  }                                                                           \
+                                                                              \
+  template <class ValueType, class Functor>                                   \
+  void OpenACCParallelReduce##REDUCER(OpenACCIterateRight, ValueType& aval,   \
+                                      Functor const& afunctor,                \
+                                      OpenACCMDRangeBegin<5> const& begin,    \
+                                      OpenACCMDRangeEnd<5> const& end,        \
+                                      int async_arg) {                        \
+    auto val = aval;                                                          \
+    auto const functor(afunctor);                                             \
+    int begin0 = begin[0];                                                    \
+    int end0   = end[0];                                                      \
+    int begin1 = begin[1];                                                    \
+    int end1   = end[1];                                                      \
+    int begin2 = begin[2];                                                    \
+    int end2   = end[2];                                                      \
+    int begin3 = begin[3];                                                    \
+    int end3   = end[3];                                                      \
+    int begin4 = begin[4];                                                    \
+    int end4   = end[4];                                                      \
+    /* clang-format off */ \
+    KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector collapse(5) reduction(OPERATOR:val) copyin(functor) async(async_arg))                                                  \
+    /* clang-format on */                                                     \
+    for (auto i0 = begin0; i0 < end0; ++i0) {                                 \
+      for (auto i1 = begin1; i1 < end1; ++i1) {                               \
+        for (auto i2 = begin2; i2 < end2; ++i2) {                             \
+          for (auto i3 = begin3; i3 < end3; ++i3) {                           \
+            for (auto i4 = begin4; i4 < end4; ++i4) {                         \
+              functor(i0, i1, i2, i3, i4, val);                               \
+            }                                                                 \
+          }                                                                   \
+        }                                                                     \
+      }                                                                       \
+    }                                                                         \
+    aval = val;                                                               \
+  }                                                                           \
+                                                                              \
+  template <class ValueType, class Functor>                                   \
+  void OpenACCParallelReduce##REDUCER(OpenACCIterateLeft, ValueType& aval,    \
+                                      Functor const& afunctor,                \
+                                      OpenACCMDRangeBegin<6> const& begin,    \
+                                      OpenACCMDRangeEnd<6> const& end,        \
+                                      int async_arg) {                        \
+    auto val = aval;                                                          \
+    auto const functor(afunctor);                                             \
+    int begin5 = begin[5];                                                    \
+    int end5   = end[5];                                                      \
+    int begin4 = begin[4];                                                    \
+    int end4   = end[4];                                                      \
+    int begin3 = begin[3];                                                    \
+    int end3   = end[3];                                                      \
+    int begin2 = begin[2];                                                    \
+    int end2   = end[2];                                                      \
+    int begin1 = begin[1];                                                    \
+    int end1   = end[1];                                                      \
+    int begin0 = begin[0];                                                    \
+    int end0   = end[0];                                                      \
+    /* clang-format off */ \
+    KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector collapse(6) reduction(OPERATOR:val) copyin(functor) async(async_arg))                                                  \
+    /* clang-format on */                                                     \
+    for (auto i5 = begin5; i5 < end5; ++i5) {                                 \
+      for (auto i4 = begin4; i4 < end4; ++i4) {                               \
+        for (auto i3 = begin3; i3 < end3; ++i3) {                             \
+          for (auto i2 = begin2; i2 < end2; ++i2) {                           \
+            for (auto i1 = begin1; i1 < end1; ++i1) {                         \
+              for (auto i0 = begin0; i0 < end0; ++i0) {                       \
+                functor(i0, i1, i2, i3, i4, i5, val);                         \
+              }                                                               \
+            }                                                                 \
+          }                                                                   \
+        }                                                                     \
+      }                                                                       \
+    }                                                                         \
+    aval = val;                                                               \
+  }                                                                           \
+                                                                              \
+  template <class ValueType, class Functor>                                   \
+  void OpenACCParallelReduce##REDUCER(OpenACCIterateRight, ValueType& aval,   \
+                                      Functor const& afunctor,                \
+                                      OpenACCMDRangeBegin<6> const& begin,    \
+                                      OpenACCMDRangeEnd<6> const& end,        \
+                                      int async_arg) {                        \
+    auto val = aval;                                                          \
+    auto const functor(afunctor);                                             \
+    int begin0 = begin[0];                                                    \
+    int end0   = end[0];                                                      \
+    int begin1 = begin[1];                                                    \
+    int end1   = end[1];                                                      \
+    int begin2 = begin[2];                                                    \
+    int end2   = end[2];                                                      \
+    int begin3 = begin[3];                                                    \
+    int end3   = end[3];                                                      \
+    int begin4 = begin[4];                                                    \
+    int end4   = end[4];                                                      \
+    int begin5 = begin[5];                                                    \
+    int end5   = end[5];                                                      \
+    /* clang-format off */ \
+    KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector collapse(6) reduction(OPERATOR:val) copyin(functor) async(async_arg))                                                  \
+    /* clang-format on */                                                     \
+    for (auto i0 = begin0; i0 < end0; ++i0) {                                 \
+      for (auto i1 = begin1; i1 < end1; ++i1) {                               \
+        for (auto i2 = begin2; i2 < end2; ++i2) {                             \
+          for (auto i3 = begin3; i3 < end3; ++i3) {                           \
+            for (auto i4 = begin4; i4 < end4; ++i4) {                         \
+              for (auto i5 = begin5; i5 < end5; ++i5) {                       \
+                functor(i0, i1, i2, i3, i4, i5, val);                         \
+              }                                                               \
+            }                                                                 \
+          }                                                                   \
+        }                                                                     \
+      }                                                                       \
+    }                                                                         \
+    aval = val;                                                               \
+  }                                                                           \
+  }  // namespace Kokkos::Experimental::Impl
+
+#define KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_MDRANGE_HELPER(REDUCER, OPERATOR) \
+  KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_DISPATCH_ITERATE(REDUCER, OPERATOR)     \
+  template <class Functor, class Scalar, class Space, class... Traits>        \
+  struct Kokkos::Experimental::Impl::OpenACCParallelReduceMDRangeHelper<      \
+      Functor, Kokkos::REDUCER<Scalar, Space>,                                \
+      Kokkos::MDRangePolicy<Traits...>, true> {                               \
+    using Policy    = MDRangePolicy<Traits...>;                               \
+    using Reducer   = REDUCER<Scalar, Space>;                                 \
+    using ValueType = typename Reducer::value_type;                           \
+                                                                              \
+    OpenACCParallelReduceMDRangeHelper(Functor const& functor,                \
+                                       Reducer const& reducer,                \
+                                       Policy const& policy) {                \
+      ValueType val;                                                          \
+      reducer.init(val);                                                      \
+                                                                              \
+      int const async_arg = policy.space().acc_async_queue();                 \
+                                                                              \
+      OpenACCParallelReduce##REDUCER(                                         \
+          std::integral_constant<Iterate, Policy::inner_direction>(), val,    \
+          functor, policy.m_lower, policy.m_upper, async_arg);                \
+                                                                              \
+      reducer.reference() = val;                                              \
+    }                                                                         \
+  }
+
+KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_MDRANGE_HELPER(Sum, +);
+KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_MDRANGE_HELPER(Prod, *);
+KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_MDRANGE_HELPER(Min, min);
+KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_MDRANGE_HELPER(Max, max);
+KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_MDRANGE_HELPER(LAnd, &&);
+KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_MDRANGE_HELPER(LOr, ||);
+KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_MDRANGE_HELPER(BAnd, &);
+KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_MDRANGE_HELPER(BOr, |);
+
+#undef KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_MDRANGE_HELPER
+#undef KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_DISPATCH_ITERATE
+
+#endif
diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Range.hpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Range.hpp
index 278d9c1d5a68ba8d5db7d9e14bca860331cd1bdf..b61a05a8ee1b20581670ee460d703e74b8804137 100644
--- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Range.hpp
+++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Range.hpp
@@ -38,62 +38,76 @@ struct OpenACCParallelReduceHelper {
 
 }  // namespace Kokkos::Experimental::Impl
 
-template <class Functor, class ReducerType, class... Traits>
-class Kokkos::Impl::ParallelReduce<Functor, Kokkos::RangePolicy<Traits...>,
-                                   ReducerType, Kokkos::Experimental::OpenACC> {
-  using Policy = RangePolicy<Traits...>;
-
-  using ReducerConditional =
-      if_c<std::is_same_v<InvalidType, ReducerType>, Functor, ReducerType>;
-  using ReducerTypeFwd = typename ReducerConditional::type;
-  using Analysis =
-      FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy, ReducerTypeFwd>;
-
-  using Pointer   = typename Analysis::pointer_type;
-  using ValueType = typename Analysis::value_type;
-
-  Functor m_functor;
+template <class CombinedFunctorReducerType, class... Traits>
+class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
+                                   Kokkos::RangePolicy<Traits...>,
+                                   Kokkos::Experimental::OpenACC> {
+  using Policy      = RangePolicy<Traits...>;
+  using FunctorType = typename CombinedFunctorReducerType::functor_type;
+  using ReducerType = typename CombinedFunctorReducerType::reducer_type;
+
+  using Pointer   = typename ReducerType::pointer_type;
+  using ValueType = typename ReducerType::value_type;
+
+  CombinedFunctorReducerType m_functor_reducer;
   Policy m_policy;
-  ReducerType m_reducer;
   Pointer m_result_ptr;
+  bool m_result_ptr_on_device;
 
  public:
-  ParallelReduce(Functor const& functor, Policy const& policy,
-                 ReducerType const& reducer)
-      : m_functor(functor),
-        m_policy(policy),
-        m_reducer(reducer),
-        m_result_ptr(reducer.view().data()) {}
-
   template <class ViewType>
-  ParallelReduce(
-      const Functor& functor, const Policy& policy, const ViewType& result,
-      std::enable_if_t<Kokkos::is_view<ViewType>::value, void*> = nullptr)
-      : m_functor(functor),
+  ParallelReduce(CombinedFunctorReducerType const& functor_reducer,
+                 Policy const& policy, ViewType const& result)
+      : m_functor_reducer(functor_reducer),
         m_policy(policy),
-        m_reducer(InvalidType()),
-        m_result_ptr(result.data()) {}
+        m_result_ptr(result.data()),
+        m_result_ptr_on_device(
+            MemorySpaceAccess<Kokkos::Experimental::OpenACCSpace,
+                              typename ViewType::memory_space>::accessible) {}
 
   void execute() const {
     auto const begin = m_policy.begin();
     auto const end   = m_policy.end();
 
+    ValueType val;
+    ReducerType const& reducer = m_functor_reducer.get_reducer();
+    reducer.init(&val);
+
     if (end <= begin) {
+      if (m_result_ptr_on_device == false) {
+        *m_result_ptr = val;
+      } else {
+        acc_memcpy_to_device(m_result_ptr, &val, sizeof(ValueType));
+      }
       return;
     }
 
-    ValueType val;
-    typename Analysis::Reducer final_reducer(
-        &ReducerConditional::select(m_functor, m_reducer));
-    final_reducer.init(&val);
+    int const async_arg = m_policy.space().acc_async_queue();
 
     Kokkos::Experimental::Impl::OpenACCParallelReduceHelper(
-        Kokkos::Experimental::Impl::FunctorAdapter<Functor, Policy>(m_functor),
-        std::conditional_t<is_reducer_v<ReducerType>, ReducerType,
-                           Sum<ValueType>>(val),
+        Kokkos::Experimental::Impl::FunctorAdapter<
+            FunctorType, Policy,
+            Kokkos::Experimental::Impl::RoutineClause::seq>(
+            m_functor_reducer.get_functor()),
+        std::conditional_t<
+            std::is_same_v<FunctorType, typename ReducerType::functor_type>,
+            Sum<ValueType>, typename ReducerType::functor_type>(val),
         m_policy);
 
-    *m_result_ptr = val;
+    // OpenACC backend supports only built-in Reducer types; thus
+    // reducer.final() below is a no-op.
+    reducer.final(&val);
+    // acc_wait(async_arg) in the below if-else statements is needed because the
+    // above OpenACC compute kernel can be executed asynchronously and val is a
+    // local host variable.
+    if (m_result_ptr_on_device == false) {
+      acc_wait(async_arg);
+      *m_result_ptr = val;
+    } else {
+      acc_memcpy_to_device_async(m_result_ptr, &val, sizeof(ValueType),
+                                 async_arg);
+      acc_wait(async_arg);
+    }
   }
 };
 
diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..3223ce3f9afa36ba6ebf5e529686bf3ef8a5d6f0
--- /dev/null
+++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp
@@ -0,0 +1,449 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_OPENACC_PARALLEL_REDUCE_TEAM_HPP
+#define KOKKOS_OPENACC_PARALLEL_REDUCE_TEAM_HPP
+
+#include <OpenACC/Kokkos_OpenACC_Team.hpp>
+#include <OpenACC/Kokkos_OpenACC_FunctorAdapter.hpp>
+#include <OpenACC/Kokkos_OpenACC_Macros.hpp>
+
+#ifdef KOKKOS_ENABLE_OPENACC_COLLAPSE_HIERARCHICAL_CONSTRUCTS
+#define KOKKOS_IMPL_OPENACC_LOOP_CLAUSE \
+  Kokkos::Experimental::Impl::RoutineClause::seq
+#else
+#define KOKKOS_IMPL_OPENACC_LOOP_CLAUSE \
+  Kokkos::Experimental::Impl::RoutineClause::worker
+#endif
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+// Hierarchical Parallelism -> Team level implementation
+namespace Kokkos::Experimental::Impl {
+
+// primary template: catch-all non-implemented custom reducers
+template <class Functor, class Reducer, class Policy,
+          bool = std::is_arithmetic_v<typename Reducer::value_type>>
+struct OpenACCParallelReduceTeamHelper {
+  OpenACCParallelReduceTeamHelper(Functor const&, Reducer const&,
+                                  Policy const&) {
+    static_assert(!Kokkos::Impl::always_true<Functor>::value,
+                  "not implemented");
+  }
+};
+
+}  // namespace Kokkos::Experimental::Impl
+
+template <class CombinedFunctorReducerType, class... Properties>
+class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
+                                   Kokkos::TeamPolicy<Properties...>,
+                                   Kokkos::Experimental::OpenACC> {
+ private:
+  using Policy =
+      TeamPolicyInternal<Kokkos::Experimental::OpenACC, Properties...>;
+  using FunctorType = typename CombinedFunctorReducerType::functor_type;
+  using ReducerType = typename CombinedFunctorReducerType::reducer_type;
+
+  using value_type   = typename ReducerType::value_type;
+  using pointer_type = typename ReducerType::pointer_type;
+
+  CombinedFunctorReducerType m_functor_reducer;
+  Policy m_policy;
+  pointer_type m_result_ptr;
+  bool m_result_ptr_on_device;
+
+ public:
+  void execute() const {
+    auto league_size   = m_policy.league_size();
+    auto team_size     = m_policy.team_size();
+    auto vector_length = m_policy.impl_vector_length();
+
+    int const async_arg = m_policy.space().acc_async_queue();
+    value_type val;
+    const ReducerType& reducer = m_functor_reducer.get_reducer();
+    reducer.init(&val);
+    if (league_size <= 0) {
+      if (m_result_ptr_on_device == false) {
+        *m_result_ptr = val;
+      } else {
+        acc_memcpy_to_device(m_result_ptr, &val, sizeof(value_type));
+      }
+      return;
+    }
+
+    Kokkos::Experimental::Impl::OpenACCParallelReduceTeamHelper(
+        Kokkos::Experimental::Impl::FunctorAdapter<
+            FunctorType, Policy, KOKKOS_IMPL_OPENACC_LOOP_CLAUSE>(
+            m_functor_reducer.get_functor()),
+        std::conditional_t<
+            std::is_same_v<FunctorType, typename ReducerType::functor_type>,
+            Sum<value_type>, typename ReducerType::functor_type>(val),
+        m_policy);
+
+    // OpenACC backend supports only built-in Reducer types; thus
+    // reducer.final() below is a no-op.
+    reducer.final(&val);
+    // acc_wait(async_arg) in the below if-else statements is needed because the
+    // above OpenACC compute kernel can be executed asynchronously and val is a
+    // local host variable.
+    if (m_result_ptr_on_device == false) {
+      acc_wait(async_arg);
+      *m_result_ptr = val;
+    } else {
+      acc_memcpy_to_device_async(m_result_ptr, &val, sizeof(value_type),
+                                 async_arg);
+      acc_wait(async_arg);
+    }
+  }
+
+  template <class ViewType>
+  ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer,
+                 const Policy& arg_policy, const ViewType& arg_result_view)
+      : m_functor_reducer(arg_functor_reducer),
+        m_policy(arg_policy),
+        m_result_ptr(arg_result_view.data()),
+        m_result_ptr_on_device(
+            MemorySpaceAccess<Kokkos::Experimental::OpenACCSpace,
+                              typename ViewType::memory_space>::accessible) {}
+};
+
+namespace Kokkos {
+
+// Hierarchical Parallelism -> Team thread level implementation
+// FIXME_OPENACC: custom reduction is not implemented.
+template <typename iType, class Lambda, typename ValueType, class JoinType>
+KOKKOS_INLINE_FUNCTION void parallel_reduce(
+    const Impl::TeamThreadRangeBoundariesStruct<iType, Impl::OpenACCTeamMember>&
+        loop_boundaries,
+    const Lambda& lambda, const JoinType& join, ValueType& init_result) {
+  static_assert(!Kokkos::Impl::always_true<Lambda>::value,
+                "custom reduction is not implemented");
+}
+
+// Hierarchical Parallelism -> Thread vector level implementation
+// FIXME_OPENACC: custom reduction is not implemented.
+template <typename iType, class Lambda, typename ValueType, class JoinType>
+KOKKOS_INLINE_FUNCTION void parallel_reduce(
+    const Impl::ThreadVectorRangeBoundariesStruct<
+        iType, Impl::OpenACCTeamMember>& loop_boundaries,
+    const Lambda& lambda, const JoinType& join, ValueType& init_result) {
+  static_assert(!Kokkos::Impl::always_true<Lambda>::value,
+                "custom reduction is not implemented");
+}
+
+}  // namespace Kokkos
+
+#ifdef KOKKOS_ENABLE_OPENACC_COLLAPSE_HIERARCHICAL_CONSTRUCTS
+
+#define KOKKOS_IMPL_ACC_REDUCE_TEAM_PRAGMA \
+  vector vector_length(team_size* vector_length)
+#define KOKKOS_IMPL_ACC_REDUCE_TEAM_ITRS league_size* team_size* vector_length
+#define KOKKOS_IMPL_ACC_REDUCE_TEAM_LEAGUE_ID_INIT \
+  i / (team_size * vector_length)
+
+namespace Kokkos {
+
+// Hierarchical Parallelism -> Team thread level implementation
+#pragma acc routine seq
+template <typename iType, class Lambda, typename ValueType>
+KOKKOS_INLINE_FUNCTION std::enable_if_t<!Kokkos::is_reducer_v<ValueType>>
+parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<
+                    iType, Impl::OpenACCTeamMember>& loop_boundaries,
+                const Lambda& lambda, ValueType& result) {
+  ValueType tmp = ValueType();
+  iType j_start =
+      loop_boundaries.team.team_rank() / loop_boundaries.team.vector_length();
+  if (j_start == 0) {
+#pragma acc loop seq
+    for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++)
+      lambda(i, tmp);
+    result = tmp;
+  }
+}
+
+#pragma acc routine seq
+template <typename iType, class Lambda, typename ReducerType>
+KOKKOS_INLINE_FUNCTION std::enable_if_t<Kokkos::is_reducer_v<ReducerType>>
+parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<
+                    iType, Impl::OpenACCTeamMember>& loop_boundaries,
+                const Lambda& lambda, const ReducerType& reducer) {
+  using ValueType = typename ReducerType::value_type;
+  ValueType tmp;
+  reducer.init(tmp);
+  iType j_start =
+      loop_boundaries.team.team_rank() / loop_boundaries.team.vector_length();
+  if (j_start == 0) {
+#pragma acc loop seq
+    for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++)
+      lambda(i, tmp);
+    reducer.reference() = tmp;
+  }
+}
+
+// Hierarchical Parallelism -> Thread vector level implementation
+#pragma acc routine seq
+template <typename iType, class Lambda, typename ValueType>
+KOKKOS_INLINE_FUNCTION std::enable_if_t<!Kokkos::is_reducer_v<ValueType>>
+parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<
+                    iType, Impl::OpenACCTeamMember>& loop_boundaries,
+                const Lambda& lambda, ValueType& result) {
+  ValueType tmp = ValueType();
+  iType j_start =
+      loop_boundaries.team.team_rank() % loop_boundaries.team.vector_length();
+  if (j_start == 0) {
+#pragma acc loop seq
+    for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
+      lambda(i, tmp);
+    }
+    result = tmp;
+  }
+}
+
+#pragma acc routine seq
+template <typename iType, class Lambda, typename ReducerType>
+KOKKOS_INLINE_FUNCTION std::enable_if_t<Kokkos::is_reducer_v<ReducerType>>
+parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<
+                    iType, Impl::OpenACCTeamMember>& loop_boundaries,
+                const Lambda& lambda, const ReducerType& reducer) {
+  using ValueType = typename ReducerType::value_type;
+  ValueType tmp;
+  reducer.init(tmp);
+  iType j_start =
+      loop_boundaries.team.team_rank() % loop_boundaries.team.vector_length();
+  if (j_start == 0) {
+#pragma acc loop seq
+    for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
+      lambda(i, tmp);
+    }
+    reducer.reference() = tmp;
+  }
+}
+
+// Hierarchical Parallelism -> Team vector level implementation
+#pragma acc routine seq
+template <typename iType, class Lambda, typename ValueType>
+KOKKOS_INLINE_FUNCTION void parallel_reduce(
+    const Impl::TeamVectorRangeBoundariesStruct<iType, Impl::OpenACCTeamMember>&
+        loop_boundaries,
+    const Lambda& lambda, ValueType& result) {
+  ValueType tmp = ValueType();
+  iType j_start =
+      loop_boundaries.team.team_rank() % loop_boundaries.team.vector_length();
+  if (j_start == 0) {
+#pragma acc loop seq
+    for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
+      lambda(i, tmp);
+    }
+    result = tmp;
+  }
+}
+
+}  // namespace Kokkos
+
+#else /* #ifdef KOKKOS_ENABLE_OPENACC_COLLAPSE_HIERARCHICAL_CONSTRUCTS */
+
+#define KOKKOS_IMPL_ACC_REDUCE_TEAM_PRAGMA \
+  num_workers(team_size) vector_length(vector_length)
+#define KOKKOS_IMPL_ACC_REDUCE_TEAM_ITRS league_size
+#define KOKKOS_IMPL_ACC_REDUCE_TEAM_LEAGUE_ID_INIT i
+
+// FIXME_OPENACC: below implementation conforms to the OpenACC standard, but
+// the NVHPC compiler (V22.11) fails due to the lack of support for lambda
+// expressions containing parallel loops.
+
+namespace Kokkos {
+
+// Hierarchical Parallelism -> Team thread level implementation
+#pragma acc routine worker
+template <typename iType, class Lambda, typename ValueType>
+KOKKOS_INLINE_FUNCTION std::enable_if_t<!Kokkos::is_reducer_v<ValueType>>
+parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<
+                    iType, Impl::OpenACCTeamMember>& loop_boundaries,
+                const Lambda& lambda, ValueType& result) {
+  ValueType tmp = ValueType();
+#pragma acc loop worker reduction(+ : tmp)
+  for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++)
+    lambda(i, tmp);
+  result = tmp;
+}
+
+#define KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_TEAM_THREAD(REDUCER, OPERATOR)   \
+  KOKKOS_IMPL_ACC_PRAGMA(routine worker)                                     \
+  template <typename iType, class Lambda, class Scalar, class Space>         \
+  KOKKOS_INLINE_FUNCTION                                                     \
+      std::enable_if_t<Kokkos::is_reducer_v<Kokkos::REDUCER<Scalar, Space>>> \
+      parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<           \
+                          iType, Impl::OpenACCTeamMember>& loop_boundaries,  \
+                      const Lambda& lambda,                                  \
+                      const Kokkos::REDUCER<Scalar, Space>& reducer) {       \
+    using ValueType = typename Kokkos::REDUCER<Scalar, Space>::value_type;   \
+    ValueType tmp   = ValueType();                                           \
+    reducer.init(tmp);                                                       \
+    KOKKOS_IMPL_ACC_PRAGMA(loop worker reduction(OPERATOR : tmp))            \
+    for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++)      \
+      lambda(i, tmp);                                                        \
+    reducer.reference() = tmp;                                               \
+  }
+
+KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_TEAM_THREAD(Sum, +);
+KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_TEAM_THREAD(Prod, *);
+KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_TEAM_THREAD(Min, min);
+KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_TEAM_THREAD(Max, max);
+KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_TEAM_THREAD(LAnd, &&);
+KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_TEAM_THREAD(LOr, ||);
+KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_TEAM_THREAD(BAnd, &);
+KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_TEAM_THREAD(BOr, |);
+
+// Hierarchical Parallelism -> Thread vector level implementation
+#pragma acc routine vector
+template <typename iType, class Lambda, typename ValueType>
+KOKKOS_INLINE_FUNCTION std::enable_if_t<!Kokkos::is_reducer_v<ValueType>>
+parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<
+                    iType, Impl::OpenACCTeamMember>& loop_boundaries,
+                const Lambda& lambda, ValueType& result) {
+  ValueType tmp = ValueType();
+#pragma acc loop vector reduction(+ : tmp)
+  for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
+    lambda(i, tmp);
+  }
+  result = tmp;
+}
+
+#define KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_THREAD_VECTOR(REDUCER, OPERATOR) \
+  KOKKOS_IMPL_ACC_PRAGMA(routine vector)                                     \
+  template <typename iType, class Lambda, class Scalar, class Space>         \
+  KOKKOS_INLINE_FUNCTION                                                     \
+      std::enable_if_t<Kokkos::is_reducer_v<Kokkos::REDUCER<Scalar, Space>>> \
+      parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<         \
+                          iType, Impl::OpenACCTeamMember>& loop_boundaries,  \
+                      const Lambda& lambda,                                  \
+                      const Kokkos::REDUCER<Scalar, Space>& reducer) {       \
+    using ValueType = typename Kokkos::REDUCER<Scalar, Space>::value_type;   \
+    ValueType tmp;                                                           \
+    reducer.init(tmp);                                                       \
+    KOKKOS_IMPL_ACC_PRAGMA(loop vector reduction(OPERATOR : tmp))            \
+    for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {    \
+      lambda(i, tmp);                                                        \
+    }                                                                        \
+    reducer.reference() = tmp;                                               \
+  }
+
+KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_THREAD_VECTOR(Sum, +);
+KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_THREAD_VECTOR(Prod, *);
+KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_THREAD_VECTOR(Min, min);
+KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_THREAD_VECTOR(Max, max);
+KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_THREAD_VECTOR(LAnd, &&);
+KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_THREAD_VECTOR(LOr, ||);
+KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_THREAD_VECTOR(BAnd, &);
+KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_THREAD_VECTOR(BOr, |);
+
+// Hierarchical Parallelism -> Team vector level implementation
+#pragma acc routine vector
+template <typename iType, class Lambda, typename ValueType>
+KOKKOS_INLINE_FUNCTION void parallel_reduce(
+    const Impl::TeamVectorRangeBoundariesStruct<iType, Impl::OpenACCTeamMember>&
+        loop_boundaries,
+    const Lambda& lambda, ValueType& result) {
+  ValueType tmp = ValueType();
+#pragma acc loop vector reduction(+ : tmp)
+  for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
+    lambda(i, tmp);
+  }
+  result = tmp;
+}
+
+}  // namespace Kokkos
+
+#undef KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_TEAM_THREAD
+#undef KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_THREAD_VECTOR
+
+#endif /* #ifdef KOKKOS_ENABLE_OPENACC_COLLAPSE_HIERARCHICAL_CONSTRUCTS */
+
+#define KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_DISPATCH_SCHEDULE(REDUCER,     \
+                                                              OPERATOR)    \
+  namespace Kokkos::Experimental::Impl {                                   \
+  template <class Policy, class ValueType, class Functor>                  \
+  void OpenACCParallelReduceTeam##REDUCER(Policy const policy,             \
+                                          ValueType& aval,                 \
+                                          Functor const& afunctor,         \
+                                          int async_arg) {                 \
+    auto const functor       = afunctor;                                   \
+    auto val                 = aval;                                       \
+    auto const league_size   = policy.league_size();                       \
+    auto const team_size     = policy.team_size();                         \
+    auto const vector_length = policy.impl_vector_length();                \
+    /* clang-format off */ \
+    KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang num_gangs(league_size) KOKKOS_IMPL_ACC_REDUCE_TEAM_PRAGMA reduction(OPERATOR : val) copyin(functor) async(async_arg))                                               \
+    /* clang-format on */                                                  \
+    for (int i = 0; i < KOKKOS_IMPL_ACC_REDUCE_TEAM_ITRS; i++) {           \
+      int league_id = KOKKOS_IMPL_ACC_REDUCE_TEAM_LEAGUE_ID_INIT;          \
+      typename Policy::member_type team(league_id, league_size, team_size, \
+                                        vector_length);                    \
+      functor(team, val);                                                  \
+    }                                                                      \
+    aval = val;                                                            \
+  }                                                                        \
+  }  // namespace Kokkos::Experimental::Impl
+
+#define KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_TEAM_HELPER(REDUCER, OPERATOR) \
+  KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_DISPATCH_SCHEDULE(REDUCER, OPERATOR) \
+                                                                           \
+  template <class Functor, class Scalar, class Space, class... Traits>     \
+  struct Kokkos::Experimental::Impl::OpenACCParallelReduceTeamHelper<      \
+      Functor, Kokkos::REDUCER<Scalar, Space>,                             \
+      Kokkos::Impl::TeamPolicyInternal<Traits...>, true> {                 \
+    using Policy    = Kokkos::Impl::TeamPolicyInternal<Traits...>;         \
+    using Reducer   = REDUCER<Scalar, Space>;                              \
+    using ValueType = typename Reducer::value_type;                        \
+                                                                           \
+    OpenACCParallelReduceTeamHelper(Functor const& functor,                \
+                                    Reducer const& reducer,                \
+                                    Policy const& policy) {                \
+      auto league_size   = policy.league_size();                           \
+      auto team_size     = policy.team_size();                             \
+      auto vector_length = policy.impl_vector_length();                    \
+                                                                           \
+      if (league_size <= 0) {                                              \
+        return;                                                            \
+      }                                                                    \
+                                                                           \
+      ValueType val;                                                       \
+      reducer.init(val);                                                   \
+                                                                           \
+      int const async_arg = policy.space().acc_async_queue();              \
+                                                                           \
+      OpenACCParallelReduceTeam##REDUCER(policy, val, functor, async_arg); \
+                                                                           \
+      reducer.reference() = val;                                           \
+    }                                                                      \
+  }
+
+KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_TEAM_HELPER(Sum, +);
+KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_TEAM_HELPER(Prod, *);
+KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_TEAM_HELPER(Min, min);
+KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_TEAM_HELPER(Max, max);
+KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_TEAM_HELPER(LAnd, &&);
+KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_TEAM_HELPER(LOr, ||);
+KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_TEAM_HELPER(BAnd, &);
+KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_TEAM_HELPER(BOr, |);
+
+#undef KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_TEAM_HELPER
+#undef KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_DISPATCH_SCHEDULE
+#undef KOKKOS_IMPL_ACC_REDUCE_TEAM_PRAGMA
+#undef KOKKOS_IMPL_ACC_REDUCE_TEAM_ITRS
+#undef KOKKOS_IMPL_ACC_REDUCE_TEAM_LEAGUE_ID_INIT
+
+#endif /* #ifndef KOKKOS_OPENACC_PARALLEL_REDUCE_TEAM_HPP */
diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelScan_Range.hpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelScan_Range.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c6d3267bdb0a0ec53ef540d8d383e645460373f4
--- /dev/null
+++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelScan_Range.hpp
@@ -0,0 +1,318 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_OPENACC_PARALLEL_SCAN_RANGE_HPP
+#define KOKKOS_OPENACC_PARALLEL_SCAN_RANGE_HPP
+
+#include <OpenACC/Kokkos_OpenACC.hpp>
+#include <OpenACC/Kokkos_OpenACC_FunctorAdapter.hpp>
+#include <OpenACC/Kokkos_OpenACC_Macros.hpp>
+#include <Kokkos_Parallel.hpp>
+
+// Clacc uses an alternative implementation to work around not-yet-implemented
+// OpenACC features: Clacc does not fully support private clauses for
+// gang-private variables, and the alternative implementation allocates
+// the gang-private arrays on GPU global memory using array expansion,
+// instead of using the private clause.
+/* clang-format off */
+#ifdef KOKKOS_COMPILER_CLANG
+#define KOKKOS_IMPL_ACC_ACCESS_ELEMENTS(THREADID) \
+  element_values[team_id * 2 * chunk_size + THREADID]
+#define KOKKOS_IMPL_ACC_ELEMENT_VALUES_CLAUSE create(element_values [0:num_elements])
+#else
+#define KOKKOS_IMPL_ACC_ACCESS_ELEMENTS(THREADID) element_values[THREADID]
+#define KOKKOS_IMPL_ACC_ELEMENT_VALUES_CLAUSE private(element_values [0:num_elements])
+#endif
+/* clang-format on */
+
+namespace Kokkos::Impl {
+
+template <class Functor, class GivenValueType, class... Traits>
+class ParallelScanOpenACCBase {
+ protected:
+  using Policy = Kokkos::RangePolicy<Traits...>;
+  using Analysis =
+      Kokkos::Impl::FunctorAnalysis<Kokkos::Impl::FunctorPatternInterface::SCAN,
+                                    Policy, Functor, GivenValueType>;
+  using PointerType = typename Analysis::pointer_type;
+  using ValueType   = typename Analysis::value_type;
+  using MemberType  = typename Policy::member_type;
+  using IndexType   = typename Policy::index_type;
+  Functor m_functor;
+  Policy m_policy;
+  ValueType* m_result_ptr;
+  bool m_result_ptr_device_accessible;
+  static constexpr MemberType default_scan_chunk_size = 128;
+
+ public:
+  ParallelScanOpenACCBase(Functor const& arg_functor, Policy const& arg_policy,
+                          ValueType* arg_result_ptr,
+                          bool arg_result_ptr_device_accessible)
+      : m_functor(arg_functor),
+        m_policy(arg_policy),
+        m_result_ptr(arg_result_ptr),
+        m_result_ptr_device_accessible(arg_result_ptr_device_accessible) {}
+
+  // This function implements the parallel scan alogithm based on the parallel
+  // prefix sum algorithm proposed by Hillis and Steele (doi:10.1145/7902.7903),
+  // which offers a shorter span and more parallelism but may not be
+  // work-efficient.
+  void OpenACCParallelScanRangePolicy(const IndexType begin,
+                                      const IndexType end, IndexType chunk_size,
+                                      const int async_arg) const {
+    if (chunk_size > 1) {
+      if (!Impl::is_integral_power_of_two(chunk_size))
+        Kokkos::abort(
+            "RangePolicy blocking granularity must be power of two to be used "
+            "with OpenACC parallel_scan()");
+    } else {
+      chunk_size = default_scan_chunk_size;
+    }
+    const Kokkos::Experimental::Impl::FunctorAdapter<
+        Functor, Policy, Kokkos::Experimental::Impl::RoutineClause::seq>
+        functor(m_functor);
+    const IndexType N        = end - begin;
+    const IndexType n_chunks = (N + chunk_size - 1) / chunk_size;
+#ifdef KOKKOS_COMPILER_CLANG
+    int const num_elements = n_chunks * 2 * chunk_size;
+#else
+    int const num_elements = 2 * chunk_size;
+#endif
+    Kokkos::View<ValueType*, Kokkos::Experimental::OpenACCSpace> chunk_values(
+        "Kokkos::OpenACCParallelScan::chunk_values", n_chunks);
+    Kokkos::View<ValueType*, Kokkos::Experimental::OpenACCSpace> offset_values(
+        "Kokkos::OpenACCParallelScan::offset_values", n_chunks);
+    Kokkos::View<ValueType, Kokkos::Experimental::OpenACCSpace> m_result_total(
+        "Kokkos::OpenACCParallelScan::m_result_total");
+    std::unique_ptr<ValueType[]> element_values_owner(
+        new ValueType[num_elements]);
+    ValueType* element_values = element_values_owner.get();
+    typename Analysis::Reducer final_reducer(m_functor);
+
+#pragma acc enter data copyin(functor, final_reducer) \
+    copyin(chunk_values, offset_values) async(async_arg)
+
+    /* clang-format off */
+KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector_length(chunk_size) KOKKOS_IMPL_ACC_ELEMENT_VALUES_CLAUSE present(functor, chunk_values, final_reducer) async(async_arg))
+    /* clang-format on */
+    for (IndexType team_id = 0; team_id < n_chunks; ++team_id) {
+      IndexType current_step = 0;
+      IndexType next_step    = 1;
+      IndexType temp;
+#pragma acc loop vector
+      for (IndexType thread_id = 0; thread_id < chunk_size; ++thread_id) {
+        const IndexType local_offset = team_id * chunk_size;
+        const IndexType idx          = local_offset + thread_id;
+        ValueType update;
+        final_reducer.init(&update);
+        if ((idx > 0) && (idx < N)) functor(idx - 1, update, false);
+        KOKKOS_IMPL_ACC_ACCESS_ELEMENTS(thread_id) = update;
+      }
+      for (IndexType step_size = 1; step_size < chunk_size; step_size *= 2) {
+#pragma acc loop vector
+        for (IndexType thread_id = 0; thread_id < chunk_size; ++thread_id) {
+          if (thread_id < step_size) {
+            KOKKOS_IMPL_ACC_ACCESS_ELEMENTS(next_step * chunk_size +
+                                            thread_id) =
+                KOKKOS_IMPL_ACC_ACCESS_ELEMENTS(current_step * chunk_size +
+                                                thread_id);
+          } else {
+            ValueType localValue = KOKKOS_IMPL_ACC_ACCESS_ELEMENTS(
+                current_step * chunk_size + thread_id);
+            final_reducer.join(&localValue, &KOKKOS_IMPL_ACC_ACCESS_ELEMENTS(
+                                                current_step * chunk_size +
+                                                thread_id - step_size));
+            KOKKOS_IMPL_ACC_ACCESS_ELEMENTS(next_step * chunk_size +
+                                            thread_id) = localValue;
+          }
+        }
+        temp         = current_step;
+        current_step = next_step;
+        next_step    = temp;
+      }
+      chunk_values(team_id) = KOKKOS_IMPL_ACC_ACCESS_ELEMENTS(
+          current_step * chunk_size + chunk_size - 1);
+    }
+
+    ValueType tempValue;
+#pragma acc parallel loop seq num_gangs(1) num_workers(1) vector_length(1) \
+    present(chunk_values, offset_values, final_reducer) async(async_arg)
+    for (IndexType team_id = 0; team_id < n_chunks; ++team_id) {
+      if (team_id == 0) {
+        final_reducer.init(&offset_values(0));
+        final_reducer.init(&tempValue);
+      } else {
+        final_reducer.join(&tempValue, &chunk_values(team_id - 1));
+        offset_values(team_id) = tempValue;
+      }
+    }
+
+    /* clang-format off */
+KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector_length(chunk_size) KOKKOS_IMPL_ACC_ELEMENT_VALUES_CLAUSE present(functor, offset_values, final_reducer) copyin(m_result_total) async(async_arg))
+    /* clang-format on */
+    for (IndexType team_id = 0; team_id < n_chunks; ++team_id) {
+      IndexType current_step = 0;
+      IndexType next_step    = 1;
+      IndexType temp;
+#pragma acc loop vector
+      for (IndexType thread_id = 0; thread_id < chunk_size; ++thread_id) {
+        const IndexType local_offset = team_id * chunk_size;
+        const IndexType idx          = local_offset + thread_id;
+        ValueType update;
+        final_reducer.init(&update);
+        if (thread_id == 0) {
+          final_reducer.join(&update, &offset_values(team_id));
+        }
+        if ((idx > 0) && (idx < N)) functor(idx - 1, update, false);
+        KOKKOS_IMPL_ACC_ACCESS_ELEMENTS(thread_id) = update;
+      }
+      for (IndexType step_size = 1; step_size < chunk_size; step_size *= 2) {
+#pragma acc loop vector
+        for (IndexType thread_id = 0; thread_id < chunk_size; ++thread_id) {
+          if (thread_id < step_size) {
+            KOKKOS_IMPL_ACC_ACCESS_ELEMENTS(next_step * chunk_size +
+                                            thread_id) =
+                KOKKOS_IMPL_ACC_ACCESS_ELEMENTS(current_step * chunk_size +
+                                                thread_id);
+          } else {
+            ValueType localValue = KOKKOS_IMPL_ACC_ACCESS_ELEMENTS(
+                current_step * chunk_size + thread_id);
+            final_reducer.join(&localValue, &KOKKOS_IMPL_ACC_ACCESS_ELEMENTS(
+                                                current_step * chunk_size +
+                                                thread_id - step_size));
+            KOKKOS_IMPL_ACC_ACCESS_ELEMENTS(next_step * chunk_size +
+                                            thread_id) = localValue;
+          }
+        }
+        temp         = current_step;
+        current_step = next_step;
+        next_step    = temp;
+      }
+#pragma acc loop vector
+      for (IndexType thread_id = 0; thread_id < chunk_size; ++thread_id) {
+        const IndexType local_offset = team_id * chunk_size;
+        const IndexType idx          = local_offset + thread_id;
+        ValueType update             = KOKKOS_IMPL_ACC_ACCESS_ELEMENTS(
+            current_step * chunk_size + thread_id);
+        if (idx < N) functor(idx, update, true);
+        if (idx == N - 1) {
+          if (m_result_ptr_device_accessible) {
+            *m_result_ptr = update;
+          } else {
+            m_result_total() = update;
+          }
+        }
+      }
+    }
+    if (!m_result_ptr_device_accessible && m_result_ptr != nullptr) {
+      DeepCopy<HostSpace, Kokkos::Experimental::OpenACCSpace,
+               Kokkos::Experimental::OpenACC>(m_policy.space(), m_result_ptr,
+                                              m_result_total.data(),
+                                              sizeof(ValueType));
+    }
+
+#pragma acc exit data delete (functor, chunk_values, offset_values, \
+                              final_reducer)async(async_arg)
+    acc_wait(async_arg);
+  }
+
+  void execute() const {
+    const IndexType begin = m_policy.begin();
+    const IndexType end   = m_policy.end();
+    IndexType chunk_size  = m_policy.chunk_size();
+
+    if (end <= begin) {
+      if (!m_result_ptr_device_accessible && m_result_ptr != nullptr) {
+        *m_result_ptr = 0;
+      }
+      return;
+    }
+
+    int const async_arg = m_policy.space().acc_async_queue();
+
+    OpenACCParallelScanRangePolicy(begin, end, chunk_size, async_arg);
+  }
+};
+
+}  // namespace Kokkos::Impl
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+template <class Functor, class... Traits>
+class Kokkos::Impl::ParallelScan<Functor, Kokkos::RangePolicy<Traits...>,
+                                 Kokkos::Experimental::OpenACC>
+    : public ParallelScanOpenACCBase<Functor, void, Traits...> {
+  using base_t    = ParallelScanOpenACCBase<Functor, void, Traits...>;
+  using IndexType = typename base_t::IndexType;
+
+ public:
+  void execute() const {
+    const IndexType begin = base_t::m_policy.begin();
+    const IndexType end   = base_t::m_policy.end();
+    IndexType chunk_size  = base_t::m_policy.chunk_size();
+
+    int const async_arg = base_t::m_policy.space().acc_async_queue();
+
+    base_t::OpenACCParallelScanRangePolicy(begin, end, chunk_size, async_arg);
+  }
+
+  ParallelScan(const Functor& arg_functor,
+               const typename base_t::Policy& arg_policy)
+      : base_t(arg_functor, arg_policy, nullptr, false) {}
+};
+
+template <class FunctorType, class ReturnType, class... Traits>
+class Kokkos::Impl::ParallelScanWithTotal<
+    FunctorType, Kokkos::RangePolicy<Traits...>, ReturnType,
+    Kokkos::Experimental::OpenACC>
+    : public ParallelScanOpenACCBase<FunctorType, ReturnType, Traits...> {
+  using base_t    = ParallelScanOpenACCBase<FunctorType, ReturnType, Traits...>;
+  using IndexType = typename base_t::IndexType;
+
+ public:
+  void execute() const {
+    const IndexType begin = base_t::m_policy.begin();
+    const IndexType end   = base_t::m_policy.end();
+    IndexType chunk_size  = base_t::m_policy.chunk_size();
+
+    if (end <= begin) {
+      if (!base_t::m_result_ptr_device_accessible &&
+          base_t::m_result_ptr != nullptr) {
+        *base_t::m_result_ptr = 0;
+      }
+      return;
+    }
+
+    int const async_arg = base_t::m_policy.space().acc_async_queue();
+
+    base_t::OpenACCParallelScanRangePolicy(begin, end, chunk_size, async_arg);
+  }
+
+  template <class ViewType>
+  ParallelScanWithTotal(const FunctorType& arg_functor,
+                        const typename base_t::Policy& arg_policy,
+                        const ViewType& arg_result_view)
+      : base_t(arg_functor, arg_policy, arg_result_view.data(),
+               MemorySpaceAccess<Kokkos::Experimental::OpenACCSpace,
+                                 typename ViewType::memory_space>::accessible) {
+  }
+};
+
+#undef KOKKOS_IMPL_ACC_ACCESS_ELEMENTS
+#undef KOKKOS_IMPL_ACC_ELEMENT_VALUES_CLAUSE
+
+#endif
diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Team.hpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Team.hpp
index 3a46f2c483440f85a94a6ed6dc12c37aa402c8a3..4ec71f56ef662fb3a4cc7488bc28a65957eedb76 100644
--- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Team.hpp
+++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Team.hpp
@@ -82,7 +82,7 @@ class OpenACCTeamMember {
   // FIXME_OPENACC: team_broadcast() is not implemented.
   template <class ValueType>
   KOKKOS_FUNCTION void team_broadcast(ValueType& value, int thread_id) const {
-    static_assert(Kokkos::Impl::always_false<ValueType>::value,
+    static_assert(!Kokkos::Impl::always_true<ValueType>::value,
                   "Kokkos Error: team_broadcast() is not implemented for the "
                   "OpenACC backend");
     return ValueType();
@@ -99,7 +99,7 @@ class OpenACCTeamMember {
   template <class ValueType, class JoinOp>
   KOKKOS_FUNCTION ValueType team_reduce(const ValueType& value,
                                         const JoinOp& op_in) const {
-    static_assert(Kokkos::Impl::always_false<ValueType>::value,
+    static_assert(!Kokkos::Impl::always_true<ValueType>::value,
                   "Kokkos Error: team_reduce() is not implemented for the "
                   "OpenACC backend");
     return ValueType();
@@ -110,7 +110,7 @@ class OpenACCTeamMember {
   KOKKOS_FUNCTION ArgType team_scan(const ArgType& /*value*/,
                                     ArgType* const /*global_accum*/) const {
     static_assert(
-        Kokkos::Impl::always_false<ArgType>::value,
+        !Kokkos::Impl::always_true<ArgType>::value,
         "Kokkos Error: team_scan() is not implemented for the OpenACC backend");
     return ArgType();
   }
@@ -163,37 +163,37 @@ class TeamPolicyInternal<Kokkos::Experimental::OpenACC, Properties...>
   // implementations.
   template <class FunctorType>
   static int team_size_max(const FunctorType&, const ParallelForTag&) {
-    return DEFAULT_TEAM_SIZE_MAX;
+    return default_team_size_max;
   }
 
   template <class FunctorType>
   static int team_size_max(const FunctorType&, const ParallelReduceTag&) {
-    return DEFAULT_TEAM_SIZE_MAX;
+    return default_team_size_max;
   }
 
   template <class FunctorType, class ReducerType>
   static int team_size_max(const FunctorType&, const ReducerType&,
                            const ParallelReduceTag&) {
-    return DEFAULT_TEAM_SIZE_MAX;
+    return default_team_size_max;
   }
 
   // FIXME_OPENACC: update team_size_recommended() APIs with realistic
   // implementations.
   template <class FunctorType>
   static int team_size_recommended(const FunctorType&, const ParallelForTag&) {
-    return DEFAULT_TEAM_SIZE_REC;
+    return default_team_size;
   }
 
   template <class FunctorType>
   static int team_size_recommended(const FunctorType&,
                                    const ParallelReduceTag&) {
-    return DEFAULT_TEAM_SIZE_REC;
+    return default_team_size;
   }
 
   template <class FunctorType, class ReducerType>
   static int team_size_recommended(const FunctorType&, const ReducerType&,
                                    const ParallelReduceTag&) {
-    return DEFAULT_TEAM_SIZE_REC;
+    return default_team_size;
   }
 
   //----------------------------------------
@@ -208,7 +208,9 @@ class TeamPolicyInternal<Kokkos::Experimental::OpenACC, Properties...>
   std::array<size_t, 2> m_thread_scratch_size;
   bool m_tune_team_size;
   bool m_tune_vector_length;
-  constexpr static const size_t default_team_size =
+  constexpr static int default_team_size_max =
+      OpenACCTeamMember::DEFAULT_TEAM_SIZE_MAX;
+  constexpr static int default_team_size =
       OpenACCTeamMember::DEFAULT_TEAM_SIZE_REC;
   int m_chunk_size;
 
@@ -226,8 +228,8 @@ class TeamPolicyInternal<Kokkos::Experimental::OpenACC, Properties...>
  public:
   bool impl_auto_team_size() const { return m_tune_team_size; }
   bool impl_auto_vector_length() const { return m_tune_vector_length; }
-  void impl_set_team_size(const size_t size) { m_team_size = size; }
-  void impl_set_vector_length(const size_t length) {
+  void impl_set_team_size(const int size) { m_team_size = size; }
+  void impl_set_vector_length(const int length) {
     m_tune_vector_length = length;
   }
   int impl_vector_length() const { return m_vector_length; }
@@ -348,7 +350,7 @@ class TeamPolicyInternal<Kokkos::Experimental::OpenACC, Properties...>
         m_chunk_size(0) {
     init(league_size_request, team_size_request, 1);
   }
-  static size_t vector_length_max() {
+  static int vector_length_max() {
     return 32; /* TODO: this is bad. Need logic that is compiler and backend
                   aware */
   }
diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp
index c8a6dfec6f71d901f7108b020ed691be5795d9ab..faa50aa7c388eb247913d12a9c6f2c8d676d4e05 100644
--- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp
+++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp
@@ -22,10 +22,12 @@
 namespace Kokkos::Experimental::Impl {
 
 struct OpenACC_Traits {
-#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
-    defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_HOPPER)
+#if defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU)
   static constexpr acc_device_t dev_type     = acc_device_nvidia;
   static constexpr bool may_fallback_to_host = false;
+#elif defined(KOKKOS_ARCH_AMD_GPU)
+  static constexpr acc_device_t dev_type     = acc_device_radeon;
+  static constexpr bool may_fallback_to_host = false;
 #else
   static constexpr acc_device_t dev_type     = acc_device_not_host;
   static constexpr bool may_fallback_to_host = true;
diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP.cpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9a169a435c7380a226b7f157ad399ac2a3340665
--- /dev/null
+++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP.cpp
@@ -0,0 +1,120 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#endif
+
+#include <iostream>
+
+#include <OpenMP/Kokkos_OpenMP.hpp>
+#include <OpenMP/Kokkos_OpenMP_Instance.hpp>
+
+#include <impl/Kokkos_ExecSpaceManager.hpp>
+
+namespace Kokkos {
+
+OpenMP::OpenMP()
+    : m_space_instance(&Impl::OpenMPInternal::singleton(),
+                       [](Impl::OpenMPInternal *) {}) {
+  Impl::OpenMPInternal::singleton().verify_is_initialized(
+      "OpenMP instance constructor");
+}
+
+OpenMP::OpenMP(int pool_size)
+    : m_space_instance(new Impl::OpenMPInternal(pool_size),
+                       [](Impl::OpenMPInternal *ptr) {
+                         ptr->finalize();
+                         delete ptr;
+                       }) {
+  Impl::OpenMPInternal::singleton().verify_is_initialized(
+      "OpenMP instance constructor");
+}
+
+int OpenMP::impl_get_current_max_threads() noexcept {
+  return Impl::OpenMPInternal::get_current_max_threads();
+}
+
+void OpenMP::impl_initialize(InitializationSettings const &settings) {
+  Impl::OpenMPInternal::singleton().initialize(
+      settings.has_num_threads() ? settings.get_num_threads() : -1);
+}
+
+void OpenMP::impl_finalize() { Impl::OpenMPInternal::singleton().finalize(); }
+
+void OpenMP::print_configuration(std::ostream &os, bool /*verbose*/) const {
+  os << "Host Parallel Execution Space:\n";
+  os << "  KOKKOS_ENABLE_OPENMP: yes\n";
+
+  os << "\nOpenMP Runtime Configuration:\n";
+
+  m_space_instance->print_configuration(os);
+}
+
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
+int OpenMP::concurrency(OpenMP const &instance) {
+  return instance.impl_thread_pool_size();
+}
+#else
+int OpenMP::concurrency() const { return impl_thread_pool_size(); }
+#endif
+
+void OpenMP::fence(const std::string &name) const {
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::OpenMP>(
+      name, Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{1}, []() {});
+}
+
+bool OpenMP::impl_is_initialized() noexcept {
+  return Impl::OpenMPInternal::singleton().is_initialized();
+}
+
+bool OpenMP::in_parallel(OpenMP const &exec_space) noexcept {
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+  return (
+      (exec_space.impl_internal_space_instance()->m_level < omp_get_level()) &&
+      (!Impl::t_openmp_instance ||
+       Impl::t_openmp_instance->m_level < omp_get_level()));
+#else
+  return exec_space.impl_internal_space_instance()->m_level < omp_get_level();
+#endif
+}
+
+int OpenMP::impl_thread_pool_size() const noexcept {
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+  return OpenMP::in_parallel(*this)
+             ? omp_get_num_threads()
+             : (Impl::t_openmp_instance
+                    ? Impl::t_openmp_instance->m_pool_size
+                    : impl_internal_space_instance()->m_pool_size);
+#else
+  return OpenMP::in_parallel(*this)
+             ? omp_get_num_threads()
+             : impl_internal_space_instance()->m_pool_size;
+#endif
+}
+
+int OpenMP::impl_max_hardware_threads() noexcept {
+  return Impl::g_openmp_hardware_max_threads;
+}
+
+namespace Impl {
+
+int g_openmp_space_factory_initialized =
+    initialize_space_factory<OpenMP>("050_OpenMP");
+
+}  // namespace Impl
+
+}  // namespace Kokkos
diff --git a/packages/kokkos/core/src/Kokkos_OpenMP.hpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP.hpp
similarity index 78%
rename from packages/kokkos/core/src/Kokkos_OpenMP.hpp
rename to packages/kokkos/core/src/OpenMP/Kokkos_OpenMP.hpp
index bbe008afd959f5db2e7eaa2478e98a0f005c509a..594f40d5245a48d3665db56d1fd432982cb1c8a8 100644
--- a/packages/kokkos/core/src/Kokkos_OpenMP.hpp
+++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP.hpp
@@ -43,6 +43,8 @@ static_assert(false,
 #include <impl/Kokkos_Profiling_Interface.hpp>
 #include <impl/Kokkos_InitializationSettings.hpp>
 
+#include <omp.h>
+
 #include <vector>
 
 /*--------------------------------------------------------------------------*/
@@ -51,7 +53,12 @@ namespace Kokkos {
 
 namespace Impl {
 class OpenMPInternal;
-}
+
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+// FIXME_OPENMP we can remove this after we remove partition_master
+inline thread_local OpenMPInternal* t_openmp_instance = nullptr;
+#endif
+}  // namespace Impl
 
 /// \class OpenMP
 /// \brief Kokkos device for multicore processors in the host memory space.
@@ -81,7 +88,7 @@ class OpenMP {
   void print_configuration(std::ostream& os, bool verbose = false) const;
 
   /// \brief is the instance running a parallel algorithm
-  inline static bool in_parallel(OpenMP const& = OpenMP()) noexcept;
+  static bool in_parallel(OpenMP const& = OpenMP()) noexcept;
 
   /// \brief Wait until all dispatched functors complete on the given instance
   ///
@@ -109,7 +116,11 @@ class OpenMP {
       int requested_partition_size = 0);
 #endif
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
   static int concurrency(OpenMP const& = OpenMP());
+#else
+  int concurrency() const;
+#endif
 
   static void impl_initialize(InitializationSettings const&);
 
@@ -120,15 +131,15 @@ class OpenMP {
   /// \brief Free any resources being consumed by the default execution space
   static void impl_finalize();
 
-  inline static int impl_thread_pool_size(OpenMP const& = OpenMP()) noexcept;
+  int impl_thread_pool_size() const noexcept;
+
+  int impl_thread_pool_size(int depth) const;
 
   /** \brief  The rank of the executing thread in this thread pool */
   inline static int impl_thread_pool_rank() noexcept;
 
-  inline static int impl_thread_pool_size(int depth, OpenMP const& = OpenMP());
-
   // use UniqueToken
-  inline static int impl_max_hardware_threads() noexcept;
+  static int impl_max_hardware_threads() noexcept;
 
   // use UniqueToken
   KOKKOS_INLINE_FUNCTION
@@ -154,6 +165,42 @@ class OpenMP {
   Kokkos::Impl::HostSharedPtr<Impl::OpenMPInternal> m_space_instance;
 };
 
+inline int OpenMP::impl_thread_pool_rank() noexcept {
+  // FIXME_OPENMP Can we remove this when removing partition_master? It's only
+  // used in one partition_master test
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+  KOKKOS_IF_ON_HOST(
+      (return Impl::t_openmp_instance ? 0 : omp_get_thread_num();))
+#else
+  KOKKOS_IF_ON_HOST((return omp_get_thread_num();))
+#endif
+
+  KOKKOS_IF_ON_DEVICE((return -1;))
+}
+
+inline void OpenMP::impl_static_fence(std::string const& name) {
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::OpenMP>(
+      name,
+      Kokkos::Tools::Experimental::SpecialSynchronizationCases::
+          GlobalDeviceSynchronization,
+      []() {});
+}
+
+inline bool OpenMP::is_asynchronous(OpenMP const& /*instance*/) noexcept {
+  return false;
+}
+
+inline int OpenMP::impl_thread_pool_size(int depth) const {
+  return depth < 2 ? impl_thread_pool_size() : 1;
+}
+
+KOKKOS_INLINE_FUNCTION
+int OpenMP::impl_hardware_thread_id() noexcept {
+  KOKKOS_IF_ON_HOST((return omp_get_thread_num();))
+
+  KOKKOS_IF_ON_DEVICE((return -1;))
+}
+
 namespace Tools {
 namespace Experimental {
 template <>
@@ -187,7 +234,6 @@ struct MemorySpaceAccess<Kokkos::OpenMP::memory_space,
 
 #include <OpenMP/Kokkos_OpenMP_Instance.hpp>
 #include <OpenMP/Kokkos_OpenMP_Team.hpp>
-#include <OpenMP/Kokkos_OpenMP_Parallel.hpp>
 #include <OpenMP/Kokkos_OpenMP_Task.hpp>
 
 #include <KokkosExp_MDRangePolicy.hpp>
diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp
index b1ccac51562781983b4e86fdfffe9d50d420b3a1..44f0fbc180a7466d8ac89e20becad6080b53c8ba 100644
--- a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp
+++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp
@@ -241,16 +241,20 @@ void OpenMPInternal::initialize(int thread_count) {
   }
 
   {
-    if (Kokkos::show_warnings() && nullptr == std::getenv("OMP_PROC_BIND")) {
+    if (Kokkos::show_warnings() && !std::getenv("OMP_PROC_BIND")) {
       std::cerr
           << R"WARNING(Kokkos::OpenMP::initialize WARNING: OMP_PROC_BIND environment variable not set
   In general, for best performance with OpenMP 4.0 or better set OMP_PROC_BIND=spread and OMP_PLACES=threads
   For best performance with OpenMP 3.1 set OMP_PROC_BIND=true
   For unit testing set OMP_PROC_BIND=false
 )WARNING" << std::endl;
-    }
 
-    OpenMP::memory_space space;
+      if (mpi_detected()) {
+        std::cerr
+            << R"WARNING(MPI detected: For OpenMP binding to work as intended, MPI ranks must be bound to exclusive CPU sets.
+)WARNING" << std::endl;
+      }
+    }
 
     // Before any other call to OMP query the maximum number of threads
     // and save the value for re-initialization unit testing.
@@ -324,8 +328,6 @@ void OpenMPInternal::initialize(int thread_count) {
     std::cerr << "                                    Requested: "
               << thread_count << " threads per process." << std::endl;
   }
-  // Init the array used for arbitrarily sized atomics
-  init_lock_array_host_space();
 
   m_initialized = true;
 }
@@ -384,68 +386,4 @@ bool OpenMPInternal::verify_is_initialized(const char *const label) const {
   return m_initialized;
 }
 }  // namespace Impl
-
-//----------------------------------------------------------------------------
-
-OpenMP::OpenMP()
-    : m_space_instance(&Impl::OpenMPInternal::singleton(),
-                       [](Impl::OpenMPInternal *) {}) {
-  Impl::OpenMPInternal::singleton().verify_is_initialized(
-      "OpenMP instance constructor");
-}
-
-OpenMP::OpenMP(int pool_size)
-    : m_space_instance(new Impl::OpenMPInternal(pool_size),
-                       [](Impl::OpenMPInternal *ptr) {
-                         ptr->finalize();
-                         delete ptr;
-                       }) {
-  Impl::OpenMPInternal::singleton().verify_is_initialized(
-      "OpenMP instance constructor");
-}
-
-int OpenMP::impl_get_current_max_threads() noexcept {
-  return Impl::OpenMPInternal::get_current_max_threads();
-}
-
-void OpenMP::impl_initialize(InitializationSettings const &settings) {
-  Impl::OpenMPInternal::singleton().initialize(
-      settings.has_num_threads() ? settings.get_num_threads() : -1);
-}
-
-void OpenMP::impl_finalize() { Impl::OpenMPInternal::singleton().finalize(); }
-
-void OpenMP::print_configuration(std::ostream &os, bool /*verbose*/) const {
-  os << "Host Parallel Execution Space:\n";
-  os << "  KOKKOS_ENABLE_OPENMP: yes\n";
-
-  os << "OpenMP Atomics:\n";
-  os << "  KOKKOS_ENABLE_OPENMP_ATOMICS: ";
-#ifdef KOKKOS_ENABLE_OPENMP_ATOMICS
-  os << "yes\n";
-#else
-  os << "no\n";
-#endif
-
-  os << "\nOpenMP Runtime Configuration:\n";
-
-  m_space_instance->print_configuration(os);
-}
-
-int OpenMP::concurrency(OpenMP const &instance) {
-  return impl_thread_pool_size(instance);
-}
-
-void OpenMP::fence(const std::string &name) const {
-  Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::OpenMP>(
-      name, Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{1}, []() {});
-}
-
-namespace Impl {
-
-int g_openmp_space_factory_initialized =
-    initialize_space_factory<OpenMP>("050_OpenMP");
-
-}  // namespace Impl
-
 }  // namespace Kokkos
diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp
index e2d52a141ab085dce80bd0b9280a2b776c27019e..03f5fff395a850651cd1ee894b94bc4bbed0da8e 100644
--- a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp
+++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp
@@ -24,14 +24,13 @@
     "You enabled Kokkos OpenMP support without enabling OpenMP in the compiler!"
 #endif
 
-#include <Kokkos_OpenMP.hpp>
+#include <OpenMP/Kokkos_OpenMP.hpp>
 
 #include <impl/Kokkos_Traits.hpp>
 #include <impl/Kokkos_HostThreadTeam.hpp>
 
 #include <Kokkos_Atomic.hpp>
 
-#include <Kokkos_UniqueToken.hpp>
 #include <impl/Kokkos_ConcurrentBitset.hpp>
 
 #include <omp.h>
@@ -41,6 +40,18 @@
 #include <type_traits>
 #include <vector>
 
+/*--------------------------------------------------------------------------*/
+namespace Kokkos {
+namespace Impl {
+
+inline bool execute_in_serial(OpenMP const& space = OpenMP()) {
+  return (OpenMP::in_parallel(space) &&
+          !(omp_get_nested() && (omp_get_level() == 1)));
+}
+
+}  // namespace Impl
+}  // namespace Kokkos
+
 namespace Kokkos {
 namespace Impl {
 
@@ -48,13 +59,8 @@ class OpenMPInternal;
 
 inline int g_openmp_hardware_max_threads = 1;
 
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
-// FIXME_OPENMP we can remove this after we remove partition_master
-inline thread_local OpenMPInternal* t_openmp_instance = nullptr;
-#endif
-
 struct OpenMPTraits {
-  static int constexpr MAX_THREAD_COUNT = 512;
+  static constexpr int MAX_THREAD_COUNT = 512;
 };
 
 class OpenMPInternal {
@@ -117,102 +123,6 @@ class OpenMPInternal {
 };
 
 }  // namespace Impl
-inline bool OpenMP::impl_is_initialized() noexcept {
-  return Impl::OpenMPInternal::singleton().is_initialized();
-}
-
-inline bool OpenMP::in_parallel(OpenMP const& exec_space) noexcept {
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
-  return (
-      (exec_space.impl_internal_space_instance()->m_level < omp_get_level()) &&
-      (!Impl::t_openmp_instance ||
-       Impl::t_openmp_instance->m_level < omp_get_level()));
-#else
-  return exec_space.impl_internal_space_instance()->m_level < omp_get_level();
-#endif
-}
-
-inline int OpenMP::impl_thread_pool_size(OpenMP const& exec_space) noexcept {
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
-  return OpenMP::in_parallel(exec_space)
-             ? omp_get_num_threads()
-             : (Impl::t_openmp_instance
-                    ? Impl::t_openmp_instance->m_pool_size
-                    : exec_space.impl_internal_space_instance()->m_pool_size);
-#else
-  return OpenMP::in_parallel(exec_space)
-             ? omp_get_num_threads()
-             : exec_space.impl_internal_space_instance()->m_pool_size;
-#endif
-}
-
-inline int OpenMP::impl_thread_pool_rank() noexcept {
-  // FIXME_OPENMP Can we remove this when removing partition_master? It's only
-  // used in one partition_master test
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
-  KOKKOS_IF_ON_HOST(
-      (return Impl::t_openmp_instance ? 0 : omp_get_thread_num();))
-#else
-  KOKKOS_IF_ON_HOST((return omp_get_thread_num();))
-#endif
-
-  KOKKOS_IF_ON_DEVICE((return -1;))
-}
-
-inline void OpenMP::impl_static_fence(std::string const& name) {
-  Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::OpenMP>(
-      name,
-      Kokkos::Tools::Experimental::SpecialSynchronizationCases::
-          GlobalDeviceSynchronization,
-      []() {});
-}
-
-inline bool OpenMP::is_asynchronous(OpenMP const& /*instance*/) noexcept {
-  return false;
-}
-
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
-template <typename F>
-KOKKOS_DEPRECATED void OpenMP::partition_master(F const& f, int num_partitions,
-                                                int partition_size) {
-#if _OPENMP >= 201511
-  if (omp_get_max_active_levels() > 1) {
-#else
-  if (omp_get_nested()) {
-#endif
-    using Exec = Impl::OpenMPInternal;
-
-    Exec* prev_instance = &Impl::OpenMPInternal::singleton();
-
-    Exec::validate_partition_impl(prev_instance->m_pool_size, num_partitions,
-                                  partition_size);
-
-    OpenMP::memory_space space;
-
-#pragma omp parallel num_threads(num_partitions)
-    {
-      Exec thread_local_instance(partition_size);
-      Impl::t_openmp_instance = &thread_local_instance;
-
-      size_t pool_reduce_bytes  = 32 * partition_size;
-      size_t team_reduce_bytes  = 32 * partition_size;
-      size_t team_shared_bytes  = 1024 * partition_size;
-      size_t thread_local_bytes = 1024;
-
-      thread_local_instance.resize_thread_data(
-          pool_reduce_bytes, team_reduce_bytes, team_shared_bytes,
-          thread_local_bytes);
-
-      omp_set_num_threads(partition_size);
-      f(omp_get_thread_num(), omp_get_num_threads());
-      Impl::t_openmp_instance = nullptr;
-    }
-  } else {
-    // nested openmp not enabled
-    f(0, 1);
-  }
-}
-#endif
 
 namespace Experimental {
 
@@ -237,124 +147,8 @@ class MasterLock<OpenMP> {
 };
 #endif
 
-template <>
-class UniqueToken<OpenMP, UniqueTokenScope::Instance> {
- private:
-  using buffer_type = Kokkos::View<uint32_t*, Kokkos::HostSpace>;
-  int m_count;
-  buffer_type m_buffer_view;
-  uint32_t volatile* m_buffer;
-
- public:
-  using execution_space = OpenMP;
-  using size_type       = int;
-
-  /// \brief create object size for concurrency on the given instance
-  ///
-  /// This object should not be shared between instances
-  UniqueToken(execution_space const& = execution_space()) noexcept
-      : m_count(::Kokkos::OpenMP::impl_thread_pool_size()),
-        m_buffer_view(buffer_type()),
-        m_buffer(nullptr) {}
-
-  UniqueToken(size_type max_size, execution_space const& = execution_space())
-      : m_count(max_size),
-        m_buffer_view("UniqueToken::m_buffer_view",
-                      ::Kokkos::Impl::concurrent_bitset::buffer_bound(m_count)),
-        m_buffer(m_buffer_view.data()) {}
-
-  /// \brief upper bound for acquired values, i.e. 0 <= value < size()
-  KOKKOS_INLINE_FUNCTION
-  int size() const noexcept {
-    KOKKOS_IF_ON_HOST((return m_count;))
-
-    KOKKOS_IF_ON_DEVICE((return 0;))
-  }
-
-  /// \brief acquire value such that 0 <= value < size()
-  KOKKOS_INLINE_FUNCTION
-  int acquire() const noexcept {
-    KOKKOS_IF_ON_HOST(
-        (if (m_count >= ::Kokkos::OpenMP::impl_thread_pool_size()) return ::
-             Kokkos::OpenMP::impl_thread_pool_rank();
-         const ::Kokkos::pair<int, int> result =
-             ::Kokkos::Impl::concurrent_bitset::acquire_bounded(
-                 m_buffer, m_count, ::Kokkos::Impl::clock_tic() % m_count);
-
-         if (result.first < 0) {
-           ::Kokkos::abort(
-               "UniqueToken<OpenMP> failure to acquire tokens, no tokens "
-               "available");
-         }
-
-         return result.first;))
-
-    KOKKOS_IF_ON_DEVICE((return 0;))
-  }
-
-  /// \brief release a value acquired by generate
-  KOKKOS_INLINE_FUNCTION
-  void release(int i) const noexcept {
-    KOKKOS_IF_ON_HOST(
-        (if (m_count < ::Kokkos::OpenMP::impl_thread_pool_size()) {
-          ::Kokkos::Impl::concurrent_bitset::release(m_buffer, i);
-        }))
-
-    KOKKOS_IF_ON_DEVICE(((void)i;))
-  }
-};
-
-template <>
-class UniqueToken<OpenMP, UniqueTokenScope::Global> {
- public:
-  using execution_space = OpenMP;
-  using size_type       = int;
-
-  /// \brief create object size for concurrency on the given instance
-  ///
-  /// This object should not be shared between instances
-  UniqueToken(execution_space const& = execution_space()) noexcept {}
-
-  /// \brief upper bound for acquired values, i.e. 0 <= value < size()
-  KOKKOS_INLINE_FUNCTION
-  int size() const noexcept {
-    KOKKOS_IF_ON_HOST((return Kokkos::Impl::g_openmp_hardware_max_threads;))
-
-    KOKKOS_IF_ON_DEVICE((return 0;))
-  }
-
-  /// \brief acquire value such that 0 <= value < size()
-  // FIXME this is wrong when using nested parallelism. In that case multiple
-  // threads have the same thread ID.
-  KOKKOS_INLINE_FUNCTION
-  int acquire() const noexcept {
-    KOKKOS_IF_ON_HOST((return omp_get_thread_num();))
-
-    KOKKOS_IF_ON_DEVICE((return 0;))
-  }
-
-  /// \brief release a value acquired by generate
-  KOKKOS_INLINE_FUNCTION
-  void release(int) const noexcept {}
-};
-
 }  // namespace Experimental
 
-inline int OpenMP::impl_thread_pool_size(int depth, OpenMP const& exec_space) {
-  return depth < 2 ? impl_thread_pool_size(exec_space) : 1;
-}
-
-KOKKOS_INLINE_FUNCTION
-int OpenMP::impl_hardware_thread_id() noexcept {
-  KOKKOS_IF_ON_HOST((return omp_get_thread_num();))
-
-  KOKKOS_IF_ON_DEVICE((return -1;))
-}
-
-inline int OpenMP::impl_max_hardware_threads() noexcept {
-  return Impl::g_openmp_hardware_max_threads;
-}
-
 namespace Experimental {
 namespace Impl {
 // Partitioning an Execution Space: expects space and integer arguments for
@@ -404,10 +198,54 @@ std::vector<OpenMP> partition_space(OpenMP const& main_instance, Args... args) {
 
 template <typename T>
 std::vector<OpenMP> partition_space(OpenMP const& main_instance,
-                                    std::vector<T>& weights) {
+                                    std::vector<T> const& weights) {
   return Impl::create_OpenMP_instances(main_instance, weights);
 }
 }  // namespace Experimental
+
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+template <typename F>
+KOKKOS_DEPRECATED void OpenMP::partition_master(F const& f, int num_partitions,
+                                                int partition_size) {
+#if _OPENMP >= 201511
+  if (omp_get_max_active_levels() > 1) {
+#else
+  if (omp_get_nested()) {
+#endif
+    using Exec = Impl::OpenMPInternal;
+
+    Exec* prev_instance = &Impl::OpenMPInternal::singleton();
+
+    Exec::validate_partition_impl(prev_instance->m_pool_size, num_partitions,
+                                  partition_size);
+
+    OpenMP::memory_space space;
+
+#pragma omp parallel num_threads(num_partitions)
+    {
+      Exec thread_local_instance(partition_size);
+      Impl::t_openmp_instance = &thread_local_instance;
+
+      size_t pool_reduce_bytes  = 32 * partition_size;
+      size_t team_reduce_bytes  = 32 * partition_size;
+      size_t team_shared_bytes  = 1024 * partition_size;
+      size_t thread_local_bytes = 1024;
+
+      thread_local_instance.resize_thread_data(
+          pool_reduce_bytes, team_reduce_bytes, team_shared_bytes,
+          thread_local_bytes);
+
+      omp_set_num_threads(partition_size);
+      f(omp_get_thread_num(), omp_get_num_threads());
+      Impl::t_openmp_instance = nullptr;
+    }
+  } else {
+    // nested openmp not enabled
+    f(0, 1);
+  }
+}
+#endif
+
 }  // namespace Kokkos
 
 #endif
diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp
deleted file mode 100644
index cbd687bec57025a19bf9a22694c6b1123f8bbdd2..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp
+++ /dev/null
@@ -1,1387 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#ifndef KOKKOS_OPENMP_PARALLEL_HPP
-#define KOKKOS_OPENMP_PARALLEL_HPP
-
-#include <Kokkos_Macros.hpp>
-#if defined(KOKKOS_ENABLE_OPENMP)
-
-#include <omp.h>
-#include <OpenMP/Kokkos_OpenMP_Instance.hpp>
-
-#include <KokkosExp_MDRangePolicy.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#define KOKKOS_PRAGMA_IVDEP_IF_ENABLED
-#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \
-    defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
-#undef KOKKOS_PRAGMA_IVDEP_IF_ENABLED
-#define KOKKOS_PRAGMA_IVDEP_IF_ENABLED _Pragma("ivdep")
-#endif
-
-#ifndef KOKKOS_COMPILER_NVHPC
-#define KOKKOS_OPENMP_OPTIONAL_CHUNK_SIZE , m_policy.chunk_size()
-#else
-#define KOKKOS_OPENMP_OPTIONAL_CHUNK_SIZE
-#endif
-
-namespace Kokkos {
-namespace Impl {
-
-inline bool execute_in_serial(OpenMP const& space = OpenMP()) {
-  return (OpenMP::in_parallel(space) &&
-          !(omp_get_nested() && (omp_get_level() == 1)));
-}
-
-template <class FunctorType, class... Traits>
-class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::OpenMP> {
- private:
-  using Policy    = Kokkos::RangePolicy<Traits...>;
-  using WorkTag   = typename Policy::work_tag;
-  using WorkRange = typename Policy::WorkRange;
-  using Member    = typename Policy::member_type;
-
-  OpenMPInternal* m_instance;
-  const FunctorType m_functor;
-  const Policy m_policy;
-
-  inline static void exec_range(const FunctorType& functor, const Member ibeg,
-                                const Member iend) {
-    KOKKOS_PRAGMA_IVDEP_IF_ENABLED
-    for (auto iwork = ibeg; iwork < iend; ++iwork) {
-      exec_work(functor, iwork);
-    }
-  }
-
-  template <class Enable = WorkTag>
-  inline static std::enable_if_t<std::is_void<WorkTag>::value &&
-                                 std::is_same<Enable, WorkTag>::value>
-  exec_work(const FunctorType& functor, const Member iwork) {
-    functor(iwork);
-  }
-
-  template <class Enable = WorkTag>
-  inline static std::enable_if_t<!std::is_void<WorkTag>::value &&
-                                 std::is_same<Enable, WorkTag>::value>
-  exec_work(const FunctorType& functor, const Member iwork) {
-    functor(WorkTag{}, iwork);
-  }
-
-  template <class Policy>
-  std::enable_if_t<std::is_same<typename Policy::schedule_type::type,
-                                Kokkos::Dynamic>::value>
-  execute_parallel() const {
-    // prevent bug in NVHPC 21.9/CUDA 11.4 (entering zero iterations loop)
-    if (m_policy.begin() >= m_policy.end()) return;
-#pragma omp parallel for schedule(dynamic KOKKOS_OPENMP_OPTIONAL_CHUNK_SIZE) \
-    num_threads(m_instance->thread_pool_size())
-    KOKKOS_PRAGMA_IVDEP_IF_ENABLED
-    for (auto iwork = m_policy.begin(); iwork < m_policy.end(); ++iwork) {
-      exec_work(m_functor, iwork);
-    }
-  }
-
-  template <class Policy>
-  std::enable_if_t<!std::is_same<typename Policy::schedule_type::type,
-                                 Kokkos::Dynamic>::value>
-  execute_parallel() const {
-// Specifying an chunksize with GCC compiler leads to performance regression
-// with static schedule.
-#ifdef KOKKOS_COMPILER_GNU
-#pragma omp parallel for schedule(static) \
-    num_threads(m_instance->thread_pool_size())
-#else
-#pragma omp parallel for schedule(static KOKKOS_OPENMP_OPTIONAL_CHUNK_SIZE) \
-    num_threads(m_instance->thread_pool_size())
-#endif
-    KOKKOS_PRAGMA_IVDEP_IF_ENABLED
-    for (auto iwork = m_policy.begin(); iwork < m_policy.end(); ++iwork) {
-      exec_work(m_functor, iwork);
-    }
-  }
-
- public:
-  inline void execute() const {
-    if (execute_in_serial(m_policy.space())) {
-      exec_range(m_functor, m_policy.begin(), m_policy.end());
-      return;
-    }
-
-#ifndef KOKKOS_INTERNAL_DISABLE_NATIVE_OPENMP
-    execute_parallel<Policy>();
-#else
-    constexpr bool is_dynamic =
-        std::is_same<typename Policy::schedule_type::type,
-                     Kokkos::Dynamic>::value;
-#pragma omp parallel num_threads(m_instance->thread_pool_size())
-    {
-      HostThreadTeamData& data = *(m_instance->get_thread_data());
-
-      data.set_work_partition(m_policy.end() - m_policy.begin(),
-                              m_policy.chunk_size());
-
-      if (is_dynamic) {
-        // Make sure work partition is set before stealing
-        if (data.pool_rendezvous()) data.pool_rendezvous_release();
-      }
-
-      std::pair<int64_t, int64_t> range(0, 0);
-
-      do {
-        range = is_dynamic ? data.get_work_stealing_chunk()
-                           : data.get_work_partition();
-
-        exec_range(m_functor, range.first + m_policy.begin(),
-                   range.second + m_policy.begin());
-
-      } while (is_dynamic && 0 <= range.first);
-    }
-#endif
-  }
-
-  inline ParallelFor(const FunctorType& arg_functor, Policy arg_policy)
-      : m_instance(nullptr), m_functor(arg_functor), m_policy(arg_policy) {
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
-    if (t_openmp_instance) {
-      m_instance = t_openmp_instance;
-    } else {
-      m_instance = arg_policy.space().impl_internal_space_instance();
-    }
-#else
-    m_instance = arg_policy.space().impl_internal_space_instance();
-#endif
-  }
-};
-
-// MDRangePolicy impl
-template <class FunctorType, class... Traits>
-class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
-                  Kokkos::OpenMP> {
- private:
-  using MDRangePolicy = Kokkos::MDRangePolicy<Traits...>;
-  using Policy        = typename MDRangePolicy::impl_range_policy;
-  using WorkTag       = typename MDRangePolicy::work_tag;
-
-  using WorkRange = typename Policy::WorkRange;
-  using Member    = typename Policy::member_type;
-
-  using index_type   = typename Policy::index_type;
-  using iterate_type = typename Kokkos::Impl::HostIterateTile<
-      MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void>;
-
-  OpenMPInternal* m_instance;
-  const iterate_type m_iter;
-
-  inline void exec_range(const Member ibeg, const Member iend) const {
-    KOKKOS_PRAGMA_IVDEP_IF_ENABLED
-    for (Member iwork = ibeg; iwork < iend; ++iwork) {
-      m_iter(iwork);
-    }
-  }
-
-  template <class Policy>
-  typename std::enable_if_t<std::is_same<typename Policy::schedule_type::type,
-                                         Kokkos::Dynamic>::value>
-  execute_parallel() const {
-#pragma omp parallel for schedule(dynamic, 1) \
-    num_threads(m_instance->thread_pool_size())
-    KOKKOS_PRAGMA_IVDEP_IF_ENABLED
-    for (index_type iwork = 0; iwork < m_iter.m_rp.m_num_tiles; ++iwork) {
-      m_iter(iwork);
-    }
-  }
-
-  template <class Policy>
-  typename std::enable_if<!std::is_same<typename Policy::schedule_type::type,
-                                        Kokkos::Dynamic>::value>::type
-  execute_parallel() const {
-#pragma omp parallel for schedule(static, 1) \
-    num_threads(m_instance->thread_pool_size())
-    KOKKOS_PRAGMA_IVDEP_IF_ENABLED
-    for (index_type iwork = 0; iwork < m_iter.m_rp.m_num_tiles; ++iwork) {
-      m_iter(iwork);
-    }
-  }
-
- public:
-  inline void execute() const {
-#ifndef KOKKOS_COMPILER_INTEL
-    if (execute_in_serial(m_iter.m_rp.space())) {
-      exec_range(0, m_iter.m_rp.m_num_tiles);
-      return;
-    }
-#endif
-
-#ifndef KOKKOS_INTERNAL_DISABLE_NATIVE_OPENMP
-    execute_parallel<Policy>();
-#else
-    constexpr bool is_dynamic =
-        std::is_same<typename Policy::schedule_type::type,
-                     Kokkos::Dynamic>::value;
-
-#pragma omp parallel num_threads(m_instance->thread_pool_size())
-    {
-      HostThreadTeamData& data = *(m_instance->get_thread_data());
-
-      data.set_work_partition(m_iter.m_rp.m_num_tiles, 1);
-
-      if (is_dynamic) {
-        // Make sure work partition is set before stealing
-        if (data.pool_rendezvous()) data.pool_rendezvous_release();
-      }
-
-      std::pair<int64_t, int64_t> range(0, 0);
-
-      do {
-        range = is_dynamic ? data.get_work_stealing_chunk()
-                           : data.get_work_partition();
-
-        exec_range(range.first, range.second);
-
-      } while (is_dynamic && 0 <= range.first);
-    }
-    // END #pragma omp parallel
-#endif
-  }
-
-  inline ParallelFor(const FunctorType& arg_functor, MDRangePolicy arg_policy)
-      : m_instance(nullptr), m_iter(arg_policy, arg_functor) {
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
-    if (t_openmp_instance) {
-      m_instance = t_openmp_instance;
-    } else {
-      m_instance = arg_policy.space().impl_internal_space_instance();
-    }
-#else
-    m_instance = arg_policy.space().impl_internal_space_instance();
-#endif
-  }
-  template <typename Policy, typename Functor>
-  static int max_tile_size_product(const Policy&, const Functor&) {
-    /**
-     * 1024 here is just our guess for a reasonable max tile size,
-     * it isn't a hardware constraint. If people see a use for larger
-     * tile size products, we're happy to change this.
-     */
-    return 1024;
-  }
-};
-
-}  // namespace Impl
-}  // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template <class FunctorType, class ReducerType, class... Traits>
-class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
-                     Kokkos::OpenMP> {
- private:
-  using Policy = Kokkos::RangePolicy<Traits...>;
-
-  using WorkTag   = typename Policy::work_tag;
-  using WorkRange = typename Policy::WorkRange;
-  using Member    = typename Policy::member_type;
-
-  using ReducerConditional =
-      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                         FunctorType, ReducerType>;
-  using ReducerTypeFwd = typename ReducerConditional::type;
-  using WorkTagFwd =
-      std::conditional_t<std::is_same<InvalidType, ReducerType>::value, WorkTag,
-                         void>;
-
-  // Static Assert WorkTag void if ReducerType not InvalidType
-  using Analysis =
-      FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy, ReducerTypeFwd>;
-
-  using pointer_type   = typename Analysis::pointer_type;
-  using reference_type = typename Analysis::reference_type;
-
-  OpenMPInternal* m_instance;
-  const FunctorType m_functor;
-  const Policy m_policy;
-  const ReducerType m_reducer;
-  const pointer_type m_result_ptr;
-
-  template <class TagType>
-  inline static std::enable_if_t<std::is_void<TagType>::value> exec_range(
-      const FunctorType& functor, const Member ibeg, const Member iend,
-      reference_type update) {
-    for (Member iwork = ibeg; iwork < iend; ++iwork) {
-      functor(iwork, update);
-    }
-  }
-
-  template <class TagType>
-  inline static std::enable_if_t<!std::is_void<TagType>::value> exec_range(
-      const FunctorType& functor, const Member ibeg, const Member iend,
-      reference_type update) {
-    const TagType t{};
-    for (Member iwork = ibeg; iwork < iend; ++iwork) {
-      functor(t, iwork, update);
-    }
-  }
-
- public:
-  inline void execute() const {
-    typename Analysis::Reducer final_reducer(
-        &ReducerConditional::select(m_functor, m_reducer));
-
-    if (m_policy.end() <= m_policy.begin()) {
-      if (m_result_ptr) {
-        final_reducer.init(m_result_ptr);
-        final_reducer.final(m_result_ptr);
-      }
-      return;
-    }
-    enum {
-      is_dynamic = std::is_same<typename Policy::schedule_type::type,
-                                Kokkos::Dynamic>::value
-    };
-
-    const size_t pool_reduce_bytes =
-        Analysis::value_size(ReducerConditional::select(m_functor, m_reducer));
-
-    m_instance->acquire_lock();
-
-    m_instance->resize_thread_data(pool_reduce_bytes, 0  // team_reduce_bytes
-                                   ,
-                                   0  // team_shared_bytes
-                                   ,
-                                   0  // thread_local_bytes
-    );
-
-    if (execute_in_serial(m_policy.space())) {
-      const pointer_type ptr =
-          m_result_ptr
-              ? m_result_ptr
-              : pointer_type(
-                    m_instance->get_thread_data(0)->pool_reduce_local());
-
-      reference_type update = final_reducer.init(ptr);
-
-      ParallelReduce::template exec_range<WorkTag>(m_functor, m_policy.begin(),
-                                                   m_policy.end(), update);
-
-      final_reducer.final(ptr);
-      return;
-    }
-    const int pool_size = m_instance->thread_pool_size();
-#pragma omp parallel num_threads(pool_size)
-    {
-      HostThreadTeamData& data = *(m_instance->get_thread_data());
-
-      data.set_work_partition(m_policy.end() - m_policy.begin(),
-                              m_policy.chunk_size());
-
-      if (is_dynamic) {
-        // Make sure work partition is set before stealing
-        if (data.pool_rendezvous()) data.pool_rendezvous_release();
-      }
-
-      reference_type update = final_reducer.init(
-          reinterpret_cast<pointer_type>(data.pool_reduce_local()));
-
-      std::pair<int64_t, int64_t> range(0, 0);
-
-      do {
-        range = is_dynamic ? data.get_work_stealing_chunk()
-                           : data.get_work_partition();
-
-        ParallelReduce::template exec_range<WorkTag>(
-            m_functor, range.first + m_policy.begin(),
-            range.second + m_policy.begin(), update);
-
-      } while (is_dynamic && 0 <= range.first);
-    }
-
-    // Reduction:
-
-    const pointer_type ptr =
-        pointer_type(m_instance->get_thread_data(0)->pool_reduce_local());
-
-    for (int i = 1; i < pool_size; ++i) {
-      final_reducer.join(
-          ptr, reinterpret_cast<pointer_type>(
-                   m_instance->get_thread_data(i)->pool_reduce_local()));
-    }
-
-    final_reducer.final(ptr);
-
-    if (m_result_ptr) {
-      const int n = Analysis::value_count(
-          ReducerConditional::select(m_functor, m_reducer));
-
-      for (int j = 0; j < n; ++j) {
-        m_result_ptr[j] = ptr[j];
-      }
-    }
-
-    m_instance->release_lock();
-  }
-
-  //----------------------------------------
-
-  template <class ViewType>
-  inline ParallelReduce(
-      const FunctorType& arg_functor, Policy arg_policy,
-      const ViewType& arg_view,
-      std::enable_if_t<Kokkos::is_view<ViewType>::value &&
-                           !Kokkos::is_reducer<ReducerType>::value,
-                       void*> = nullptr)
-      : m_instance(nullptr),
-        m_functor(arg_functor),
-        m_policy(arg_policy),
-        m_reducer(InvalidType()),
-        m_result_ptr(arg_view.data()) {
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
-    if (t_openmp_instance) {
-      m_instance = t_openmp_instance;
-    } else {
-      m_instance = arg_policy.space().impl_internal_space_instance();
-    }
-#else
-    m_instance = arg_policy.space().impl_internal_space_instance();
-#endif
-    /*static_assert( std::is_same< typename ViewType::memory_space
-                                    , Kokkos::HostSpace >::value
-      , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace"
-      );*/
-  }
-
-  inline ParallelReduce(const FunctorType& arg_functor, Policy arg_policy,
-                        const ReducerType& reducer)
-      : m_instance(nullptr),
-        m_functor(arg_functor),
-        m_policy(arg_policy),
-        m_reducer(reducer),
-        m_result_ptr(reducer.view().data()) {
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
-    if (t_openmp_instance) {
-      m_instance = t_openmp_instance;
-    } else {
-      m_instance = arg_policy.space().impl_internal_space_instance();
-    }
-#else
-    m_instance = arg_policy.space().impl_internal_space_instance();
-#endif
-    /*static_assert( std::is_same< typename ViewType::memory_space
-                                    , Kokkos::HostSpace >::value
-      , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace"
-      );*/
-  }
-};
-
-// MDRangePolicy impl
-template <class FunctorType, class ReducerType, class... Traits>
-class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
-                     Kokkos::OpenMP> {
- private:
-  using MDRangePolicy = Kokkos::MDRangePolicy<Traits...>;
-  using Policy        = typename MDRangePolicy::impl_range_policy;
-
-  using WorkTag   = typename MDRangePolicy::work_tag;
-  using WorkRange = typename Policy::WorkRange;
-  using Member    = typename Policy::member_type;
-
-  using ReducerConditional =
-      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                         FunctorType, ReducerType>;
-  using ReducerTypeFwd = typename ReducerConditional::type;
-  using WorkTagFwd =
-      std::conditional_t<std::is_same<InvalidType, ReducerType>::value, WorkTag,
-                         void>;
-
-  using Analysis = FunctorAnalysis<FunctorPatternInterface::REDUCE,
-                                   MDRangePolicy, ReducerTypeFwd>;
-
-  using pointer_type   = typename Analysis::pointer_type;
-  using value_type     = typename Analysis::value_type;
-  using reference_type = typename Analysis::reference_type;
-
-  using iterate_type =
-      typename Kokkos::Impl::HostIterateTile<MDRangePolicy, FunctorType,
-                                             WorkTag, reference_type>;
-
-  OpenMPInternal* m_instance;
-  const iterate_type m_iter;
-  const ReducerType m_reducer;
-  const pointer_type m_result_ptr;
-
-  inline void exec_range(const Member ibeg, const Member iend,
-                         reference_type update) const {
-    for (Member iwork = ibeg; iwork < iend; ++iwork) {
-      m_iter(iwork, update);
-    }
-  }
-
- public:
-  inline void execute() const {
-    const size_t pool_reduce_bytes = Analysis::value_size(
-        ReducerConditional::select(m_iter.m_func, m_reducer));
-
-    m_instance->acquire_lock();
-
-    m_instance->resize_thread_data(pool_reduce_bytes, 0  // team_reduce_bytes
-                                   ,
-                                   0  // team_shared_bytes
-                                   ,
-                                   0  // thread_local_bytes
-    );
-
-    typename Analysis::Reducer final_reducer(
-        &ReducerConditional::select(m_iter.m_func, m_reducer));
-
-#ifndef KOKKOS_COMPILER_INTEL
-    if (execute_in_serial(m_iter.m_rp.space())) {
-      const pointer_type ptr =
-          m_result_ptr
-              ? m_result_ptr
-              : pointer_type(
-                    m_instance->get_thread_data(0)->pool_reduce_local());
-
-      reference_type update = final_reducer.init(ptr);
-
-      ParallelReduce::exec_range(0, m_iter.m_rp.m_num_tiles, update);
-
-      final_reducer.final(ptr);
-
-      m_instance->release_lock();
-
-      return;
-    }
-#endif
-
-    enum {
-      is_dynamic = std::is_same<typename Policy::schedule_type::type,
-                                Kokkos::Dynamic>::value
-    };
-
-    const int pool_size = m_instance->thread_pool_size();
-#pragma omp parallel num_threads(pool_size)
-    {
-      HostThreadTeamData& data = *(m_instance->get_thread_data());
-
-      data.set_work_partition(m_iter.m_rp.m_num_tiles, 1);
-
-      if (is_dynamic) {
-        // Make sure work partition is set before stealing
-        if (data.pool_rendezvous()) data.pool_rendezvous_release();
-      }
-
-      reference_type update = final_reducer.init(
-          reinterpret_cast<pointer_type>(data.pool_reduce_local()));
-
-      std::pair<int64_t, int64_t> range(0, 0);
-
-      do {
-        range = is_dynamic ? data.get_work_stealing_chunk()
-                           : data.get_work_partition();
-
-        ParallelReduce::exec_range(range.first, range.second, update);
-
-      } while (is_dynamic && 0 <= range.first);
-    }
-    // END #pragma omp parallel
-
-    // Reduction:
-
-    const pointer_type ptr =
-        pointer_type(m_instance->get_thread_data(0)->pool_reduce_local());
-
-    for (int i = 1; i < pool_size; ++i) {
-      final_reducer.join(
-          ptr, reinterpret_cast<pointer_type>(
-                   m_instance->get_thread_data(i)->pool_reduce_local()));
-    }
-
-    final_reducer.final(ptr);
-
-    if (m_result_ptr) {
-      const int n = Analysis::value_count(
-          ReducerConditional::select(m_iter.m_func, m_reducer));
-
-      for (int j = 0; j < n; ++j) {
-        m_result_ptr[j] = ptr[j];
-      }
-    }
-
-    m_instance->release_lock();
-  }
-
-  //----------------------------------------
-
-  template <class ViewType>
-  inline ParallelReduce(
-      const FunctorType& arg_functor, MDRangePolicy arg_policy,
-      const ViewType& arg_view,
-      std::enable_if_t<Kokkos::is_view<ViewType>::value &&
-                           !Kokkos::is_reducer<ReducerType>::value,
-                       void*> = nullptr)
-      : m_instance(nullptr),
-        m_iter(arg_policy, arg_functor),
-        m_reducer(InvalidType()),
-        m_result_ptr(arg_view.data()) {
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
-    if (t_openmp_instance) {
-      m_instance = t_openmp_instance;
-    } else {
-      m_instance = arg_policy.space().impl_internal_space_instance();
-    }
-#else
-    m_instance = arg_policy.space().impl_internal_space_instance();
-#endif
-    /*static_assert( std::is_same< typename ViewType::memory_space
-                                    , Kokkos::HostSpace >::value
-      , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace"
-      );*/
-  }
-
-  inline ParallelReduce(const FunctorType& arg_functor,
-                        MDRangePolicy arg_policy, const ReducerType& reducer)
-      : m_instance(nullptr),
-        m_iter(arg_policy, arg_functor),
-        m_reducer(reducer),
-        m_result_ptr(reducer.view().data()) {
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
-    if (t_openmp_instance) {
-      m_instance = t_openmp_instance;
-    } else {
-      m_instance = arg_policy.space().impl_internal_space_instance();
-    }
-#else
-    m_instance = arg_policy.space().impl_internal_space_instance();
-#endif
-    /*static_assert( std::is_same< typename ViewType::memory_space
-                                    , Kokkos::HostSpace >::value
-      , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace"
-      );*/
-  }
-  template <typename Policy, typename Functor>
-  static int max_tile_size_product(const Policy&, const Functor&) {
-    /**
-     * 1024 here is just our guess for a reasonable max tile size,
-     * it isn't a hardware constraint. If people see a use for larger
-     * tile size products, we're happy to change this.
-     */
-    return 1024;
-  }
-};
-
-}  // namespace Impl
-}  // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template <class FunctorType, class... Traits>
-class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
-                   Kokkos::OpenMP> {
- private:
-  using Policy = Kokkos::RangePolicy<Traits...>;
-
-  using Analysis =
-      FunctorAnalysis<FunctorPatternInterface::SCAN, Policy, FunctorType>;
-
-  using WorkTag   = typename Policy::work_tag;
-  using WorkRange = typename Policy::WorkRange;
-  using Member    = typename Policy::member_type;
-
-  using pointer_type   = typename Analysis::pointer_type;
-  using reference_type = typename Analysis::reference_type;
-
-  OpenMPInternal* m_instance;
-  const FunctorType m_functor;
-  const Policy m_policy;
-
-  template <class TagType>
-  inline static std::enable_if_t<std::is_void<TagType>::value> exec_range(
-      const FunctorType& functor, const Member ibeg, const Member iend,
-      reference_type update, const bool final) {
-    for (Member iwork = ibeg; iwork < iend; ++iwork) {
-      functor(iwork, update, final);
-    }
-  }
-
-  template <class TagType>
-  inline static std::enable_if_t<!std::is_void<TagType>::value> exec_range(
-      const FunctorType& functor, const Member ibeg, const Member iend,
-      reference_type update, const bool final) {
-    const TagType t{};
-    for (Member iwork = ibeg; iwork < iend; ++iwork) {
-      functor(t, iwork, update, final);
-    }
-  }
-
- public:
-  inline void execute() const {
-    const int value_count          = Analysis::value_count(m_functor);
-    const size_t pool_reduce_bytes = 2 * Analysis::value_size(m_functor);
-
-    m_instance->resize_thread_data(pool_reduce_bytes, 0  // team_reduce_bytes
-                                   ,
-                                   0  // team_shared_bytes
-                                   ,
-                                   0  // thread_local_bytes
-    );
-
-    if (execute_in_serial(m_policy.space())) {
-      typename Analysis::Reducer final_reducer(&m_functor);
-
-      reference_type update = final_reducer.init(
-          pointer_type(m_instance->get_thread_data(0)->pool_reduce_local()));
-
-      ParallelScan::template exec_range<WorkTag>(m_functor, m_policy.begin(),
-                                                 m_policy.end(), update, true);
-
-      return;
-    }
-
-#pragma omp parallel num_threads(m_instance->thread_pool_size())
-    {
-      HostThreadTeamData& data = *(m_instance->get_thread_data());
-      typename Analysis::Reducer final_reducer(&m_functor);
-
-      const WorkRange range(m_policy, omp_get_thread_num(),
-                            omp_get_num_threads());
-
-      reference_type update_sum = final_reducer.init(
-          reinterpret_cast<pointer_type>(data.pool_reduce_local()));
-
-      ParallelScan::template exec_range<WorkTag>(
-          m_functor, range.begin(), range.end(), update_sum, false);
-
-      if (data.pool_rendezvous()) {
-        pointer_type ptr_prev = nullptr;
-
-        const int n = omp_get_num_threads();
-
-        for (int i = 0; i < n; ++i) {
-          pointer_type ptr =
-              (pointer_type)data.pool_member(i)->pool_reduce_local();
-
-          if (i) {
-            for (int j = 0; j < value_count; ++j) {
-              ptr[j + value_count] = ptr_prev[j + value_count];
-            }
-            final_reducer.join(ptr + value_count, ptr_prev);
-          } else {
-            final_reducer.init(ptr + value_count);
-          }
-
-          ptr_prev = ptr;
-        }
-
-        data.pool_rendezvous_release();
-      }
-
-      reference_type update_base = final_reducer.reference(
-          reinterpret_cast<pointer_type>(data.pool_reduce_local()) +
-          value_count);
-
-      ParallelScan::template exec_range<WorkTag>(
-          m_functor, range.begin(), range.end(), update_base, true);
-    }
-  }
-
-  //----------------------------------------
-
-  inline ParallelScan(const FunctorType& arg_functor, const Policy& arg_policy)
-      : m_instance(nullptr), m_functor(arg_functor), m_policy(arg_policy) {
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
-    if (t_openmp_instance) {
-      m_instance = t_openmp_instance;
-    } else {
-      m_instance = arg_policy.space().impl_internal_space_instance();
-    }
-#else
-    m_instance = arg_policy.space().impl_internal_space_instance();
-#endif
-  }
-
-  //----------------------------------------
-};
-
-template <class FunctorType, class ReturnType, class... Traits>
-class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
-                            ReturnType, Kokkos::OpenMP> {
- private:
-  using Policy = Kokkos::RangePolicy<Traits...>;
-
-  using Analysis =
-      FunctorAnalysis<FunctorPatternInterface::SCAN, Policy, FunctorType>;
-
-  using WorkTag   = typename Policy::work_tag;
-  using WorkRange = typename Policy::WorkRange;
-  using Member    = typename Policy::member_type;
-
-  using value_type     = typename Analysis::value_type;
-  using pointer_type   = typename Analysis::pointer_type;
-  using reference_type = typename Analysis::reference_type;
-
-  OpenMPInternal* m_instance;
-  const FunctorType m_functor;
-  const Policy m_policy;
-  const pointer_type m_result_ptr;
-
-  template <class TagType>
-  inline static std::enable_if_t<std::is_void<TagType>::value> exec_range(
-      const FunctorType& functor, const Member ibeg, const Member iend,
-      reference_type update, const bool final) {
-    for (Member iwork = ibeg; iwork < iend; ++iwork) {
-      functor(iwork, update, final);
-    }
-  }
-
-  template <class TagType>
-  inline static std::enable_if_t<!std::is_void<TagType>::value> exec_range(
-      const FunctorType& functor, const Member ibeg, const Member iend,
-      reference_type update, const bool final) {
-    const TagType t{};
-    for (Member iwork = ibeg; iwork < iend; ++iwork) {
-      functor(t, iwork, update, final);
-    }
-  }
-
- public:
-  inline void execute() const {
-    const int value_count          = Analysis::value_count(m_functor);
-    const size_t pool_reduce_bytes = 2 * Analysis::value_size(m_functor);
-
-    m_instance->acquire_lock();
-
-    m_instance->resize_thread_data(pool_reduce_bytes, 0  // team_reduce_bytes
-                                   ,
-                                   0  // team_shared_bytes
-                                   ,
-                                   0  // thread_local_bytes
-    );
-
-    if (execute_in_serial(m_policy.space())) {
-      typename Analysis::Reducer final_reducer(&m_functor);
-
-      reference_type update = final_reducer.init(
-          pointer_type(m_instance->get_thread_data(0)->pool_reduce_local()));
-
-      this->template exec_range<WorkTag>(m_functor, m_policy.begin(),
-                                         m_policy.end(), update, true);
-
-      *m_result_ptr = update;
-
-      m_instance->release_lock();
-
-      return;
-    }
-
-#pragma omp parallel num_threads(m_instance->thread_pool_size())
-    {
-      HostThreadTeamData& data = *(m_instance->get_thread_data());
-      typename Analysis::Reducer final_reducer(&m_functor);
-
-      const WorkRange range(m_policy, omp_get_thread_num(),
-                            omp_get_num_threads());
-      reference_type update_sum = final_reducer.init(
-          reinterpret_cast<pointer_type>(data.pool_reduce_local()));
-
-      ParallelScanWithTotal::template exec_range<WorkTag>(
-          m_functor, range.begin(), range.end(), update_sum, false);
-
-      if (data.pool_rendezvous()) {
-        pointer_type ptr_prev = nullptr;
-
-        const int n = omp_get_num_threads();
-
-        for (int i = 0; i < n; ++i) {
-          pointer_type ptr =
-              (pointer_type)data.pool_member(i)->pool_reduce_local();
-
-          if (i) {
-            for (int j = 0; j < value_count; ++j) {
-              ptr[j + value_count] = ptr_prev[j + value_count];
-            }
-            final_reducer.join(ptr + value_count, ptr_prev);
-          } else {
-            final_reducer.init(ptr + value_count);
-          }
-
-          ptr_prev = ptr;
-        }
-
-        data.pool_rendezvous_release();
-      }
-
-      reference_type update_base = final_reducer.reference(
-          reinterpret_cast<pointer_type>(data.pool_reduce_local()) +
-          value_count);
-
-      ParallelScanWithTotal::template exec_range<WorkTag>(
-          m_functor, range.begin(), range.end(), update_base, true);
-
-      if (omp_get_thread_num() == omp_get_num_threads() - 1) {
-        *m_result_ptr = update_base;
-      }
-    }
-
-    m_instance->release_lock();
-  }
-
-  //----------------------------------------
-
-  template <class ViewType>
-  ParallelScanWithTotal(const FunctorType& arg_functor,
-                        const Policy& arg_policy,
-                        const ViewType& arg_result_view)
-      : m_instance(nullptr),
-        m_functor(arg_functor),
-        m_policy(arg_policy),
-        m_result_ptr(arg_result_view.data()) {
-    static_assert(
-        Kokkos::Impl::MemorySpaceAccess<typename ViewType::memory_space,
-                                        Kokkos::HostSpace>::accessible,
-        "Kokkos::OpenMP parallel_scan result must be host-accessible!");
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
-    if (t_openmp_instance) {
-      m_instance = t_openmp_instance;
-    } else {
-      m_instance = arg_policy.space().impl_internal_space_instance();
-    }
-#else
-    m_instance = arg_policy.space().impl_internal_space_instance();
-#endif
-  }
-
-  //----------------------------------------
-};
-
-}  // namespace Impl
-}  // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template <class FunctorType, class... Properties>
-class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
-                  Kokkos::OpenMP> {
- private:
-  enum { TEAM_REDUCE_SIZE = 512 };
-
-  using Policy =
-      Kokkos::Impl::TeamPolicyInternal<Kokkos::OpenMP, Properties...>;
-  using WorkTag  = typename Policy::work_tag;
-  using SchedTag = typename Policy::schedule_type::type;
-  using Member   = typename Policy::member_type;
-
-  OpenMPInternal* m_instance;
-  const FunctorType m_functor;
-  const Policy m_policy;
-  const size_t m_shmem_size;
-
-  template <class TagType>
-  inline static std::enable_if_t<(std::is_void<TagType>::value)> exec_team(
-      const FunctorType& functor, HostThreadTeamData& data,
-      const int league_rank_begin, const int league_rank_end,
-      const int league_size) {
-    for (int r = league_rank_begin; r < league_rank_end;) {
-      functor(Member(data, r, league_size));
-
-      if (++r < league_rank_end) {
-        // Don't allow team members to lap one another
-        // so that they don't overwrite shared memory.
-        if (data.team_rendezvous()) {
-          data.team_rendezvous_release();
-        }
-      }
-    }
-  }
-
-  template <class TagType>
-  inline static std::enable_if_t<(!std::is_void<TagType>::value)> exec_team(
-      const FunctorType& functor, HostThreadTeamData& data,
-      const int league_rank_begin, const int league_rank_end,
-      const int league_size) {
-    const TagType t{};
-
-    for (int r = league_rank_begin; r < league_rank_end;) {
-      functor(t, Member(data, r, league_size));
-
-      if (++r < league_rank_end) {
-        // Don't allow team members to lap one another
-        // so that they don't overwrite shared memory.
-        if (data.team_rendezvous()) {
-          data.team_rendezvous_release();
-        }
-      }
-    }
-  }
-
- public:
-  inline void execute() const {
-    enum { is_dynamic = std::is_same<SchedTag, Kokkos::Dynamic>::value };
-
-    const size_t pool_reduce_size  = 0;  // Never shrinks
-    const size_t team_reduce_size  = TEAM_REDUCE_SIZE * m_policy.team_size();
-    const size_t team_shared_size  = m_shmem_size;
-    const size_t thread_local_size = 0;  // Never shrinks
-
-    m_instance->acquire_lock();
-
-    m_instance->resize_thread_data(pool_reduce_size, team_reduce_size,
-                                   team_shared_size, thread_local_size);
-
-    if (execute_in_serial(m_policy.space())) {
-      ParallelFor::template exec_team<WorkTag>(
-          m_functor, *(m_instance->get_thread_data()), 0,
-          m_policy.league_size(), m_policy.league_size());
-
-      m_instance->release_lock();
-
-      return;
-    }
-
-#pragma omp parallel num_threads(m_instance->thread_pool_size())
-    {
-      HostThreadTeamData& data = *(m_instance->get_thread_data());
-
-      const int active = data.organize_team(m_policy.team_size());
-
-      if (active) {
-        data.set_work_partition(
-            m_policy.league_size(),
-            (0 < m_policy.chunk_size() ? m_policy.chunk_size()
-                                       : m_policy.team_iter()));
-      }
-
-      if (is_dynamic) {
-        // Must synchronize to make sure each team has set its
-        // partition before beginning the work stealing loop.
-        if (data.pool_rendezvous()) data.pool_rendezvous_release();
-      }
-
-      if (active) {
-        std::pair<int64_t, int64_t> range(0, 0);
-
-        do {
-          range = is_dynamic ? data.get_work_stealing_chunk()
-                             : data.get_work_partition();
-
-          ParallelFor::template exec_team<WorkTag>(m_functor, data, range.first,
-                                                   range.second,
-                                                   m_policy.league_size());
-
-        } while (is_dynamic && 0 <= range.first);
-      }
-
-      data.disband_team();
-    }
-
-    m_instance->release_lock();
-  }
-
-  inline ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
-      : m_instance(nullptr),
-        m_functor(arg_functor),
-        m_policy(arg_policy),
-        m_shmem_size(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) +
-                     FunctorTeamShmemSize<FunctorType>::value(
-                         arg_functor, arg_policy.team_size())) {
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
-    if (t_openmp_instance) {
-      m_instance = t_openmp_instance;
-    } else {
-      m_instance = arg_policy.space().impl_internal_space_instance();
-    }
-#else
-    m_instance = arg_policy.space().impl_internal_space_instance();
-#endif
-  }
-};
-
-//----------------------------------------------------------------------------
-
-template <class FunctorType, class ReducerType, class... Properties>
-class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
-                     ReducerType, Kokkos::OpenMP> {
- private:
-  enum { TEAM_REDUCE_SIZE = 512 };
-
-  using Policy =
-      Kokkos::Impl::TeamPolicyInternal<Kokkos::OpenMP, Properties...>;
-
-  using WorkTag  = typename Policy::work_tag;
-  using SchedTag = typename Policy::schedule_type::type;
-  using Member   = typename Policy::member_type;
-
-  using ReducerConditional =
-      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                         FunctorType, ReducerType>;
-
-  using ReducerTypeFwd = typename ReducerConditional::type;
-  using WorkTagFwd =
-      std::conditional_t<std::is_same<InvalidType, ReducerType>::value, WorkTag,
-                         void>;
-
-  using Analysis =
-      FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy, ReducerTypeFwd>;
-
-  using pointer_type   = typename Analysis::pointer_type;
-  using reference_type = typename Analysis::reference_type;
-
-  OpenMPInternal* m_instance;
-  const FunctorType m_functor;
-  const Policy m_policy;
-  const ReducerType m_reducer;
-  const pointer_type m_result_ptr;
-  const int m_shmem_size;
-
-  template <class TagType>
-  inline static std::enable_if_t<(std::is_void<TagType>::value)> exec_team(
-      const FunctorType& functor, HostThreadTeamData& data,
-      reference_type& update, const int league_rank_begin,
-      const int league_rank_end, const int league_size) {
-    for (int r = league_rank_begin; r < league_rank_end;) {
-      functor(Member(data, r, league_size), update);
-
-      if (++r < league_rank_end) {
-        // Don't allow team members to lap one another
-        // so that they don't overwrite shared memory.
-        if (data.team_rendezvous()) {
-          data.team_rendezvous_release();
-        }
-      }
-    }
-  }
-
-  template <class TagType>
-  inline static std::enable_if_t<(!std::is_void<TagType>::value)> exec_team(
-      const FunctorType& functor, HostThreadTeamData& data,
-      reference_type& update, const int league_rank_begin,
-      const int league_rank_end, const int league_size) {
-    const TagType t{};
-
-    for (int r = league_rank_begin; r < league_rank_end;) {
-      functor(t, Member(data, r, league_size), update);
-
-      if (++r < league_rank_end) {
-        // Don't allow team members to lap one another
-        // so that they don't overwrite shared memory.
-        if (data.team_rendezvous()) {
-          data.team_rendezvous_release();
-        }
-      }
-    }
-  }
-
- public:
-  inline void execute() const {
-    enum { is_dynamic = std::is_same<SchedTag, Kokkos::Dynamic>::value };
-
-    typename Analysis::Reducer final_reducer(
-        &ReducerConditional::select(m_functor, m_reducer));
-
-    if (m_policy.league_size() == 0 || m_policy.team_size() == 0) {
-      if (m_result_ptr) {
-        final_reducer.init(m_result_ptr);
-        final_reducer.final(m_result_ptr);
-      }
-      return;
-    }
-
-    const size_t pool_reduce_size =
-        Analysis::value_size(ReducerConditional::select(m_functor, m_reducer));
-
-    const size_t team_reduce_size  = TEAM_REDUCE_SIZE * m_policy.team_size();
-    const size_t team_shared_size  = m_shmem_size + m_policy.scratch_size(1);
-    const size_t thread_local_size = 0;  // Never shrinks
-
-    m_instance->acquire_lock();
-
-    m_instance->resize_thread_data(pool_reduce_size, team_reduce_size,
-                                   team_shared_size, thread_local_size);
-
-    if (execute_in_serial(m_policy.space())) {
-      HostThreadTeamData& data = *(m_instance->get_thread_data());
-      pointer_type ptr =
-          m_result_ptr ? m_result_ptr : pointer_type(data.pool_reduce_local());
-      reference_type update       = final_reducer.init(ptr);
-      const int league_rank_begin = 0;
-      const int league_rank_end   = m_policy.league_size();
-      ParallelReduce::template exec_team<WorkTag>(
-          m_functor, data, update, league_rank_begin, league_rank_end,
-          m_policy.league_size());
-
-      final_reducer.final(ptr);
-
-      m_instance->release_lock();
-
-      return;
-    }
-
-    const int pool_size = m_instance->thread_pool_size();
-#pragma omp parallel num_threads(pool_size)
-    {
-      HostThreadTeamData& data = *(m_instance->get_thread_data());
-
-      const int active = data.organize_team(m_policy.team_size());
-
-      if (active) {
-        data.set_work_partition(
-            m_policy.league_size(),
-            (0 < m_policy.chunk_size() ? m_policy.chunk_size()
-                                       : m_policy.team_iter()));
-      }
-
-      if (is_dynamic) {
-        // Must synchronize to make sure each team has set its
-        // partition before beginning the work stealing loop.
-        if (data.pool_rendezvous()) data.pool_rendezvous_release();
-      }
-
-      if (active) {
-        reference_type update = final_reducer.init(
-            reinterpret_cast<pointer_type>(data.pool_reduce_local()));
-
-        std::pair<int64_t, int64_t> range(0, 0);
-
-        do {
-          range = is_dynamic ? data.get_work_stealing_chunk()
-                             : data.get_work_partition();
-
-          ParallelReduce::template exec_team<WorkTag>(m_functor, data, update,
-                                                      range.first, range.second,
-                                                      m_policy.league_size());
-
-        } while (is_dynamic && 0 <= range.first);
-      } else {
-        final_reducer.init(
-            reinterpret_cast<pointer_type>(data.pool_reduce_local()));
-      }
-
-      data.disband_team();
-
-      //  This thread has updated 'pool_reduce_local()' with its
-      //  contributions to the reduction.  The parallel region is
-      //  about to terminate and the master thread will load and
-      //  reduce each 'pool_reduce_local()' contribution.
-      //  Must 'memory_fence()' to guarantee that storing the update to
-      //  'pool_reduce_local()' will complete before this thread
-      //  exits the parallel region.
-
-      memory_fence();
-    }
-
-    // Reduction:
-
-    const pointer_type ptr =
-        pointer_type(m_instance->get_thread_data(0)->pool_reduce_local());
-
-    for (int i = 1; i < pool_size; ++i) {
-      final_reducer.join(
-          ptr, reinterpret_cast<pointer_type>(
-                   m_instance->get_thread_data(i)->pool_reduce_local()));
-    }
-
-    final_reducer.final(ptr);
-
-    if (m_result_ptr) {
-      const int n = Analysis::value_count(
-          ReducerConditional::select(m_functor, m_reducer));
-
-      for (int j = 0; j < n; ++j) {
-        m_result_ptr[j] = ptr[j];
-      }
-    }
-
-    m_instance->release_lock();
-  }
-
-  //----------------------------------------
-
-  template <class ViewType>
-  inline ParallelReduce(
-      const FunctorType& arg_functor, const Policy& arg_policy,
-      const ViewType& arg_result,
-      std::enable_if_t<Kokkos::is_view<ViewType>::value &&
-                           !Kokkos::is_reducer<ReducerType>::value,
-                       void*> = nullptr)
-      : m_instance(nullptr),
-        m_functor(arg_functor),
-        m_policy(arg_policy),
-        m_reducer(InvalidType()),
-        m_result_ptr(arg_result.data()),
-        m_shmem_size(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) +
-                     FunctorTeamShmemSize<FunctorType>::value(
-                         arg_functor, arg_policy.team_size())) {
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
-    if (t_openmp_instance) {
-      m_instance = t_openmp_instance;
-    } else {
-      m_instance = arg_policy.space().impl_internal_space_instance();
-    }
-#else
-    m_instance = arg_policy.space().impl_internal_space_instance();
-#endif
-  }
-
-  inline ParallelReduce(const FunctorType& arg_functor, Policy arg_policy,
-                        const ReducerType& reducer)
-      : m_instance(nullptr),
-        m_functor(arg_functor),
-        m_policy(arg_policy),
-        m_reducer(reducer),
-        m_result_ptr(reducer.view().data()),
-        m_shmem_size(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) +
-                     FunctorTeamShmemSize<FunctorType>::value(
-                         arg_functor, arg_policy.team_size())) {
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
-    if (t_openmp_instance) {
-      m_instance = t_openmp_instance;
-    } else {
-      m_instance = arg_policy.space().impl_internal_space_instance();
-    }
-#else
-    m_instance = arg_policy.space().impl_internal_space_instance();
-#endif
-    /*static_assert( std::is_same< typename ViewType::memory_space
-                            , Kokkos::HostSpace >::value
-    , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace"
-    );*/
-  }
-};
-
-}  // namespace Impl
-}  // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#undef KOKKOS_PRAGMA_IVDEP_IF_ENABLED
-#undef KOKKOS_OPENMP_OPTIONAL_CHUNK_SIZE
-
-#endif
-#endif /* KOKKOS_OPENMP_PARALLEL_HPP */
diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_For.hpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_For.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..96dc664eb79a58fc0c9363eca928e2d7455333dd
--- /dev/null
+++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_For.hpp
@@ -0,0 +1,433 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_OPENMP_PARALLEL_FOR_HPP
+#define KOKKOS_OPENMP_PARALLEL_FOR_HPP
+
+#include <omp.h>
+#include <OpenMP/Kokkos_OpenMP_Instance.hpp>
+#include <KokkosExp_MDRangePolicy.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#define KOKKOS_PRAGMA_IVDEP_IF_ENABLED
+#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \
+    defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
+#undef KOKKOS_PRAGMA_IVDEP_IF_ENABLED
+#define KOKKOS_PRAGMA_IVDEP_IF_ENABLED _Pragma("ivdep")
+#endif
+
+#ifndef KOKKOS_COMPILER_NVHPC
+#define KOKKOS_OPENMP_OPTIONAL_CHUNK_SIZE , m_policy.chunk_size()
+#else
+#define KOKKOS_OPENMP_OPTIONAL_CHUNK_SIZE
+#endif
+
+namespace Kokkos {
+namespace Impl {
+
+template <class FunctorType, class... Traits>
+class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::OpenMP> {
+ private:
+  using Policy  = Kokkos::RangePolicy<Traits...>;
+  using WorkTag = typename Policy::work_tag;
+  using Member  = typename Policy::member_type;
+
+  OpenMPInternal* m_instance;
+  const FunctorType m_functor;
+  const Policy m_policy;
+
+  inline static void exec_range(const FunctorType& functor, const Member ibeg,
+                                const Member iend) {
+    KOKKOS_PRAGMA_IVDEP_IF_ENABLED
+    for (auto iwork = ibeg; iwork < iend; ++iwork) {
+      exec_work(functor, iwork);
+    }
+  }
+
+  template <class Enable = WorkTag>
+  inline static std::enable_if_t<std::is_void<WorkTag>::value &&
+                                 std::is_same<Enable, WorkTag>::value>
+  exec_work(const FunctorType& functor, const Member iwork) {
+    functor(iwork);
+  }
+
+  template <class Enable = WorkTag>
+  inline static std::enable_if_t<!std::is_void<WorkTag>::value &&
+                                 std::is_same<Enable, WorkTag>::value>
+  exec_work(const FunctorType& functor, const Member iwork) {
+    functor(WorkTag{}, iwork);
+  }
+
+  template <class Policy>
+  std::enable_if_t<std::is_same<typename Policy::schedule_type::type,
+                                Kokkos::Dynamic>::value>
+  execute_parallel() const {
+    // prevent bug in NVHPC 21.9/CUDA 11.4 (entering zero iterations loop)
+    if (m_policy.begin() >= m_policy.end()) return;
+#pragma omp parallel for schedule(dynamic KOKKOS_OPENMP_OPTIONAL_CHUNK_SIZE) \
+    num_threads(m_instance->thread_pool_size())
+    KOKKOS_PRAGMA_IVDEP_IF_ENABLED
+    for (auto iwork = m_policy.begin(); iwork < m_policy.end(); ++iwork) {
+      exec_work(m_functor, iwork);
+    }
+  }
+
+  template <class Policy>
+  std::enable_if_t<!std::is_same<typename Policy::schedule_type::type,
+                                 Kokkos::Dynamic>::value>
+  execute_parallel() const {
+// Specifying an chunksize with GCC compiler leads to performance regression
+// with static schedule.
+#ifdef KOKKOS_COMPILER_GNU
+#pragma omp parallel for schedule(static) \
+    num_threads(m_instance->thread_pool_size())
+#else
+#pragma omp parallel for schedule(static KOKKOS_OPENMP_OPTIONAL_CHUNK_SIZE) \
+    num_threads(m_instance->thread_pool_size())
+#endif
+    KOKKOS_PRAGMA_IVDEP_IF_ENABLED
+    for (auto iwork = m_policy.begin(); iwork < m_policy.end(); ++iwork) {
+      exec_work(m_functor, iwork);
+    }
+  }
+
+ public:
+  inline void execute() const {
+    if (execute_in_serial(m_policy.space())) {
+      exec_range(m_functor, m_policy.begin(), m_policy.end());
+      return;
+    }
+
+#ifndef KOKKOS_INTERNAL_DISABLE_NATIVE_OPENMP
+    execute_parallel<Policy>();
+#else
+    constexpr bool is_dynamic =
+        std::is_same<typename Policy::schedule_type::type,
+                     Kokkos::Dynamic>::value;
+#pragma omp parallel num_threads(m_instance->thread_pool_size())
+    {
+      HostThreadTeamData& data = *(m_instance->get_thread_data());
+
+      data.set_work_partition(m_policy.end() - m_policy.begin(),
+                              m_policy.chunk_size());
+
+      if (is_dynamic) {
+        // Make sure work partition is set before stealing
+        if (data.pool_rendezvous()) data.pool_rendezvous_release();
+      }
+
+      std::pair<int64_t, int64_t> range(0, 0);
+
+      do {
+        range = is_dynamic ? data.get_work_stealing_chunk()
+                           : data.get_work_partition();
+
+        exec_range(m_functor, range.first + m_policy.begin(),
+                   range.second + m_policy.begin());
+
+      } while (is_dynamic && 0 <= range.first);
+    }
+#endif
+  }
+
+  inline ParallelFor(const FunctorType& arg_functor, Policy arg_policy)
+      : m_instance(nullptr), m_functor(arg_functor), m_policy(arg_policy) {
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+    if (t_openmp_instance) {
+      m_instance = t_openmp_instance;
+    } else {
+      m_instance = arg_policy.space().impl_internal_space_instance();
+    }
+#else
+    m_instance = arg_policy.space().impl_internal_space_instance();
+#endif
+  }
+};
+
+// MDRangePolicy impl
+template <class FunctorType, class... Traits>
+class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
+                  Kokkos::OpenMP> {
+ private:
+  using MDRangePolicy = Kokkos::MDRangePolicy<Traits...>;
+  using Policy        = typename MDRangePolicy::impl_range_policy;
+  using WorkTag       = typename MDRangePolicy::work_tag;
+
+  using Member = typename Policy::member_type;
+
+  using index_type   = typename Policy::index_type;
+  using iterate_type = typename Kokkos::Impl::HostIterateTile<
+      MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void>;
+
+  OpenMPInternal* m_instance;
+  const iterate_type m_iter;
+
+  inline void exec_range(const Member ibeg, const Member iend) const {
+    KOKKOS_PRAGMA_IVDEP_IF_ENABLED
+    for (Member iwork = ibeg; iwork < iend; ++iwork) {
+      m_iter(iwork);
+    }
+  }
+
+  template <class Policy>
+  typename std::enable_if_t<std::is_same<typename Policy::schedule_type::type,
+                                         Kokkos::Dynamic>::value>
+  execute_parallel() const {
+#pragma omp parallel for schedule(dynamic, 1) \
+    num_threads(m_instance->thread_pool_size())
+    KOKKOS_PRAGMA_IVDEP_IF_ENABLED
+    for (index_type iwork = 0; iwork < m_iter.m_rp.m_num_tiles; ++iwork) {
+      m_iter(iwork);
+    }
+  }
+
+  template <class Policy>
+  typename std::enable_if<!std::is_same<typename Policy::schedule_type::type,
+                                        Kokkos::Dynamic>::value>::type
+  execute_parallel() const {
+#pragma omp parallel for schedule(static, 1) \
+    num_threads(m_instance->thread_pool_size())
+    KOKKOS_PRAGMA_IVDEP_IF_ENABLED
+    for (index_type iwork = 0; iwork < m_iter.m_rp.m_num_tiles; ++iwork) {
+      m_iter(iwork);
+    }
+  }
+
+ public:
+  inline void execute() const {
+#ifndef KOKKOS_COMPILER_INTEL
+    if (execute_in_serial(m_iter.m_rp.space())) {
+      exec_range(0, m_iter.m_rp.m_num_tiles);
+      return;
+    }
+#endif
+
+#ifndef KOKKOS_INTERNAL_DISABLE_NATIVE_OPENMP
+    execute_parallel<Policy>();
+#else
+    constexpr bool is_dynamic =
+        std::is_same<typename Policy::schedule_type::type,
+                     Kokkos::Dynamic>::value;
+
+#pragma omp parallel num_threads(m_instance->thread_pool_size())
+    {
+      HostThreadTeamData& data = *(m_instance->get_thread_data());
+
+      data.set_work_partition(m_iter.m_rp.m_num_tiles, 1);
+
+      if (is_dynamic) {
+        // Make sure work partition is set before stealing
+        if (data.pool_rendezvous()) data.pool_rendezvous_release();
+      }
+
+      std::pair<int64_t, int64_t> range(0, 0);
+
+      do {
+        range = is_dynamic ? data.get_work_stealing_chunk()
+                           : data.get_work_partition();
+
+        exec_range(range.first, range.second);
+
+      } while (is_dynamic && 0 <= range.first);
+    }
+    // END #pragma omp parallel
+#endif
+  }
+
+  inline ParallelFor(const FunctorType& arg_functor, MDRangePolicy arg_policy)
+      : m_instance(nullptr), m_iter(arg_policy, arg_functor) {
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+    if (t_openmp_instance) {
+      m_instance = t_openmp_instance;
+    } else {
+      m_instance = arg_policy.space().impl_internal_space_instance();
+    }
+#else
+    m_instance = arg_policy.space().impl_internal_space_instance();
+#endif
+  }
+  template <typename Policy, typename Functor>
+  static int max_tile_size_product(const Policy&, const Functor&) {
+    /**
+     * 1024 here is just our guess for a reasonable max tile size,
+     * it isn't a hardware constraint. If people see a use for larger
+     * tile size products, we're happy to change this.
+     */
+    return 1024;
+  }
+};
+
+}  // namespace Impl
+}  // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template <class FunctorType, class... Properties>
+class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
+                  Kokkos::OpenMP> {
+ private:
+  enum { TEAM_REDUCE_SIZE = 512 };
+
+  using Policy =
+      Kokkos::Impl::TeamPolicyInternal<Kokkos::OpenMP, Properties...>;
+  using WorkTag  = typename Policy::work_tag;
+  using SchedTag = typename Policy::schedule_type::type;
+  using Member   = typename Policy::member_type;
+
+  OpenMPInternal* m_instance;
+  const FunctorType m_functor;
+  const Policy m_policy;
+  const size_t m_shmem_size;
+
+  template <class TagType>
+  inline static std::enable_if_t<(std::is_void<TagType>::value)> exec_team(
+      const FunctorType& functor, HostThreadTeamData& data,
+      const int league_rank_begin, const int league_rank_end,
+      const int league_size) {
+    for (int r = league_rank_begin; r < league_rank_end;) {
+      functor(Member(data, r, league_size));
+
+      if (++r < league_rank_end) {
+        // Don't allow team members to lap one another
+        // so that they don't overwrite shared memory.
+        if (data.team_rendezvous()) {
+          data.team_rendezvous_release();
+        }
+      }
+    }
+  }
+
+  template <class TagType>
+  inline static std::enable_if_t<(!std::is_void<TagType>::value)> exec_team(
+      const FunctorType& functor, HostThreadTeamData& data,
+      const int league_rank_begin, const int league_rank_end,
+      const int league_size) {
+    const TagType t{};
+
+    for (int r = league_rank_begin; r < league_rank_end;) {
+      functor(t, Member(data, r, league_size));
+
+      if (++r < league_rank_end) {
+        // Don't allow team members to lap one another
+        // so that they don't overwrite shared memory.
+        if (data.team_rendezvous()) {
+          data.team_rendezvous_release();
+        }
+      }
+    }
+  }
+
+ public:
+  inline void execute() const {
+    enum { is_dynamic = std::is_same<SchedTag, Kokkos::Dynamic>::value };
+
+    const size_t pool_reduce_size  = 0;  // Never shrinks
+    const size_t team_reduce_size  = TEAM_REDUCE_SIZE * m_policy.team_size();
+    const size_t team_shared_size  = m_shmem_size;
+    const size_t thread_local_size = 0;  // Never shrinks
+
+    m_instance->acquire_lock();
+
+    m_instance->resize_thread_data(pool_reduce_size, team_reduce_size,
+                                   team_shared_size, thread_local_size);
+
+    if (execute_in_serial(m_policy.space())) {
+      ParallelFor::template exec_team<WorkTag>(
+          m_functor, *(m_instance->get_thread_data()), 0,
+          m_policy.league_size(), m_policy.league_size());
+
+      m_instance->release_lock();
+
+      return;
+    }
+
+#pragma omp parallel num_threads(m_instance->thread_pool_size())
+    {
+      HostThreadTeamData& data = *(m_instance->get_thread_data());
+
+      const int active = data.organize_team(m_policy.team_size());
+
+      if (active) {
+        data.set_work_partition(
+            m_policy.league_size(),
+            (0 < m_policy.chunk_size() ? m_policy.chunk_size()
+                                       : m_policy.team_iter()));
+      }
+
+      if (is_dynamic) {
+        // Must synchronize to make sure each team has set its
+        // partition before beginning the work stealing loop.
+        if (data.pool_rendezvous()) data.pool_rendezvous_release();
+      }
+
+      if (active) {
+        std::pair<int64_t, int64_t> range(0, 0);
+
+        do {
+          range = is_dynamic ? data.get_work_stealing_chunk()
+                             : data.get_work_partition();
+
+          ParallelFor::template exec_team<WorkTag>(m_functor, data, range.first,
+                                                   range.second,
+                                                   m_policy.league_size());
+
+        } while (is_dynamic && 0 <= range.first);
+      }
+
+      data.disband_team();
+    }
+
+    m_instance->release_lock();
+  }
+
+  inline ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
+      : m_instance(nullptr),
+        m_functor(arg_functor),
+        m_policy(arg_policy),
+        m_shmem_size(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) +
+                     FunctorTeamShmemSize<FunctorType>::value(
+                         arg_functor, arg_policy.team_size())) {
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+    if (t_openmp_instance) {
+      m_instance = t_openmp_instance;
+    } else {
+      m_instance = arg_policy.space().impl_internal_space_instance();
+    }
+#else
+    m_instance = arg_policy.space().impl_internal_space_instance();
+#endif
+  }
+};
+
+}  // namespace Impl
+}  // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#undef KOKKOS_PRAGMA_IVDEP_IF_ENABLED
+#undef KOKKOS_OPENMP_OPTIONAL_CHUNK_SIZE
+
+#endif /* KOKKOS_OPENMP_PARALLEL_FOR_HPP */
diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_Reduce.hpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_Reduce.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..52cdef18e65965da7d1dcdbb263e98df6b4a60f0
--- /dev/null
+++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_Reduce.hpp
@@ -0,0 +1,567 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_OPENMP_PARALLEL_REDUCE_HPP
+#define KOKKOS_OPENMP_PARALLEL_REDUCE_HPP
+
+#include <omp.h>
+#include <OpenMP/Kokkos_OpenMP_Instance.hpp>
+#include <KokkosExp_MDRangePolicy.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+template <class CombinedFunctorReducerType, class... Traits>
+class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>,
+                     Kokkos::OpenMP> {
+ private:
+  using Policy      = Kokkos::RangePolicy<Traits...>;
+  using FunctorType = typename CombinedFunctorReducerType::functor_type;
+  using ReducerType = typename CombinedFunctorReducerType::reducer_type;
+
+  using WorkTag = typename Policy::work_tag;
+  using Member  = typename Policy::member_type;
+
+  using pointer_type   = typename ReducerType::pointer_type;
+  using reference_type = typename ReducerType::reference_type;
+
+  OpenMPInternal* m_instance;
+  const CombinedFunctorReducerType m_functor_reducer;
+  const Policy m_policy;
+  const pointer_type m_result_ptr;
+
+  template <class TagType>
+  inline static std::enable_if_t<std::is_void<TagType>::value> exec_range(
+      const FunctorType& functor, const Member ibeg, const Member iend,
+      reference_type update) {
+    for (Member iwork = ibeg; iwork < iend; ++iwork) {
+      functor(iwork, update);
+    }
+  }
+
+  template <class TagType>
+  inline static std::enable_if_t<!std::is_void<TagType>::value> exec_range(
+      const FunctorType& functor, const Member ibeg, const Member iend,
+      reference_type update) {
+    const TagType t{};
+    for (Member iwork = ibeg; iwork < iend; ++iwork) {
+      functor(t, iwork, update);
+    }
+  }
+
+ public:
+  inline void execute() const {
+    const ReducerType& reducer = m_functor_reducer.get_reducer();
+
+    if (m_policy.end() <= m_policy.begin()) {
+      if (m_result_ptr) {
+        reducer.init(m_result_ptr);
+        reducer.final(m_result_ptr);
+      }
+      return;
+    }
+    enum {
+      is_dynamic = std::is_same<typename Policy::schedule_type::type,
+                                Kokkos::Dynamic>::value
+    };
+
+    const size_t pool_reduce_bytes = reducer.value_size();
+
+    m_instance->acquire_lock();
+
+    m_instance->resize_thread_data(pool_reduce_bytes, 0  // team_reduce_bytes
+                                   ,
+                                   0  // team_shared_bytes
+                                   ,
+                                   0  // thread_local_bytes
+    );
+
+    if (execute_in_serial(m_policy.space())) {
+      const pointer_type ptr =
+          m_result_ptr
+              ? m_result_ptr
+              : pointer_type(
+                    m_instance->get_thread_data(0)->pool_reduce_local());
+
+      reference_type update = reducer.init(ptr);
+
+      ParallelReduce::template exec_range<WorkTag>(
+          m_functor_reducer.get_functor(), m_policy.begin(), m_policy.end(),
+          update);
+
+      reducer.final(ptr);
+      return;
+    }
+    const int pool_size = m_instance->thread_pool_size();
+#pragma omp parallel num_threads(pool_size)
+    {
+      HostThreadTeamData& data = *(m_instance->get_thread_data());
+
+      data.set_work_partition(m_policy.end() - m_policy.begin(),
+                              m_policy.chunk_size());
+
+      if (is_dynamic) {
+        // Make sure work partition is set before stealing
+        if (data.pool_rendezvous()) data.pool_rendezvous_release();
+      }
+
+      reference_type update = reducer.init(
+          reinterpret_cast<pointer_type>(data.pool_reduce_local()));
+
+      std::pair<int64_t, int64_t> range(0, 0);
+
+      do {
+        range = is_dynamic ? data.get_work_stealing_chunk()
+                           : data.get_work_partition();
+
+        ParallelReduce::template exec_range<WorkTag>(
+            m_functor_reducer.get_functor(), range.first + m_policy.begin(),
+            range.second + m_policy.begin(), update);
+
+      } while (is_dynamic && 0 <= range.first);
+    }
+
+    // Reduction:
+
+    const pointer_type ptr =
+        pointer_type(m_instance->get_thread_data(0)->pool_reduce_local());
+
+    for (int i = 1; i < pool_size; ++i) {
+      reducer.join(ptr,
+                   reinterpret_cast<pointer_type>(
+                       m_instance->get_thread_data(i)->pool_reduce_local()));
+    }
+
+    reducer.final(ptr);
+
+    if (m_result_ptr) {
+      const int n = reducer.value_count();
+
+      for (int j = 0; j < n; ++j) {
+        m_result_ptr[j] = ptr[j];
+      }
+    }
+
+    m_instance->release_lock();
+  }
+
+  //----------------------------------------
+
+  template <class ViewType>
+  inline ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer,
+                        Policy arg_policy, const ViewType& arg_view)
+      : m_instance(nullptr),
+        m_functor_reducer(arg_functor_reducer),
+        m_policy(arg_policy),
+        m_result_ptr(arg_view.data()) {
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+    if (t_openmp_instance) {
+      m_instance = t_openmp_instance;
+    } else {
+      m_instance = arg_policy.space().impl_internal_space_instance();
+    }
+#else
+    m_instance = arg_policy.space().impl_internal_space_instance();
+#endif
+    static_assert(
+        Kokkos::Impl::MemorySpaceAccess<typename ViewType::memory_space,
+                                        Kokkos::HostSpace>::accessible,
+        "Kokkos::OpenMP reduce result must be a View accessible from "
+        "HostSpace");
+  }
+};
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+// MDRangePolicy impl
+template <class CombinedFunctorReducerType, class... Traits>
+class ParallelReduce<CombinedFunctorReducerType,
+                     Kokkos::MDRangePolicy<Traits...>, Kokkos::OpenMP> {
+ private:
+  using MDRangePolicy = Kokkos::MDRangePolicy<Traits...>;
+  using Policy        = typename MDRangePolicy::impl_range_policy;
+  using FunctorType   = typename CombinedFunctorReducerType::functor_type;
+  using ReducerType   = typename CombinedFunctorReducerType::reducer_type;
+
+  using WorkTag = typename MDRangePolicy::work_tag;
+  using Member  = typename Policy::member_type;
+
+  using pointer_type   = typename ReducerType::pointer_type;
+  using value_type     = typename ReducerType::value_type;
+  using reference_type = typename ReducerType::reference_type;
+
+  using iterate_type = typename Kokkos::Impl::HostIterateTile<
+      MDRangePolicy, CombinedFunctorReducerType, WorkTag, reference_type>;
+
+  OpenMPInternal* m_instance;
+  const iterate_type m_iter;
+  const pointer_type m_result_ptr;
+
+  inline void exec_range(const Member ibeg, const Member iend,
+                         reference_type update) const {
+    for (Member iwork = ibeg; iwork < iend; ++iwork) {
+      m_iter(iwork, update);
+    }
+  }
+
+ public:
+  inline void execute() const {
+    const ReducerType& reducer     = m_iter.m_func.get_reducer();
+    const size_t pool_reduce_bytes = reducer.value_size();
+
+    m_instance->acquire_lock();
+
+    m_instance->resize_thread_data(pool_reduce_bytes, 0  // team_reduce_bytes
+                                   ,
+                                   0  // team_shared_bytes
+                                   ,
+                                   0  // thread_local_bytes
+    );
+
+#ifndef KOKKOS_COMPILER_INTEL
+    if (execute_in_serial(m_iter.m_rp.space())) {
+      const pointer_type ptr =
+          m_result_ptr
+              ? m_result_ptr
+              : pointer_type(
+                    m_instance->get_thread_data(0)->pool_reduce_local());
+
+      reference_type update = reducer.init(ptr);
+
+      ParallelReduce::exec_range(0, m_iter.m_rp.m_num_tiles, update);
+
+      reducer.final(ptr);
+
+      m_instance->release_lock();
+
+      return;
+    }
+#endif
+
+    enum {
+      is_dynamic = std::is_same<typename Policy::schedule_type::type,
+                                Kokkos::Dynamic>::value
+    };
+
+    const int pool_size = m_instance->thread_pool_size();
+#pragma omp parallel num_threads(pool_size)
+    {
+      HostThreadTeamData& data = *(m_instance->get_thread_data());
+
+      data.set_work_partition(m_iter.m_rp.m_num_tiles, 1);
+
+      if (is_dynamic) {
+        // Make sure work partition is set before stealing
+        if (data.pool_rendezvous()) data.pool_rendezvous_release();
+      }
+
+      reference_type update = reducer.init(
+          reinterpret_cast<pointer_type>(data.pool_reduce_local()));
+
+      std::pair<int64_t, int64_t> range(0, 0);
+
+      do {
+        range = is_dynamic ? data.get_work_stealing_chunk()
+                           : data.get_work_partition();
+
+        ParallelReduce::exec_range(range.first, range.second, update);
+
+      } while (is_dynamic && 0 <= range.first);
+    }
+    // END #pragma omp parallel
+
+    // Reduction:
+
+    const pointer_type ptr =
+        pointer_type(m_instance->get_thread_data(0)->pool_reduce_local());
+
+    for (int i = 1; i < pool_size; ++i) {
+      reducer.join(ptr,
+                   reinterpret_cast<pointer_type>(
+                       m_instance->get_thread_data(i)->pool_reduce_local()));
+    }
+
+    reducer.final(ptr);
+
+    if (m_result_ptr) {
+      const int n = reducer.value_count();
+
+      for (int j = 0; j < n; ++j) {
+        m_result_ptr[j] = ptr[j];
+      }
+    }
+
+    m_instance->release_lock();
+  }
+
+  //----------------------------------------
+
+  template <class ViewType>
+  ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer,
+                 MDRangePolicy arg_policy, const ViewType& arg_view)
+      : m_instance(nullptr),
+        m_iter(arg_policy, arg_functor_reducer),
+        m_result_ptr(arg_view.data()) {
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+    if (t_openmp_instance) {
+      m_instance = t_openmp_instance;
+    } else {
+      m_instance = arg_policy.space().impl_internal_space_instance();
+    }
+#else
+    m_instance = arg_policy.space().impl_internal_space_instance();
+#endif
+    static_assert(
+        Kokkos::Impl::MemorySpaceAccess<typename ViewType::memory_space,
+                                        Kokkos::HostSpace>::accessible,
+        "Kokkos::OpenMP reduce result must be a View accessible from "
+        "HostSpace");
+  }
+
+  template <typename Policy, typename Functor>
+  static int max_tile_size_product(const Policy&, const Functor&) {
+    /**
+     * 1024 here is just our guess for a reasonable max tile size,
+     * it isn't a hardware constraint. If people see a use for larger
+     * tile size products, we're happy to change this.
+     */
+    return 1024;
+  }
+};
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+template <class CombinedFunctorReducerType, class... Properties>
+class ParallelReduce<CombinedFunctorReducerType,
+                     Kokkos::TeamPolicy<Properties...>, Kokkos::OpenMP> {
+ private:
+  enum { TEAM_REDUCE_SIZE = 512 };
+
+  using Policy =
+      Kokkos::Impl::TeamPolicyInternal<Kokkos::OpenMP, Properties...>;
+  using FunctorType = typename CombinedFunctorReducerType::functor_type;
+  using ReducerType = typename CombinedFunctorReducerType::reducer_type;
+
+  using WorkTag  = typename Policy::work_tag;
+  using SchedTag = typename Policy::schedule_type::type;
+  using Member   = typename Policy::member_type;
+
+  using pointer_type   = typename ReducerType::pointer_type;
+  using reference_type = typename ReducerType::reference_type;
+
+  OpenMPInternal* m_instance;
+  const CombinedFunctorReducerType m_functor_reducer;
+  const Policy m_policy;
+  const pointer_type m_result_ptr;
+  const int m_shmem_size;
+
+  template <class TagType>
+  inline static std::enable_if_t<(std::is_void<TagType>::value)> exec_team(
+      const FunctorType& functor, HostThreadTeamData& data,
+      reference_type& update, const int league_rank_begin,
+      const int league_rank_end, const int league_size) {
+    for (int r = league_rank_begin; r < league_rank_end;) {
+      functor(Member(data, r, league_size), update);
+
+      if (++r < league_rank_end) {
+        // Don't allow team members to lap one another
+        // so that they don't overwrite shared memory.
+        if (data.team_rendezvous()) {
+          data.team_rendezvous_release();
+        }
+      }
+    }
+  }
+
+  template <class TagType>
+  inline static std::enable_if_t<(!std::is_void<TagType>::value)> exec_team(
+      const FunctorType& functor, HostThreadTeamData& data,
+      reference_type& update, const int league_rank_begin,
+      const int league_rank_end, const int league_size) {
+    const TagType t{};
+
+    for (int r = league_rank_begin; r < league_rank_end;) {
+      functor(t, Member(data, r, league_size), update);
+
+      if (++r < league_rank_end) {
+        // Don't allow team members to lap one another
+        // so that they don't overwrite shared memory.
+        if (data.team_rendezvous()) {
+          data.team_rendezvous_release();
+        }
+      }
+    }
+  }
+
+ public:
+  inline void execute() const {
+    enum { is_dynamic = std::is_same<SchedTag, Kokkos::Dynamic>::value };
+
+    const ReducerType& reducer = m_functor_reducer.get_reducer();
+
+    if (m_policy.league_size() == 0 || m_policy.team_size() == 0) {
+      if (m_result_ptr) {
+        reducer.init(m_result_ptr);
+        reducer.final(m_result_ptr);
+      }
+      return;
+    }
+
+    const size_t pool_reduce_size = reducer.value_size();
+
+    const size_t team_reduce_size  = TEAM_REDUCE_SIZE * m_policy.team_size();
+    const size_t team_shared_size  = m_shmem_size + m_policy.scratch_size(1);
+    const size_t thread_local_size = 0;  // Never shrinks
+
+    m_instance->acquire_lock();
+
+    m_instance->resize_thread_data(pool_reduce_size, team_reduce_size,
+                                   team_shared_size, thread_local_size);
+
+    if (execute_in_serial(m_policy.space())) {
+      HostThreadTeamData& data = *(m_instance->get_thread_data());
+      pointer_type ptr =
+          m_result_ptr ? m_result_ptr : pointer_type(data.pool_reduce_local());
+      reference_type update       = reducer.init(ptr);
+      const int league_rank_begin = 0;
+      const int league_rank_end   = m_policy.league_size();
+      ParallelReduce::template exec_team<WorkTag>(
+          m_functor_reducer.get_functor(), data, update, league_rank_begin,
+          league_rank_end, m_policy.league_size());
+
+      reducer.final(ptr);
+
+      m_instance->release_lock();
+
+      return;
+    }
+
+    const int pool_size = m_instance->thread_pool_size();
+#pragma omp parallel num_threads(pool_size)
+    {
+      HostThreadTeamData& data = *(m_instance->get_thread_data());
+
+      const int active = data.organize_team(m_policy.team_size());
+
+      if (active) {
+        data.set_work_partition(
+            m_policy.league_size(),
+            (0 < m_policy.chunk_size() ? m_policy.chunk_size()
+                                       : m_policy.team_iter()));
+      }
+
+      if (is_dynamic) {
+        // Must synchronize to make sure each team has set its
+        // partition before beginning the work stealing loop.
+        if (data.pool_rendezvous()) data.pool_rendezvous_release();
+      }
+
+      if (active) {
+        reference_type update = reducer.init(
+            reinterpret_cast<pointer_type>(data.pool_reduce_local()));
+
+        std::pair<int64_t, int64_t> range(0, 0);
+
+        do {
+          range = is_dynamic ? data.get_work_stealing_chunk()
+                             : data.get_work_partition();
+
+          ParallelReduce::template exec_team<WorkTag>(
+              m_functor_reducer.get_functor(), data, update, range.first,
+              range.second, m_policy.league_size());
+
+        } while (is_dynamic && 0 <= range.first);
+      } else {
+        reducer.init(reinterpret_cast<pointer_type>(data.pool_reduce_local()));
+      }
+
+      data.disband_team();
+
+      //  This thread has updated 'pool_reduce_local()' with its
+      //  contributions to the reduction.  The parallel region is
+      //  about to terminate and the master thread will load and
+      //  reduce each 'pool_reduce_local()' contribution.
+      //  Must 'memory_fence()' to guarantee that storing the update to
+      //  'pool_reduce_local()' will complete before this thread
+      //  exits the parallel region.
+
+      memory_fence();
+    }
+
+    // Reduction:
+
+    const pointer_type ptr =
+        pointer_type(m_instance->get_thread_data(0)->pool_reduce_local());
+
+    for (int i = 1; i < pool_size; ++i) {
+      reducer.join(ptr,
+                   reinterpret_cast<pointer_type>(
+                       m_instance->get_thread_data(i)->pool_reduce_local()));
+    }
+
+    reducer.final(ptr);
+
+    if (m_result_ptr) {
+      const int n = reducer.value_count();
+
+      for (int j = 0; j < n; ++j) {
+        m_result_ptr[j] = ptr[j];
+      }
+    }
+
+    m_instance->release_lock();
+  }
+
+  //----------------------------------------
+
+  template <class ViewType>
+  inline ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer,
+                        const Policy& arg_policy, const ViewType& arg_result)
+      : m_instance(nullptr),
+        m_functor_reducer(arg_functor_reducer),
+        m_policy(arg_policy),
+        m_result_ptr(arg_result.data()),
+        m_shmem_size(
+            arg_policy.scratch_size(0) + arg_policy.scratch_size(1) +
+            FunctorTeamShmemSize<FunctorType>::value(
+                arg_functor_reducer.get_functor(), arg_policy.team_size())) {
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+    if (t_openmp_instance) {
+      m_instance = t_openmp_instance;
+    } else {
+      m_instance = arg_policy.space().impl_internal_space_instance();
+    }
+#else
+    m_instance = arg_policy.space().impl_internal_space_instance();
+#endif
+
+    static_assert(
+        Kokkos::Impl::MemorySpaceAccess<typename ViewType::memory_space,
+                                        Kokkos::HostSpace>::accessible,
+        "Kokkos::OpenMP reduce result must be a View accessible from "
+        "HostSpace");
+  }
+};
+
+}  // namespace Impl
+}  // namespace Kokkos
+
+#endif /* KOKKOS_OPENMP_PARALLEL_REDUCE_HPP */
diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_Scan.hpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_Scan.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..02707e7fbee1f2dd4b63397135152d7515fa7c36
--- /dev/null
+++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_Scan.hpp
@@ -0,0 +1,312 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_OPENMP_PARALLEL_SCAN_HPP
+#define KOKKOS_OPENMP_PARALLEL_SCAN_HPP
+
+#include <omp.h>
+#include <OpenMP/Kokkos_OpenMP_Instance.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template <class FunctorType, class... Traits>
+class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
+                   Kokkos::OpenMP> {
+ private:
+  using Policy = Kokkos::RangePolicy<Traits...>;
+
+  using Analysis =
+      FunctorAnalysis<FunctorPatternInterface::SCAN, Policy, FunctorType, void>;
+
+  using WorkTag   = typename Policy::work_tag;
+  using WorkRange = typename Policy::WorkRange;
+  using Member    = typename Policy::member_type;
+
+  using pointer_type   = typename Analysis::pointer_type;
+  using reference_type = typename Analysis::reference_type;
+
+  OpenMPInternal* m_instance;
+  const FunctorType m_functor;
+  const Policy m_policy;
+
+  template <class TagType>
+  inline static std::enable_if_t<std::is_void<TagType>::value> exec_range(
+      const FunctorType& functor, const Member ibeg, const Member iend,
+      reference_type update, const bool final) {
+    for (Member iwork = ibeg; iwork < iend; ++iwork) {
+      functor(iwork, update, final);
+    }
+  }
+
+  template <class TagType>
+  inline static std::enable_if_t<!std::is_void<TagType>::value> exec_range(
+      const FunctorType& functor, const Member ibeg, const Member iend,
+      reference_type update, const bool final) {
+    const TagType t{};
+    for (Member iwork = ibeg; iwork < iend; ++iwork) {
+      functor(t, iwork, update, final);
+    }
+  }
+
+ public:
+  inline void execute() const {
+    const int value_count          = Analysis::value_count(m_functor);
+    const size_t pool_reduce_bytes = 2 * Analysis::value_size(m_functor);
+
+    m_instance->resize_thread_data(pool_reduce_bytes, 0  // team_reduce_bytes
+                                   ,
+                                   0  // team_shared_bytes
+                                   ,
+                                   0  // thread_local_bytes
+    );
+
+    if (execute_in_serial(m_policy.space())) {
+      typename Analysis::Reducer final_reducer(m_functor);
+
+      reference_type update = final_reducer.init(
+          pointer_type(m_instance->get_thread_data(0)->pool_reduce_local()));
+
+      ParallelScan::template exec_range<WorkTag>(m_functor, m_policy.begin(),
+                                                 m_policy.end(), update, true);
+
+      return;
+    }
+
+#pragma omp parallel num_threads(m_instance->thread_pool_size())
+    {
+      HostThreadTeamData& data = *(m_instance->get_thread_data());
+      typename Analysis::Reducer final_reducer(m_functor);
+
+      const WorkRange range(m_policy, omp_get_thread_num(),
+                            omp_get_num_threads());
+
+      reference_type update_sum = final_reducer.init(
+          reinterpret_cast<pointer_type>(data.pool_reduce_local()));
+
+      ParallelScan::template exec_range<WorkTag>(
+          m_functor, range.begin(), range.end(), update_sum, false);
+
+      if (data.pool_rendezvous()) {
+        pointer_type ptr_prev = nullptr;
+
+        const int n = omp_get_num_threads();
+
+        for (int i = 0; i < n; ++i) {
+          pointer_type ptr =
+              (pointer_type)data.pool_member(i)->pool_reduce_local();
+
+          if (i) {
+            for (int j = 0; j < value_count; ++j) {
+              ptr[j + value_count] = ptr_prev[j + value_count];
+            }
+            final_reducer.join(ptr + value_count, ptr_prev);
+          } else {
+            final_reducer.init(ptr + value_count);
+          }
+
+          ptr_prev = ptr;
+        }
+
+        data.pool_rendezvous_release();
+      }
+
+      reference_type update_base = final_reducer.reference(
+          reinterpret_cast<pointer_type>(data.pool_reduce_local()) +
+          value_count);
+
+      ParallelScan::template exec_range<WorkTag>(
+          m_functor, range.begin(), range.end(), update_base, true);
+    }
+  }
+
+  //----------------------------------------
+
+  inline ParallelScan(const FunctorType& arg_functor, const Policy& arg_policy)
+      : m_instance(nullptr), m_functor(arg_functor), m_policy(arg_policy) {
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+    if (t_openmp_instance) {
+      m_instance = t_openmp_instance;
+    } else {
+      m_instance = arg_policy.space().impl_internal_space_instance();
+    }
+#else
+    m_instance = arg_policy.space().impl_internal_space_instance();
+#endif
+  }
+};
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+template <class FunctorType, class ReturnType, class... Traits>
+class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
+                            ReturnType, Kokkos::OpenMP> {
+ private:
+  using Policy = Kokkos::RangePolicy<Traits...>;
+
+  using Analysis = FunctorAnalysis<FunctorPatternInterface::SCAN, Policy,
+                                   FunctorType, ReturnType>;
+
+  using WorkTag   = typename Policy::work_tag;
+  using WorkRange = typename Policy::WorkRange;
+  using Member    = typename Policy::member_type;
+
+  using value_type     = typename Analysis::value_type;
+  using pointer_type   = typename Analysis::pointer_type;
+  using reference_type = typename Analysis::reference_type;
+
+  OpenMPInternal* m_instance;
+  const FunctorType m_functor;
+  const Policy m_policy;
+  const pointer_type m_result_ptr;
+
+  template <class TagType>
+  inline static std::enable_if_t<std::is_void<TagType>::value> exec_range(
+      const FunctorType& functor, const Member ibeg, const Member iend,
+      reference_type update, const bool final) {
+    for (Member iwork = ibeg; iwork < iend; ++iwork) {
+      functor(iwork, update, final);
+    }
+  }
+
+  template <class TagType>
+  inline static std::enable_if_t<!std::is_void<TagType>::value> exec_range(
+      const FunctorType& functor, const Member ibeg, const Member iend,
+      reference_type update, const bool final) {
+    const TagType t{};
+    for (Member iwork = ibeg; iwork < iend; ++iwork) {
+      functor(t, iwork, update, final);
+    }
+  }
+
+ public:
+  inline void execute() const {
+    const int value_count          = Analysis::value_count(m_functor);
+    const size_t pool_reduce_bytes = 2 * Analysis::value_size(m_functor);
+
+    m_instance->acquire_lock();
+
+    m_instance->resize_thread_data(pool_reduce_bytes, 0  // team_reduce_bytes
+                                   ,
+                                   0  // team_shared_bytes
+                                   ,
+                                   0  // thread_local_bytes
+    );
+
+    if (execute_in_serial(m_policy.space())) {
+      typename Analysis::Reducer final_reducer(m_functor);
+
+      reference_type update = final_reducer.init(
+          pointer_type(m_instance->get_thread_data(0)->pool_reduce_local()));
+
+      this->template exec_range<WorkTag>(m_functor, m_policy.begin(),
+                                         m_policy.end(), update, true);
+
+      *m_result_ptr = update;
+
+      m_instance->release_lock();
+
+      return;
+    }
+
+#pragma omp parallel num_threads(m_instance->thread_pool_size())
+    {
+      HostThreadTeamData& data = *(m_instance->get_thread_data());
+      typename Analysis::Reducer final_reducer(m_functor);
+
+      const WorkRange range(m_policy, omp_get_thread_num(),
+                            omp_get_num_threads());
+      reference_type update_sum = final_reducer.init(
+          reinterpret_cast<pointer_type>(data.pool_reduce_local()));
+
+      ParallelScanWithTotal::template exec_range<WorkTag>(
+          m_functor, range.begin(), range.end(), update_sum, false);
+
+      if (data.pool_rendezvous()) {
+        pointer_type ptr_prev = nullptr;
+
+        const int n = omp_get_num_threads();
+
+        for (int i = 0; i < n; ++i) {
+          pointer_type ptr =
+              (pointer_type)data.pool_member(i)->pool_reduce_local();
+
+          if (i) {
+            for (int j = 0; j < value_count; ++j) {
+              ptr[j + value_count] = ptr_prev[j + value_count];
+            }
+            final_reducer.join(ptr + value_count, ptr_prev);
+          } else {
+            final_reducer.init(ptr + value_count);
+          }
+
+          ptr_prev = ptr;
+        }
+
+        data.pool_rendezvous_release();
+      }
+
+      reference_type update_base = final_reducer.reference(
+          reinterpret_cast<pointer_type>(data.pool_reduce_local()) +
+          value_count);
+
+      ParallelScanWithTotal::template exec_range<WorkTag>(
+          m_functor, range.begin(), range.end(), update_base, true);
+
+      if (omp_get_thread_num() == omp_get_num_threads() - 1) {
+        *m_result_ptr = update_base;
+      }
+    }
+
+    m_instance->release_lock();
+  }
+
+  //----------------------------------------
+
+  template <class ViewType>
+  ParallelScanWithTotal(const FunctorType& arg_functor,
+                        const Policy& arg_policy,
+                        const ViewType& arg_result_view)
+      : m_instance(nullptr),
+        m_functor(arg_functor),
+        m_policy(arg_policy),
+        m_result_ptr(arg_result_view.data()) {
+    static_assert(
+        Kokkos::Impl::MemorySpaceAccess<typename ViewType::memory_space,
+                                        Kokkos::HostSpace>::accessible,
+        "Kokkos::OpenMP parallel_scan result must be host-accessible!");
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+    if (t_openmp_instance) {
+      m_instance = t_openmp_instance;
+    } else {
+      m_instance = arg_policy.space().impl_internal_space_instance();
+    }
+#else
+    m_instance = arg_policy.space().impl_internal_space_instance();
+#endif
+  }
+
+  //----------------------------------------
+};
+
+}  // namespace Impl
+}  // namespace Kokkos
+
+#endif /* KOKKOS_OPENMP_PARALLEL_REDUCE_SCAN_HPP */
diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp
index d6fd45ae9ae5e3cfa07bbbbe0055fcdb40e4534e..01b66948654c8d7e8c3f3a450a7f738f6f01bb9c 100644
--- a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp
+++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp
@@ -20,10 +20,11 @@
 #include <Kokkos_Macros.hpp>
 #if defined(KOKKOS_ENABLE_OPENMP) && defined(KOKKOS_ENABLE_TASKDAG)
 
+#include <Kokkos_Atomic.hpp>
 #include <Kokkos_TaskScheduler_fwd.hpp>
 
 #include <impl/Kokkos_HostThreadTeam.hpp>
-#include <Kokkos_OpenMP.hpp>
+#include <OpenMP/Kokkos_OpenMP.hpp>
 
 #include <type_traits>
 #include <cassert>
@@ -156,7 +157,7 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::OpenMP, QueueType>> {
   }
 
   static uint32_t get_max_team_count(execution_space const& espace) {
-    return static_cast<uint32_t>(OpenMP::impl_thread_pool_size(espace));
+    return static_cast<uint32_t>(espace.impl_thread_pool_size());
   }
 
   // TODO @tasking @optimization DSH specialize this for trivially destructible
@@ -189,7 +190,8 @@ class TaskQueueSpecializationConstrained<
     using task_base_type = typename scheduler_type::task_base;
     using queue_type     = typename scheduler_type::queue_type;
 
-    if (1 == OpenMP::impl_thread_pool_size()) {
+    execution_space exec;
+    if (1 == exec.impl_thread_pool_size()) {
       task_base_type* const end = (task_base_type*)task_base_type::EndTag;
 
       HostThreadTeamData& team_data_single =
@@ -286,7 +288,9 @@ class TaskQueueSpecializationConstrained<
 
               // If 0 == m_ready_count then set task = 0
 
-              if (*((volatile int*)&team_queue.m_ready_count) > 0) {
+              if (desul::atomic_load(&team_queue.m_ready_count,
+                                     desul::MemoryOrderAcquire(),
+                                     desul::MemoryScopeDevice()) > 0) {
                 task = end;
                 // Attempt to acquire a task
                 // Loop by priority and then type
diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Team.hpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Team.hpp
index 280b1701ad603a812bd07c4fd22f62a4be6a3da8..dbc30c5d02f6f24d55abe6c21612e88eb30ff045 100644
--- a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Team.hpp
+++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Team.hpp
@@ -59,7 +59,7 @@ class TeamPolicyInternal<Kokkos::OpenMP, Properties...>
 
   template <class FunctorType>
   int team_size_max(const FunctorType&, const ParallelForTag&) const {
-    int pool_size = traits::execution_space::impl_thread_pool_size(1, m_space);
+    int pool_size          = m_space.impl_thread_pool_size(1);
     int max_host_team_size = Impl::HostThreadTeamData::max_team_members;
     return pool_size < max_host_team_size ? pool_size : max_host_team_size;
   }
@@ -68,7 +68,7 @@ class TeamPolicyInternal<Kokkos::OpenMP, Properties...>
 
   template <class FunctorType>
   int team_size_max(const FunctorType&, const ParallelReduceTag&) const {
-    int pool_size = traits::execution_space::impl_thread_pool_size(1, m_space);
+    int pool_size          = m_space.impl_thread_pool_size(1);
     int max_host_team_size = Impl::HostThreadTeamData::max_team_members;
     return pool_size < max_host_team_size ? pool_size : max_host_team_size;
   }
@@ -79,12 +79,12 @@ class TeamPolicyInternal<Kokkos::OpenMP, Properties...>
   }
   template <class FunctorType>
   int team_size_recommended(const FunctorType&, const ParallelForTag&) const {
-    return traits::execution_space::impl_thread_pool_size(2, m_space);
+    return m_space.impl_thread_pool_size(2);
   }
   template <class FunctorType>
   int team_size_recommended(const FunctorType&,
                             const ParallelReduceTag&) const {
-    return traits::execution_space::impl_thread_pool_size(2, m_space);
+    return m_space.impl_thread_pool_size(2);
   }
   template <class FunctorType, class ReducerType>
   inline int team_size_recommended(const FunctorType& f, const ReducerType&,
@@ -120,10 +120,8 @@ class TeamPolicyInternal<Kokkos::OpenMP, Properties...>
   typename traits::execution_space m_space;
 
   inline void init(const int league_size_request, const int team_size_request) {
-    const int pool_size =
-        traits::execution_space::impl_thread_pool_size(0, m_space);
-    const int team_grain =
-        traits::execution_space::impl_thread_pool_size(2, m_space);
+    const int pool_size          = m_space.impl_thread_pool_size(0);
+    const int team_grain         = m_space.impl_thread_pool_size(2);
     const int max_host_team_size = Impl::HostThreadTeamData::max_team_members;
     const int team_max =
         ((pool_size < max_host_team_size) ? pool_size : max_host_team_size);
@@ -192,8 +190,7 @@ class TeamPolicyInternal<Kokkos::OpenMP, Properties...>
         m_tune_team(true),
         m_tune_vector(false),
         m_space(space) {
-    init(league_size_request,
-         traits::execution_space::impl_thread_pool_size(2, m_space));
+    init(league_size_request, m_space.impl_thread_pool_size(2));
   }
 
   TeamPolicyInternal(const typename traits::execution_space& space,
@@ -207,8 +204,7 @@ class TeamPolicyInternal<Kokkos::OpenMP, Properties...>
         m_tune_team(true),
         m_tune_vector(true),
         m_space(space) {
-    init(league_size_request,
-         traits::execution_space::impl_thread_pool_size(2, m_space));
+    init(league_size_request, m_space.impl_thread_pool_size(2));
   }
 
   TeamPolicyInternal(const typename traits::execution_space& space,
@@ -242,8 +238,7 @@ class TeamPolicyInternal<Kokkos::OpenMP, Properties...>
         m_chunk_size(0),
         m_tune_team(true),
         m_tune_vector(false) {
-    init(league_size_request,
-         traits::execution_space::impl_thread_pool_size(2, m_space));
+    init(league_size_request, m_space.impl_thread_pool_size(2));
   }
 
   TeamPolicyInternal(int league_size_request,
@@ -255,8 +250,7 @@ class TeamPolicyInternal<Kokkos::OpenMP, Properties...>
         m_chunk_size(0),
         m_tune_team(true),
         m_tune_vector(true) {
-    init(league_size_request,
-         traits::execution_space::impl_thread_pool_size(2, m_space));
+    init(league_size_request, m_space.impl_thread_pool_size(2));
   }
 
   TeamPolicyInternal(int league_size_request, int team_size_request,
@@ -310,9 +304,7 @@ class TeamPolicyInternal<Kokkos::OpenMP, Properties...>
  private:
   /** \brief finalize chunk_size if it was set to AUTO*/
   inline void set_auto_chunk_size() {
-    int concurrency =
-        traits::execution_space::impl_thread_pool_size(0, m_space) /
-        m_team_alloc;
+    int concurrency = m_space.impl_thread_pool_size(0) / m_team_alloc;
     if (concurrency == 0) concurrency = 1;
 
     if (m_chunk_size > 0) {
diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_UniqueToken.hpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_UniqueToken.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a37e1758a26104784a816a311bbae7c7475f6040
--- /dev/null
+++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_UniqueToken.hpp
@@ -0,0 +1,129 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_OPENMP_UNIQUE_TOKEN_HPP
+#define KOKKOS_OPENMP_UNIQUE_TOKEN_HPP
+
+#include <Kokkos_UniqueToken.hpp>
+
+namespace Kokkos::Experimental {
+template <>
+class UniqueToken<OpenMP, UniqueTokenScope::Instance> {
+ public:
+  using execution_space = OpenMP;
+  using size_type       = int;
+
+ private:
+  using buffer_type = Kokkos::View<uint32_t*, Kokkos::HostSpace>;
+  execution_space m_exec;
+  size_type m_count;
+  buffer_type m_buffer_view;
+  uint32_t volatile* m_buffer;
+
+ public:
+  /// \brief create object size for concurrency on the given instance
+  ///
+  /// This object should not be shared between instances
+  UniqueToken(execution_space const& exec = execution_space()) noexcept
+      : m_exec(exec),
+        m_count(m_exec.impl_thread_pool_size()),
+        m_buffer_view(buffer_type()),
+        m_buffer(nullptr) {}
+
+  UniqueToken(size_type max_size,
+              execution_space const& exec = execution_space())
+      : m_exec(exec),
+        m_count(max_size),
+        m_buffer_view("UniqueToken::m_buffer_view",
+                      ::Kokkos::Impl::concurrent_bitset::buffer_bound(m_count)),
+        m_buffer(m_buffer_view.data()) {}
+
+  /// \brief upper bound for acquired values, i.e. 0 <= value < size()
+  KOKKOS_INLINE_FUNCTION
+  int size() const noexcept {
+    KOKKOS_IF_ON_HOST((return m_count;))
+
+    KOKKOS_IF_ON_DEVICE((return 0;))
+  }
+
+  /// \brief acquire value such that 0 <= value < size()
+  KOKKOS_INLINE_FUNCTION
+  int acquire() const noexcept {
+    KOKKOS_IF_ON_HOST(
+        (if (m_count >= m_exec.impl_thread_pool_size()) return m_exec
+             .impl_thread_pool_rank();
+         const ::Kokkos::pair<int, int> result =
+             ::Kokkos::Impl::concurrent_bitset::acquire_bounded(
+                 m_buffer, m_count, ::Kokkos::Impl::clock_tic() % m_count);
+
+         if (result.first < 0) {
+           ::Kokkos::abort(
+               "UniqueToken<OpenMP> failure to acquire tokens, no tokens "
+               "available");
+         }
+
+         return result.first;))
+
+    KOKKOS_IF_ON_DEVICE((return 0;))
+  }
+
+  /// \brief release a value acquired by generate
+  KOKKOS_INLINE_FUNCTION
+  void release(int i) const noexcept {
+    KOKKOS_IF_ON_HOST((if (m_count < m_exec.impl_thread_pool_size()) {
+      ::Kokkos::Impl::concurrent_bitset::release(m_buffer, i);
+    }))
+
+    KOKKOS_IF_ON_DEVICE(((void)i;))
+  }
+};
+
+template <>
+class UniqueToken<OpenMP, UniqueTokenScope::Global> {
+ public:
+  using execution_space = OpenMP;
+  using size_type       = int;
+
+  /// \brief create object size for concurrency on the given instance
+  ///
+  /// This object should not be shared between instances
+  UniqueToken(execution_space const& = execution_space()) noexcept {}
+
+  /// \brief upper bound for acquired values, i.e. 0 <= value < size()
+  KOKKOS_INLINE_FUNCTION
+  int size() const noexcept {
+    KOKKOS_IF_ON_HOST((return Kokkos::Impl::g_openmp_hardware_max_threads;))
+
+    KOKKOS_IF_ON_DEVICE((return 0;))
+  }
+
+  /// \brief acquire value such that 0 <= value < size()
+  // FIXME this is wrong when using nested parallelism. In that case multiple
+  // threads have the same thread ID.
+  KOKKOS_INLINE_FUNCTION
+  int acquire() const noexcept {
+    KOKKOS_IF_ON_HOST((return omp_get_thread_num();))
+
+    KOKKOS_IF_ON_DEVICE((return 0;))
+  }
+
+  /// \brief release a value acquired by generate
+  KOKKOS_INLINE_FUNCTION
+  void release(int) const noexcept {}
+};
+}  // namespace Kokkos::Experimental
+
+#endif
diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_WorkGraphPolicy.hpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_WorkGraphPolicy.hpp
index 6cc52815de9a6200b450fcd4c218fbceff340247..a030a2b70682a8d00cc7b3fa00a5a2341ae40682 100644
--- a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_WorkGraphPolicy.hpp
+++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_WorkGraphPolicy.hpp
@@ -17,7 +17,7 @@
 #ifndef KOKKOS_OPENMP_WORKGRAPHPOLICY_HPP
 #define KOKKOS_OPENMP_WORKGRAPHPOLICY_HPP
 
-#include <Kokkos_OpenMP.hpp>
+#include <OpenMP/Kokkos_OpenMP.hpp>
 
 namespace Kokkos {
 namespace Impl {
@@ -49,7 +49,8 @@ class ParallelFor<FunctorType, Kokkos::WorkGraphPolicy<Traits...>,
     // We need to introduce pool_size to work around NVHPC 22.5 ICE
     // We need to use [[maybe_unused]] to work around an unused-variable warning
     // from HIP
-    [[maybe_unused]] int pool_size = OpenMP::impl_thread_pool_size();
+    OpenMP exec;
+    [[maybe_unused]] int pool_size = exec.impl_thread_pool_size();
 #pragma omp parallel num_threads(pool_size)
     {
       // Spin until COMPLETED_TOKEN.
diff --git a/packages/kokkos/core/src/Kokkos_OpenMPTarget.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp
similarity index 98%
rename from packages/kokkos/core/src/Kokkos_OpenMPTarget.hpp
rename to packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp
index 4bcfed90e39d831f4cbf2c7299b6f16b0f8d7acf..adf972dd081f6a6c055d39965b1d06c4a6daf03e 100644
--- a/packages/kokkos/core/src/Kokkos_OpenMPTarget.hpp
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp
@@ -30,7 +30,7 @@ static_assert(false,
 
 #include <cstddef>
 #include <iosfwd>
-#include <Kokkos_OpenMPTargetSpace.hpp>
+#include <OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp>
 #include <Kokkos_ScratchSpace.hpp>
 #include <Kokkos_Parallel.hpp>
 #include <Kokkos_TaskScheduler.hpp>
@@ -141,7 +141,6 @@ struct DeviceTypeTraits<::Kokkos::Experimental::OpenMPTarget> {
 /*--------------------------------------------------------------------------*/
 /*--------------------------------------------------------------------------*/
 
-#include <OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp>
 #include <OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp>
 #include <OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp>
 #include <OpenMPTarget/Kokkos_OpenMPTarget_Task.hpp>
diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp
index f30abb0c875ac7454e280ba4b6697d2fa5a26539..81fbc56de00535c03975f361447cdd14fd78f0f4 100644
--- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp
@@ -33,8 +33,8 @@
 #include <sstream>
 #include <cstring>
 
-#include <Kokkos_OpenMPTarget.hpp>
-#include <Kokkos_OpenMPTargetSpace.hpp>
+#include <OpenMPTarget/Kokkos_OpenMPTarget.hpp>
+#include <OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp>
 #include <impl/Kokkos_Error.hpp>
 #include <Kokkos_Atomic.hpp>
 #include <impl/Kokkos_MemorySpace.hpp>
@@ -165,39 +165,6 @@ SharedAllocationRecord<Kokkos::Experimental::OpenMPTargetSpace, void>::
 }  // namespace Impl
 }  // namespace Kokkos
 
-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-/*
-namespace Kokkos {
-namespace {
-  const unsigned HOST_SPACE_ATOMIC_MASK = 0xFFFF;
-  const unsigned HOST_SPACE_ATOMIC_XOR_MASK = 0x5A39;
-  static int HOST_SPACE_ATOMIC_LOCKS[HOST_SPACE_ATOMIC_MASK+1];
-}
-
-namespace Impl {
-void init_lock_array_host_space() {
-  static int is_initialized = 0;
-  if(! is_initialized)
-    for(int i = 0; i < static_cast<int> (HOST_SPACE_ATOMIC_MASK+1); i++)
-      HOST_SPACE_ATOMIC_LOCKS[i] = 0;
-}
-
-bool lock_address_host_space(void* ptr) {
-  return 0 == atomic_compare_exchange( &HOST_SPACE_ATOMIC_LOCKS[
-      (( size_t(ptr) >> 2 ) & HOST_SPACE_ATOMIC_MASK) ^
-HOST_SPACE_ATOMIC_XOR_MASK] , 0 , 1);
-}
-
-void unlock_address_host_space(void* ptr) {
-   atomic_exchange( &HOST_SPACE_ATOMIC_LOCKS[
-      (( size_t(ptr) >> 2 ) & HOST_SPACE_ATOMIC_MASK) ^
-HOST_SPACE_ATOMIC_XOR_MASK] , 0);
-}
-
-}
-}*/
-
 //==============================================================================
 // <editor-fold desc="Explicit instantiations of CRTP Base classes"> {{{1
 
diff --git a/packages/kokkos/core/src/Kokkos_OpenMPTargetSpace.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp
similarity index 89%
rename from packages/kokkos/core/src/Kokkos_OpenMPTargetSpace.hpp
rename to packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp
index ca015da379f4c312d00ae4c91c20893fa8702022..e5b33d0982f83e9bb5ae720734d3f0081fbec4ba 100644
--- a/packages/kokkos/core/src/Kokkos_OpenMPTargetSpace.hpp
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp
@@ -35,37 +35,6 @@ static_assert(false,
 #include <Kokkos_HostSpace.hpp>
 #include <omp.h>
 
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-namespace Impl {
-
-/// \brief Initialize lock array for arbitrary size atomics.
-///
-/// Arbitrary atomics are implemented using a hash table of locks
-/// where the hash value is derived from the address of the
-/// object for which an atomic operation is performed.
-/// This function initializes the locks to zero (unset).
-// void init_lock_array_host_space();
-
-/// \brief Acquire a lock for the address
-///
-/// This function tries to acquire the lock for the hash value derived
-/// from the provided ptr. If the lock is successfully acquired the
-/// function returns true. Otherwise it returns false.
-// bool lock_address_host_space(void* ptr);
-
-/// \brief Release lock for the address
-///
-/// This function releases the lock for the hash value derived
-/// from the provided ptr. This function should only be called
-/// after previously successfully acquiring a lock with
-/// lock_address.
-// void unlock_address_host_space(void* ptr);
-
-}  // namespace Impl
-}  // namespace Kokkos
-
 namespace Kokkos {
 namespace Impl {
 
diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp
index 40da73ebc641c11e1e0b9a8ce14134f72ea73f23..1902c38409a98d80689cbb13a9879d7a7db23ec2 100644
--- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp
@@ -72,8 +72,9 @@ void OpenMPTargetExec::verify_initialized(const char* const label) {
 void* OpenMPTargetExec::m_scratch_ptr         = nullptr;
 int64_t OpenMPTargetExec::m_scratch_size      = 0;
 int* OpenMPTargetExec::m_lock_array           = nullptr;
-int64_t OpenMPTargetExec::m_lock_size         = 0;
+uint64_t OpenMPTargetExec::m_lock_size        = 0;
 uint32_t* OpenMPTargetExec::m_uniquetoken_ptr = nullptr;
+int OpenMPTargetExec::MAX_ACTIVE_THREADS      = 0;
 
 void OpenMPTargetExec::clear_scratch() {
   Kokkos::Experimental::OpenMPTargetSpace space;
@@ -100,11 +101,26 @@ void OpenMPTargetExec::resize_scratch(int64_t team_size, int64_t shmem_size_L0,
   const int64_t shmem_size =
       shmem_size_L0 + shmem_size_L1;  // L0 + L1 scratch memory per team.
   const int64_t padding = shmem_size * 10 / 100;  // Padding per team.
+
+  // Maximum active teams possible.
+  // The number should not exceed the maximum in-flight teams possible or the
+  // league_size.
+  int max_active_teams =
+      std::min(OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size, league_size);
+
+  // max_active_teams is the number of active teams on the given hardware.
+  // We set the number of teams to be twice the number of max_active_teams for
+  // the compiler to pick the right number in its case.
+  // FIXME_OPENMPTARGET: Cray compiler did not yet implement omp_set_num_teams.
+#if !defined(KOKKOS_COMPILER_CRAY_LLVM)
+  omp_set_num_teams(max_active_teams * 2);
+#endif
+
   // Total amount of scratch memory allocated is depenedent
   // on the maximum number of in-flight teams possible.
   int64_t total_size =
       (shmem_size + OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE + padding) *
-      std::min(MAX_ACTIVE_THREADS / team_size, league_size);
+      max_active_teams * 2;
 
   if (total_size > m_scratch_size) {
     space.deallocate(m_scratch_ptr, m_scratch_size);
@@ -131,9 +147,10 @@ int* OpenMPTargetExec::get_lock_array(int num_teams) {
 
     for (int i = 0; i < lock_array_elem; ++i) h_lock_array[i] = 0;
 
-    KOKKOS_IMPL_OMPT_SAFE_CALL(
-        omp_target_memcpy(m_lock_array, h_lock_array, m_lock_size, 0, 0,
-                          omp_get_default_device(), omp_get_initial_device()));
+    if (0 < m_lock_size)
+      KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy(
+          m_lock_array, h_lock_array, m_lock_size, 0, 0,
+          omp_get_default_device(), omp_get_initial_device()));
 
     omp_target_free(h_lock_array, omp_get_initial_device());
   }
diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp
deleted file mode 100644
index 6d62a3c7e4b7d7b9f0cbeee441c92c38c99dbee6..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp
+++ /dev/null
@@ -1,1929 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#ifndef KOKKOS_OPENMPTARGETEXEC_HPP
-#define KOKKOS_OPENMPTARGETEXEC_HPP
-
-#include <impl/Kokkos_Traits.hpp>
-#include <impl/Kokkos_Spinwait.hpp>
-
-#include <Kokkos_Atomic.hpp>
-#include "Kokkos_OpenMPTarget_Abort.hpp"
-
-// FIXME_OPENMPTARGET - Using this macro to implement a workaround for
-// hierarchical reducers. It avoids hitting the code path which we wanted to
-// write but doesn't work. undef'ed at the end.
-// Intel compilers prefer the non-workaround version.
-#ifndef KOKKOS_ARCH_INTEL_GPU
-#define KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND
-#endif
-
-// FIXME_OPENMPTARGET - Using this macro to implement a workaround for
-// hierarchical scan. It avoids hitting the code path which we wanted to
-// write but doesn't work. undef'ed at the end.
-#ifndef KOKKOS_ARCH_INTEL_GPU
-#define KOKKOS_IMPL_TEAM_SCAN_WORKAROUND
-#endif
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template <class Reducer>
-struct OpenMPTargetReducerWrapper {
-  using value_type = typename Reducer::value_type;
-
-  // Using a generic unknown Reducer for the OpenMPTarget backend is not
-  // implemented.
-  KOKKOS_INLINE_FUNCTION
-  static void join(value_type&, const value_type&) = delete;
-
-  KOKKOS_INLINE_FUNCTION
-  static void join(volatile value_type&, const volatile value_type&) = delete;
-
-  KOKKOS_INLINE_FUNCTION
-  static void init(value_type&) = delete;
-};
-
-template <class Scalar, class Space>
-struct OpenMPTargetReducerWrapper<Sum<Scalar, Space>> {
- public:
-  // Required
-  using value_type = std::remove_cv_t<Scalar>;
-
-  // Required
-  KOKKOS_INLINE_FUNCTION
-  static void join(value_type& dest, const value_type& src) { dest += src; }
-
-  KOKKOS_INLINE_FUNCTION
-  static void join(volatile value_type& dest, const volatile value_type& src) {
-    dest += src;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  static void init(value_type& val) {
-    val = reduction_identity<value_type>::sum();
-  }
-};
-
-template <class Scalar, class Space>
-struct OpenMPTargetReducerWrapper<Prod<Scalar, Space>> {
- public:
-  // Required
-  using value_type = std::remove_cv_t<Scalar>;
-
-  // Required
-  KOKKOS_INLINE_FUNCTION
-  static void join(value_type& dest, const value_type& src) { dest *= src; }
-
-  KOKKOS_INLINE_FUNCTION
-  static void join(volatile value_type& dest, const volatile value_type& src) {
-    dest *= src;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  static void init(value_type& val) {
-    val = reduction_identity<value_type>::prod();
-  }
-};
-
-template <class Scalar, class Space>
-struct OpenMPTargetReducerWrapper<Min<Scalar, Space>> {
- public:
-  // Required
-  using value_type = std::remove_cv_t<Scalar>;
-
-  // Required
-  KOKKOS_INLINE_FUNCTION
-  static void join(value_type& dest, const value_type& src) {
-    if (src < dest) dest = src;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  static void join(volatile value_type& dest, const volatile value_type& src) {
-    if (src < dest) dest = src;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  static void init(value_type& val) {
-    val = reduction_identity<value_type>::min();
-  }
-};
-
-template <class Scalar, class Space>
-struct OpenMPTargetReducerWrapper<Max<Scalar, Space>> {
- public:
-  // Required
-  using value_type = std::remove_cv_t<Scalar>;
-
-  // Required
-  KOKKOS_INLINE_FUNCTION
-  static void join(value_type& dest, const value_type& src) {
-    if (src > dest) dest = src;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  static void join(volatile value_type& dest, const volatile value_type& src) {
-    if (src > dest) dest = src;
-  }
-
-  // Required
-  KOKKOS_INLINE_FUNCTION
-  static void init(value_type& val) {
-    val = reduction_identity<value_type>::max();
-  }
-};
-
-template <class Scalar, class Space>
-struct OpenMPTargetReducerWrapper<LAnd<Scalar, Space>> {
- public:
-  // Required
-  using value_type = std::remove_cv_t<Scalar>;
-
-  KOKKOS_INLINE_FUNCTION
-  static void join(value_type& dest, const value_type& src) {
-    dest = dest && src;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  static void join(volatile value_type& dest, const volatile value_type& src) {
-    dest = dest && src;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  static void init(value_type& val) {
-    val = reduction_identity<value_type>::land();
-  }
-};
-
-template <class Scalar, class Space>
-struct OpenMPTargetReducerWrapper<LOr<Scalar, Space>> {
- public:
-  // Required
-  using value_type = std::remove_cv_t<Scalar>;
-
-  using result_view_type = Kokkos::View<value_type, Space>;
-
-  // Required
-  KOKKOS_INLINE_FUNCTION
-  static void join(value_type& dest, const value_type& src) {
-    dest = dest || src;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  static void join(volatile value_type& dest, const volatile value_type& src) {
-    dest = dest || src;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  static void init(value_type& val) {
-    val = reduction_identity<value_type>::lor();
-  }
-};
-
-template <class Scalar, class Space>
-struct OpenMPTargetReducerWrapper<BAnd<Scalar, Space>> {
- public:
-  // Required
-  using value_type = std::remove_cv_t<Scalar>;
-
-  // Required
-  KOKKOS_INLINE_FUNCTION
-  static void join(value_type& dest, const value_type& src) {
-    dest = dest & src;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  static void join(volatile value_type& dest, const volatile value_type& src) {
-    dest = dest & src;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  static void init(value_type& val) {
-    val = reduction_identity<value_type>::band();
-  }
-};
-
-template <class Scalar, class Space>
-struct OpenMPTargetReducerWrapper<BOr<Scalar, Space>> {
- public:
-  // Required
-  using value_type = std::remove_cv_t<Scalar>;
-
-  // Required
-  KOKKOS_INLINE_FUNCTION
-  static void join(value_type& dest, const value_type& src) {
-    dest = dest | src;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  static void join(volatile value_type& dest, const volatile value_type& src) {
-    dest = dest | src;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  static void init(value_type& val) {
-    val = reduction_identity<value_type>::bor();
-  }
-};
-
-template <class Scalar, class Index, class Space>
-struct OpenMPTargetReducerWrapper<MinLoc<Scalar, Index, Space>> {
- private:
-  using scalar_type = std::remove_cv_t<Scalar>;
-  using index_type  = std::remove_cv_t<Index>;
-
- public:
-  // Required
-  using value_type = ValLocScalar<scalar_type, index_type>;
-
-  // Required
-  KOKKOS_INLINE_FUNCTION
-  static void join(value_type& dest, const value_type& src) {
-    if (src.val < dest.val) dest = src;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  static void join(volatile value_type& dest, const volatile value_type& src) {
-    if (src.val < dest.val) dest = src;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  static void init(value_type& val) {
-    val.val = reduction_identity<scalar_type>::min();
-    val.loc = reduction_identity<index_type>::min();
-  }
-};
-
-template <class Scalar, class Index, class Space>
-struct OpenMPTargetReducerWrapper<MaxLoc<Scalar, Index, Space>> {
- private:
-  using scalar_type = std::remove_cv_t<Scalar>;
-  using index_type  = std::remove_cv_t<Index>;
-
- public:
-  // Required
-  using value_type = ValLocScalar<scalar_type, index_type>;
-
-  KOKKOS_INLINE_FUNCTION
-  static void join(value_type& dest, const value_type& src) {
-    if (src.val > dest.val) dest = src;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  static void join(volatile value_type& dest, const volatile value_type& src) {
-    if (src.val > dest.val) dest = src;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  static void init(value_type& val) {
-    val.val = reduction_identity<scalar_type>::max();
-    val.loc = reduction_identity<index_type>::min();
-  }
-};
-
-template <class Scalar, class Space>
-struct OpenMPTargetReducerWrapper<MinMax<Scalar, Space>> {
- private:
-  using scalar_type = std::remove_cv_t<Scalar>;
-
- public:
-  // Required
-  using value_type = MinMaxScalar<scalar_type>;
-
-  // Required
-  KOKKOS_INLINE_FUNCTION
-  static void join(value_type& dest, const value_type& src) {
-    if (src.min_val < dest.min_val) {
-      dest.min_val = src.min_val;
-    }
-    if (src.max_val > dest.max_val) {
-      dest.max_val = src.max_val;
-    }
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  static void join(volatile value_type& dest, const volatile value_type& src) {
-    if (src.min_val < dest.min_val) {
-      dest.min_val = src.min_val;
-    }
-    if (src.max_val > dest.max_val) {
-      dest.max_val = src.max_val;
-    }
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  static void init(value_type& val) {
-    val.max_val = reduction_identity<scalar_type>::max();
-    val.min_val = reduction_identity<scalar_type>::min();
-  }
-};
-
-template <class Scalar, class Index, class Space>
-struct OpenMPTargetReducerWrapper<MinMaxLoc<Scalar, Index, Space>> {
- private:
-  using scalar_type = std::remove_cv_t<Scalar>;
-  using index_type  = std::remove_cv_t<Index>;
-
- public:
-  // Required
-  using value_type = MinMaxLocScalar<scalar_type, index_type>;
-
-  // Required
-  KOKKOS_INLINE_FUNCTION
-  static void join(value_type& dest, const value_type& src) {
-    if (src.min_val < dest.min_val) {
-      dest.min_val = src.min_val;
-      dest.min_loc = src.min_loc;
-    }
-    if (src.max_val > dest.max_val) {
-      dest.max_val = src.max_val;
-      dest.max_loc = src.max_loc;
-    }
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  static void join(volatile value_type& dest, const volatile value_type& src) {
-    if (src.min_val < dest.min_val) {
-      dest.min_val = src.min_val;
-      dest.min_loc = src.min_loc;
-    }
-    if (src.max_val > dest.max_val) {
-      dest.max_val = src.max_val;
-      dest.max_loc = src.max_loc;
-    }
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  static void init(value_type& val) {
-    val.max_val = reduction_identity<scalar_type>::max();
-    val.min_val = reduction_identity<scalar_type>::min();
-    val.max_loc = reduction_identity<index_type>::min();
-    val.min_loc = reduction_identity<index_type>::min();
-  }
-};
-
-//
-// specialize for MaxFirstLoc
-//
-template <class Scalar, class Index, class Space>
-struct OpenMPTargetReducerWrapper<MaxFirstLoc<Scalar, Index, Space>> {
- private:
-  using scalar_type = std::remove_cv_t<Scalar>;
-  using index_type  = std::remove_cv_t<Index>;
-
- public:
-  // Required
-  using value_type = ValLocScalar<scalar_type, index_type>;
-
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
-  KOKKOS_INLINE_FUNCTION
-  static void join(value_type& dest, const value_type& src) {
-    if (dest.val < src.val) {
-      dest = src;
-    } else if (!(src.val < dest.val)) {
-      dest.loc = (src.loc < dest.loc) ? src.loc : dest.loc;
-    }
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  static void join(volatile value_type& dest, const volatile value_type& src) {
-    if (dest.val < src.val) {
-      dest = src;
-    } else if (!(src.val < dest.val)) {
-      dest.loc = (src.loc < dest.loc) ? src.loc : dest.loc;
-    }
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  static void init(value_type& val) {
-    val.val = reduction_identity<scalar_type>::max();
-    val.loc = reduction_identity<index_type>::min();
-  }
-#pragma omp end declare target
-};
-
-//
-// specialize for MinFirstLoc
-//
-template <class Scalar, class Index, class Space>
-struct OpenMPTargetReducerWrapper<MinFirstLoc<Scalar, Index, Space>> {
- private:
-  using scalar_type = std::remove_cv_t<Scalar>;
-  using index_type  = std::remove_cv_t<Index>;
-
- public:
-  // Required
-  using value_type = ValLocScalar<scalar_type, index_type>;
-
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
-  KOKKOS_INLINE_FUNCTION
-  static void join(value_type& dest, const value_type& src) {
-    if (src.val < dest.val) {
-      dest = src;
-    } else if (!(dest.val < src.val)) {
-      dest.loc = (src.loc < dest.loc) ? src.loc : dest.loc;
-    }
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  static void join(volatile value_type& dest, const volatile value_type& src) {
-    if (src.val < dest.val) {
-      dest = src;
-    } else if (!(dest.val < src.val)) {
-      dest.loc = (src.loc < dest.loc) ? src.loc : dest.loc;
-    }
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  static void init(value_type& val) {
-    val.val = reduction_identity<scalar_type>::min();
-    val.loc = reduction_identity<index_type>::min();
-  }
-#pragma omp end declare target
-};
-
-//
-// specialize for MinMaxFirstLastLoc
-//
-template <class Scalar, class Index, class Space>
-struct OpenMPTargetReducerWrapper<MinMaxFirstLastLoc<Scalar, Index, Space>> {
- private:
-  using scalar_type = std::remove_cv_t<Scalar>;
-  using index_type  = std::remove_cv_t<Index>;
-
- public:
-  // Required
-  using value_type = MinMaxLocScalar<scalar_type, index_type>;
-
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
-  // Required
-  KOKKOS_INLINE_FUNCTION
-  static void join(value_type& dest, const value_type& src) {
-    if (src.min_val < dest.min_val) {
-      dest.min_val = src.min_val;
-      dest.min_loc = src.min_loc;
-    } else if (!(dest.min_val < src.min_val)) {
-      dest.min_loc = (src.min_loc < dest.min_loc) ? src.min_loc : dest.min_loc;
-    }
-
-    if (dest.max_val < src.max_val) {
-      dest.max_val = src.max_val;
-      dest.max_loc = src.max_loc;
-    } else if (!(src.max_val < dest.max_val)) {
-      dest.max_loc = (src.max_loc > dest.max_loc) ? src.max_loc : dest.max_loc;
-    }
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  static void join(volatile value_type& dest, const volatile value_type& src) {
-    if (src.min_val < dest.min_val) {
-      dest.min_val = src.min_val;
-      dest.min_loc = src.min_loc;
-    } else if (!(dest.min_val < src.min_val)) {
-      dest.min_loc = (src.min_loc < dest.min_loc) ? src.min_loc : dest.min_loc;
-    }
-
-    if (dest.max_val < src.max_val) {
-      dest.max_val = src.max_val;
-      dest.max_loc = src.max_loc;
-    } else if (!(src.max_val < dest.max_val)) {
-      dest.max_loc = (src.max_loc > dest.max_loc) ? src.max_loc : dest.max_loc;
-    }
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  static void init(value_type& val) {
-    val.max_val = reduction_identity<scalar_type>::max();
-    val.min_val = reduction_identity<scalar_type>::min();
-    val.max_loc = reduction_identity<index_type>::max();
-    val.min_loc = reduction_identity<index_type>::min();
-  }
-#pragma omp end declare target
-};
-
-//
-// specialize for FirstLoc
-//
-template <class Index, class Space>
-struct OpenMPTargetReducerWrapper<FirstLoc<Index, Space>> {
- private:
-  using index_type = std::remove_cv_t<Index>;
-
- public:
-  // Required
-  using value_type = FirstLocScalar<index_type>;
-
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
-  // Required
-  KOKKOS_INLINE_FUNCTION
-  static void join(value_type& dest, const value_type& src) {
-    dest.min_loc_true = (src.min_loc_true < dest.min_loc_true)
-                            ? src.min_loc_true
-                            : dest.min_loc_true;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  static void join(volatile value_type& dest, const volatile value_type& src) {
-    dest.min_loc_true = (src.min_loc_true < dest.min_loc_true)
-                            ? src.min_loc_true
-                            : dest.min_loc_true;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  static void init(value_type& val) {
-    val.min_loc_true = reduction_identity<index_type>::min();
-  }
-#pragma omp end declare target
-};
-
-//
-// specialize for LastLoc
-//
-template <class Index, class Space>
-struct OpenMPTargetReducerWrapper<LastLoc<Index, Space>> {
- private:
-  using index_type = std::remove_cv_t<Index>;
-
- public:
-  // Required
-  using value_type = LastLocScalar<index_type>;
-
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
-  // Required
-  KOKKOS_INLINE_FUNCTION
-  static void join(value_type& dest, const value_type& src) {
-    dest.max_loc_true = (src.max_loc_true > dest.max_loc_true)
-                            ? src.max_loc_true
-                            : dest.max_loc_true;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  static void join(volatile value_type& dest, const volatile value_type& src) {
-    dest.max_loc_true = (src.max_loc_true > dest.max_loc_true)
-                            ? src.max_loc_true
-                            : dest.max_loc_true;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  static void init(value_type& val) {
-    val.max_loc_true = reduction_identity<index_type>::max();
-  }
-#pragma omp end declare target
-};
-
-//
-// specialize for StdIsPartitioned
-//
-template <class Index, class Space>
-struct OpenMPTargetReducerWrapper<StdIsPartitioned<Index, Space>> {
- private:
-  using index_type = std::remove_cv_t<Index>;
-
- public:
-  // Required
-  using value_type = StdIsPartScalar<index_type>;
-
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
-  // Required
-  KOKKOS_INLINE_FUNCTION
-  static void join(value_type& dest, const value_type& src) {
-    dest.max_loc_true = (dest.max_loc_true < src.max_loc_true)
-                            ? src.max_loc_true
-                            : dest.max_loc_true;
-
-    dest.min_loc_false = (dest.min_loc_false < src.min_loc_false)
-                             ? dest.min_loc_false
-                             : src.min_loc_false;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  static void join(volatile value_type& dest, const volatile value_type& src) {
-    dest.max_loc_true = (dest.max_loc_true < src.max_loc_true)
-                            ? src.max_loc_true
-                            : dest.max_loc_true;
-
-    dest.min_loc_false = (dest.min_loc_false < src.min_loc_false)
-                             ? dest.min_loc_false
-                             : src.min_loc_false;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  static void init(value_type& val) {
-    val.max_loc_true  = ::Kokkos::reduction_identity<index_type>::max();
-    val.min_loc_false = ::Kokkos::reduction_identity<index_type>::min();
-  }
-#pragma omp end declare target
-};
-
-//
-// specialize for StdPartitionPoint
-//
-template <class Index, class Space>
-struct OpenMPTargetReducerWrapper<StdPartitionPoint<Index, Space>> {
- private:
-  using index_type = std::remove_cv_t<Index>;
-
- public:
-  // Required
-  using value_type = StdPartPointScalar<index_type>;
-
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
-  // Required
-  KOKKOS_INLINE_FUNCTION
-  static void join(value_type& dest, const value_type& src) {
-    dest.min_loc_false = (dest.min_loc_false < src.min_loc_false)
-                             ? dest.min_loc_false
-                             : src.min_loc_false;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  static void join(volatile value_type& dest, const volatile value_type& src) {
-    dest.min_loc_false = (dest.min_loc_false < src.min_loc_false)
-                             ? dest.min_loc_false
-                             : src.min_loc_false;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  static void init(value_type& val) {
-    val.min_loc_false = ::Kokkos::reduction_identity<index_type>::min();
-  }
-#pragma omp end declare target
-};
-
-/*
-template<class ReducerType>
-class OpenMPTargetReducerWrapper {
-  public:
-    const ReducerType& reducer;
-    using value_type = typename ReducerType::value_type;
-    value_type& value;
-
-    KOKKOS_INLINE_FUNCTION
-    void join(const value_type& upd) {
-      reducer.join(value,upd);
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    void init(const value_type& upd) {
-      reducer.init(value,upd);
-    }
-};*/
-
-}  // namespace Impl
-}  // namespace Kokkos
-
-namespace Kokkos {
-namespace Impl {
-
-//----------------------------------------------------------------------------
-/** \brief  Data for OpenMPTarget thread execution */
-
-class OpenMPTargetExec {
- public:
-  // FIXME_OPENMPTARGET - Currently the maximum number of
-  // teams possible is calculated based on NVIDIA's Volta GPU. In
-  // future this value should be based on the chosen architecture for the
-  // OpenMPTarget backend.
-  static constexpr int MAX_ACTIVE_THREADS = 2080 * 80;
-  static constexpr int MAX_ACTIVE_TEAMS   = MAX_ACTIVE_THREADS / 32;
-
- private:
-  static void* scratch_ptr;
-
- public:
-  static void verify_is_process(const char* const);
-  static void verify_initialized(const char* const);
-
-  static int* get_lock_array(int num_teams);
-  static void* get_scratch_ptr();
-  static void clear_scratch();
-  static void clear_lock_array();
-  static void resize_scratch(int64_t team_reduce_bytes,
-                             int64_t team_shared_bytes,
-                             int64_t thread_local_bytes, int64_t league_size);
-
-  static void* m_scratch_ptr;
-  static int64_t m_scratch_size;
-  static int* m_lock_array;
-  static int64_t m_lock_size;
-  static uint32_t* m_uniquetoken_ptr;
-};
-
-}  // namespace Impl
-}  // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-class OpenMPTargetExecTeamMember {
- public:
-  static constexpr int TEAM_REDUCE_SIZE = 512;
-
-  using execution_space      = Kokkos::Experimental::OpenMPTarget;
-  using scratch_memory_space = execution_space::scratch_memory_space;
-  using team_handle          = OpenMPTargetExecTeamMember;
-
-  scratch_memory_space m_team_shared;
-  size_t m_team_scratch_size[2];
-  int m_team_rank;
-  int m_team_size;
-  int m_league_rank;
-  int m_league_size;
-  int m_vector_length;
-  int m_vector_lane;
-  int m_shmem_block_index;
-  void* m_glb_scratch;
-  void* m_reduce_scratch;
-
- public:
-  KOKKOS_INLINE_FUNCTION
-  const execution_space::scratch_memory_space& team_shmem() const {
-    return m_team_shared.set_team_thread_mode(0, 1, 0);
-  }
-
-  // set_team_thread_mode routine parameters for future understanding:
-  // first parameter - scratch level.
-  // second parameter - size multiplier for advancing scratch ptr after a
-  // request was serviced. third parameter - offset size multiplier from current
-  // scratch ptr when returning a ptr for a request.
-  KOKKOS_INLINE_FUNCTION
-  const execution_space::scratch_memory_space& team_scratch(int level) const {
-    return m_team_shared.set_team_thread_mode(level, 1, 0);
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  const execution_space::scratch_memory_space& thread_scratch(int level) const {
-    return m_team_shared.set_team_thread_mode(level, team_size(), team_rank());
-  }
-
-  KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank; }
-  KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size; }
-  KOKKOS_INLINE_FUNCTION int team_rank() const { return m_team_rank; }
-  KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size; }
-  KOKKOS_INLINE_FUNCTION void* impl_reduce_scratch() const {
-    return m_reduce_scratch;
-  }
-
-  KOKKOS_INLINE_FUNCTION void team_barrier() const {
-#pragma omp barrier
-  }
-
-  template <class ValueType>
-  KOKKOS_INLINE_FUNCTION void team_broadcast(ValueType& value,
-                                             int thread_id) const {
-    // Make sure there is enough scratch space:
-    using type = std::conditional_t<(sizeof(ValueType) < TEAM_REDUCE_SIZE),
-                                    ValueType, void>;
-    type* team_scratch =
-        reinterpret_cast<type*>(static_cast<char*>(m_glb_scratch) +
-                                TEAM_REDUCE_SIZE * omp_get_team_num());
-#pragma omp barrier
-    if (team_rank() == thread_id) *team_scratch = value;
-#pragma omp barrier
-    value = *team_scratch;
-  }
-
-  template <class Closure, class ValueType>
-  KOKKOS_INLINE_FUNCTION void team_broadcast(const Closure& f, ValueType& value,
-                                             const int& thread_id) const {
-    f(value);
-    team_broadcast(value, thread_id);
-  }
-
-  // FIXME_OPENMPTARGET this function has the wrong interface and currently
-  // ignores the reducer passed.
-  template <class ValueType, class JoinOp>
-  KOKKOS_INLINE_FUNCTION ValueType team_reduce(const ValueType& value,
-                                               const JoinOp&) const {
-#pragma omp barrier
-
-    using value_type = ValueType;
-    //    const JoinLambdaAdapter<value_type, JoinOp> op(op_in);
-
-    // Make sure there is enough scratch space:
-    using type = std::conditional_t<(sizeof(value_type) < TEAM_REDUCE_SIZE),
-                                    value_type, void>;
-
-    const int n_values = TEAM_REDUCE_SIZE / sizeof(value_type);
-    type* team_scratch =
-        reinterpret_cast<type*>(static_cast<char*>(m_glb_scratch) +
-                                TEAM_REDUCE_SIZE * omp_get_team_num());
-    for (int i = m_team_rank; i < n_values; i += m_team_size) {
-      team_scratch[i] = value_type();
-    }
-
-#pragma omp barrier
-
-    for (int k = 0; k < m_team_size; k += n_values) {
-      if ((k <= m_team_rank) && (k + n_values > m_team_rank))
-        team_scratch[m_team_rank % n_values] += value;
-#pragma omp barrier
-    }
-
-    for (int d = 1; d < n_values; d *= 2) {
-      if ((m_team_rank + d < n_values) && (m_team_rank % (2 * d) == 0)) {
-        team_scratch[m_team_rank] += team_scratch[m_team_rank + d];
-      }
-#pragma omp barrier
-    }
-    return team_scratch[0];
-  }
-  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
-   *          with intra-team non-deterministic ordering accumulation.
-   *
-   *  The global inter-team accumulation value will, at the end of the
-   *  league's parallel execution, be the scan's total.
-   *  Parallel execution ordering of the league's teams is non-deterministic.
-   *  As such the base value for each team's scan operation is similarly
-   *  non-deterministic.
-   */
-  template <typename ArgType>
-  KOKKOS_INLINE_FUNCTION ArgType
-  team_scan(const ArgType& /*value*/, ArgType* const /*global_accum*/) const {
-    // FIXME_OPENMPTARGET
-    /*  // Make sure there is enough scratch space:
-      using type =
-        std::conditional_t<(sizeof(ArgType) < TEAM_REDUCE_SIZE), ArgType, void>;
-
-      volatile type * const work_value  = ((type*) m_exec.scratch_thread());
-
-      *work_value = value ;
-
-      memory_fence();
-
-      if ( team_fan_in() ) {
-        // The last thread to synchronize returns true, all other threads wait
-      for team_fan_out()
-        // m_team_base[0]                 == highest ranking team member
-        // m_team_base[ m_team_size - 1 ] == lowest ranking team member
-        //
-        // 1) copy from lower to higher rank, initialize lowest rank to zero
-        // 2) prefix sum from lowest to highest rank, skipping lowest rank
-
-        type accum = 0 ;
-
-        if ( global_accum ) {
-          for ( int i = m_team_size ; i-- ; ) {
-            type & val = *((type*) m_exec.pool_rev( m_team_base_rev + i
-      )->scratch_thread()); accum += val ;
-          }
-          accum = atomic_fetch_add( global_accum , accum );
-        }
-
-        for ( int i = m_team_size ; i-- ; ) {
-          type & val = *((type*) m_exec.pool_rev( m_team_base_rev + i
-      )->scratch_thread()); const type offset = accum ; accum += val ; val =
-      offset ;
-        }
-
-        memory_fence();
-      }
-
-      team_fan_out();
-
-      return *work_value ;*/
-    return ArgType();
-  }
-
-  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
-   *
-   *  The highest rank thread can compute the reduction total as
-   *    reduction_total = dev.team_scan( value ) + value ;
-   */
-  template <typename Type>
-  KOKKOS_INLINE_FUNCTION Type team_scan(const Type& value) const {
-    return this->template team_scan<Type>(value, 0);
-  }
-
-  //----------------------------------------
-  // Private for the driver
-
- private:
-  using space = execution_space::scratch_memory_space;
-
- public:
-  // FIXME_OPENMPTARGET - 512(16*32) bytes at the begining of the scratch space
-  // for each league is saved for reduction. It should actually be based on the
-  // ValueType of the reduction variable.
-  inline OpenMPTargetExecTeamMember(
-      const int league_rank, const int league_size, const int team_size,
-      const int vector_length  // const TeamPolicyInternal< OpenMPTarget,
-                               // Properties ...> & team
-      ,
-      void* const glb_scratch, const int shmem_block_index,
-      const size_t shmem_size_L0, const size_t shmem_size_L1)
-      : m_team_scratch_size{shmem_size_L0, shmem_size_L1},
-        m_team_rank(0),
-        m_team_size(team_size),
-        m_league_rank(league_rank),
-        m_league_size(league_size),
-        m_vector_length(vector_length),
-        m_shmem_block_index(shmem_block_index),
-        m_glb_scratch(glb_scratch) {
-    const int omp_tid = omp_get_thread_num();
-
-    // The scratch memory allocated is a sum of TEAM_REDUCE_SIZE, L0 shmem size
-    // and L1 shmem size. TEAM_REDUCE_SIZE = 512 bytes saved per team for
-    // hierarchical reduction. There is an additional 10% of the requested
-    // scratch memory allocated per team as padding. Hence the product with 0.1.
-    const int reduce_offset =
-        m_shmem_block_index *
-        (shmem_size_L0 + shmem_size_L1 +
-         ((shmem_size_L0 + shmem_size_L1) * 0.1) + TEAM_REDUCE_SIZE);
-    const int l0_offset = reduce_offset + TEAM_REDUCE_SIZE;
-    const int l1_offset = l0_offset + shmem_size_L0;
-    m_team_shared       = scratch_memory_space(
-        (static_cast<char*>(glb_scratch) + l0_offset), shmem_size_L0,
-        static_cast<char*>(glb_scratch) + l1_offset, shmem_size_L1);
-    m_reduce_scratch = static_cast<char*>(glb_scratch) + reduce_offset;
-    m_league_rank    = league_rank;
-    m_team_rank      = omp_tid;
-    m_vector_lane    = 0;
-  }
-
-  static inline int team_reduce_size() { return TEAM_REDUCE_SIZE; }
-};
-
-template <class... Properties>
-class TeamPolicyInternal<Kokkos::Experimental::OpenMPTarget, Properties...>
-    : public PolicyTraits<Properties...> {
- public:
-  //! Tag this class as a kokkos execution policy
-  using execution_policy = TeamPolicyInternal;
-
-  using traits = PolicyTraits<Properties...>;
-
-  //----------------------------------------
-
-  template <class FunctorType>
-  inline static int team_size_max(const FunctorType&, const ParallelForTag&) {
-    return 256;
-  }
-
-  template <class FunctorType>
-  inline static int team_size_max(const FunctorType&,
-                                  const ParallelReduceTag&) {
-    return 256;
-  }
-
-  template <class FunctorType, class ReducerType>
-  inline static int team_size_max(const FunctorType&, const ReducerType&,
-                                  const ParallelReduceTag&) {
-    return 256;
-  }
-
-  template <class FunctorType>
-  inline static int team_size_recommended(const FunctorType&,
-                                          const ParallelForTag&) {
-    return 128;
-  }
-
-  template <class FunctorType>
-  inline static int team_size_recommended(const FunctorType&,
-                                          const ParallelReduceTag&) {
-    return 128;
-  }
-
-  template <class FunctorType, class ReducerType>
-  inline static int team_size_recommended(const FunctorType&,
-                                          const ReducerType&,
-                                          const ParallelReduceTag&) {
-    return 128;
-  }
-
-  //----------------------------------------
-
- private:
-  int m_league_size;
-  int m_team_size;
-  int m_vector_length;
-  int m_team_alloc;
-  int m_team_iter;
-  std::array<size_t, 2> m_team_scratch_size;
-  std::array<size_t, 2> m_thread_scratch_size;
-  bool m_tune_team_size;
-  bool m_tune_vector_length;
-  constexpr const static size_t default_team_size = 256;
-  int m_chunk_size;
-
-  inline void init(const int league_size_request, const int team_size_request,
-                   const int vector_length_request) {
-    m_league_size = league_size_request;
-
-    // Minimum team size should be 32 for OpenMPTarget backend.
-    if (team_size_request < 32) {
-      Kokkos::Impl::OpenMPTarget_abort(
-          "OpenMPTarget backend requires a minimum of 32 threads per team.\n");
-    } else
-      m_team_size = team_size_request;
-
-    m_vector_length = vector_length_request;
-    set_auto_chunk_size();
-  }
-
-  template <typename ExecSpace, typename... OtherProperties>
-  friend class TeamPolicyInternal;
-
- public:
-  // FIXME_OPENMPTARGET : Currently this routine is a copy of the Cuda
-  // implementation, but this has to be tailored to be architecture specific.
-  inline static int scratch_size_max(int level) {
-    return (
-        level == 0 ? 1024 * 40 :  // 48kB is the max for CUDA, but we need some
-                                  // for team_member.reduce etc.
-            20 * 1024 *
-                1024);  // arbitrarily setting this to 20MB, for a Volta V100
-                        // that would give us about 3.2GB for 2 teams per SM
-  }
-  inline bool impl_auto_team_size() const { return m_tune_team_size; }
-  inline bool impl_auto_vector_length() const { return m_tune_vector_length; }
-  inline void impl_set_team_size(const size_t size) { m_team_size = size; }
-  inline void impl_set_vector_length(const size_t length) {
-    m_tune_vector_length = length;
-  }
-  inline int impl_vector_length() const { return m_vector_length; }
-  inline int team_size() const { return m_team_size; }
-  inline int league_size() const { return m_league_size; }
-  inline size_t scratch_size(const int& level, int team_size_ = -1) const {
-    if (team_size_ < 0) team_size_ = m_team_size;
-    return m_team_scratch_size[level] +
-           team_size_ * m_thread_scratch_size[level];
-  }
-
-  inline Kokkos::Experimental::OpenMPTarget space() const {
-    return Kokkos::Experimental::OpenMPTarget();
-  }
-
-  template <class... OtherProperties>
-  TeamPolicyInternal(const TeamPolicyInternal<OtherProperties...>& p)
-      : m_league_size(p.m_league_size),
-        m_team_size(p.m_team_size),
-        m_vector_length(p.m_vector_length),
-        m_team_alloc(p.m_team_alloc),
-        m_team_iter(p.m_team_iter),
-        m_team_scratch_size(p.m_team_scratch_size),
-        m_thread_scratch_size(p.m_thread_scratch_size),
-        m_tune_team_size(p.m_tune_team_size),
-        m_tune_vector_length(p.m_tune_vector_length),
-        m_chunk_size(p.m_chunk_size) {}
-
-  /** \brief  Specify league size, request team size */
-  TeamPolicyInternal(const typename traits::execution_space&,
-                     int league_size_request, int team_size_request,
-                     int vector_length_request = 1)
-      : m_team_scratch_size{0, 0},
-        m_thread_scratch_size{0, 0},
-        m_tune_team_size(false),
-        m_tune_vector_length(false),
-        m_chunk_size(0) {
-    init(league_size_request, team_size_request, vector_length_request);
-  }
-
-  TeamPolicyInternal(const typename traits::execution_space&,
-                     int league_size_request,
-                     const Kokkos::AUTO_t& /* team_size_request */
-                     ,
-                     int vector_length_request = 1)
-      : m_team_scratch_size{0, 0},
-        m_thread_scratch_size{0, 0},
-        m_tune_team_size(true),
-        m_tune_vector_length(false),
-        m_chunk_size(0) {
-    init(league_size_request, default_team_size / vector_length_request,
-         vector_length_request);
-  }
-
-  TeamPolicyInternal(const typename traits::execution_space&,
-                     int league_size_request,
-                     const Kokkos::AUTO_t& /* team_size_request */
-                     ,
-                     const Kokkos::AUTO_t& /* vector_length_request */)
-      : m_team_scratch_size{0, 0},
-        m_thread_scratch_size{0, 0},
-        m_tune_team_size(true),
-        m_tune_vector_length(true),
-        m_chunk_size(0) {
-    init(league_size_request, default_team_size, 1);
-  }
-  TeamPolicyInternal(const typename traits::execution_space&,
-                     int league_size_request, int team_size_request,
-                     const Kokkos::AUTO_t& /* vector_length_request */)
-      : m_team_scratch_size{0, 0},
-        m_thread_scratch_size{0, 0},
-        m_tune_team_size(false),
-        m_tune_vector_length(true),
-        m_chunk_size(0) {
-    init(league_size_request, team_size_request, 1);
-  }
-
-  TeamPolicyInternal(int league_size_request, int team_size_request,
-                     int vector_length_request = 1)
-      : m_team_scratch_size{0, 0},
-        m_thread_scratch_size{0, 0},
-        m_tune_team_size(false),
-        m_tune_vector_length(false),
-        m_chunk_size(0) {
-    init(league_size_request, team_size_request, vector_length_request);
-  }
-
-  TeamPolicyInternal(int league_size_request,
-                     const Kokkos::AUTO_t& /* team_size_request */
-                     ,
-                     int vector_length_request = 1)
-      : m_team_scratch_size{0, 0},
-        m_thread_scratch_size{0, 0},
-        m_tune_team_size(true),
-        m_tune_vector_length(false),
-        m_chunk_size(0) {
-    init(league_size_request, default_team_size / vector_length_request,
-         vector_length_request);
-  }
-
-  TeamPolicyInternal(int league_size_request,
-                     const Kokkos::AUTO_t& /* team_size_request */
-                     ,
-                     const Kokkos::AUTO_t& /* vector_length_request */)
-      : m_team_scratch_size{0, 0},
-        m_thread_scratch_size{0, 0},
-        m_tune_team_size(true),
-        m_tune_vector_length(true),
-        m_chunk_size(0) {
-    init(league_size_request, default_team_size, 1);
-  }
-  TeamPolicyInternal(int league_size_request, int team_size_request,
-                     const Kokkos::AUTO_t& /* vector_length_request */)
-      : m_team_scratch_size{0, 0},
-        m_thread_scratch_size{0, 0},
-        m_tune_team_size(false),
-        m_tune_vector_length(true),
-        m_chunk_size(0) {
-    init(league_size_request, team_size_request, 1);
-  }
-  inline static size_t vector_length_max() {
-    return 32; /* TODO: this is bad. Need logic that is compiler and backend
-                  aware */
-  }
-  inline int team_alloc() const { return m_team_alloc; }
-  inline int team_iter() const { return m_team_iter; }
-
-  inline int chunk_size() const { return m_chunk_size; }
-
-  /** \brief set chunk_size to a discrete value*/
-  inline TeamPolicyInternal& set_chunk_size(
-      typename traits::index_type chunk_size_) {
-    m_chunk_size = chunk_size_;
-    return *this;
-  }
-
-  /** \brief set per team scratch size for a specific level of the scratch
-   * hierarchy */
-  inline TeamPolicyInternal& set_scratch_size(const int& level,
-                                              const PerTeamValue& per_team) {
-    m_team_scratch_size[level] = per_team.value;
-    return *this;
-  }
-
-  /** \brief set per thread scratch size for a specific level of the scratch
-   * hierarchy */
-  inline TeamPolicyInternal& set_scratch_size(
-      const int& level, const PerThreadValue& per_thread) {
-    m_thread_scratch_size[level] = per_thread.value;
-    return *this;
-  }
-
-  /** \brief set per thread and per team scratch size for a specific level of
-   * the scratch hierarchy */
-  inline TeamPolicyInternal& set_scratch_size(
-      const int& level, const PerTeamValue& per_team,
-      const PerThreadValue& per_thread) {
-    m_team_scratch_size[level]   = per_team.value;
-    m_thread_scratch_size[level] = per_thread.value;
-    return *this;
-  }
-
- private:
-  /** \brief finalize chunk_size if it was set to AUTO*/
-  inline void set_auto_chunk_size() {
-    int concurrency = 2048 * 128;
-
-    if (concurrency == 0) concurrency = 1;
-
-    if (m_chunk_size > 0) {
-      if (!Impl::is_integral_power_of_two(m_chunk_size))
-        Kokkos::abort("TeamPolicy blocking granularity must be power of two");
-    }
-
-    int new_chunk_size = 1;
-    while (new_chunk_size * 100 * concurrency < m_league_size)
-      new_chunk_size *= 2;
-    if (new_chunk_size < 128) {
-      new_chunk_size = 1;
-      while ((new_chunk_size * 40 * concurrency < m_league_size) &&
-             (new_chunk_size < 128))
-        new_chunk_size *= 2;
-    }
-    m_chunk_size = new_chunk_size;
-  }
-
- public:
-  using member_type = Impl::OpenMPTargetExecTeamMember;
-};
-}  // namespace Impl
-
-}  // namespace Kokkos
-
-namespace Kokkos {
-
-template <typename iType>
-KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct<
-    iType, Impl::OpenMPTargetExecTeamMember>
-TeamThreadRange(const Impl::OpenMPTargetExecTeamMember& thread,
-                const iType& count) {
-  return Impl::TeamThreadRangeBoundariesStruct<
-      iType, Impl::OpenMPTargetExecTeamMember>(thread, count);
-}
-
-template <typename iType1, typename iType2>
-KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct<
-    std::common_type_t<iType1, iType2>, Impl::OpenMPTargetExecTeamMember>
-TeamThreadRange(const Impl::OpenMPTargetExecTeamMember& thread,
-                const iType1& begin, const iType2& end) {
-  using iType = std::common_type_t<iType1, iType2>;
-  return Impl::TeamThreadRangeBoundariesStruct<
-      iType, Impl::OpenMPTargetExecTeamMember>(thread, iType(begin),
-                                               iType(end));
-}
-
-template <typename iType>
-KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct<
-    iType, Impl::OpenMPTargetExecTeamMember>
-ThreadVectorRange(const Impl::OpenMPTargetExecTeamMember& thread,
-                  const iType& count) {
-  return Impl::ThreadVectorRangeBoundariesStruct<
-      iType, Impl::OpenMPTargetExecTeamMember>(thread, count);
-}
-
-template <typename iType1, typename iType2>
-KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct<
-    std::common_type_t<iType1, iType2>, Impl::OpenMPTargetExecTeamMember>
-ThreadVectorRange(const Impl::OpenMPTargetExecTeamMember& thread,
-                  const iType1& arg_begin, const iType2& arg_end) {
-  using iType = std::common_type_t<iType1, iType2>;
-  return Impl::ThreadVectorRangeBoundariesStruct<
-      iType, Impl::OpenMPTargetExecTeamMember>(thread, iType(arg_begin),
-                                               iType(arg_end));
-}
-
-template <typename iType>
-KOKKOS_INLINE_FUNCTION Impl::TeamVectorRangeBoundariesStruct<
-    iType, Impl::OpenMPTargetExecTeamMember>
-TeamVectorRange(const Impl::OpenMPTargetExecTeamMember& thread,
-                const iType& count) {
-  return Impl::TeamVectorRangeBoundariesStruct<
-      iType, Impl::OpenMPTargetExecTeamMember>(thread, count);
-}
-
-template <typename iType1, typename iType2>
-KOKKOS_INLINE_FUNCTION Impl::TeamVectorRangeBoundariesStruct<
-    std::common_type_t<iType1, iType2>, Impl::OpenMPTargetExecTeamMember>
-TeamVectorRange(const Impl::OpenMPTargetExecTeamMember& thread,
-                const iType1& arg_begin, const iType2& arg_end) {
-  using iType = std::common_type_t<iType1, iType2>;
-  return Impl::TeamVectorRangeBoundariesStruct<
-      iType, Impl::OpenMPTargetExecTeamMember>(thread, iType(arg_begin),
-                                               iType(arg_end));
-}
-
-KOKKOS_INLINE_FUNCTION
-Impl::ThreadSingleStruct<Impl::OpenMPTargetExecTeamMember> PerTeam(
-    const Impl::OpenMPTargetExecTeamMember& thread) {
-  return Impl::ThreadSingleStruct<Impl::OpenMPTargetExecTeamMember>(thread);
-}
-
-KOKKOS_INLINE_FUNCTION
-Impl::VectorSingleStruct<Impl::OpenMPTargetExecTeamMember> PerThread(
-    const Impl::OpenMPTargetExecTeamMember& thread) {
-  return Impl::VectorSingleStruct<Impl::OpenMPTargetExecTeamMember>(thread);
-}
-}  // namespace Kokkos
-
-namespace Kokkos {
-
-/** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each
- * i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all threads of the the calling thread team.
- */
-template <typename iType, class Lambda>
-KOKKOS_INLINE_FUNCTION void parallel_for(
-    const Impl::TeamThreadRangeBoundariesStruct<
-        iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
-    const Lambda& lambda) {
-#pragma omp for nowait schedule(static, 1)
-  for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) lambda(i);
-}
-
-/** \brief  Inter-thread vector parallel_reduce. Executes lambda(iType i,
- * ValueType & val) for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all threads of the the calling thread team
- * and a summation of val is performed and put into result.
- */
-
-template <typename iType, class Lambda, typename ValueType>
-KOKKOS_INLINE_FUNCTION std::enable_if_t<!Kokkos::is_reducer<ValueType>::value>
-parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<
-                    iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
-                const Lambda& lambda, ValueType& result) {
-  // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of
-  // elements in the array <= 32. For reduction we allocate, 16 bytes per
-  // element in the scratch space, hence, 16*32 = 512.
-  static_assert(sizeof(ValueType) <=
-                Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE);
-
-  ValueType* TeamThread_scratch =
-      static_cast<ValueType*>(loop_boundaries.team.impl_reduce_scratch());
-
-#pragma omp barrier
-  TeamThread_scratch[0] = ValueType();
-#pragma omp barrier
-
-  if constexpr (std::is_arithmetic<ValueType>::value) {
-#pragma omp for reduction(+ : TeamThread_scratch[:1])
-    for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
-      ValueType tmp = ValueType();
-      lambda(i, tmp);
-      TeamThread_scratch[0] += tmp;
-    }
-  } else {
-#pragma omp declare reduction(custom:ValueType : omp_out += omp_in)
-
-#pragma omp for reduction(custom : TeamThread_scratch[:1])
-    for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
-      ValueType tmp = ValueType();
-      lambda(i, tmp);
-      TeamThread_scratch[0] += tmp;
-    }
-  }
-
-  result = TeamThread_scratch[0];
-}
-
-#if !defined(KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND)
-// For some reason the actual version we wanted to write doesn't work
-// and crashes. We should try this with every new compiler
-// This is the variant we actually wanted to write
-template <typename iType, class Lambda, typename ReducerType>
-KOKKOS_INLINE_FUNCTION std::enable_if_t<Kokkos::is_reducer<ReducerType>::value>
-parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<
-                    iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
-                const Lambda& lambda, ReducerType result) {
-  using ValueType = typename ReducerType::value_type;
-
-#pragma omp declare reduction(                                               \
-    custominner:ValueType                                                    \
-    : Impl::OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
-    initializer(                                                             \
-        Impl::OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
-
-  // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of
-  // elements in the array <= 32. For reduction we allocate, 16 bytes per
-  // element in the scratch space, hence, 16*32 = 512.
-  static_assert(sizeof(ValueType) <=
-                Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE);
-
-  ValueType* TeamThread_scratch =
-      static_cast<ValueType*>(loop_boundaries.team.impl_reduce_scratch());
-
-#pragma omp barrier
-  Impl::OpenMPTargetReducerWrapper<ReducerType>::init(TeamThread_scratch[0]);
-#pragma omp barrier
-
-#pragma omp for reduction(custominner : TeamThread_scratch[:1])
-  for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
-    lambda(i, TeamThread_scratch[0]);
-  }
-  result.reference() = TeamThread_scratch[0];
-}
-#else
-template <typename iType, class Lambda, typename ReducerType>
-KOKKOS_INLINE_FUNCTION std::enable_if_t<Kokkos::is_reducer<ReducerType>::value>
-parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<
-                    iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
-                const Lambda& lambda, ReducerType result) {
-  using ValueType = typename ReducerType::value_type;
-
-  // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of
-  // elements in the array <= 32. For reduction we allocate, 16 bytes per
-  // element in the scratch space, hence, 16*32 = 512.
-  static_assert(sizeof(ValueType) <=
-                Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE);
-
-  ValueType* TeamThread_scratch =
-      static_cast<ValueType*>(loop_boundaries.team.impl_reduce_scratch());
-
-#pragma omp declare reduction(                                               \
-    omp_red_teamthread_reducer:ValueType                                     \
-    : Impl::OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
-    initializer(                                                             \
-        Impl::OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
-
-#pragma omp barrier
-  ValueType tmp;
-  result.init(tmp);
-  TeamThread_scratch[0] = tmp;
-#pragma omp barrier
-
-  iType team_size = iType(omp_get_num_threads());
-#pragma omp for reduction(omp_red_teamthread_reducer \
-                          : TeamThread_scratch[:1]) schedule(static, 1)
-  for (iType t = 0; t < team_size; t++) {
-    ValueType tmp2;
-    result.init(tmp2);
-
-    for (iType i = loop_boundaries.start + t; i < loop_boundaries.end;
-         i += team_size) {
-      lambda(i, tmp2);
-    }
-
-    // FIXME_OPENMPTARGET: Join should work but doesn't. Every threads gets a
-    // private TeamThread_scratch[0] and at the end of the for-loop the `join`
-    // operation is performed by OpenMP itself and hence the simple assignment
-    // works.
-    //    result.join(TeamThread_scratch[0], tmp2);
-    TeamThread_scratch[0] = tmp2;
-  }
-
-  result.reference() = TeamThread_scratch[0];
-}
-#endif  // KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND
-
-/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i,
- * ValueType & val) for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all vector lanes of the the calling thread
- * and a reduction of val is performed using JoinType(ValueType& val, const
- * ValueType& update) and put into init_result. The input value of init_result
- * is used as initializer for temporary variables of ValueType. Therefore the
- * input value should be the neutral element with respect to the join operation
- * (e.g. '0 for +-' or '1 for *').
- */
-template <typename iType, class Lambda, typename ValueType, class JoinType>
-KOKKOS_INLINE_FUNCTION void parallel_reduce(
-    const Impl::TeamThreadRangeBoundariesStruct<
-        iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
-    const Lambda& lambda, const JoinType& join, ValueType& init_result) {
-  ValueType* TeamThread_scratch =
-      static_cast<ValueType*>(loop_boundaries.team.impl_reduce_scratch());
-
-  // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of
-  // elements in the array <= 32. For reduction we allocate, 16 bytes per
-  // element in the scratch space, hence, 16*32 = 512.
-  static_assert(sizeof(ValueType) <=
-                Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE);
-
-  // FIXME_OPENMPTARGET: Still need to figure out how to get value_count here.
-  const int value_count = 1;
-
-#pragma omp barrier
-  TeamThread_scratch[0] = init_result;
-#pragma omp barrier
-
-#pragma omp for
-  for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
-    lambda(i, TeamThread_scratch[omp_get_num_threads() * value_count]);
-  }
-
-  // Reduce all partial results within a team.
-  const int team_size      = omp_get_num_threads();
-  int tree_neighbor_offset = 1;
-  do {
-#pragma omp for
-    for (int i = 0; i < team_size - tree_neighbor_offset;
-         i += 2 * tree_neighbor_offset) {
-      const int neighbor = i + tree_neighbor_offset;
-      join(lambda, &TeamThread_scratch[i * value_count],
-           &TeamThread_scratch[neighbor * value_count]);
-    }
-    tree_neighbor_offset *= 2;
-  } while (tree_neighbor_offset < team_size);
-  init_result = TeamThread_scratch[0];
-}
-
-// This is largely the same code as in HIP and CUDA except for the member name
-template <typename iType, class FunctorType>
-KOKKOS_INLINE_FUNCTION void parallel_scan(
-    const Impl::TeamThreadRangeBoundariesStruct<
-        iType, Impl::OpenMPTargetExecTeamMember>& loop_bounds,
-    const FunctorType& lambda) {
-  using Analysis   = Impl::FunctorAnalysis<Impl::FunctorPatternInterface::SCAN,
-                                         TeamPolicy<Experimental::OpenMPTarget>,
-                                         FunctorType>;
-  using value_type = typename Analysis::value_type;
-
-  const auto start = loop_bounds.start;
-  const auto end   = loop_bounds.end;
-  //   Note this thing is called .member in the CUDA specialization of
-  //   TeamThreadRangeBoundariesStruct
-  auto& member         = loop_bounds.team;
-  const auto team_rank = member.team_rank();
-
-#if defined(KOKKOS_IMPL_TEAM_SCAN_WORKAROUND)
-  value_type scan_val = value_type();
-
-  if (team_rank == 0) {
-    for (iType i = start; i < end; ++i) {
-      lambda(i, scan_val, true);
-    }
-  }
-#pragma omp barrier
-#else
-  const auto team_size = member.team_size();
-  const auto nchunk    = (end - start + team_size - 1) / team_size;
-  value_type accum     = 0;
-  // each team has to process one or
-  //      more chunks of the prefix scan
-  for (iType i = 0; i < nchunk; ++i) {
-    auto ii = start + i * team_size + team_rank;
-    // local accumulation for this chunk
-    value_type local_accum = 0;
-    // user updates value with prefix value
-    if (ii < loop_bounds.end) lambda(ii, local_accum, false);
-    // perform team scan
-    local_accum = member.team_scan(local_accum);
-    // add this blocks accum to total accumulation
-    auto val = accum + local_accum;
-    // user updates their data with total accumulation
-    if (ii < loop_bounds.end) lambda(ii, val, true);
-    // the last value needs to be propogated to next chunk
-    if (team_rank == team_size - 1) accum = val;
-    // broadcast last value to rest of the team
-    member.team_broadcast(accum, team_size - 1);
-  }
-#endif
-}
-
-}  // namespace Kokkos
-
-namespace Kokkos {
-/** \brief  Intra-thread vector parallel_for. Executes lambda(iType i) for each
- * i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all vector lanes of the the calling thread.
- */
-template <typename iType, class Lambda>
-KOKKOS_INLINE_FUNCTION void parallel_for(
-    const Impl::ThreadVectorRangeBoundariesStruct<
-        iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
-    const Lambda& lambda) {
-#pragma omp simd
-  for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) lambda(i);
-}
-
-/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i,
- * ValueType & val) for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all vector lanes of the the calling thread
- * and a summation of val is performed and put into result.
- */
-template <typename iType, class Lambda, typename ValueType>
-KOKKOS_INLINE_FUNCTION void parallel_reduce(
-    const Impl::ThreadVectorRangeBoundariesStruct<
-        iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
-    const Lambda& lambda, ValueType& result) {
-  ValueType vector_reduce = ValueType();
-
-  if constexpr (std::is_arithmetic<ValueType>::value) {
-#pragma omp simd reduction(+ : vector_reduce)
-    for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
-      ValueType tmp = ValueType();
-      lambda(i, tmp);
-      vector_reduce += tmp;
-    }
-  } else {
-#pragma omp declare reduction(custom:ValueType : omp_out += omp_in)
-
-#pragma omp simd reduction(custom : vector_reduce)
-    for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
-      lambda(i, vector_reduce);
-    }
-  }
-
-  result = vector_reduce;
-}
-
-template <typename iType, class Lambda, typename ReducerType>
-KOKKOS_INLINE_FUNCTION std::enable_if_t<Kokkos::is_reducer<ReducerType>::value>
-parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<
-                    iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
-                const Lambda& lambda, ReducerType const& result) {
-  using ValueType = typename ReducerType::value_type;
-
-#pragma omp declare reduction(                                               \
-    custom:ValueType                                                         \
-    : Impl::OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
-    initializer(                                                             \
-        Impl::OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
-
-  ValueType vector_reduce;
-  Impl::OpenMPTargetReducerWrapper<ReducerType>::init(vector_reduce);
-
-#pragma omp simd reduction(custom : vector_reduce)
-  for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
-    lambda(i, vector_reduce);
-  }
-
-  result.reference() = vector_reduce;
-}
-
-/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i,
- * ValueType & val) for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all vector lanes of the the calling thread
- * and a reduction of val is performed using JoinType(ValueType& val, const
- * ValueType& update) and put into init_result. The input value of init_result
- * is used as initializer for temporary variables of ValueType. Therefore the
- * input value should be the neutral element with respect to the join operation
- * (e.g. '0 for +-' or '1 for *').
- */
-template <typename iType, class Lambda, typename ValueType, class JoinType>
-KOKKOS_INLINE_FUNCTION void parallel_reduce(
-    const Impl::ThreadVectorRangeBoundariesStruct<
-        iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
-    const Lambda& lambda, const JoinType& join, ValueType& init_result) {
-  ValueType result = init_result;
-
-  // FIXME_OPENMPTARGET think about omp simd
-  // join does not work with omp reduction clause
-  for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
-    ValueType tmp = ValueType();
-    lambda(i, tmp);
-    join(result, tmp);
-  }
-
-  init_result = result;
-}
-
-/** \brief  Intra-thread vector parallel exclusive prefix sum. Executes
- * lambda(iType i, ValueType & val, bool final) for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all vector lanes in the thread and a scan
- * operation is performed. Depending on the target execution space the operator
- * might be called twice: once with final=false and once with final=true. When
- * final==true val contains the prefix sum value. The contribution of this "i"
- * needs to be added to val no matter whether final==true or not. In a serial
- * execution (i.e. team_size==1) the operator is only called once with
- * final==true. Scan_val will be set to the final sum value over all vector
- * lanes.
- */
-template <typename iType, class FunctorType>
-KOKKOS_INLINE_FUNCTION void parallel_scan(
-    const Impl::ThreadVectorRangeBoundariesStruct<
-        iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
-    const FunctorType& lambda) {
-  using Analysis   = Impl::FunctorAnalysis<Impl::FunctorPatternInterface::SCAN,
-                                         TeamPolicy<Experimental::OpenMPTarget>,
-                                         FunctorType>;
-  using value_type = typename Analysis::value_type;
-
-  value_type scan_val = value_type();
-
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-  for (iType i = loop_boundaries.start; i < loop_boundaries.end; ++i) {
-    lambda(i, scan_val, true);
-  }
-}
-
-}  // namespace Kokkos
-
-#ifdef KOKKOS_IMPL_TEAM_SCAN_WORKAROUND
-#undef KOKKOS_IMPL_TEAM_SCAN_WORKAROUND
-#endif
-
-namespace Kokkos {
-/** \brief  Intra-team vector parallel_for. Executes lambda(iType i) for each
- * i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all vector lanes of the the calling team.
- */
-template <typename iType, class Lambda>
-KOKKOS_INLINE_FUNCTION void parallel_for(
-    const Impl::TeamVectorRangeBoundariesStruct<
-        iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
-    const Lambda& lambda) {
-#pragma omp for simd nowait schedule(static, 1)
-  for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) lambda(i);
-}
-
-/** \brief  Intra-team vector parallel_reduce. Executes lambda(iType i,
- * ValueType & val) for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all vector lanes of the the calling team
- * and a summation of val is performed and put into result.
- */
-template <typename iType, class Lambda, typename ValueType>
-KOKKOS_INLINE_FUNCTION void parallel_reduce(
-    const Impl::TeamVectorRangeBoundariesStruct<
-        iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
-    const Lambda& lambda, ValueType& result) {
-  // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of
-  // elements in the array <= 32. For reduction we allocate, 16 bytes per
-  // element in the scratch space, hence, 16*32 = 512.
-  static_assert(sizeof(ValueType) <=
-                Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE);
-
-  ValueType* TeamVector_scratch =
-      static_cast<ValueType*>(loop_boundaries.team.impl_reduce_scratch());
-
-#pragma omp barrier
-  TeamVector_scratch[0] = ValueType();
-#pragma omp barrier
-
-  if constexpr (std::is_arithmetic<ValueType>::value) {
-#pragma omp for simd reduction(+ : TeamVector_scratch[:1])
-    for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
-      ValueType tmp = ValueType();
-      lambda(i, tmp);
-      TeamVector_scratch[0] += tmp;
-    }
-  } else {
-#pragma omp declare reduction(custom:ValueType : omp_out += omp_in)
-
-#pragma omp for simd reduction(custom : TeamVector_scratch[:1])
-    for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
-      ValueType tmp = ValueType();
-      lambda(i, tmp);
-      TeamVector_scratch[0] += tmp;
-    }
-  }
-
-  result = TeamVector_scratch[0];
-}
-
-#if !defined(KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND)
-template <typename iType, class Lambda, typename ReducerType>
-KOKKOS_INLINE_FUNCTION std::enable_if_t<Kokkos::is_reducer<ReducerType>::value>
-parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct<
-                    iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
-                const Lambda& lambda, ReducerType const& result) {
-  using ValueType = typename ReducerType::value_type;
-
-  // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of
-  // elements in the array <= 32. For reduction we allocate, 16 bytes per
-  // element in the scratch space, hence, 16*32 = 512.
-  static_assert(sizeof(ValueType) <=
-                Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE);
-
-#pragma omp declare reduction(                                               \
-    custom:ValueType                                                         \
-    : Impl::OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
-    initializer(                                                             \
-        Impl::OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
-
-  ValueType* TeamVector_scratch =
-      static_cast<ValueType*>(loop_boundaries.team.impl_reduce_scratch());
-
-#pragma omp barrier
-  Impl::OpenMPTargetReducerWrapper<ReducerType>::init(TeamVector_scratch[0]);
-#pragma omp barrier
-
-#pragma omp for simd reduction(custom : TeamVector_scratch[:1])
-  for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
-    lambda(i, TeamVector_scratch[0]);
-  }
-
-  result.reference() = TeamVector_scratch[0];
-}
-#else
-template <typename iType, class Lambda, typename ReducerType>
-KOKKOS_INLINE_FUNCTION std::enable_if_t<Kokkos::is_reducer<ReducerType>::value>
-parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct<
-                    iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
-                const Lambda& lambda, ReducerType const& result) {
-  using ValueType = typename ReducerType::value_type;
-
-  // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of
-  // elements in the array <= 32. For reduction we allocate, 16 bytes per
-  // element in the scratch space, hence, 16*32 = 512.
-  static_assert(sizeof(ValueType) <=
-                Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE);
-
-  ValueType* TeamVector_scratch =
-      static_cast<ValueType*>(loop_boundaries.team.impl_reduce_scratch());
-
-#pragma omp declare reduction(                                               \
-    omp_red_teamthread_reducer:ValueType                                     \
-    : Impl::OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
-    initializer(                                                             \
-        Impl::OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
-
-#pragma omp barrier
-  ValueType tmp;
-  result.init(tmp);
-  TeamVector_scratch[0] = tmp;
-#pragma omp barrier
-
-  iType team_size = iType(omp_get_num_threads());
-#pragma omp for simd reduction(omp_red_teamthread_reducer \
-                               : TeamVector_scratch[:1]) schedule(static, 1)
-  for (iType t = 0; t < team_size; t++) {
-    ValueType tmp2;
-    result.init(tmp2);
-
-    for (iType i = loop_boundaries.start + t; i < loop_boundaries.end;
-         i += team_size) {
-      lambda(i, tmp2);
-    }
-    TeamVector_scratch[0] = tmp2;
-  }
-
-  result.reference() = TeamVector_scratch[0];
-}
-#endif  // KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND
-}  // namespace Kokkos
-
-#ifdef KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND
-#undef KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND
-#endif
-
-namespace Kokkos {
-
-template <class FunctorType>
-KOKKOS_INLINE_FUNCTION void single(
-    const Impl::VectorSingleStruct<Impl::OpenMPTargetExecTeamMember>&
-    /*single_struct*/,
-    const FunctorType& lambda) {
-  lambda();
-}
-
-template <class FunctorType>
-KOKKOS_INLINE_FUNCTION void single(
-    const Impl::ThreadSingleStruct<Impl::OpenMPTargetExecTeamMember>&
-        single_struct,
-    const FunctorType& lambda) {
-  if (single_struct.team_member.team_rank() == 0) lambda();
-}
-
-template <class FunctorType, class ValueType>
-KOKKOS_INLINE_FUNCTION void single(
-    const Impl::VectorSingleStruct<Impl::OpenMPTargetExecTeamMember>&
-    /*single_struct*/,
-    const FunctorType& lambda, ValueType& val) {
-  lambda(val);
-}
-
-template <class FunctorType, class ValueType>
-KOKKOS_INLINE_FUNCTION void single(
-    const Impl::ThreadSingleStruct<Impl::OpenMPTargetExecTeamMember>&
-        single_struct,
-    const FunctorType& lambda, ValueType& val) {
-  if (single_struct.team_member.team_rank() == 0) {
-    lambda(val);
-  }
-  single_struct.team_member.team_broadcast(val, 0);
-}
-}  // namespace Kokkos
-
-#endif /* #ifndef KOKKOS_OPENMPTARGETEXEC_HPP */
diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
index 4a33961205224faea928b14eeb29d572ad44174c..9e8844a6f20846ee675d24c66c987fd3706288c1 100644
--- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
@@ -27,10 +27,11 @@
 // constructor. undef'ed at the end
 #define KOKKOS_IMPL_OPENMPTARGET_WORKAROUND
 
-#include <Kokkos_OpenMPTarget.hpp>
+#include <OpenMPTarget/Kokkos_OpenMPTarget.hpp>
 #include <OpenMPTarget/Kokkos_OpenMPTarget_UniqueToken.hpp>
 #include <OpenMPTarget/Kokkos_OpenMPTarget_Instance.hpp>
 #include <impl/Kokkos_ExecSpaceManager.hpp>
+#include <OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp>
 
 #include <sstream>
 
@@ -65,18 +66,41 @@ void OpenMPTargetInternal::fence(const std::string& name,
         [&]() {});
   }
 }
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
-int OpenMPTargetInternal::concurrency() {
-#else
 int OpenMPTargetInternal::concurrency() const {
+  int max_threads = 2048 * 80;
+#if defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU)
+  int max_threads_sm = 2048;
+#if defined(KOKKOS_ARCH_AMPERE86)
+  max_threads = max_threads_sm * 84;
+#elif defined(KOKKOS_ARCH_AMPERE80)
+  max_threads = max_threads_sm * 108;
+#elif defined(KOKKOS_ARCH_VOLTA72)
+  max_threads = max_threads_sm * 84;
+#elif defined(KOKKOS_ARCH_VOLTA70)
+  max_threads = max_threads_sm * 80;
+#elif defined(KOKKOS_ARCH_PASCAL60) || defined(KOKKOS_ARCH_PASCAL61)
+  max_threads = max_threads_sm * 60;
+#endif
+#elif defined(KOKKOS_ARCH_INTEL_GPU)
+#pragma omp target map(max_threads)
+  { max_threads = omp_get_num_procs(); }
+
+  // Multiply the number of processors with the SIMD length.
+  max_threads *= 32;
 #endif
-  return 128000;  // FIXME_OPENMPTARGET
+
+  return max_threads;
 }
 const char* OpenMPTargetInternal::name() { return "OpenMPTarget"; }
 void OpenMPTargetInternal::print_configuration(std::ostream& os,
                                                bool /*verbose*/) const {
   // FIXME_OPENMPTARGET
   os << "Using OpenMPTarget\n";
+#if defined(KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU)
+  os << "Defined KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU: Workaround "
+        "for "
+        "hierarchical parallelism for Intel GPUs.";
+#endif
 }
 
 void OpenMPTargetInternal::impl_finalize() {
@@ -88,15 +112,18 @@ void OpenMPTargetInternal::impl_finalize() {
     Kokkos::kokkos_free<Kokkos::Experimental::OpenMPTargetSpace>(
         space.m_uniquetoken_ptr);
 }
+
 void OpenMPTargetInternal::impl_initialize() {
   m_is_initialized = true;
 
+  Kokkos::Impl::OpenMPTargetExec::MAX_ACTIVE_THREADS = concurrency();
+
   // FIXME_OPENMPTARGET:  Only fix the number of teams for NVIDIA architectures
   // from Pascal and upwards.
-#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) ||    \
-    defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
-    defined(KOKKOS_ARCH_HOPPER)
-#if defined(KOKKOS_COMPILER_CLANG) && (KOKKOS_COMPILER_CLANG >= 1300)
+  // FIXME_OPENMPTARGTE: Cray compiler did not yet implement omp_set_num_teams.
+#if !defined(KOKKOS_COMPILER_CRAY_LLVM)
+#if defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) && defined(KOKKOS_COMPILER_CLANG) && \
+    (KOKKOS_COMPILER_CLANG >= 1300)
   omp_set_num_teams(512);
 #endif
 #endif
@@ -131,9 +158,15 @@ uint32_t OpenMPTarget::impl_instance_id() const noexcept {
   return m_space_instance->impl_get_instance_id();
 }
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
 int OpenMPTarget::concurrency() {
   return Impl::OpenMPTargetInternal::impl_singleton()->concurrency();
 }
+#else
+int OpenMPTarget::concurrency() const {
+  return m_space_instance->concurrency();
+}
+#endif
 
 void OpenMPTarget::fence(const std::string& name) {
   Impl::OpenMPTargetInternal::impl_singleton()->fence(name);
@@ -182,9 +215,10 @@ UniqueToken<Kokkos::Experimental::OpenMPTarget,
         Kokkos::kokkos_malloc<Kokkos::Experimental::OpenMPTargetSpace>(
             "Kokkos::OpenMPTarget::m_uniquetoken_ptr", size));
     std::vector<uint32_t> h_buf(count, 0);
-    KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy(ptr, h_buf.data(), size, 0, 0,
-                                                 omp_get_default_device(),
-                                                 omp_get_initial_device()));
+    if (0 < size)
+      KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy(ptr, h_buf.data(), size, 0,
+                                                   0, omp_get_default_device(),
+                                                   omp_get_initial_device()));
 
     Kokkos::Impl::OpenMPTargetExec::m_uniquetoken_ptr = ptr;
   }
diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.hpp
index 9f4349c00ef65c404eefbf7352449c50d67a51cf..bea3bb3b12b8f6161159871516eaca663b99d7b5 100644
--- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.hpp
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.hpp
@@ -37,7 +37,7 @@ class OpenMPTargetInternal {
              openmp_fence_is_static is_static = openmp_fence_is_static::no);
 
   /** \brief  Return the maximum amount of concurrency.  */
-  int concurrency();
+  int concurrency() const;
 
   //! Print configuration information to the given output stream.
   void print_configuration(std::ostream& os, bool verbose) const;
diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp
index 71ce4b18f2896545b1d4d50edef1590f561e5a22..9767d8e53eff19f1b888c3e0fbf78e2623cf3416 100644
--- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp
@@ -20,1253 +20,654 @@
 #include <omp.h>
 #include <sstream>
 #include <Kokkos_Parallel.hpp>
-#include <OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp>
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_Spinwait.hpp>
+
+#include <Kokkos_Atomic.hpp>
+#include "Kokkos_OpenMPTarget_Abort.hpp"
+
+// Intel architectures prefer the classical hierarchical parallelism that relies
+// on OpenMP.
+#if defined(KOKKOS_ARCH_INTEL_GPU)
+#define KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU
+#endif
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
 
 namespace Kokkos {
 namespace Impl {
 
-template <class FunctorType, class... Traits>
-class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>,
-                  Kokkos::Experimental::OpenMPTarget> {
- private:
-  using Policy    = Kokkos::RangePolicy<Traits...>;
-  using WorkTag   = typename Policy::work_tag;
-  using WorkRange = typename Policy::WorkRange;
-  using Member    = typename Policy::member_type;
-
-  const FunctorType m_functor;
-  const Policy m_policy;
+class OpenMPTargetExecTeamMember {
+ public:
+  static constexpr int TEAM_REDUCE_SIZE = 512;
+
+  using execution_space      = Kokkos::Experimental::OpenMPTarget;
+  using scratch_memory_space = execution_space::scratch_memory_space;
+  using team_handle          = OpenMPTargetExecTeamMember;
+
+  scratch_memory_space m_team_shared;
+  size_t m_team_scratch_size[2];
+  int m_team_rank;
+  int m_team_size;
+  int m_league_rank;
+  int m_league_size;
+  int m_vector_length;
+  int m_vector_lane;
+  int m_shmem_block_index;
+  void* m_glb_scratch;
+  void* m_reduce_scratch;
 
  public:
-  void execute() const { execute_impl<WorkTag>(); }
-
-  template <class TagType>
-  void execute_impl() const {
-    OpenMPTargetExec::verify_is_process(
-        "Kokkos::Experimental::OpenMPTarget parallel_for");
-    OpenMPTargetExec::verify_initialized(
-        "Kokkos::Experimental::OpenMPTarget parallel_for");
-    const auto begin = m_policy.begin();
-    const auto end   = m_policy.end();
-
-    if (end <= begin) return;
-
-    FunctorType a_functor(m_functor);
-
-#pragma omp target teams distribute parallel for map(to : a_functor)
-    for (auto i = begin; i < end; ++i) {
-      if constexpr (std::is_void<TagType>::value) {
-        a_functor(i);
-      } else {
-        a_functor(TagType(), i);
-      }
-    }
+  KOKKOS_INLINE_FUNCTION
+  const execution_space::scratch_memory_space& team_shmem() const {
+    return m_team_shared.set_team_thread_mode(0, 1, 0);
   }
 
-  ParallelFor(const FunctorType& arg_functor, Policy arg_policy)
-      : m_functor(arg_functor), m_policy(arg_policy) {}
-};
+  // set_team_thread_mode routine parameters for future understanding:
+  // first parameter - scratch level.
+  // second parameter - size multiplier for advancing scratch ptr after a
+  // request was serviced. third parameter - offset size multiplier from current
+  // scratch ptr when returning a ptr for a request.
+  KOKKOS_INLINE_FUNCTION
+  const execution_space::scratch_memory_space& team_scratch(int level) const {
+    return m_team_shared.set_team_thread_mode(level, 1, 0);
+  }
 
-}  // namespace Impl
-}  // namespace Kokkos
+  KOKKOS_INLINE_FUNCTION
+  const execution_space::scratch_memory_space& thread_scratch(int level) const {
+    return m_team_shared.set_team_thread_mode(level, team_size(), team_rank());
+  }
 
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
+  KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank; }
+  KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size; }
+  KOKKOS_INLINE_FUNCTION int team_rank() const { return m_team_rank; }
+  KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size; }
+  KOKKOS_INLINE_FUNCTION void* impl_reduce_scratch() const {
+    return m_reduce_scratch;
+  }
 
-namespace Kokkos {
-namespace Impl {
+  KOKKOS_INLINE_FUNCTION void team_barrier() const {
+#pragma omp barrier
+  }
 
-// This class has the memcpy routine that is commonly used by ParallelReduce
-// over RangePolicy and TeamPolicy.
-template <class PointerType>
-struct ParallelReduceCommon {
-  // Copy the result back to device if the view is on the device.
-  static void memcpy_result(PointerType dest, PointerType src, size_t size,
-                            bool ptr_on_device) {
-    if (ptr_on_device) {
-      KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy(dest, src, size, 0, 0,
-                                                   omp_get_default_device(),
-                                                   omp_get_initial_device()));
-    } else {
-      *dest = *src;
-    }
+  template <class ValueType>
+  KOKKOS_INLINE_FUNCTION void team_broadcast(ValueType& value,
+                                             int thread_id) const {
+    // Make sure there is enough scratch space:
+    using type = std::conditional_t<(sizeof(ValueType) < TEAM_REDUCE_SIZE),
+                                    ValueType, void>;
+    type* team_scratch =
+        reinterpret_cast<type*>(static_cast<char*>(m_glb_scratch) +
+                                TEAM_REDUCE_SIZE * omp_get_team_num());
+#pragma omp barrier
+    if (team_rank() == thread_id) *team_scratch = value;
+#pragma omp barrier
+    value = *team_scratch;
   }
-};
 
-template <class FunctorType, class PolicyType, class ReducerType,
-          class PointerType, class ValueType>
-struct ParallelReduceSpecialize {
-  inline static void execute(const FunctorType& /*f*/, const PolicyType& /*p*/,
-                             PointerType /*result_ptr*/) {
-    constexpr int FunctorHasJoin =
-        Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE, PolicyType,
-                              FunctorType>::has_join_member_function;
-    constexpr int UseReducerType = is_reducer<ReducerType>::value;
-
-    std::stringstream error_message;
-    error_message << "Error: Invalid Specialization " << FunctorHasJoin << ' '
-                  << UseReducerType << '\n';
-    // FIXME_OPENMPTARGET
-    OpenMPTarget_abort(error_message.str().c_str());
+  template <class Closure, class ValueType>
+  KOKKOS_INLINE_FUNCTION void team_broadcast(const Closure& f, ValueType& value,
+                                             const int& thread_id) const {
+    f(value);
+    team_broadcast(value, thread_id);
   }
-};
 
-template <class FunctorType, class ReducerType, class PointerType,
-          class ValueType, class... PolicyArgs>
-struct ParallelReduceSpecialize<FunctorType, Kokkos::RangePolicy<PolicyArgs...>,
-                                ReducerType, PointerType, ValueType> {
-  using PolicyType = Kokkos::RangePolicy<PolicyArgs...>;
-  using TagType    = typename PolicyType::work_tag;
-  using ReducerTypeFwd =
-      std::conditional_t<std::is_same<InvalidType, ReducerType>::value,
-                         FunctorType, ReducerType>;
-  using Analysis = Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
-                                         PolicyType, ReducerTypeFwd>;
-  using ReferenceType = typename Analysis::reference_type;
-
-  using ParReduceCommon = ParallelReduceCommon<PointerType>;
-
-  static void execute_reducer(const FunctorType& f, const PolicyType& p,
-                              PointerType result_ptr, bool ptr_on_device) {
-    OpenMPTargetExec::verify_is_process(
-        "Kokkos::Experimental::OpenMPTarget parallel_for");
-    OpenMPTargetExec::verify_initialized(
-        "Kokkos::Experimental::OpenMPTarget parallel_for");
-    const auto begin = p.begin();
-    const auto end   = p.end();
-
-    ValueType result;
-    OpenMPTargetReducerWrapper<ReducerType>::init(result);
-
-    // Initialize and copy back the result even if it is a zero length
-    // reduction.
-    if (end <= begin) {
-      ParReduceCommon::memcpy_result(result_ptr, &result, sizeof(ValueType),
-                                     ptr_on_device);
-      return;
-    }
+  // FIXME_OPENMPTARGET this function has the wrong interface and currently
+  // ignores the reducer passed.
+  template <class ValueType, class JoinOp>
+  KOKKOS_INLINE_FUNCTION ValueType team_reduce(const ValueType& value,
+                                               const JoinOp&) const {
+#pragma omp barrier
 
-#pragma omp declare reduction(                                         \
-    custom:ValueType                                                   \
-    : OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
-    initializer(OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
-
-#pragma omp target teams distribute parallel for map(to                    \
-                                                     : f) reduction(custom \
-                                                                    : result)
-    for (auto i = begin; i < end; ++i) {
-      if constexpr (std::is_void<TagType>::value) {
-        f(i, result);
-      } else {
-        f(TagType(), i, result);
-      }
+    using value_type = ValueType;
+    //    const JoinLambdaAdapter<value_type, JoinOp> op(op_in);
+
+    // Make sure there is enough scratch space:
+    using type = std::conditional_t<(sizeof(value_type) < TEAM_REDUCE_SIZE),
+                                    value_type, void>;
+
+    const int n_values = TEAM_REDUCE_SIZE / sizeof(value_type);
+    type* team_scratch =
+        reinterpret_cast<type*>(static_cast<char*>(m_glb_scratch) +
+                                TEAM_REDUCE_SIZE * omp_get_team_num());
+    for (int i = m_team_rank; i < n_values; i += m_team_size) {
+      team_scratch[i] = value_type();
     }
 
-    ParReduceCommon::memcpy_result(result_ptr, &result, sizeof(ValueType),
-                                   ptr_on_device);
-  }
+#pragma omp barrier
 
-  template <class TagType, int NumReductions>
-  static void execute_array(const FunctorType& f, const PolicyType& p,
-                            PointerType result_ptr, bool ptr_on_device) {
-    OpenMPTargetExec::verify_is_process(
-        "Kokkos::Experimental::OpenMPTarget parallel_for");
-    OpenMPTargetExec::verify_initialized(
-        "Kokkos::Experimental::OpenMPTarget parallel_for");
-    const auto begin = p.begin();
-    const auto end   = p.end();
-
-    // Enter the loop if the reduction is on a scalar type.
-    if constexpr (NumReductions == 1) {
-      ValueType result = ValueType();
-
-      // Initialize and copy back the result even if it is a zero length
-      // reduction.
-      if (end <= begin) {
-        ParReduceCommon::memcpy_result(result_ptr, &result, sizeof(ValueType),
-                                       ptr_on_device);
-        return;
-      }
-      // Case where reduction is on a native data type.
-      if constexpr (std::is_arithmetic<ValueType>::value) {
-#pragma omp target teams distribute parallel for \
-         map(to:f) reduction(+: result)
-        for (auto i = begin; i < end; ++i)
-
-          if constexpr (std::is_void<TagType>::value) {
-            f(i, result);
-          } else {
-            f(TagType(), i, result);
-          }
-      } else {
-#pragma omp declare reduction(custom:ValueType : omp_out += omp_in)
-#pragma omp target teams distribute parallel for map(to                    \
-                                                     : f) reduction(custom \
-                                                                    : result)
-        for (auto i = begin; i < end; ++i)
-
-          if constexpr (std::is_void<TagType>::value) {
-            f(i, result);
-          } else {
-            f(TagType(), i, result);
-          }
-      }
+    for (int k = 0; k < m_team_size; k += n_values) {
+      if ((k <= m_team_rank) && (k + n_values > m_team_rank))
+        team_scratch[m_team_rank % n_values] += value;
+#pragma omp barrier
+    }
 
-      ParReduceCommon::memcpy_result(result_ptr, &result, sizeof(ValueType),
-                                     ptr_on_device);
-    } else {
-      ValueType result[NumReductions] = {};
-
-      // Initialize and copy back the result even if it is a zero length
-      // reduction.
-      if (end <= begin) {
-        ParReduceCommon::memcpy_result(result_ptr, result,
-                                       NumReductions * sizeof(ValueType),
-                                       ptr_on_device);
-        return;
+    for (int d = 1; d < n_values; d *= 2) {
+      if ((m_team_rank + d < n_values) && (m_team_rank % (2 * d) == 0)) {
+        team_scratch[m_team_rank] += team_scratch[m_team_rank + d];
       }
-#pragma omp target teams distribute parallel for map(to:f) reduction(+:result[:NumReductions])
-      for (auto i = begin; i < end; ++i) {
-        if constexpr (std::is_void<TagType>::value) {
-          f(i, result);
-        } else {
-          f(TagType(), i, result);
-        }
-      }
-
-      ParReduceCommon::memcpy_result(
-          result_ptr, result, NumReductions * sizeof(ValueType), ptr_on_device);
+#pragma omp barrier
     }
+    return team_scratch[0];
   }
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
+   *          with intra-team non-deterministic ordering accumulation.
+   *
+   *  The global inter-team accumulation value will, at the end of the
+   *  league's parallel execution, be the scan's total.
+   *  Parallel execution ordering of the league's teams is non-deterministic.
+   *  As such the base value for each team's scan operation is similarly
+   *  non-deterministic.
+   */
+  template <typename ArgType>
+  KOKKOS_INLINE_FUNCTION ArgType
+  team_scan(const ArgType& /*value*/, ArgType* const /*global_accum*/) const {
+    // FIXME_OPENMPTARGET
+    /*  // Make sure there is enough scratch space:
+      using type =
+        std::conditional_t<(sizeof(ArgType) < TEAM_REDUCE_SIZE), ArgType, void>;
 
-  static void execute_init_join(const FunctorType& f, const PolicyType& p,
-                                PointerType ptr, const bool ptr_on_device) {
-    const auto begin = p.begin();
-    const auto end   = p.end();
-
-    using FunctorAnalysis =
-        Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE, PolicyType,
-                              FunctorType>;
-    constexpr int HasInit = FunctorAnalysis::has_init_member_function;
-
-    // Initialize the result pointer.
-
-    const auto size = end - begin;
-
-    // FIXME_OPENMPTARGET: The team size and MAX_ACTIVE_THREADS are currently
-    // based on NVIDIA-V100 and should be modifid to be based on the
-    // architecture in the future.
-    const int max_team_threads = 32;
-    const int max_teams =
-        OpenMPTargetExec::MAX_ACTIVE_THREADS / max_team_threads;
-    // Number of elements in the reduction
-    const auto value_count = FunctorAnalysis::value_count(f);
-
-    // Allocate scratch per active thread. Achieved by setting the first
-    // parameter of `resize_scratch=1`.
-    OpenMPTargetExec::resize_scratch(1, 0, value_count * sizeof(ValueType),
-                                     std::numeric_limits<int64_t>::max());
-    ValueType* scratch_ptr =
-        static_cast<ValueType*>(OpenMPTargetExec::get_scratch_ptr());
-
-#pragma omp target map(to : f) is_device_ptr(scratch_ptr)
-    {
-      typename FunctorAnalysis::Reducer final_reducer(&f);
-      // Enter this loop if the functor has an `init`
-      if constexpr (HasInit) {
-        // The `init` routine needs to be called on the device since it might
-        // need device members.
-        final_reducer.init(scratch_ptr);
-        final_reducer.final(scratch_ptr);
-      } else {
-        for (int i = 0; i < value_count; ++i) {
-          static_cast<ValueType*>(scratch_ptr)[i] = ValueType();
-        }
+      volatile type * const work_value  = ((type*) m_exec.scratch_thread());
 
-        final_reducer.final(scratch_ptr);
-      }
-    }
+      *work_value = value ;
 
-    if (end <= begin) {
-      // If there is no work to be done, copy back the initialized values and
-      // exit.
-      if (!ptr_on_device)
-        KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy(
-            ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0,
-            omp_get_initial_device(), omp_get_default_device()));
-      else
-        KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy(
-            ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0,
-            omp_get_default_device(), omp_get_default_device()));
-
-      return;
-    }
+      memory_fence();
 
-#pragma omp target teams num_teams(max_teams) thread_limit(max_team_threads) \
-    map(to                                                                   \
-        : f) is_device_ptr(scratch_ptr)
-    {
-      typename FunctorAnalysis::Reducer final_reducer(&f);
-#pragma omp parallel
-      {
-        const int team_num    = omp_get_team_num();
-        const int num_teams   = omp_get_num_teams();
-        const auto chunk_size = size / num_teams;
-        const auto team_begin = begin + team_num * chunk_size;
-        const auto team_end =
-            (team_num == num_teams - 1) ? end : (team_begin + chunk_size);
-        ValueType* team_scratch =
-            scratch_ptr + team_num * max_team_threads * value_count;
-        ReferenceType result = final_reducer.init(
-            &team_scratch[omp_get_thread_num() * value_count]);
-
-        // Accumulate partial results in thread specific storage.
-#pragma omp for simd
-        for (auto i = team_begin; i < team_end; ++i) {
-          if constexpr (std::is_void<TagType>::value) {
-            f(i, result);
-          } else {
-            f(TagType(), i, result);
+      if ( team_fan_in() ) {
+        // The last thread to synchronize returns true, all other threads wait
+      for team_fan_out()
+        // m_team_base[0]                 == highest ranking team member
+        // m_team_base[ m_team_size - 1 ] == lowest ranking team member
+        //
+        // 1) copy from lower to higher rank, initialize lowest rank to zero
+        // 2) prefix sum from lowest to highest rank, skipping lowest rank
+
+        type accum = 0 ;
+
+        if ( global_accum ) {
+          for ( int i = m_team_size ; i-- ; ) {
+            type & val = *((type*) m_exec.pool_rev( m_team_base_rev + i
+      )->scratch_thread()); accum += val ;
           }
+          accum = atomic_fetch_add( global_accum , accum );
         }
 
-        // Reduce all paritial results within a team.
-        const int team_size      = max_team_threads;
-        int tree_neighbor_offset = 1;
-        do {
-#pragma omp for simd
-          for (int i = 0; i < team_size - tree_neighbor_offset;
-               i += 2 * tree_neighbor_offset) {
-            const int neighbor = i + tree_neighbor_offset;
-            final_reducer.join(&team_scratch[i * value_count],
-                               &team_scratch[neighbor * value_count]);
-          }
-          tree_neighbor_offset *= 2;
-        } while (tree_neighbor_offset < team_size);
-      }  // end parallel
-    }    // end target
-
-    int tree_neighbor_offset = 1;
-    do {
-#pragma omp target teams distribute parallel for simd map(to   \
-                                                          : f) \
-    is_device_ptr(scratch_ptr)
-      for (int i = 0; i < max_teams - tree_neighbor_offset;
-           i += 2 * tree_neighbor_offset) {
-        typename FunctorAnalysis::Reducer final_reducer(&f);
-        ValueType* team_scratch = scratch_ptr;
-        const int team_offset   = max_team_threads * value_count;
-        final_reducer.join(
-            &team_scratch[i * team_offset],
-            &team_scratch[(i + tree_neighbor_offset) * team_offset]);
-
-        // If `final` is provided by the functor.
-        // Do the final only once at the end.
-        if (tree_neighbor_offset * 2 >= max_teams && omp_get_team_num() == 0 &&
-            omp_get_thread_num() == 0) {
-          final_reducer.final(scratch_ptr);
+        for ( int i = m_team_size ; i-- ; ) {
+          type & val = *((type*) m_exec.pool_rev( m_team_base_rev + i
+      )->scratch_thread()); const type offset = accum ; accum += val ; val =
+      offset ;
         }
+
+        memory_fence();
       }
-      tree_neighbor_offset *= 2;
-    } while (tree_neighbor_offset < max_teams);
-
-    // If the result view is on the host, copy back the values via memcpy.
-    if (!ptr_on_device)
-      KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy(
-          ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0,
-          omp_get_initial_device(), omp_get_default_device()));
-    else
-      KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy(
-          ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0,
-          omp_get_default_device(), omp_get_default_device()));
-  }
-};
 
-template <class FunctorType, class ReducerType, class... Traits>
-class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
-                     Kokkos::Experimental::OpenMPTarget> {
- private:
-  using Policy = Kokkos::RangePolicy<Traits...>;
-
-  using WorkTag   = typename Policy::work_tag;
-  using WorkRange = typename Policy::WorkRange;
-
-  using ReducerTypeFwd =
-      std::conditional_t<std::is_same<InvalidType, ReducerType>::value,
-                         FunctorType, ReducerType>;
-  using Analysis = Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
-                                         Policy, ReducerTypeFwd>;
-
-  using pointer_type   = typename Analysis::pointer_type;
-  using reference_type = typename Analysis::reference_type;
-
-  static constexpr int HasJoin =
-      Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE, Policy,
-                            FunctorType>::has_join_member_function;
-  static constexpr int UseReducer = is_reducer<ReducerType>::value;
-  static constexpr int IsArray    = std::is_pointer<reference_type>::value;
-
-  using ParReduceSpecialize =
-      ParallelReduceSpecialize<FunctorType, Policy, ReducerType, pointer_type,
-                               typename Analysis::value_type>;
-
-  const FunctorType m_functor;
-  const Policy m_policy;
-  const ReducerType m_reducer;
-  const pointer_type m_result_ptr;
-  bool m_result_ptr_on_device;
-  const int m_result_ptr_num_elems;
-  using TagType = typename Policy::work_tag;
+      team_fan_out();
 
- public:
-  void execute() const {
-    if constexpr (HasJoin) {
-      // Enter this loop if the Functor has a init-join.
-      ParReduceSpecialize::execute_init_join(m_functor, m_policy, m_result_ptr,
-                                             m_result_ptr_on_device);
-    } else if constexpr (UseReducer) {
-      // Enter this loop if the Functor is a reducer type.
-      ParReduceSpecialize::execute_reducer(m_functor, m_policy, m_result_ptr,
-                                           m_result_ptr_on_device);
-    } else if constexpr (IsArray) {
-      // Enter this loop if the reduction is on an array and the routine is
-      // templated over the size of the array.
-      if (m_result_ptr_num_elems <= 2) {
-        ParReduceSpecialize::template execute_array<TagType, 2>(
-            m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
-      } else if (m_result_ptr_num_elems <= 4) {
-        ParReduceSpecialize::template execute_array<TagType, 4>(
-            m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
-      } else if (m_result_ptr_num_elems <= 8) {
-        ParReduceSpecialize::template execute_array<TagType, 8>(
-            m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
-      } else if (m_result_ptr_num_elems <= 16) {
-        ParReduceSpecialize::template execute_array<TagType, 16>(
-            m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
-      } else if (m_result_ptr_num_elems <= 32) {
-        ParReduceSpecialize::template execute_array<TagType, 32>(
-            m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
-      } else {
-        Kokkos::abort("array reduction length must be <= 32");
-      }
-    } else {
-      // This loop handles the basic scalar reduction.
-      ParReduceSpecialize::template execute_array<TagType, 1>(
-          m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
-    }
+      return *work_value ;*/
+    return ArgType();
   }
 
-  template <class ViewType>
-  ParallelReduce(const FunctorType& arg_functor, Policy& arg_policy,
-                 const ViewType& arg_result_view,
-                 std::enable_if_t<Kokkos::is_view<ViewType>::value &&
-                                      !Kokkos::is_reducer<ReducerType>::value,
-                                  void*> = nullptr)
-      : m_functor(arg_functor),
-        m_policy(arg_policy),
-        m_reducer(InvalidType()),
-        m_result_ptr(arg_result_view.data()),
-        m_result_ptr_on_device(
-            MemorySpaceAccess<Kokkos::Experimental::OpenMPTargetSpace,
-                              typename ViewType::memory_space>::accessible),
-        m_result_ptr_num_elems(arg_result_view.size()) {}
-
-  ParallelReduce(const FunctorType& arg_functor, Policy& arg_policy,
-                 const ReducerType& reducer)
-      : m_functor(arg_functor),
-        m_policy(arg_policy),
-        m_reducer(reducer),
-        m_result_ptr(reducer.view().data()),
-        m_result_ptr_on_device(
-            MemorySpaceAccess<Kokkos::Experimental::OpenMPTargetSpace,
-                              typename ReducerType::result_view_type::
-                                  memory_space>::accessible),
-        m_result_ptr_num_elems(reducer.view().size()) {}
-};
-
-}  // namespace Impl
-}  // namespace Kokkos
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
+   *
+   *  The highest rank thread can compute the reduction total as
+   *    reduction_total = dev.team_scan( value ) + value ;
+   */
+  template <typename Type>
+  KOKKOS_INLINE_FUNCTION Type team_scan(const Type& value) const {
+    return this->template team_scan<Type>(value, 0);
+  }
 
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
+  //----------------------------------------
+  // Private for the driver
 
-namespace Kokkos {
-namespace Impl {
+ private:
+  using space = execution_space::scratch_memory_space;
 
-template <class FunctorType, class... Traits>
-class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
-                   Kokkos::Experimental::OpenMPTarget> {
- protected:
-  using Policy = Kokkos::RangePolicy<Traits...>;
+ public:
+  // FIXME_OPENMPTARGET - 512(16*32) bytes at the begining of the scratch space
+  // for each league is saved for reduction. It should actually be based on the
+  // ValueType of the reduction variable.
+  inline OpenMPTargetExecTeamMember(
+      const int league_rank, const int league_size, const int team_size,
+      const int vector_length  // const TeamPolicyInternal< OpenMPTarget,
+                               // Properties ...> & team
+      ,
+      void* const glb_scratch, const int shmem_block_index,
+      const size_t shmem_size_L0, const size_t shmem_size_L1)
+      : m_team_scratch_size{shmem_size_L0, shmem_size_L1},
+        m_team_rank(0),
+        m_team_size(team_size),
+        m_league_rank(league_rank),
+        m_league_size(league_size),
+        m_vector_length(vector_length),
+        m_shmem_block_index(shmem_block_index),
+        m_glb_scratch(glb_scratch) {
+    const int omp_tid = omp_get_thread_num();
+
+    // The scratch memory allocated is a sum of TEAM_REDUCE_SIZE, L0 shmem size
+    // and L1 shmem size. TEAM_REDUCE_SIZE = 512 bytes saved per team for
+    // hierarchical reduction. There is an additional 10% of the requested
+    // scratch memory allocated per team as padding. Hence the product with 0.1.
+    const int reduce_offset =
+        m_shmem_block_index *
+        (shmem_size_L0 + shmem_size_L1 +
+         ((shmem_size_L0 + shmem_size_L1) * 0.1) + TEAM_REDUCE_SIZE);
+    const int l0_offset = reduce_offset + TEAM_REDUCE_SIZE;
+    const int l1_offset = l0_offset + shmem_size_L0;
+    m_team_shared       = scratch_memory_space(
+        (static_cast<char*>(glb_scratch) + l0_offset), shmem_size_L0,
+        static_cast<char*>(glb_scratch) + l1_offset, shmem_size_L1);
+    m_reduce_scratch = static_cast<char*>(glb_scratch) + reduce_offset;
+    m_league_rank    = league_rank;
+    m_team_rank      = omp_tid;
+    m_vector_lane    = 0;
+  }
 
-  using WorkTag   = typename Policy::work_tag;
-  using WorkRange = typename Policy::WorkRange;
-  using Member    = typename Policy::member_type;
-  using idx_type  = typename Policy::index_type;
+  static inline int team_reduce_size() { return TEAM_REDUCE_SIZE; }
+};
 
-  using Analysis = Impl::FunctorAnalysis<Impl::FunctorPatternInterface::SCAN,
-                                         Policy, FunctorType>;
+template <class... Properties>
+class TeamPolicyInternal<Kokkos::Experimental::OpenMPTarget, Properties...>
+    : public PolicyTraits<Properties...> {
+ public:
+  //! Tag this class as a kokkos execution policy
+  using execution_policy = TeamPolicyInternal;
 
-  using value_type     = typename Analysis::value_type;
-  using pointer_type   = typename Analysis::pointer_type;
-  using reference_type = typename Analysis::reference_type;
+  using traits = PolicyTraits<Properties...>;
 
-  const FunctorType m_functor;
-  const Policy m_policy;
+  //----------------------------------------
 
-  value_type* m_result_ptr;
-  const bool m_result_ptr_device_accessible;
+  template <class FunctorType>
+  inline static int team_size_max(const FunctorType&, const ParallelForTag&) {
+    return 256;
+  }
 
-  template <class TagType>
-  std::enable_if_t<std::is_void<TagType>::value> call_with_tag(
-      const FunctorType& f, const idx_type& idx, value_type& val,
-      const bool& is_final) const {
-    f(idx, val, is_final);
+  template <class FunctorType>
+  inline static int team_size_max(const FunctorType&,
+                                  const ParallelReduceTag&) {
+    return 256;
   }
-  template <class TagType>
-  std::enable_if_t<!std::is_void<TagType>::value> call_with_tag(
-      const FunctorType& f, const idx_type& idx, value_type& val,
-      const bool& is_final) const {
-    f(WorkTag(), idx, val, is_final);
+
+  template <class FunctorType, class ReducerType>
+  inline static int team_size_max(const FunctorType&, const ReducerType&,
+                                  const ParallelReduceTag&) {
+    return 256;
   }
 
- public:
-  void impl_execute(
-      Kokkos::View<value_type**, Kokkos::LayoutRight,
-                   Kokkos::Experimental::OpenMPTargetSpace>
-          element_values,
-      Kokkos::View<value_type*, Kokkos::Experimental::OpenMPTargetSpace>
-          chunk_values,
-      Kokkos::View<int64_t, Kokkos::Experimental::OpenMPTargetSpace> count)
-      const {
-    const idx_type N          = m_policy.end() - m_policy.begin();
-    const idx_type chunk_size = 128;
-    const idx_type n_chunks   = (N + chunk_size - 1) / chunk_size;
-    idx_type nteams           = n_chunks > 512 ? 512 : n_chunks;
-    idx_type team_size        = 128;
-
-    FunctorType a_functor(m_functor);
-#pragma omp target teams distribute map(to                             \
-                                        : a_functor) num_teams(nteams) \
-    thread_limit(team_size)
-    for (idx_type team_id = 0; team_id < n_chunks; ++team_id) {
-      typename Analysis::Reducer final_reducer(&a_functor);
-#pragma omp parallel num_threads(team_size)
-      {
-        const idx_type local_offset = team_id * chunk_size;
-
-#pragma omp for
-        for (idx_type i = 0; i < chunk_size; ++i) {
-          const idx_type idx = local_offset + i;
-          value_type val;
-          final_reducer.init(&val);
-          if (idx < N) call_with_tag<WorkTag>(a_functor, idx, val, false);
-          element_values(team_id, i) = val;
-        }
-#pragma omp barrier
-        if (omp_get_thread_num() == 0) {
-          value_type sum;
-          final_reducer.init(&sum);
-          for (idx_type i = 0; i < chunk_size; ++i) {
-            final_reducer.join(&sum, &element_values(team_id, i));
-            element_values(team_id, i) = sum;
-          }
-          chunk_values(team_id) = sum;
-        }
-#pragma omp barrier
-        if (omp_get_thread_num() == 0) {
-          if (Kokkos::atomic_fetch_add(&count(), 1) == n_chunks - 1) {
-            value_type sum;
-            final_reducer.init(&sum);
-            for (idx_type i = 0; i < n_chunks; ++i) {
-              final_reducer.join(&sum, &chunk_values(i));
-              chunk_values(i) = sum;
-            }
-          }
-        }
-      }
-    }
+  template <class FunctorType>
+  inline static int team_size_recommended(const FunctorType&,
+                                          const ParallelForTag&) {
+    return 128;
+  }
 
-#pragma omp target teams distribute map(to                             \
-                                        : a_functor) num_teams(nteams) \
-    thread_limit(team_size)
-    for (idx_type team_id = 0; team_id < n_chunks; ++team_id) {
-      typename Analysis::Reducer final_reducer(&a_functor);
-#pragma omp parallel num_threads(team_size)
-      {
-        const idx_type local_offset = team_id * chunk_size;
-        value_type offset_value;
-        if (team_id > 0)
-          offset_value = chunk_values(team_id - 1);
-        else
-          final_reducer.init(&offset_value);
-
-#pragma omp for
-        for (idx_type i = 0; i < chunk_size; ++i) {
-          const idx_type idx = local_offset + i;
-          value_type local_offset_value;
-          if (i > 0) {
-            local_offset_value = element_values(team_id, i - 1);
-            // FIXME_OPENMPTARGET We seem to access memory illegaly on AMD GPUs
-#ifdef KOKKOS_ARCH_VEGA
-            if constexpr (Analysis::has_join_member_function) {
-              if constexpr (std::is_void_v<WorkTag>)
-                a_functor.join(local_offset_value, offset_value);
-              else
-                a_functor.join(WorkTag{}, local_offset_value, offset_value);
-            } else
-              local_offset_value += offset_value;
-#else
-            final_reducer.join(&local_offset_value, &offset_value);
-#endif
-          } else
-            local_offset_value = offset_value;
-          if (idx < N)
-            call_with_tag<WorkTag>(a_functor, idx, local_offset_value, true);
-          if (idx == N - 1 && m_result_ptr_device_accessible)
-            *m_result_ptr = local_offset_value;
-        }
-      }
-    }
+  template <class FunctorType>
+  inline static int team_size_recommended(const FunctorType&,
+                                          const ParallelReduceTag&) {
+    return 128;
   }
 
-  void execute() const {
-    OpenMPTargetExec::verify_is_process(
-        "Kokkos::Experimental::OpenMPTarget parallel_for");
-    OpenMPTargetExec::verify_initialized(
-        "Kokkos::Experimental::OpenMPTarget parallel_for");
-    const idx_type N          = m_policy.end() - m_policy.begin();
-    const idx_type chunk_size = 128;
-    const idx_type n_chunks   = (N + chunk_size - 1) / chunk_size;
-
-    // This could be scratch memory per team
-    Kokkos::View<value_type**, Kokkos::LayoutRight,
-                 Kokkos::Experimental::OpenMPTargetSpace>
-        element_values("element_values", n_chunks, chunk_size);
-    Kokkos::View<value_type*, Kokkos::Experimental::OpenMPTargetSpace>
-        chunk_values("chunk_values", n_chunks);
-    Kokkos::View<int64_t, Kokkos::Experimental::OpenMPTargetSpace> count(
-        "Count");
-
-    impl_execute(element_values, chunk_values, count);
+  template <class FunctorType, class ReducerType>
+  inline static int team_size_recommended(const FunctorType&,
+                                          const ReducerType&,
+                                          const ParallelReduceTag&) {
+    return 128;
   }
 
   //----------------------------------------
 
-  ParallelScan(const FunctorType& arg_functor, const Policy& arg_policy,
-               pointer_type arg_result_ptr           = nullptr,
-               bool arg_result_ptr_device_accessible = false)
-      : m_functor(arg_functor),
-        m_policy(arg_policy),
-        m_result_ptr(arg_result_ptr),
-        m_result_ptr_device_accessible(arg_result_ptr_device_accessible) {}
-
-  //----------------------------------------
-};
+ private:
+  int m_league_size;
+  int m_team_size;
+  int m_vector_length;
+  int m_team_alloc;
+  int m_team_iter;
+  std::array<size_t, 2> m_team_scratch_size;
+  std::array<size_t, 2> m_thread_scratch_size;
+  bool m_tune_team_size;
+  bool m_tune_vector_length;
+  constexpr const static size_t default_team_size = 256;
+  int m_chunk_size;
+
+  inline void init(const int league_size_request, const int team_size_request,
+                   const int vector_length_request) {
+    m_league_size = league_size_request;
+
+    // Minimum team size should be 32 for OpenMPTarget backend.
+    if (team_size_request < 32) {
+      Kokkos::Impl::OpenMPTarget_abort(
+          "OpenMPTarget backend requires a minimum of 32 threads per team.\n");
+    } else
+      m_team_size = team_size_request;
+
+    m_vector_length = vector_length_request;
+    set_auto_chunk_size();
+  }
 
-template <class FunctorType, class ReturnType, class... Traits>
-class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
-                            ReturnType, Kokkos::Experimental::OpenMPTarget>
-    : public ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
-                          Kokkos::Experimental::OpenMPTarget> {
-  using base_t     = ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
-                              Kokkos::Experimental::OpenMPTarget>;
-  using value_type = typename base_t::value_type;
+  template <typename ExecSpace, typename... OtherProperties>
+  friend class TeamPolicyInternal;
 
  public:
-  void execute() const {
-    OpenMPTargetExec::verify_is_process(
-        "Kokkos::Experimental::OpenMPTarget parallel_for");
-    OpenMPTargetExec::verify_initialized(
-        "Kokkos::Experimental::OpenMPTarget parallel_for");
-    const int64_t N        = base_t::m_policy.end() - base_t::m_policy.begin();
-    const int chunk_size   = 128;
-    const int64_t n_chunks = (N + chunk_size - 1) / chunk_size;
-
-    if (N > 0) {
-      // This could be scratch memory per team
-      Kokkos::View<value_type**, Kokkos::LayoutRight,
-                   Kokkos::Experimental::OpenMPTargetSpace>
-          element_values("element_values", n_chunks, chunk_size);
-      Kokkos::View<value_type*, Kokkos::Experimental::OpenMPTargetSpace>
-          chunk_values("chunk_values", n_chunks);
-      Kokkos::View<int64_t, Kokkos::Experimental::OpenMPTargetSpace> count(
-          "Count");
-
-      base_t::impl_execute(element_values, chunk_values, count);
-
-      if (!base_t::m_result_ptr_device_accessible) {
-        const int size = base_t::Analysis::value_size(base_t::m_functor);
-        DeepCopy<HostSpace, Kokkos::Experimental::OpenMPTargetSpace>(
-            base_t::m_result_ptr, chunk_values.data() + (n_chunks - 1), size);
-      }
-    } else if (!base_t::m_result_ptr_device_accessible) {
-      *base_t::m_result_ptr = 0;
-    }
+  // FIXME_OPENMPTARGET : Currently this routine is a copy of the Cuda
+  // implementation, but this has to be tailored to be architecture specific.
+  inline static int scratch_size_max(int level) {
+    return (
+        level == 0 ? 1024 * 40 :  // 48kB is the max for CUDA, but we need some
+                                  // for team_member.reduce etc.
+            20 * 1024 *
+                1024);  // arbitrarily setting this to 20MB, for a Volta V100
+                        // that would give us about 3.2GB for 2 teams per SM
   }
-
-  template <class ViewType>
-  ParallelScanWithTotal(const FunctorType& arg_functor,
-                        const typename base_t::Policy& arg_policy,
-                        const ViewType& arg_result_view)
-      : base_t(arg_functor, arg_policy, arg_result_view.data(),
-               MemorySpaceAccess<Kokkos::Experimental::OpenMPTargetSpace,
-                                 typename ViewType::memory_space>::accessible) {
+  inline bool impl_auto_team_size() const { return m_tune_team_size; }
+  inline bool impl_auto_vector_length() const { return m_tune_vector_length; }
+  inline void impl_set_team_size(const size_t size) { m_team_size = size; }
+  inline void impl_set_vector_length(const size_t length) {
+    m_tune_vector_length = length;
+  }
+  inline int impl_vector_length() const { return m_vector_length; }
+  inline int team_size() const { return m_team_size; }
+  inline int league_size() const { return m_league_size; }
+  inline size_t scratch_size(const int& level, int team_size_ = -1) const {
+    if (team_size_ < 0) team_size_ = m_team_size;
+    return m_team_scratch_size[level] +
+           team_size_ * m_thread_scratch_size[level];
   }
-};
-}  // namespace Impl
-}  // namespace Kokkos
 
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
+  inline Kokkos::Experimental::OpenMPTarget space() const {
+    return Kokkos::Experimental::OpenMPTarget();
+  }
 
-namespace Kokkos {
-namespace Impl {
+  template <class... OtherProperties>
+  TeamPolicyInternal(const TeamPolicyInternal<OtherProperties...>& p)
+      : m_league_size(p.m_league_size),
+        m_team_size(p.m_team_size),
+        m_vector_length(p.m_vector_length),
+        m_team_alloc(p.m_team_alloc),
+        m_team_iter(p.m_team_iter),
+        m_team_scratch_size(p.m_team_scratch_size),
+        m_thread_scratch_size(p.m_thread_scratch_size),
+        m_tune_team_size(p.m_tune_team_size),
+        m_tune_vector_length(p.m_tune_vector_length),
+        m_chunk_size(p.m_chunk_size) {}
+
+  /** \brief  Specify league size, request team size */
+  TeamPolicyInternal(const typename traits::execution_space&,
+                     int league_size_request, int team_size_request,
+                     int vector_length_request = 1)
+      : m_team_scratch_size{0, 0},
+        m_thread_scratch_size{0, 0},
+        m_tune_team_size(false),
+        m_tune_vector_length(false),
+        m_chunk_size(0) {
+    init(league_size_request, team_size_request, vector_length_request);
+  }
 
-template <class FunctorType, class... Properties>
-class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
-                  Kokkos::Experimental::OpenMPTarget> {
- private:
-  using Policy =
-      Kokkos::Impl::TeamPolicyInternal<Kokkos::Experimental::OpenMPTarget,
-                                       Properties...>;
-  using WorkTag = typename Policy::work_tag;
-  using Member  = typename Policy::member_type;
+  TeamPolicyInternal(const typename traits::execution_space&,
+                     int league_size_request,
+                     const Kokkos::AUTO_t& /* team_size_request */
+                     ,
+                     int vector_length_request = 1)
+      : m_team_scratch_size{0, 0},
+        m_thread_scratch_size{0, 0},
+        m_tune_team_size(true),
+        m_tune_vector_length(false),
+        m_chunk_size(0) {
+    init(league_size_request, default_team_size / vector_length_request,
+         vector_length_request);
+  }
 
-  const FunctorType m_functor;
-  const Policy m_policy;
-  const size_t m_shmem_size;
+  TeamPolicyInternal(const typename traits::execution_space&,
+                     int league_size_request,
+                     const Kokkos::AUTO_t& /* team_size_request */
+                     ,
+                     const Kokkos::AUTO_t& /* vector_length_request */)
+      : m_team_scratch_size{0, 0},
+        m_thread_scratch_size{0, 0},
+        m_tune_team_size(true),
+        m_tune_vector_length(true),
+        m_chunk_size(0) {
+    init(league_size_request, default_team_size, 1);
+  }
+  TeamPolicyInternal(const typename traits::execution_space&,
+                     int league_size_request, int team_size_request,
+                     const Kokkos::AUTO_t& /* vector_length_request */)
+      : m_team_scratch_size{0, 0},
+        m_thread_scratch_size{0, 0},
+        m_tune_team_size(false),
+        m_tune_vector_length(true),
+        m_chunk_size(0) {
+    init(league_size_request, team_size_request, 1);
+  }
 
- public:
-  void execute() const {
-    OpenMPTargetExec::verify_is_process(
-        "Kokkos::Experimental::OpenMPTarget parallel_for");
-    OpenMPTargetExec::verify_initialized(
-        "Kokkos::Experimental::OpenMPTarget parallel_for");
-    execute_impl<WorkTag>();
+  TeamPolicyInternal(int league_size_request, int team_size_request,
+                     int vector_length_request = 1)
+      : m_team_scratch_size{0, 0},
+        m_thread_scratch_size{0, 0},
+        m_tune_team_size(false),
+        m_tune_vector_length(false),
+        m_chunk_size(0) {
+    init(league_size_request, team_size_request, vector_length_request);
   }
 
- private:
-  template <class TagType>
-  void execute_impl() const {
-    OpenMPTargetExec::verify_is_process(
-        "Kokkos::Experimental::OpenMPTarget parallel_for");
-    OpenMPTargetExec::verify_initialized(
-        "Kokkos::Experimental::OpenMPTarget parallel_for");
-    const auto league_size   = m_policy.league_size();
-    const auto team_size     = m_policy.team_size();
-    const auto vector_length = m_policy.impl_vector_length();
-
-    const size_t shmem_size_L0 = m_policy.scratch_size(0, team_size);
-    const size_t shmem_size_L1 = m_policy.scratch_size(1, team_size);
-    OpenMPTargetExec::resize_scratch(team_size, shmem_size_L0, shmem_size_L1,
-                                     league_size);
-
-    void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr();
-    FunctorType a_functor(m_functor);
-
-    // FIXME_OPENMPTARGET - If the team_size is not a multiple of 32, the
-    // scratch implementation does not work in the Release or RelWithDebugInfo
-    // mode but works in the Debug mode.
-
-    // Maximum active teams possible.
-    int max_active_teams = OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size;
-    // nteams should not exceed the maximum in-flight teams possible.
-    const auto nteams =
-        league_size < max_active_teams ? league_size : max_active_teams;
-
-    // If the league size is <=0, do not launch the kernel.
-    if (nteams <= 0) return;
-
-// Performing our own scheduling of teams to avoid separation of code between
-// teams-distribute and parallel. Gave a 2x performance boost in test cases with
-// the clang compiler. atomic_compare_exchange can be avoided since the standard
-// guarantees that the number of teams specified in the `num_teams` clause is
-// always less than or equal to the maximum concurrently running teams.
-#pragma omp target teams num_teams(nteams) thread_limit(team_size) \
-    map(to                                                         \
-        : a_functor) is_device_ptr(scratch_ptr)
-#pragma omp parallel
-    {
-      const int blockIdx = omp_get_team_num();
-      const int gridDim  = omp_get_num_teams();
-
-      // Iterate through the number of teams until league_size and assign the
-      // league_id accordingly
-      // Guarantee that the compilers respect the `num_teams` clause
-      if (gridDim <= nteams) {
-        for (int league_id = blockIdx; league_id < league_size;
-             league_id += gridDim) {
-          typename Policy::member_type team(
-              league_id, league_size, team_size, vector_length, scratch_ptr,
-              blockIdx, shmem_size_L0, shmem_size_L1);
-          if constexpr (std::is_void<TagType>::value)
-            m_functor(team);
-          else
-            m_functor(TagType(), team);
-        }
-      } else
-        Kokkos::abort("`num_teams` clause was not respected.\n");
-    }
+  TeamPolicyInternal(int league_size_request,
+                     const Kokkos::AUTO_t& /* team_size_request */
+                     ,
+                     int vector_length_request = 1)
+      : m_team_scratch_size{0, 0},
+        m_thread_scratch_size{0, 0},
+        m_tune_team_size(true),
+        m_tune_vector_length(false),
+        m_chunk_size(0) {
+    init(league_size_request, default_team_size / vector_length_request,
+         vector_length_request);
   }
 
- public:
-  ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
-      : m_functor(arg_functor),
-        m_policy(arg_policy),
-        m_shmem_size(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) +
-                     FunctorTeamShmemSize<FunctorType>::value(
-                         arg_functor, arg_policy.team_size())) {}
-};
+  TeamPolicyInternal(int league_size_request,
+                     const Kokkos::AUTO_t& /* team_size_request */
+                     ,
+                     const Kokkos::AUTO_t& /* vector_length_request */)
+      : m_team_scratch_size{0, 0},
+        m_thread_scratch_size{0, 0},
+        m_tune_team_size(true),
+        m_tune_vector_length(true),
+        m_chunk_size(0) {
+    init(league_size_request, default_team_size, 1);
+  }
+  TeamPolicyInternal(int league_size_request, int team_size_request,
+                     const Kokkos::AUTO_t& /* vector_length_request */)
+      : m_team_scratch_size{0, 0},
+        m_thread_scratch_size{0, 0},
+        m_tune_team_size(false),
+        m_tune_vector_length(true),
+        m_chunk_size(0) {
+    init(league_size_request, team_size_request, 1);
+  }
+  inline static size_t vector_length_max() {
+    return 32; /* TODO: this is bad. Need logic that is compiler and backend
+                  aware */
+  }
+  inline int team_alloc() const { return m_team_alloc; }
+  inline int team_iter() const { return m_team_iter; }
 
-template <class FunctorType, class ReducerType, class PointerType,
-          class ValueType, class... PolicyArgs>
-struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>,
-                                ReducerType, PointerType, ValueType> {
-  using PolicyType = TeamPolicyInternal<PolicyArgs...>;
-  using TagType    = typename PolicyType::work_tag;
-  using ReducerTypeFwd =
-      std::conditional_t<std::is_same<InvalidType, ReducerType>::value,
-                         FunctorType, ReducerType>;
-  using Analysis = Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
-                                         PolicyType, ReducerTypeFwd>;
-
-  using ReferenceType = typename Analysis::reference_type;
-
-  using ParReduceCommon = ParallelReduceCommon<PointerType>;
-
-  static void execute_reducer(const FunctorType& f, const PolicyType& p,
-                              PointerType result_ptr, bool ptr_on_device) {
-    OpenMPTargetExec::verify_is_process(
-        "Kokkos::Experimental::OpenMPTarget parallel_for");
-    OpenMPTargetExec::verify_initialized(
-        "Kokkos::Experimental::OpenMPTarget parallel_for");
-
-    const int league_size   = p.league_size();
-    const int team_size     = p.team_size();
-    const int vector_length = p.impl_vector_length();
-
-    const size_t shmem_size_L0 = p.scratch_size(0, team_size);
-    const size_t shmem_size_L1 = p.scratch_size(1, team_size);
-    OpenMPTargetExec::resize_scratch(PolicyType::member_type::TEAM_REDUCE_SIZE,
-                                     shmem_size_L0, shmem_size_L1, league_size);
-    void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr();
-
-    ValueType result = ValueType();
-
-    // Maximum active teams possible.
-    int max_active_teams = OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size;
-    const auto nteams =
-        league_size < max_active_teams ? league_size : max_active_teams;
-
-    // If the league size is <=0, do not launch the kernel.
-    if (nteams <= 0) return;
-
-#pragma omp declare reduction(                                         \
-    custom:ValueType                                                   \
-    : OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
-    initializer(OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
-
-#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to   \
-                                                                       : f) \
-    is_device_ptr(scratch_ptr) reduction(custom                             \
-                                         : result)
-#pragma omp parallel reduction(custom : result)
-    {
-      const int blockIdx = omp_get_team_num();
-      const int gridDim  = omp_get_num_teams();
-
-      // Guarantee that the compilers respect the `num_teams` clause
-      if (gridDim <= nteams) {
-        for (int league_id = blockIdx; league_id < league_size;
-             league_id += gridDim) {
-          typename PolicyType::member_type team(
-              league_id, league_size, team_size, vector_length, scratch_ptr,
-              blockIdx, shmem_size_L0, shmem_size_L1);
-          if constexpr (std::is_void<TagType>::value)
-            f(team, result);
-          else
-            f(TagType(), team, result);
-        }
-      } else
-        Kokkos::abort("`num_teams` clause was not respected.\n");
-    }
+  inline int chunk_size() const { return m_chunk_size; }
 
-    // Copy results back to device if `parallel_reduce` is on a device view.
-    ParReduceCommon::memcpy_result(result_ptr, &result, sizeof(ValueType),
-                                   ptr_on_device);
+  /** \brief set chunk_size to a discrete value*/
+  inline TeamPolicyInternal& set_chunk_size(
+      typename traits::index_type chunk_size_) {
+    m_chunk_size = chunk_size_;
+    return *this;
   }
 
-  template <int NumReductions>
-  static void execute_array(const FunctorType& f, const PolicyType& p,
-                            PointerType result_ptr, bool ptr_on_device) {
-    OpenMPTargetExec::verify_is_process(
-        "Kokkos::Experimental::OpenMPTarget parallel_for");
-    OpenMPTargetExec::verify_initialized(
-        "Kokkos::Experimental::OpenMPTarget parallel_for");
-
-    const int league_size   = p.league_size();
-    const int team_size     = p.team_size();
-    const int vector_length = p.impl_vector_length();
-
-    const size_t shmem_size_L0 = p.scratch_size(0, team_size);
-    const size_t shmem_size_L1 = p.scratch_size(1, team_size);
-    OpenMPTargetExec::resize_scratch(PolicyType::member_type::TEAM_REDUCE_SIZE,
-                                     shmem_size_L0, shmem_size_L1, league_size);
-    void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr();
-
-    // Maximum active teams possible.
-    int max_active_teams = OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size;
-    const auto nteams =
-        league_size < max_active_teams ? league_size : max_active_teams;
-
-    // If the league size is <=0, do not launch the kernel.
-    if (nteams <= 0) return;
-
-    // Case where the number of reduction items is 1.
-    if constexpr (NumReductions == 1) {
-      ValueType result = ValueType();
-
-      // Case where reduction is on a native data type.
-      if constexpr (std::is_arithmetic<ValueType>::value) {
-#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to   \
-                                                                       : f) \
-    is_device_ptr(scratch_ptr) reduction(+: result)
-#pragma omp parallel reduction(+ : result)
-        {
-          const int blockIdx = omp_get_team_num();
-          const int gridDim  = omp_get_num_teams();
-
-          // Guarantee that the compilers respect the `num_teams` clause
-          if (gridDim <= nteams) {
-            for (int league_id = blockIdx; league_id < league_size;
-                 league_id += gridDim) {
-              typename PolicyType::member_type team(
-                  league_id, league_size, team_size, vector_length, scratch_ptr,
-                  blockIdx, shmem_size_L0, shmem_size_L1);
-              if constexpr (std::is_void<TagType>::value)
-                f(team, result);
-              else
-                f(TagType(), team, result);
-            }
-          } else
-            Kokkos::abort("`num_teams` clause was not respected.\n");
-        }
-      } else {
-        // Case where the reduction is on a non-native data type.
-#pragma omp declare reduction(custom:ValueType : omp_out += omp_in)
-#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to   \
-                                                                       : f) \
-    is_device_ptr(scratch_ptr) reduction(custom                             \
-                                         : result)
-#pragma omp parallel reduction(custom : result)
-        {
-          const int blockIdx = omp_get_team_num();
-          const int gridDim  = omp_get_num_teams();
-
-          // Guarantee that the compilers respect the `num_teams` clause
-          if (gridDim <= nteams) {
-            for (int league_id = blockIdx; league_id < league_size;
-                 league_id += gridDim) {
-              typename PolicyType::member_type team(
-                  league_id, league_size, team_size, vector_length, scratch_ptr,
-                  blockIdx, shmem_size_L0, shmem_size_L1);
-              if constexpr (std::is_void<TagType>::value)
-                f(team, result);
-              else
-                f(TagType(), team, result);
-            }
-          } else
-            Kokkos::abort("`num_teams` clause was not respected.\n");
-        }
-      }
+  /** \brief set per team scratch size for a specific level of the scratch
+   * hierarchy */
+  inline TeamPolicyInternal& set_scratch_size(const int& level,
+                                              const PerTeamValue& per_team) {
+    m_team_scratch_size[level] = per_team.value;
+    return *this;
+  }
 
-      // Copy results back to device if `parallel_reduce` is on a device view.
-      ParReduceCommon::memcpy_result(result_ptr, &result, sizeof(ValueType),
-                                     ptr_on_device);
-    } else {
-      ValueType result[NumReductions] = {};
-      // Case where the reduction is on an array.
-#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to   \
-                                                                       : f) \
-    is_device_ptr(scratch_ptr) reduction(+ : result[:NumReductions])
-#pragma omp parallel reduction(+ : result[:NumReductions])
-      {
-        const int blockIdx = omp_get_team_num();
-        const int gridDim  = omp_get_num_teams();
-
-        // Guarantee that the compilers respect the `num_teams` clause
-        if (gridDim <= nteams) {
-          for (int league_id = blockIdx; league_id < league_size;
-               league_id += gridDim) {
-            typename PolicyType::member_type team(
-                league_id, league_size, team_size, vector_length, scratch_ptr,
-                blockIdx, shmem_size_L0, shmem_size_L1);
-            if constexpr (std::is_void<TagType>::value)
-              f(team, result);
-            else
-              f(TagType(), team, result);
-          }
-        } else
-          Kokkos::abort("`num_teams` clause was not respected.\n");
-      }
+  /** \brief set per thread scratch size for a specific level of the scratch
+   * hierarchy */
+  inline TeamPolicyInternal& set_scratch_size(
+      const int& level, const PerThreadValue& per_thread) {
+    m_thread_scratch_size[level] = per_thread.value;
+    return *this;
+  }
 
-      // Copy results back to device if `parallel_reduce` is on a device view.
-      ParReduceCommon::memcpy_result(
-          result_ptr, result, NumReductions * sizeof(ValueType), ptr_on_device);
-    }
+  /** \brief set per thread and per team scratch size for a specific level of
+   * the scratch hierarchy */
+  inline TeamPolicyInternal& set_scratch_size(
+      const int& level, const PerTeamValue& per_team,
+      const PerThreadValue& per_thread) {
+    m_team_scratch_size[level]   = per_team.value;
+    m_thread_scratch_size[level] = per_thread.value;
+    return *this;
   }
 
-  // FIXME_OPENMPTARGET : This routine is a copy from `parallel_reduce` over
-  // RangePolicy. Need a new implementation.
-  static void execute_init_join(const FunctorType& f, const PolicyType& p,
-                                PointerType ptr, const bool ptr_on_device) {
-    using FunctorAnalysis =
-        Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE, PolicyType,
-                              FunctorType>;
-    constexpr int HasInit = FunctorAnalysis::has_init_member_function;
-
-    const int league_size   = p.league_size();
-    const int team_size     = p.team_size();
-    const int vector_length = p.impl_vector_length();
-
-    auto begin = 0;
-    auto end   = league_size * team_size + team_size * vector_length;
-
-    const size_t shmem_size_L0 = p.scratch_size(0, team_size);
-    const size_t shmem_size_L1 = p.scratch_size(1, team_size);
-
-    // FIXME_OPENMPTARGET: This would oversubscribe scratch memory since we are
-    // already using the available scratch memory to create temporaries for each
-    // thread.
-    if ((shmem_size_L0 + shmem_size_L1) > 0) {
-      Kokkos::abort(
-          "OpenMPTarget: Scratch memory is not supported in `parallel_reduce` "
-          "over functors with init/join.");
-    }
+ private:
+  /** \brief finalize chunk_size if it was set to AUTO*/
+  inline void set_auto_chunk_size() {
+    int concurrency = 2048 * 128;
 
-    const auto nteams = league_size;
-
-    // Number of elements in the reduction
-    const auto value_count = FunctorAnalysis::value_count(f);
-
-    // Allocate scratch per active thread.
-    OpenMPTargetExec::resize_scratch(1, 0, value_count * sizeof(ValueType),
-                                     league_size);
-    void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr();
-
-    // Enter this loop if the functor has an `init`
-    if constexpr (HasInit) {
-      // The `init` routine needs to be called on the device since it might need
-      // device members.
-#pragma omp target map(to : f) is_device_ptr(scratch_ptr)
-      {
-        typename FunctorAnalysis::Reducer final_reducer(&f);
-        final_reducer.init(scratch_ptr);
-        final_reducer.final(scratch_ptr);
-      }
-    } else {
-#pragma omp target map(to : f) is_device_ptr(scratch_ptr)
-      {
-        for (int i = 0; i < value_count; ++i) {
-          static_cast<ValueType*>(scratch_ptr)[i] = ValueType();
-        }
+    if (concurrency == 0) concurrency = 1;
 
-        typename FunctorAnalysis::Reducer final_reducer(&f);
-        final_reducer.final(static_cast<ValueType*>(scratch_ptr));
-      }
+    if (m_chunk_size > 0) {
+      if (!Impl::is_integral_power_of_two(m_chunk_size))
+        Kokkos::abort("TeamPolicy blocking granularity must be power of two");
     }
 
-    if (end <= begin) {
-      // If there is no work to be done, copy back the initialized values and
-      // exit.
-      if (!ptr_on_device)
-        KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy(
-            ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0,
-            omp_get_initial_device(), omp_get_default_device()));
-      else
-        KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy(
-            ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0,
-            omp_get_default_device(), omp_get_default_device()));
-
-      return;
+    int new_chunk_size = 1;
+    while (new_chunk_size * 100 * concurrency < m_league_size)
+      new_chunk_size *= 2;
+    if (new_chunk_size < 128) {
+      new_chunk_size = 1;
+      while ((new_chunk_size * 40 * concurrency < m_league_size) &&
+             (new_chunk_size < 128))
+        new_chunk_size *= 2;
     }
-
-#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to   \
-                                                                       : f) \
-    is_device_ptr(scratch_ptr)
-    {
-#pragma omp parallel
-      {
-        const int team_num      = omp_get_team_num();
-        const int num_teams     = omp_get_num_teams();
-        ValueType* team_scratch = static_cast<ValueType*>(scratch_ptr) +
-                                  team_num * team_size * value_count;
-        typename FunctorAnalysis::Reducer final_reducer(&f);
-        ReferenceType result = final_reducer.init(&team_scratch[0]);
-
-        for (int league_id = team_num; league_id < league_size;
-             league_id += num_teams) {
-          typename PolicyType::member_type team(
-              league_id, league_size, team_size, vector_length, scratch_ptr,
-              team_num, shmem_size_L0, shmem_size_L1);
-          if constexpr (std::is_void<TagType>::value) {
-            f(team, result);
-          } else {
-            f(TagType(), team, result);
-          }
-        }
-      }  // end parallel
-    }    // end target
-
-    int tree_neighbor_offset = 1;
-    do {
-#pragma omp target teams distribute parallel for simd map(to   \
-                                                          : f) \
-    is_device_ptr(scratch_ptr)
-      for (int i = 0; i < nteams - tree_neighbor_offset;
-           i += 2 * tree_neighbor_offset) {
-        ValueType* team_scratch = static_cast<ValueType*>(scratch_ptr);
-        const int team_offset   = team_size * value_count;
-        typename FunctorAnalysis::Reducer final_reducer(&f);
-        final_reducer.join(
-            &team_scratch[i * team_offset],
-            &team_scratch[(i + tree_neighbor_offset) * team_offset]);
-
-        // If `final` is provided by the functor.
-        // Do the final only once at the end.
-        if (tree_neighbor_offset * 2 >= nteams && omp_get_team_num() == 0 &&
-            omp_get_thread_num() == 0) {
-          final_reducer.final(scratch_ptr);
-        }
-      }
-      tree_neighbor_offset *= 2;
-    } while (tree_neighbor_offset < nteams);
-
-    // If the result view is on the host, copy back the values via memcpy.
-    if (!ptr_on_device)
-      KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy(
-          ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0,
-          omp_get_initial_device(), omp_get_default_device()));
-    else
-      KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy(
-          ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0,
-          omp_get_default_device(), omp_get_default_device()));
+    m_chunk_size = new_chunk_size;
   }
-};
-
-template <class FunctorType, class ReducerType, class... Properties>
-class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
-                     ReducerType, Kokkos::Experimental::OpenMPTarget> {
- private:
-  using Policy =
-      Kokkos::Impl::TeamPolicyInternal<Kokkos::Experimental::OpenMPTarget,
-                                       Properties...>;
-
-  using WorkTag = typename Policy::work_tag;
-  using Member  = typename Policy::member_type;
-  using ReducerTypeFwd =
-      std::conditional_t<std::is_same<InvalidType, ReducerType>::value,
-                         FunctorType, ReducerType>;
-  using WorkTagFwd =
-      std::conditional_t<std::is_same<InvalidType, ReducerType>::value, WorkTag,
-                         void>;
-  using Analysis = Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
-                                         Policy, ReducerTypeFwd>;
-
-  using pointer_type   = typename Analysis::pointer_type;
-  using reference_type = typename Analysis::reference_type;
-  using value_type     = typename Analysis::value_type;
-
-  bool m_result_ptr_on_device;
-  const int m_result_ptr_num_elems;
-
-  static constexpr int HasJoin =
-      Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE, Policy,
-                            FunctorType>::has_join_member_function;
-  static constexpr int UseReducer = is_reducer<ReducerType>::value;
-  static constexpr int IsArray    = std::is_pointer<reference_type>::value;
-
-  using ParReduceSpecialize =
-      ParallelReduceSpecialize<FunctorType, Policy, ReducerType, pointer_type,
-                               typename Analysis::value_type>;
-
-  const FunctorType m_functor;
-  const Policy m_policy;
-  const ReducerType m_reducer;
-  const pointer_type m_result_ptr;
-  const size_t m_shmem_size;
 
  public:
-  void execute() const {
-    if constexpr (HasJoin) {
-      ParReduceSpecialize::execute_init_join(m_functor, m_policy, m_result_ptr,
-                                             m_result_ptr_on_device);
-    } else if constexpr (UseReducer) {
-      ParReduceSpecialize::execute_reducer(m_functor, m_policy, m_result_ptr,
-                                           m_result_ptr_on_device);
-    } else if constexpr (IsArray) {
-      if (m_result_ptr_num_elems <= 2) {
-        ParReduceSpecialize::template execute_array<2>(
-            m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
-      } else if (m_result_ptr_num_elems <= 4) {
-        ParReduceSpecialize::template execute_array<4>(
-            m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
-      } else if (m_result_ptr_num_elems <= 8) {
-        ParReduceSpecialize::template execute_array<8>(
-            m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
-      } else if (m_result_ptr_num_elems <= 16) {
-        ParReduceSpecialize::template execute_array<16>(
-            m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
-      } else if (m_result_ptr_num_elems <= 32) {
-        ParReduceSpecialize::template execute_array<32>(
-            m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
-      } else {
-        Kokkos::abort("array reduction length must be <= 32");
-      }
-    } else {
-      ParReduceSpecialize::template execute_array<1>(
-          m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
-    }
-  }
-
-  template <class ViewType>
-  ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy,
-                 const ViewType& arg_result,
-                 std::enable_if_t<Kokkos::is_view<ViewType>::value &&
-                                      !Kokkos::is_reducer<ReducerType>::value,
-                                  void*> = nullptr)
-      : m_result_ptr_on_device(
-            MemorySpaceAccess<Kokkos::Experimental::OpenMPTargetSpace,
-                              typename ViewType::memory_space>::accessible),
-        m_result_ptr_num_elems(arg_result.size()),
-        m_functor(arg_functor),
-        m_policy(arg_policy),
-        m_reducer(InvalidType()),
-        m_result_ptr(arg_result.data()),
-        m_shmem_size(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) +
-                     FunctorTeamShmemSize<FunctorType>::value(
-                         arg_functor, arg_policy.team_size())) {}
-
-  ParallelReduce(const FunctorType& arg_functor, Policy& arg_policy,
-                 const ReducerType& reducer)
-      : m_result_ptr_on_device(
-            MemorySpaceAccess<Kokkos::Experimental::OpenMPTargetSpace,
-                              typename ReducerType::result_view_type::
-                                  memory_space>::accessible),
-        m_result_ptr_num_elems(reducer.view().size()),
-        m_functor(arg_functor),
-        m_policy(arg_policy),
-        m_reducer(reducer),
-        m_result_ptr(reducer.view().data()),
-        m_shmem_size(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) +
-                     FunctorTeamShmemSize<FunctorType>::value(
-                         arg_functor, arg_policy.team_size())) {}
+  using member_type = Impl::OpenMPTargetExecTeamMember;
 };
 
 }  // namespace Impl
 }  // namespace Kokkos
 
+namespace Kokkos {
+
+template <typename iType>
+KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct<
+    iType, Impl::OpenMPTargetExecTeamMember>
+TeamThreadRange(const Impl::OpenMPTargetExecTeamMember& thread,
+                const iType& count) {
+  return Impl::TeamThreadRangeBoundariesStruct<
+      iType, Impl::OpenMPTargetExecTeamMember>(thread, count);
+}
+
+template <typename iType1, typename iType2>
+KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct<
+    std::common_type_t<iType1, iType2>, Impl::OpenMPTargetExecTeamMember>
+TeamThreadRange(const Impl::OpenMPTargetExecTeamMember& thread,
+                const iType1& begin, const iType2& end) {
+  using iType = std::common_type_t<iType1, iType2>;
+  return Impl::TeamThreadRangeBoundariesStruct<
+      iType, Impl::OpenMPTargetExecTeamMember>(thread, iType(begin),
+                                               iType(end));
+}
+
+template <typename iType>
+KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct<
+    iType, Impl::OpenMPTargetExecTeamMember>
+ThreadVectorRange(const Impl::OpenMPTargetExecTeamMember& thread,
+                  const iType& count) {
+  return Impl::ThreadVectorRangeBoundariesStruct<
+      iType, Impl::OpenMPTargetExecTeamMember>(thread, count);
+}
+
+template <typename iType1, typename iType2>
+KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct<
+    std::common_type_t<iType1, iType2>, Impl::OpenMPTargetExecTeamMember>
+ThreadVectorRange(const Impl::OpenMPTargetExecTeamMember& thread,
+                  const iType1& arg_begin, const iType2& arg_end) {
+  using iType = std::common_type_t<iType1, iType2>;
+  return Impl::ThreadVectorRangeBoundariesStruct<
+      iType, Impl::OpenMPTargetExecTeamMember>(thread, iType(arg_begin),
+                                               iType(arg_end));
+}
+
+template <typename iType>
+KOKKOS_INLINE_FUNCTION Impl::TeamVectorRangeBoundariesStruct<
+    iType, Impl::OpenMPTargetExecTeamMember>
+TeamVectorRange(const Impl::OpenMPTargetExecTeamMember& thread,
+                const iType& count) {
+  return Impl::TeamVectorRangeBoundariesStruct<
+      iType, Impl::OpenMPTargetExecTeamMember>(thread, count);
+}
+
+template <typename iType1, typename iType2>
+KOKKOS_INLINE_FUNCTION Impl::TeamVectorRangeBoundariesStruct<
+    std::common_type_t<iType1, iType2>, Impl::OpenMPTargetExecTeamMember>
+TeamVectorRange(const Impl::OpenMPTargetExecTeamMember& thread,
+                const iType1& arg_begin, const iType2& arg_end) {
+  using iType = std::common_type_t<iType1, iType2>;
+  return Impl::TeamVectorRangeBoundariesStruct<
+      iType, Impl::OpenMPTargetExecTeamMember>(thread, iType(arg_begin),
+                                               iType(arg_end));
+}
+
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadSingleStruct<Impl::OpenMPTargetExecTeamMember> PerTeam(
+    const Impl::OpenMPTargetExecTeamMember& thread) {
+  return Impl::ThreadSingleStruct<Impl::OpenMPTargetExecTeamMember>(thread);
+}
+
+KOKKOS_INLINE_FUNCTION
+Impl::VectorSingleStruct<Impl::OpenMPTargetExecTeamMember> PerThread(
+    const Impl::OpenMPTargetExecTeamMember& thread) {
+  return Impl::VectorSingleStruct<Impl::OpenMPTargetExecTeamMember>(thread);
+}
+}  // namespace Kokkos
+
+namespace Kokkos {
+
+template <class FunctorType>
+KOKKOS_INLINE_FUNCTION void single(
+    const Impl::VectorSingleStruct<Impl::OpenMPTargetExecTeamMember>&
+    /*single_struct*/,
+    const FunctorType& lambda) {
+  lambda();
+}
+
+template <class FunctorType>
+KOKKOS_INLINE_FUNCTION void single(
+    const Impl::ThreadSingleStruct<Impl::OpenMPTargetExecTeamMember>&
+        single_struct,
+    const FunctorType& lambda) {
+  if (single_struct.team_member.team_rank() == 0) lambda();
+}
+
+template <class FunctorType, class ValueType>
+KOKKOS_INLINE_FUNCTION void single(
+    const Impl::VectorSingleStruct<Impl::OpenMPTargetExecTeamMember>&
+    /*single_struct*/,
+    const FunctorType& lambda, ValueType& val) {
+  lambda(val);
+}
+
+template <class FunctorType, class ValueType>
+KOKKOS_INLINE_FUNCTION void single(
+    const Impl::ThreadSingleStruct<Impl::OpenMPTargetExecTeamMember>&
+        single_struct,
+    const FunctorType& lambda, ValueType& val) {
+  if (single_struct.team_member.team_rank() == 0) {
+    lambda(val);
+  }
+  single_struct.team_member.team_broadcast(val, 0);
+}
+}  // namespace Kokkos
+
 namespace Kokkos {
 namespace Impl {
 
@@ -1320,5 +721,43 @@ struct TeamVectorRangeBoundariesStruct<iType, OpenMPTargetExecTeamMember> {
 }  // namespace Kokkos
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+/** \brief  Data for OpenMPTarget thread execution */
+
+class OpenMPTargetExec {
+ public:
+  // FIXME_OPENMPTARGET - Currently the maximum number of
+  // teams possible is calculated based on NVIDIA's Volta GPU. In
+  // future this value should be based on the chosen architecture for the
+  // OpenMPTarget backend.
+  static int MAX_ACTIVE_THREADS;
+
+ private:
+  static void* scratch_ptr;
+
+ public:
+  static void verify_is_process(const char* const);
+  static void verify_initialized(const char* const);
+
+  static int* get_lock_array(int num_teams);
+  static void* get_scratch_ptr();
+  static void clear_scratch();
+  static void clear_lock_array();
+  static void resize_scratch(int64_t team_reduce_bytes,
+                             int64_t team_shared_bytes,
+                             int64_t thread_local_bytes, int64_t league_size);
+
+  static void* m_scratch_ptr;
+  static int64_t m_scratch_size;
+  static int* m_lock_array;
+  static uint64_t m_lock_size;
+  static uint32_t* m_uniquetoken_ptr;
+};
+
+}  // namespace Impl
+}  // namespace Kokkos
 
 #endif /* KOKKOS_OPENMPTARGET_PARALLEL_HPP */
diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Range.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Range.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a674637a3b1aca146e18e07b6b84912f5280d93f
--- /dev/null
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Range.hpp
@@ -0,0 +1,71 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_OPENMPTARGET_PARALLEL_FOR_RANGE_HPP
+#define KOKKOS_OPENMPTARGET_PARALLEL_FOR_RANGE_HPP
+
+#include <omp.h>
+#include <sstream>
+#include <Kokkos_Parallel.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+template <class FunctorType, class... Traits>
+class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>,
+                  Kokkos::Experimental::OpenMPTarget> {
+ private:
+  using Policy  = Kokkos::RangePolicy<Traits...>;
+  using WorkTag = typename Policy::work_tag;
+  using Member  = typename Policy::member_type;
+
+  const FunctorType m_functor;
+  const Policy m_policy;
+
+ public:
+  void execute() const { execute_impl<WorkTag>(); }
+
+  template <class TagType>
+  void execute_impl() const {
+    OpenMPTargetExec::verify_is_process(
+        "Kokkos::Experimental::OpenMPTarget parallel_for");
+    OpenMPTargetExec::verify_initialized(
+        "Kokkos::Experimental::OpenMPTarget parallel_for");
+    const auto begin = m_policy.begin();
+    const auto end   = m_policy.end();
+
+    if (end <= begin) return;
+
+    FunctorType a_functor(m_functor);
+
+#pragma omp target teams distribute parallel for map(to : a_functor)
+    for (auto i = begin; i < end; ++i) {
+      if constexpr (std::is_void<TagType>::value) {
+        a_functor(i);
+      } else {
+        a_functor(TagType(), i);
+      }
+    }
+  }
+
+  ParallelFor(const FunctorType& arg_functor, Policy arg_policy)
+      : m_functor(arg_functor), m_policy(arg_policy) {}
+};
+
+}  // namespace Impl
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..1abc925caed58c9e91ef054b1abfb5111876a9d7
--- /dev/null
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp
@@ -0,0 +1,201 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_OPENMPTARGET_PARALLEL_FOR_TEAM_HPP
+#define KOKKOS_OPENMPTARGET_PARALLEL_FOR_TEAM_HPP
+
+#include <omp.h>
+#include <sstream>
+#include <Kokkos_Parallel.hpp>
+#include <OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp>
+
+namespace Kokkos {
+
+/** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each
+ * i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all threads of the the calling thread team.
+ */
+template <typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION void parallel_for(
+    const Impl::TeamThreadRangeBoundariesStruct<
+        iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
+    const Lambda& lambda) {
+#pragma omp for nowait schedule(static, 1)
+  for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) lambda(i);
+}
+
+/** \brief  Intra-thread vector parallel_for. Executes lambda(iType i) for each
+ * i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread.
+ */
+template <typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION void parallel_for(
+    const Impl::ThreadVectorRangeBoundariesStruct<
+        iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
+    const Lambda& lambda) {
+#pragma omp simd
+  for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) lambda(i);
+}
+
+/** \brief  Intra-team vector parallel_for. Executes lambda(iType i) for each
+ * i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling team.
+ */
+template <typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION void parallel_for(
+    const Impl::TeamVectorRangeBoundariesStruct<
+        iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
+    const Lambda& lambda) {
+#pragma omp for simd nowait schedule(static, 1)
+  for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) lambda(i);
+}
+
+namespace Impl {
+
+template <class FunctorType, class... Properties>
+class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
+                  Kokkos::Experimental::OpenMPTarget> {
+ private:
+  using Policy =
+      Kokkos::Impl::TeamPolicyInternal<Kokkos::Experimental::OpenMPTarget,
+                                       Properties...>;
+  using WorkTag = typename Policy::work_tag;
+  using Member  = typename Policy::member_type;
+
+  const FunctorType m_functor;
+  const Policy m_policy;
+  const size_t m_shmem_size;
+
+ public:
+  void execute() const {
+    OpenMPTargetExec::verify_is_process(
+        "Kokkos::Experimental::OpenMPTarget parallel_for");
+    OpenMPTargetExec::verify_initialized(
+        "Kokkos::Experimental::OpenMPTarget parallel_for");
+    execute_impl<WorkTag>();
+  }
+
+ private:
+  template <class TagType>
+  void execute_impl() const {
+    OpenMPTargetExec::verify_is_process(
+        "Kokkos::Experimental::OpenMPTarget parallel_for");
+    OpenMPTargetExec::verify_initialized(
+        "Kokkos::Experimental::OpenMPTarget parallel_for");
+    const auto league_size   = m_policy.league_size();
+    const auto team_size     = m_policy.team_size();
+    const auto vector_length = m_policy.impl_vector_length();
+
+    const size_t shmem_size_L0 = m_policy.scratch_size(0, team_size);
+    const size_t shmem_size_L1 = m_policy.scratch_size(1, team_size);
+    OpenMPTargetExec::resize_scratch(team_size, shmem_size_L0, shmem_size_L1,
+                                     league_size);
+
+    void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr();
+    FunctorType a_functor(m_functor);
+
+    // FIXME_OPENMPTARGET - If the team_size is not a multiple of 32, the
+    // scratch implementation does not work in the Release or RelWithDebugInfo
+    // mode but works in the Debug mode.
+
+    // Maximum active teams possible.
+    // FIXME_OPENMPTARGET: Cray compiler did not yet implement
+    // omp_get_max_teams.
+#if !defined(KOKKOS_COMPILER_CRAY_LLVM)
+    int max_active_teams = omp_get_max_teams();
+#else
+    int max_active_teams =
+        std::min(OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size, league_size);
+#endif
+
+    // FIXME_OPENMPTARGET: Although the maximum number of teams is set using the
+    // omp_set_num_teams in the resize_scratch routine, the call is not
+    // respected. Hence we need to use `num_teams` routine to restrict the
+    // number of teams generated to max_active_teams. Hopefully we can avoid the
+    // num_teams clause in the future and let compiler pick the right number of
+    // teams. This is not true for Intel architectures.
+
+    // If the league size is <=0, do not launch the kernel.
+    if (max_active_teams <= 0) return;
+
+// Performing our own scheduling of teams to avoid separation of code between
+// teams-distribute and parallel. Gave a 2x performance boost in test cases with
+// the clang compiler. atomic_compare_exchange can be avoided since the standard
+// guarantees that the number of teams specified in the `num_teams` clause is
+// always less than or equal to the maximum concurrently running teams.
+#if !defined(KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU)
+#pragma omp target teams thread_limit(team_size) firstprivate(a_functor) \
+    num_teams(max_active_teams) is_device_ptr(scratch_ptr)
+#pragma omp parallel
+    {
+      if (omp_get_num_teams() > max_active_teams)
+        Kokkos::abort("`omp_set_num_teams` call was not respected.\n");
+
+      const int blockIdx = omp_get_team_num();
+      const int gridDim  = omp_get_num_teams();
+
+      // Iterate through the number of teams until league_size and assign the
+      // league_id accordingly
+      // Guarantee that the compilers respect the `num_teams` clause
+      for (int league_id = blockIdx; league_id < league_size;
+           league_id += gridDim) {
+        typename Policy::member_type team(league_id, league_size, team_size,
+                                          vector_length, scratch_ptr, blockIdx,
+                                          shmem_size_L0, shmem_size_L1);
+        if constexpr (std::is_void_v<TagType>)
+          m_functor(team);
+        else
+          m_functor(TagType(), team);
+      }
+    }
+#else
+#pragma omp target teams distribute firstprivate(a_functor) \
+    is_device_ptr(scratch_ptr) num_teams(max_active_teams)  \
+        thread_limit(team_size)
+    for (int i = 0; i < league_size; i++) {
+#pragma omp parallel
+      {
+        if (omp_get_num_teams() > max_active_teams)
+          Kokkos::abort("`omp_set_num_teams` call was not respected.\n");
+
+        typename Policy::member_type team(i, league_size, team_size,
+                                          vector_length, scratch_ptr, i,
+                                          shmem_size_L0, shmem_size_L1);
+        if constexpr (std::is_void_v<TagType>)
+          m_functor(team);
+        else
+          m_functor(TagType(), team);
+      }
+    }
+#endif
+  }
+
+ public:
+  ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
+      : m_functor(arg_functor),
+        m_policy(arg_policy),
+        m_shmem_size(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) +
+                     FunctorTeamShmemSize<FunctorType>::value(
+                         arg_functor, arg_policy.team_size())) {}
+};
+
+}  // namespace Impl
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..4452af3846d28b96d17116236ceeb3f195cc16b8
--- /dev/null
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp
@@ -0,0 +1,114 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_OPENMPTARGET_PARALLELREDUCE_RANGE_HPP
+#define KOKKOS_OPENMPTARGET_PARALLELREDUCE_RANGE_HPP
+
+#include <omp.h>
+#include <sstream>
+#include <Kokkos_Parallel.hpp>
+#include <OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+template <class CombinedFunctorReducerType, class... Traits>
+class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>,
+                     Kokkos::Experimental::OpenMPTarget> {
+ private:
+  using Policy      = Kokkos::RangePolicy<Traits...>;
+  using FunctorType = typename CombinedFunctorReducerType::functor_type;
+  using ReducerType = typename CombinedFunctorReducerType::reducer_type;
+
+  using WorkTag = typename Policy::work_tag;
+
+  using pointer_type   = typename ReducerType::pointer_type;
+  using reference_type = typename ReducerType::reference_type;
+
+  static constexpr bool FunctorHasJoin = Impl::FunctorAnalysis<
+      Impl::FunctorPatternInterface::REDUCE, Policy, FunctorType,
+      typename ReducerType::value_type>::Reducer::has_join_member_function();
+  static constexpr bool UseReducer =
+      !std::is_same_v<FunctorType, typename ReducerType::functor_type>;
+  static constexpr bool IsArray = std::is_pointer_v<reference_type>;
+
+  using ParReduceSpecialize =
+      ParallelReduceSpecialize<FunctorType, Policy,
+                               typename ReducerType::functor_type, pointer_type,
+                               typename ReducerType::value_type>;
+
+  const CombinedFunctorReducerType m_functor_reducer;
+  const Policy m_policy;
+  const pointer_type m_result_ptr;
+  bool m_result_ptr_on_device;
+  const int m_result_ptr_num_elems;
+  using TagType = typename Policy::work_tag;
+
+ public:
+  void execute() const {
+    const FunctorType& functor = m_functor_reducer.get_functor();
+    if constexpr (FunctorHasJoin) {
+      // Enter this loop if the Functor has a init-join.
+      ParReduceSpecialize::execute_init_join(functor, m_policy, m_result_ptr,
+                                             m_result_ptr_on_device);
+    } else if constexpr (UseReducer) {
+      // Enter this loop if the Functor is a reducer type.
+      ParReduceSpecialize::execute_reducer(functor, m_policy, m_result_ptr,
+                                           m_result_ptr_on_device);
+    } else if constexpr (IsArray) {
+      // Enter this loop if the reduction is on an array and the routine is
+      // templated over the size of the array.
+      if (m_result_ptr_num_elems <= 2) {
+        ParReduceSpecialize::template execute_array<TagType, 2>(
+            functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+      } else if (m_result_ptr_num_elems <= 4) {
+        ParReduceSpecialize::template execute_array<TagType, 4>(
+            functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+      } else if (m_result_ptr_num_elems <= 8) {
+        ParReduceSpecialize::template execute_array<TagType, 8>(
+            functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+      } else if (m_result_ptr_num_elems <= 16) {
+        ParReduceSpecialize::template execute_array<TagType, 16>(
+            functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+      } else if (m_result_ptr_num_elems <= 32) {
+        ParReduceSpecialize::template execute_array<TagType, 32>(
+            functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+      } else {
+        Kokkos::abort("array reduction length must be <= 32");
+      }
+    } else {
+      // This loop handles the basic scalar reduction.
+      ParReduceSpecialize::template execute_array<TagType, 1>(
+          functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+    }
+  }
+
+  template <class ViewType>
+  ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer,
+                 const Policy& arg_policy, const ViewType& arg_result_view)
+      : m_functor_reducer(arg_functor_reducer),
+        m_policy(arg_policy),
+        m_result_ptr(arg_result_view.data()),
+        m_result_ptr_on_device(
+            MemorySpaceAccess<Kokkos::Experimental::OpenMPTargetSpace,
+                              typename ViewType::memory_space>::accessible),
+        m_result_ptr_num_elems(arg_result_view.size()) {}
+};
+
+}  // namespace Impl
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a302fa7151152d6c4199d189cfa68b491076a430
--- /dev/null
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp
@@ -0,0 +1,530 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_OPENMPTARGET_PARALLELREDUCE_TEAM_HPP
+#define KOKKOS_OPENMPTARGET_PARALLELREDUCE_TEAM_HPP
+
+#include <omp.h>
+#include <sstream>
+#include <Kokkos_Parallel.hpp>
+#include <OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp>
+#include <OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp>
+
+// FIXME_OPENMPTARGET - Using this macro to implement a workaround for
+// hierarchical reducers. It avoids hitting the code path which we wanted to
+// write but doesn't work. undef'ed at the end.
+// Intel compilers prefer the non-workaround version.
+#ifndef KOKKOS_ARCH_INTEL_GPU
+#define KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND
+#endif
+
+namespace Kokkos {
+
+/** \brief  Inter-thread vector parallel_reduce. Executes lambda(iType i,
+ * ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all threads of the the calling thread team
+ * and a summation of val is performed and put into result.
+ */
+
+template <typename iType, class Lambda, typename ValueType>
+KOKKOS_INLINE_FUNCTION std::enable_if_t<!Kokkos::is_reducer<ValueType>::value>
+parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<
+                    iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
+                const Lambda& lambda, ValueType& result) {
+  // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of
+  // elements in the array <= 32. For reduction we allocate, 16 bytes per
+  // element in the scratch space, hence, 16*32 = 512.
+  static_assert(sizeof(ValueType) <=
+                Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE);
+
+  ValueType* TeamThread_scratch =
+      static_cast<ValueType*>(loop_boundaries.team.impl_reduce_scratch());
+
+#pragma omp barrier
+  TeamThread_scratch[0] = ValueType();
+#pragma omp barrier
+
+  if constexpr (std::is_arithmetic<ValueType>::value) {
+#pragma omp for reduction(+ : TeamThread_scratch[:1])
+    for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
+      ValueType tmp = ValueType();
+      lambda(i, tmp);
+      TeamThread_scratch[0] += tmp;
+    }
+  } else {
+#pragma omp declare reduction(custom:ValueType : omp_out += omp_in)
+
+#pragma omp for reduction(custom : TeamThread_scratch[:1])
+    for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
+      ValueType tmp = ValueType();
+      lambda(i, tmp);
+      TeamThread_scratch[0] += tmp;
+    }
+  }
+
+  result = TeamThread_scratch[0];
+}
+
+#if !defined(KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND)
+// For some reason the actual version we wanted to write doesn't work
+// and crashes. We should try this with every new compiler
+// This is the variant we actually wanted to write
+template <typename iType, class Lambda, typename ReducerType>
+KOKKOS_INLINE_FUNCTION std::enable_if_t<Kokkos::is_reducer<ReducerType>::value>
+parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<
+                    iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
+                const Lambda& lambda, ReducerType result) {
+  using ValueType = typename ReducerType::value_type;
+
+#pragma omp declare reduction(                                               \
+    custominner:ValueType                                                    \
+    : Impl::OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
+    initializer(                                                             \
+        Impl::OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
+
+  // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of
+  // elements in the array <= 32. For reduction we allocate, 16 bytes per
+  // element in the scratch space, hence, 16*32 = 512.
+  static_assert(sizeof(ValueType) <=
+                Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE);
+
+  ValueType* TeamThread_scratch =
+      static_cast<ValueType*>(loop_boundaries.team.impl_reduce_scratch());
+
+#pragma omp barrier
+  Impl::OpenMPTargetReducerWrapper<ReducerType>::init(TeamThread_scratch[0]);
+#pragma omp barrier
+
+#pragma omp for reduction(custominner : TeamThread_scratch[:1])
+  for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
+    lambda(i, TeamThread_scratch[0]);
+  }
+  result.reference() = TeamThread_scratch[0];
+}
+#else
+template <typename iType, class Lambda, typename ReducerType>
+KOKKOS_INLINE_FUNCTION std::enable_if_t<Kokkos::is_reducer<ReducerType>::value>
+parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<
+                    iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
+                const Lambda& lambda, ReducerType result) {
+  using ValueType = typename ReducerType::value_type;
+
+  // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of
+  // elements in the array <= 32. For reduction we allocate, 16 bytes per
+  // element in the scratch space, hence, 16*32 = 512.
+  static_assert(sizeof(ValueType) <=
+                Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE);
+
+  ValueType* TeamThread_scratch =
+      static_cast<ValueType*>(loop_boundaries.team.impl_reduce_scratch());
+
+#pragma omp declare reduction(                                               \
+    omp_red_teamthread_reducer:ValueType                                     \
+    : Impl::OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
+    initializer(                                                             \
+        Impl::OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
+
+#pragma omp barrier
+  ValueType tmp;
+  result.init(tmp);
+  TeamThread_scratch[0] = tmp;
+#pragma omp barrier
+
+  iType team_size = iType(omp_get_num_threads());
+#pragma omp for reduction(omp_red_teamthread_reducer \
+                          : TeamThread_scratch[:1]) schedule(static, 1)
+  for (iType t = 0; t < team_size; t++) {
+    ValueType tmp2;
+    result.init(tmp2);
+
+    for (iType i = loop_boundaries.start + t; i < loop_boundaries.end;
+         i += team_size) {
+      lambda(i, tmp2);
+    }
+
+    // FIXME_OPENMPTARGET: Join should work but doesn't. Every threads gets a
+    // private TeamThread_scratch[0] and at the end of the for-loop the `join`
+    // operation is performed by OpenMP itself and hence the simple assignment
+    // works.
+    //    result.join(TeamThread_scratch[0], tmp2);
+    TeamThread_scratch[0] = tmp2;
+  }
+
+  result.reference() = TeamThread_scratch[0];
+}
+#endif  // KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i,
+ * ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread
+ * and a reduction of val is performed using JoinType(ValueType& val, const
+ * ValueType& update) and put into init_result. The input value of init_result
+ * is used as initializer for temporary variables of ValueType. Therefore the
+ * input value should be the neutral element with respect to the join operation
+ * (e.g. '0 for +-' or '1 for *').
+ */
+template <typename iType, class Lambda, typename ValueType, class JoinType>
+KOKKOS_INLINE_FUNCTION void parallel_reduce(
+    const Impl::TeamThreadRangeBoundariesStruct<
+        iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
+    const Lambda& lambda, const JoinType& join, ValueType& init_result) {
+  ValueType* TeamThread_scratch =
+      static_cast<ValueType*>(loop_boundaries.team.impl_reduce_scratch());
+
+  // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of
+  // elements in the array <= 32. For reduction we allocate, 16 bytes per
+  // element in the scratch space, hence, 16*32 = 512.
+  static_assert(sizeof(ValueType) <=
+                Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE);
+
+  // FIXME_OPENMPTARGET: Still need to figure out how to get value_count here.
+  const int value_count = 1;
+
+#pragma omp barrier
+  TeamThread_scratch[0] = init_result;
+#pragma omp barrier
+
+#pragma omp for
+  for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
+    lambda(i, TeamThread_scratch[omp_get_num_threads() * value_count]);
+  }
+
+  // Reduce all partial results within a team.
+  const int team_size      = omp_get_num_threads();
+  int tree_neighbor_offset = 1;
+  do {
+#pragma omp for
+    for (int i = 0; i < team_size - tree_neighbor_offset;
+         i += 2 * tree_neighbor_offset) {
+      const int neighbor = i + tree_neighbor_offset;
+      join(lambda, &TeamThread_scratch[i * value_count],
+           &TeamThread_scratch[neighbor * value_count]);
+    }
+    tree_neighbor_offset *= 2;
+  } while (tree_neighbor_offset < team_size);
+  init_result = TeamThread_scratch[0];
+}
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i,
+ * ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread
+ * and a summation of val is performed and put into result.
+ */
+template <typename iType, class Lambda, typename ValueType>
+KOKKOS_INLINE_FUNCTION void parallel_reduce(
+    const Impl::ThreadVectorRangeBoundariesStruct<
+        iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
+    const Lambda& lambda, ValueType& result) {
+  ValueType vector_reduce = ValueType();
+
+  if constexpr (std::is_arithmetic<ValueType>::value) {
+#pragma omp simd reduction(+ : vector_reduce)
+    for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
+      ValueType tmp = ValueType();
+      lambda(i, tmp);
+      vector_reduce += tmp;
+    }
+  } else {
+#pragma omp declare reduction(custom:ValueType : omp_out += omp_in)
+
+#pragma omp simd reduction(custom : vector_reduce)
+    for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
+      lambda(i, vector_reduce);
+    }
+  }
+
+  result = vector_reduce;
+}
+
+template <typename iType, class Lambda, typename ReducerType>
+KOKKOS_INLINE_FUNCTION std::enable_if_t<Kokkos::is_reducer<ReducerType>::value>
+parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<
+                    iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
+                const Lambda& lambda, ReducerType const& result) {
+  using ValueType = typename ReducerType::value_type;
+
+#pragma omp declare reduction(                                               \
+    custom:ValueType                                                         \
+    : Impl::OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
+    initializer(                                                             \
+        Impl::OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
+
+  ValueType vector_reduce;
+  Impl::OpenMPTargetReducerWrapper<ReducerType>::init(vector_reduce);
+
+#pragma omp simd reduction(custom : vector_reduce)
+  for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
+    lambda(i, vector_reduce);
+  }
+
+  result.reference() = vector_reduce;
+}
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i,
+ * ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread
+ * and a reduction of val is performed using JoinType(ValueType& val, const
+ * ValueType& update) and put into init_result. The input value of init_result
+ * is used as initializer for temporary variables of ValueType. Therefore the
+ * input value should be the neutral element with respect to the join operation
+ * (e.g. '0 for +-' or '1 for *').
+ */
+template <typename iType, class Lambda, typename ValueType, class JoinType>
+KOKKOS_INLINE_FUNCTION void parallel_reduce(
+    const Impl::ThreadVectorRangeBoundariesStruct<
+        iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
+    const Lambda& lambda, const JoinType& join, ValueType& init_result) {
+  ValueType result = init_result;
+
+  // FIXME_OPENMPTARGET think about omp simd
+  // join does not work with omp reduction clause
+  for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
+    ValueType tmp = ValueType();
+    lambda(i, tmp);
+    join(result, tmp);
+  }
+
+  init_result = result;
+}
+
+/** \brief  Intra-team vector parallel_reduce. Executes lambda(iType i,
+ * ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling team
+ * and a summation of val is performed and put into result.
+ */
+template <typename iType, class Lambda, typename ValueType>
+KOKKOS_INLINE_FUNCTION void parallel_reduce(
+    const Impl::TeamVectorRangeBoundariesStruct<
+        iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
+    const Lambda& lambda, ValueType& result) {
+  // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of
+  // elements in the array <= 32. For reduction we allocate, 16 bytes per
+  // element in the scratch space, hence, 16*32 = 512.
+  static_assert(sizeof(ValueType) <=
+                Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE);
+
+  ValueType* TeamVector_scratch =
+      static_cast<ValueType*>(loop_boundaries.team.impl_reduce_scratch());
+
+#pragma omp barrier
+  TeamVector_scratch[0] = ValueType();
+#pragma omp barrier
+
+  if constexpr (std::is_arithmetic<ValueType>::value) {
+#pragma omp for simd reduction(+ : TeamVector_scratch[:1])
+    for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
+      ValueType tmp = ValueType();
+      lambda(i, tmp);
+      TeamVector_scratch[0] += tmp;
+    }
+  } else {
+#pragma omp declare reduction(custom:ValueType : omp_out += omp_in)
+
+#pragma omp for simd reduction(custom : TeamVector_scratch[:1])
+    for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
+      ValueType tmp = ValueType();
+      lambda(i, tmp);
+      TeamVector_scratch[0] += tmp;
+    }
+  }
+
+  result = TeamVector_scratch[0];
+}
+
+#if !defined(KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND)
+template <typename iType, class Lambda, typename ReducerType>
+KOKKOS_INLINE_FUNCTION std::enable_if_t<Kokkos::is_reducer<ReducerType>::value>
+parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct<
+                    iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
+                const Lambda& lambda, ReducerType const& result) {
+  using ValueType = typename ReducerType::value_type;
+
+  // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of
+  // elements in the array <= 32. For reduction we allocate, 16 bytes per
+  // element in the scratch space, hence, 16*32 = 512.
+  static_assert(sizeof(ValueType) <=
+                Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE);
+
+#pragma omp declare reduction(                                               \
+    custom:ValueType                                                         \
+    : Impl::OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
+    initializer(                                                             \
+        Impl::OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
+
+  ValueType* TeamVector_scratch =
+      static_cast<ValueType*>(loop_boundaries.team.impl_reduce_scratch());
+
+#pragma omp barrier
+  Impl::OpenMPTargetReducerWrapper<ReducerType>::init(TeamVector_scratch[0]);
+#pragma omp barrier
+
+#pragma omp for simd reduction(custom : TeamVector_scratch[:1])
+  for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
+    lambda(i, TeamVector_scratch[0]);
+  }
+
+  result.reference() = TeamVector_scratch[0];
+}
+#else
+template <typename iType, class Lambda, typename ReducerType>
+KOKKOS_INLINE_FUNCTION std::enable_if_t<Kokkos::is_reducer<ReducerType>::value>
+parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct<
+                    iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
+                const Lambda& lambda, ReducerType const& result) {
+  using ValueType = typename ReducerType::value_type;
+
+  // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of
+  // elements in the array <= 32. For reduction we allocate, 16 bytes per
+  // element in the scratch space, hence, 16*32 = 512.
+  static_assert(sizeof(ValueType) <=
+                Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE);
+
+  ValueType* TeamVector_scratch =
+      static_cast<ValueType*>(loop_boundaries.team.impl_reduce_scratch());
+
+#pragma omp declare reduction(                                               \
+    omp_red_teamthread_reducer:ValueType                                     \
+    : Impl::OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
+    initializer(                                                             \
+        Impl::OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
+
+#pragma omp barrier
+  ValueType tmp;
+  result.init(tmp);
+  TeamVector_scratch[0] = tmp;
+#pragma omp barrier
+
+  iType team_size = iType(omp_get_num_threads());
+#pragma omp for simd reduction(omp_red_teamthread_reducer \
+                               : TeamVector_scratch[:1]) schedule(static, 1)
+  for (iType t = 0; t < team_size; t++) {
+    ValueType tmp2;
+    result.init(tmp2);
+
+    for (iType i = loop_boundaries.start + t; i < loop_boundaries.end;
+         i += team_size) {
+      lambda(i, tmp2);
+    }
+    TeamVector_scratch[0] = tmp2;
+  }
+
+  result.reference() = TeamVector_scratch[0];
+}
+#endif  // KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND
+
+namespace Impl {
+
+template <class CombinedFunctorReducerType, class... Properties>
+class ParallelReduce<CombinedFunctorReducerType,
+                     Kokkos::TeamPolicy<Properties...>,
+                     Kokkos::Experimental::OpenMPTarget> {
+ private:
+  using Policy =
+      Kokkos::Impl::TeamPolicyInternal<Kokkos::Experimental::OpenMPTarget,
+                                       Properties...>;
+  using FunctorType = typename CombinedFunctorReducerType::functor_type;
+  using ReducerType = typename CombinedFunctorReducerType::reducer_type;
+
+  using WorkTag = typename Policy::work_tag;
+  using Member  = typename Policy::member_type;
+
+  using pointer_type   = typename ReducerType::pointer_type;
+  using reference_type = typename ReducerType::reference_type;
+  using value_type     = typename ReducerType::value_type;
+
+  bool m_result_ptr_on_device;
+  const int m_result_ptr_num_elems;
+
+  static constexpr bool FunctorHasJoin = Impl::FunctorAnalysis<
+      Impl::FunctorPatternInterface::REDUCE, Policy, FunctorType,
+      typename ReducerType::value_type>::Reducer::has_join_member_function();
+  static constexpr bool UseReducer =
+      !std::is_same_v<FunctorType, typename ReducerType::functor_type>;
+  static constexpr bool IsArray = std::is_pointer_v<reference_type>;
+
+  using ParReduceSpecialize =
+      ParallelReduceSpecialize<FunctorType, Policy,
+                               typename ReducerType::functor_type, pointer_type,
+                               typename ReducerType::value_type>;
+
+  const CombinedFunctorReducerType m_functor_reducer;
+  const Policy m_policy;
+  const pointer_type m_result_ptr;
+  const size_t m_shmem_size;
+
+ public:
+  void execute() const {
+    const FunctorType& functor = m_functor_reducer.get_functor();
+    if constexpr (FunctorHasJoin) {
+      ParReduceSpecialize::execute_init_join(functor, m_policy, m_result_ptr,
+                                             m_result_ptr_on_device);
+    } else if constexpr (UseReducer) {
+      ParReduceSpecialize::execute_reducer(functor, m_policy, m_result_ptr,
+                                           m_result_ptr_on_device);
+    } else if constexpr (IsArray) {
+      if (m_result_ptr_num_elems <= 2) {
+        ParReduceSpecialize::template execute_array<2>(
+            functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+      } else if (m_result_ptr_num_elems <= 4) {
+        ParReduceSpecialize::template execute_array<4>(
+            functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+      } else if (m_result_ptr_num_elems <= 8) {
+        ParReduceSpecialize::template execute_array<8>(
+            functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+      } else if (m_result_ptr_num_elems <= 16) {
+        ParReduceSpecialize::template execute_array<16>(
+            functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+      } else if (m_result_ptr_num_elems <= 32) {
+        ParReduceSpecialize::template execute_array<32>(
+            functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+      } else {
+        Kokkos::abort("array reduction length must be <= 32");
+      }
+    } else {
+      ParReduceSpecialize::template execute_array<1>(
+          functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+    }
+  }
+
+  template <class ViewType>
+  ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer,
+                 const Policy& arg_policy, const ViewType& arg_result)
+      : m_result_ptr_on_device(
+            MemorySpaceAccess<Kokkos::Experimental::OpenMPTargetSpace,
+                              typename ViewType::memory_space>::accessible),
+        m_result_ptr_num_elems(arg_result.size()),
+        m_functor_reducer(arg_functor_reducer),
+        m_policy(arg_policy),
+        m_result_ptr(arg_result.data()),
+        m_shmem_size(
+            arg_policy.scratch_size(0) + arg_policy.scratch_size(1) +
+            FunctorTeamShmemSize<FunctorType>::value(
+                arg_functor_reducer.get_functor(), arg_policy.team_size())) {}
+};
+
+}  // namespace Impl
+
+#ifdef KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND
+#undef KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND
+#endif
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..1d6677a1df6ba49cd3e720f8bcf67696779dda32
--- /dev/null
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp
@@ -0,0 +1,261 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_OPENMPTARGET_PARALLELSCAN_RANGE_HPP
+#define KOKKOS_OPENMPTARGET_PARALLELSCAN_RANGE_HPP
+
+#include <omp.h>
+#include <sstream>
+#include <Kokkos_Parallel.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+template <class FunctorType, class... Traits>
+class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
+                   Kokkos::Experimental::OpenMPTarget> {
+ protected:
+  using Policy = Kokkos::RangePolicy<Traits...>;
+
+  using WorkTag  = typename Policy::work_tag;
+  using Member   = typename Policy::member_type;
+  using idx_type = typename Policy::index_type;
+
+  using Analysis = Impl::FunctorAnalysis<Impl::FunctorPatternInterface::SCAN,
+                                         Policy, FunctorType, void>;
+
+  using value_type     = typename Analysis::value_type;
+  using pointer_type   = typename Analysis::pointer_type;
+  using reference_type = typename Analysis::reference_type;
+
+  const CombinedFunctorReducer<FunctorType, typename Analysis::Reducer>
+      m_functor_reducer;
+  const Policy m_policy;
+
+  value_type* m_result_ptr;
+  const bool m_result_ptr_device_accessible;
+
+  template <class TagType>
+  std::enable_if_t<std::is_void<TagType>::value> call_with_tag(
+      const FunctorType& f, const idx_type& idx, value_type& val,
+      const bool& is_final) const {
+    f(idx, val, is_final);
+  }
+  template <class TagType>
+  std::enable_if_t<!std::is_void<TagType>::value> call_with_tag(
+      const FunctorType& f, const idx_type& idx, value_type& val,
+      const bool& is_final) const {
+    f(WorkTag(), idx, val, is_final);
+  }
+
+ public:
+  void impl_execute(
+      Kokkos::View<value_type**, Kokkos::LayoutRight,
+                   Kokkos::Experimental::OpenMPTargetSpace>
+          element_values,
+      Kokkos::View<value_type*, Kokkos::Experimental::OpenMPTargetSpace>
+          chunk_values,
+      Kokkos::View<int64_t, Kokkos::Experimental::OpenMPTargetSpace> count)
+      const {
+    const idx_type N          = m_policy.end() - m_policy.begin();
+    const idx_type chunk_size = 128;
+    const idx_type n_chunks   = (N + chunk_size - 1) / chunk_size;
+    idx_type nteams           = n_chunks > 512 ? 512 : n_chunks;
+    idx_type team_size        = 128;
+
+    auto a_functor_reducer = m_functor_reducer;
+#pragma omp target teams distribute map(to \
+                                        : a_functor_reducer) num_teams(nteams)
+    for (idx_type team_id = 0; team_id < n_chunks; ++team_id) {
+      const typename Analysis::Reducer& reducer =
+          a_functor_reducer.get_reducer();
+#pragma omp parallel num_threads(team_size)
+      {
+        const idx_type local_offset = team_id * chunk_size;
+
+#pragma omp for
+        for (idx_type i = 0; i < chunk_size; ++i) {
+          const idx_type idx = local_offset + i;
+          value_type val;
+          reducer.init(&val);
+          if (idx < N)
+            call_with_tag<WorkTag>(a_functor_reducer.get_functor(), idx, val,
+                                   false);
+          element_values(team_id, i) = val;
+        }
+#pragma omp barrier
+        if (omp_get_thread_num() == 0) {
+          value_type sum;
+          reducer.init(&sum);
+          for (idx_type i = 0; i < chunk_size; ++i) {
+            reducer.join(&sum, &element_values(team_id, i));
+            element_values(team_id, i) = sum;
+          }
+          chunk_values(team_id) = sum;
+        }
+#pragma omp barrier
+        if (omp_get_thread_num() == 0) {
+          if (Kokkos::atomic_fetch_add(&count(), 1) == n_chunks - 1) {
+            value_type sum;
+            reducer.init(&sum);
+            for (idx_type i = 0; i < n_chunks; ++i) {
+              reducer.join(&sum, &chunk_values(i));
+              chunk_values(i) = sum;
+            }
+          }
+        }
+      }
+    }
+
+#pragma omp target teams distribute map(to                                     \
+                                        : a_functor_reducer) num_teams(nteams) \
+    thread_limit(team_size)
+    for (idx_type team_id = 0; team_id < n_chunks; ++team_id) {
+      const typename Analysis::Reducer& reducer =
+          a_functor_reducer.get_reducer();
+#pragma omp parallel num_threads(team_size)
+      {
+        const idx_type local_offset = team_id * chunk_size;
+        value_type offset_value;
+        if (team_id > 0)
+          offset_value = chunk_values(team_id - 1);
+        else
+          reducer.init(&offset_value);
+
+#pragma omp for
+        for (idx_type i = 0; i < chunk_size; ++i) {
+          const idx_type idx = local_offset + i;
+          value_type local_offset_value;
+          if (i > 0) {
+            local_offset_value = element_values(team_id, i - 1);
+            // FIXME_OPENMPTARGET We seem to access memory illegaly on AMD GPUs
+#if defined(KOKKOS_ARCH_AMD_GPU) && !defined(KOKKOS_ARCH_AMD_GFX1030) && \
+    !defined(KOKKOS_ARCH_AMD_GFX1100)
+            if constexpr (Analysis::Reducer::has_join_member_function()) {
+              if constexpr (std::is_void_v<WorkTag>)
+                a_functor_reducer.get_functor().join(local_offset_value,
+                                                     offset_value);
+              else
+                a_functor_reducer.get_functor().join(
+                    WorkTag{}, local_offset_value, offset_value);
+            } else
+              local_offset_value += offset_value;
+#else
+            reducer.join(&local_offset_value, &offset_value);
+#endif
+          } else
+            local_offset_value = offset_value;
+          if (idx < N)
+            call_with_tag<WorkTag>(a_functor_reducer.get_functor(), idx,
+                                   local_offset_value, true);
+          if (idx == N - 1 && m_result_ptr_device_accessible)
+            *m_result_ptr = local_offset_value;
+        }
+      }
+    }
+  }
+
+  void execute() const {
+    OpenMPTargetExec::verify_is_process(
+        "Kokkos::Experimental::OpenMPTarget parallel_for");
+    OpenMPTargetExec::verify_initialized(
+        "Kokkos::Experimental::OpenMPTarget parallel_for");
+    const idx_type N          = m_policy.end() - m_policy.begin();
+    const idx_type chunk_size = 128;
+    const idx_type n_chunks   = (N + chunk_size - 1) / chunk_size;
+
+    // This could be scratch memory per team
+    Kokkos::View<value_type**, Kokkos::LayoutRight,
+                 Kokkos::Experimental::OpenMPTargetSpace>
+        element_values("element_values", n_chunks, chunk_size);
+    Kokkos::View<value_type*, Kokkos::Experimental::OpenMPTargetSpace>
+        chunk_values("chunk_values", n_chunks);
+    Kokkos::View<int64_t, Kokkos::Experimental::OpenMPTargetSpace> count(
+        "Count");
+
+    impl_execute(element_values, chunk_values, count);
+  }
+
+  //----------------------------------------
+
+  ParallelScan(const FunctorType& arg_functor, const Policy& arg_policy,
+               pointer_type arg_result_ptr           = nullptr,
+               bool arg_result_ptr_device_accessible = false)
+      : m_functor_reducer(arg_functor, typename Analysis::Reducer{arg_functor}),
+        m_policy(arg_policy),
+        m_result_ptr(arg_result_ptr),
+        m_result_ptr_device_accessible(arg_result_ptr_device_accessible) {}
+
+  //----------------------------------------
+};
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+template <class FunctorType, class ReturnType, class... Traits>
+class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
+                            ReturnType, Kokkos::Experimental::OpenMPTarget>
+    : public ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
+                          Kokkos::Experimental::OpenMPTarget> {
+  using base_t     = ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
+                              Kokkos::Experimental::OpenMPTarget>;
+  using value_type = typename base_t::value_type;
+
+ public:
+  void execute() const {
+    OpenMPTargetExec::verify_is_process(
+        "Kokkos::Experimental::OpenMPTarget parallel_for");
+    OpenMPTargetExec::verify_initialized(
+        "Kokkos::Experimental::OpenMPTarget parallel_for");
+    const int64_t N        = base_t::m_policy.end() - base_t::m_policy.begin();
+    const int chunk_size   = 128;
+    const int64_t n_chunks = (N + chunk_size - 1) / chunk_size;
+
+    if (N > 0) {
+      // This could be scratch memory per team
+      Kokkos::View<value_type**, Kokkos::LayoutRight,
+                   Kokkos::Experimental::OpenMPTargetSpace>
+          element_values("element_values", n_chunks, chunk_size);
+      Kokkos::View<value_type*, Kokkos::Experimental::OpenMPTargetSpace>
+          chunk_values("chunk_values", n_chunks);
+      Kokkos::View<int64_t, Kokkos::Experimental::OpenMPTargetSpace> count(
+          "Count");
+
+      base_t::impl_execute(element_values, chunk_values, count);
+
+      if (!base_t::m_result_ptr_device_accessible) {
+        const int size = base_t::m_functor_reducer.get_reducer().value_size();
+        DeepCopy<HostSpace, Kokkos::Experimental::OpenMPTargetSpace>(
+            base_t::m_result_ptr, chunk_values.data() + (n_chunks - 1), size);
+      }
+    } else if (!base_t::m_result_ptr_device_accessible) {
+      base_t::m_functor_reducer.get_reducer().init(base_t::m_result_ptr);
+    }
+  }
+
+  template <class ViewType>
+  ParallelScanWithTotal(const FunctorType& arg_functor,
+                        const typename base_t::Policy& arg_policy,
+                        const ViewType& arg_result_view)
+      : base_t(arg_functor, arg_policy, arg_result_view.data(),
+               MemorySpaceAccess<Kokkos::Experimental::OpenMPTargetSpace,
+                                 typename ViewType::memory_space>::accessible) {
+  }
+};
+}  // namespace Impl
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Team.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Team.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ad0b3c4e7f607ef14b0ef68d190ffd3f6186a257
--- /dev/null
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Team.hpp
@@ -0,0 +1,166 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_OPENMPTARGET_PARALLELSCAN_TEAM_HPP
+#define KOKKOS_OPENMPTARGET_PARALLELSCAN_TEAM_HPP
+
+#include <omp.h>
+#include <sstream>
+#include <Kokkos_Parallel.hpp>
+#include <OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp>
+
+// FIXME_OPENMPTARGET - Using this macro to implement a workaround for
+// hierarchical scan. It avoids hitting the code path which we wanted to
+// write but doesn't work. undef'ed at the end.
+#ifndef KOKKOS_ARCH_INTEL_GPU
+#define KOKKOS_IMPL_TEAM_SCAN_WORKAROUND
+#endif
+
+namespace Kokkos {
+
+// This is largely the same code as in HIP and CUDA except for the member name
+template <typename iType, class FunctorType, class ValueType>
+KOKKOS_INLINE_FUNCTION void parallel_scan(
+    const Impl::TeamThreadRangeBoundariesStruct<
+        iType, Impl::OpenMPTargetExecTeamMember>& loop_bounds,
+    const FunctorType& lambda, ValueType& return_val) {
+  using Analysis = Impl::FunctorAnalysis<Impl::FunctorPatternInterface::SCAN,
+                                         TeamPolicy<Experimental::OpenMPTarget>,
+                                         FunctorType, void>;
+  using analysis_value_type = typename Analysis::value_type;
+  static_assert(std::is_same_v<analysis_value_type, ValueType>,
+                "Non-matching value types of functor and return type");
+
+  const auto start = loop_bounds.start;
+  const auto end   = loop_bounds.end;
+  //   Note this thing is called .member in the CUDA specialization of
+  //   TeamThreadRangeBoundariesStruct
+  auto& member         = loop_bounds.team;
+  const auto team_rank = member.team_rank();
+
+#if defined(KOKKOS_IMPL_TEAM_SCAN_WORKAROUND)
+  ValueType scan_val = {};
+
+  if (team_rank == 0) {
+    for (iType i = start; i < end; ++i) {
+      lambda(i, scan_val, true);
+    }
+  }
+  member.team_broadcast(scan_val, 0);
+  return_val = scan_val;
+
+#pragma omp barrier
+#else
+  const auto team_size = member.team_size();
+  const auto nchunk    = (end - start + team_size - 1) / team_size;
+  ValueType accum      = {};
+  // each team has to process one or
+  //      more chunks of the prefix scan
+  for (iType i = 0; i < nchunk; ++i) {
+    auto ii = start + i * team_size + team_rank;
+    // local accumulation for this chunk
+    ValueType local_accum = {};
+    // user updates value with prefix value
+    if (ii < loop_bounds.end) lambda(ii, local_accum, false);
+    // perform team scan
+    local_accum = member.team_scan(local_accum);
+    // add this blocks accum to total accumulation
+    auto val = accum + local_accum;
+    // user updates their data with total accumulation
+    if (ii < loop_bounds.end) lambda(ii, val, true);
+    // the last value needs to be propogated to next chunk
+    if (team_rank == team_size - 1) accum = val;
+    // broadcast last value to rest of the team
+    member.team_broadcast(accum, team_size - 1);
+  }
+  return_val = accum;
+
+#endif
+}
+
+template <typename iType, class FunctorType>
+KOKKOS_INLINE_FUNCTION void parallel_scan(
+    const Impl::TeamThreadRangeBoundariesStruct<
+        iType, Impl::OpenMPTargetExecTeamMember>& loop_bounds,
+    const FunctorType& lambda) {
+  using Analysis   = Impl::FunctorAnalysis<Impl::FunctorPatternInterface::SCAN,
+                                         TeamPolicy<Experimental::OpenMPTarget>,
+                                         FunctorType, void>;
+  using value_type = typename Analysis::value_type;
+  value_type scan_val;
+  parallel_scan(loop_bounds, lambda, scan_val);
+}
+}  // namespace Kokkos
+
+namespace Kokkos {
+
+/** \brief  Intra-thread vector parallel exclusive prefix sum. Executes
+ * lambda(iType i, ValueType & val, bool final) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes in the thread and a scan
+ * operation is performed. Depending on the target execution space the operator
+ * might be called twice: once with final=false and once with final=true. When
+ * final==true val contains the prefix sum value. The contribution of this "i"
+ * needs to be added to val no matter whether final==true or not. In a serial
+ * execution (i.e. team_size==1) the operator is only called once with
+ * final==true. Scan_val will be set to the final sum value over all vector
+ * lanes.
+ */
+template <typename iType, class FunctorType, class ValueType>
+KOKKOS_INLINE_FUNCTION void parallel_scan(
+    const Impl::ThreadVectorRangeBoundariesStruct<
+        iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
+    const FunctorType& lambda, ValueType& return_val) {
+  using Analysis = Impl::FunctorAnalysis<Impl::FunctorPatternInterface::SCAN,
+                                         TeamPolicy<Experimental::OpenMPTarget>,
+                                         FunctorType, void>;
+  using analysis_value_type = typename Analysis::value_type;
+  static_assert(std::is_same_v<analysis_value_type, ValueType>,
+                "Non-matching value types of functor and return type");
+
+  ValueType scan_val = {};
+
+#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for (iType i = loop_boundaries.start; i < loop_boundaries.end; ++i) {
+    lambda(i, scan_val, true);
+  }
+
+  return_val = scan_val;
+}
+
+template <typename iType, class FunctorType>
+KOKKOS_INLINE_FUNCTION void parallel_scan(
+    const Impl::ThreadVectorRangeBoundariesStruct<
+        iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
+    const FunctorType& lambda) {
+  using Analysis   = Impl::FunctorAnalysis<Impl::FunctorPatternInterface::SCAN,
+                                         TeamPolicy<Experimental::OpenMPTarget>,
+                                         FunctorType, void>;
+  using value_type = typename Analysis::value_type;
+
+  value_type scan_val = value_type();
+  parallel_scan(loop_boundaries, lambda, scan_val);
+}
+
+}  // namespace Kokkos
+
+#ifdef KOKKOS_IMPL_TEAM_SCAN_WORKAROUND
+#undef KOKKOS_IMPL_TEAM_SCAN_WORKAROUND
+#endif
+
+#endif
diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..fb75f05f270104f6764a39977106562b7cdd226a
--- /dev/null
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp
@@ -0,0 +1,706 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_OPENMPTARGET_PARALLEL_COMMON_HPP
+#define KOKKOS_OPENMPTARGET_PARALLEL_COMMON_HPP
+
+#include <omp.h>
+#include <sstream>
+#include <Kokkos_Parallel.hpp>
+#include <OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+// This class has the memcpy routine that is commonly used by ParallelReduce
+// over RangePolicy and TeamPolicy.
+template <class PointerType>
+struct ParallelReduceCopy {
+  // Copy the result back to device if the view is on the device.
+  static void memcpy_result(PointerType dest, PointerType src, size_t size,
+                            bool ptr_on_device) {
+    if (ptr_on_device) {
+      if (0 < size) {
+        KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy(dest, src, size, 0, 0,
+                                                     omp_get_default_device(),
+                                                     omp_get_initial_device()));
+      }
+
+    } else {
+      *dest = *src;
+    }
+  }
+};
+
+// template <class FunctorType, class PolicyType, class ReducerType,
+// class PointerType, class ValueType>
+template <class FunctorType, class ReducerType, class PointerType,
+          class ValueType, class PolicyType>
+struct ParallelReduceSpecialize {
+  inline static void execute(const FunctorType& /*f*/, const PolicyType& /*p*/,
+                             PointerType /*result_ptr*/) {
+    constexpr int FunctorHasJoin =
+        Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE, PolicyType,
+                              FunctorType,
+                              ValueType>::Reducer::has_join_member_function();
+    constexpr int UseReducerType = is_reducer_v<ReducerType>;
+
+    std::stringstream error_message;
+    error_message << "Error: Invalid Specialization " << FunctorHasJoin << ' '
+                  << UseReducerType << '\n';
+    // FIXME_OPENMPTARGET
+    OpenMPTarget_abort(error_message.str().c_str());
+  }
+};
+
+template <class FunctorType, class ReducerType, class PointerType,
+          class ValueType, class... PolicyArgs>
+struct ParallelReduceSpecialize<FunctorType, Kokkos::RangePolicy<PolicyArgs...>,
+                                ReducerType, PointerType, ValueType> {
+  using PolicyType = Kokkos::RangePolicy<PolicyArgs...>;
+  using TagType    = typename PolicyType::work_tag;
+  using ReducerTypeFwd =
+      std::conditional_t<std::is_same<InvalidType, ReducerType>::value,
+                         FunctorType, ReducerType>;
+  using Analysis = Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
+                                         PolicyType, ReducerTypeFwd, ValueType>;
+  using ReferenceType = typename Analysis::reference_type;
+
+  using ParReduceCopy = ParallelReduceCopy<PointerType>;
+
+  static void execute_reducer(const FunctorType& f, const PolicyType& p,
+                              PointerType result_ptr, bool ptr_on_device) {
+    OpenMPTargetExec::verify_is_process(
+        "Kokkos::Experimental::OpenMPTarget RangePolicy "
+        "parallel_reduce:reducer");
+    OpenMPTargetExec::verify_initialized(
+        "Kokkos::Experimental::OpenMPTarget RangePolicy "
+        "parallel_reduce:reducer");
+    const auto begin = p.begin();
+    const auto end   = p.end();
+
+    ValueType result;
+    OpenMPTargetReducerWrapper<ReducerType>::init(result);
+
+    // Initialize and copy back the result even if it is a zero length
+    // reduction.
+    if (end <= begin) {
+      ParReduceCopy::memcpy_result(result_ptr, &result, sizeof(ValueType),
+                                   ptr_on_device);
+      return;
+    }
+
+#pragma omp declare reduction(                                         \
+    custom:ValueType                                                   \
+    : OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
+    initializer(OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
+
+#pragma omp target teams distribute parallel for map(to                    \
+                                                     : f) reduction(custom \
+                                                                    : result)
+    for (auto i = begin; i < end; ++i) {
+      if constexpr (std::is_void_v<TagType>) {
+        f(i, result);
+      } else {
+        f(TagType(), i, result);
+      }
+    }
+
+    ParReduceCopy::memcpy_result(result_ptr, &result, sizeof(ValueType),
+                                 ptr_on_device);
+  }
+
+  template <class TagType, int NumReductions>
+  static void execute_array(const FunctorType& f, const PolicyType& p,
+                            PointerType result_ptr, bool ptr_on_device) {
+    OpenMPTargetExec::verify_is_process(
+        "Kokkos::Experimental::OpenMPTarget RangePolicy "
+        "parallel_reduce:array_reduction");
+    OpenMPTargetExec::verify_initialized(
+        "Kokkos::Experimental::OpenMPTarget RangePolicy "
+        "parallel_reduce:array_reduction");
+    const auto begin = p.begin();
+    const auto end   = p.end();
+
+    // Enter the loop if the reduction is on a scalar type.
+    if constexpr (NumReductions == 1) {
+      ValueType result = ValueType();
+
+      // Initialize and copy back the result even if it is a zero length
+      // reduction.
+      if (end <= begin) {
+        ParReduceCopy::memcpy_result(result_ptr, &result, sizeof(ValueType),
+                                     ptr_on_device);
+        return;
+      }
+
+      // Case where reduction is on a native data type.
+      if constexpr (std::is_arithmetic<ValueType>::value) {
+#pragma omp target teams distribute parallel for \
+         map(to:f) reduction(+: result)
+        for (auto i = begin; i < end; ++i)
+
+          if constexpr (std::is_void_v<TagType>) {
+            f(i, result);
+          } else {
+            f(TagType(), i, result);
+          }
+      } else {
+#pragma omp declare reduction(custom:ValueType : omp_out += omp_in)
+#pragma omp target teams distribute parallel for map(to                    \
+                                                     : f) reduction(custom \
+                                                                    : result)
+        for (auto i = begin; i < end; ++i)
+
+          if constexpr (std::is_void_v<TagType>) {
+            f(i, result);
+          } else {
+            f(TagType(), i, result);
+          }
+      }
+
+      ParReduceCopy::memcpy_result(result_ptr, &result, sizeof(ValueType),
+                                   ptr_on_device);
+    } else {
+      ValueType result[NumReductions] = {};
+
+      // Initialize and copy back the result even if it is a zero length
+      // reduction.
+      if (end <= begin) {
+        ParReduceCopy::memcpy_result(result_ptr, result,
+                                     NumReductions * sizeof(ValueType),
+                                     ptr_on_device);
+        return;
+      }
+#pragma omp target teams distribute parallel for map(to:f) reduction(+:result[:NumReductions])
+      for (auto i = begin; i < end; ++i) {
+        if constexpr (std::is_void_v<TagType>) {
+          f(i, result);
+        } else {
+          f(TagType(), i, result);
+        }
+      }
+
+      ParReduceCopy::memcpy_result(
+          result_ptr, result, NumReductions * sizeof(ValueType), ptr_on_device);
+    }
+  }
+
+  static void execute_init_join(const FunctorType& f, const PolicyType& p,
+                                PointerType ptr, const bool ptr_on_device) {
+    OpenMPTargetExec::verify_is_process(
+        "Kokkos::Experimental::OpenMPTarget RangePolicy "
+        "parallel_reduce:init_join");
+    OpenMPTargetExec::verify_initialized(
+        "Kokkos::Experimental::OpenMPTarget RangePolicy "
+        "parallel_reduce:init_join");
+    const auto begin = p.begin();
+    const auto end   = p.end();
+
+    using FunctorAnalysis =
+        Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE, PolicyType,
+                              FunctorType, ValueType>;
+
+    // Initialize the result pointer.
+
+    const auto size = end - begin;
+
+    // FIXME_OPENMPTARGET: The team size and MAX_ACTIVE_THREADS are currently
+    // based on NVIDIA-V100 and should be modifid to be based on the
+    // architecture in the future.
+    const int max_team_threads = 32;
+    const int max_teams =
+        OpenMPTargetExec::MAX_ACTIVE_THREADS / max_team_threads;
+    // Number of elements in the reduction
+    const auto value_count = FunctorAnalysis::value_count(f);
+
+    // Allocate scratch per active thread. Achieved by setting the first
+    // parameter of `resize_scratch=1`.
+    OpenMPTargetExec::resize_scratch(1, 0, value_count * sizeof(ValueType),
+                                     std::numeric_limits<int64_t>::max());
+    ValueType* scratch_ptr =
+        static_cast<ValueType*>(OpenMPTargetExec::get_scratch_ptr());
+
+    typename FunctorAnalysis::Reducer final_reducer(f);
+
+    if (end <= begin) {
+#pragma omp target map(to : final_reducer) is_device_ptr(scratch_ptr)
+      {
+        // If there is no work to be done, copy back the initialized values and
+        // exit.
+        final_reducer.init(scratch_ptr);
+        final_reducer.final(scratch_ptr);
+      }
+      if (0 < value_count) {
+        if (!ptr_on_device)
+          KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy(
+              ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0,
+              omp_get_initial_device(), omp_get_default_device()));
+        else
+          KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy(
+              ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0,
+              omp_get_default_device(), omp_get_default_device()));
+      }
+
+      return;
+    }
+
+#pragma omp target teams num_teams(max_teams) thread_limit(max_team_threads) \
+    map(to                                                                   \
+        : final_reducer) is_device_ptr(scratch_ptr)
+    {
+#pragma omp parallel
+      {
+        const int team_num    = omp_get_team_num();
+        const int num_teams   = omp_get_num_teams();
+        const auto chunk_size = size / num_teams;
+        const auto team_begin = begin + team_num * chunk_size;
+        const auto team_end =
+            (team_num == num_teams - 1) ? end : (team_begin + chunk_size);
+        ValueType* team_scratch =
+            scratch_ptr + team_num * max_team_threads * value_count;
+        ReferenceType result = final_reducer.init(
+            &team_scratch[omp_get_thread_num() * value_count]);
+
+        // Accumulate partial results in thread specific storage.
+#pragma omp for simd
+        for (auto i = team_begin; i < team_end; ++i) {
+          if constexpr (std::is_void_v<TagType>) {
+            f(i, result);
+          } else {
+            f(TagType(), i, result);
+          }
+        }
+
+        // Reduce all paritial results within a team.
+        const int team_size      = max_team_threads;
+        int tree_neighbor_offset = 1;
+        do {
+#pragma omp for simd
+          for (int i = 0; i < team_size - tree_neighbor_offset;
+               i += 2 * tree_neighbor_offset) {
+            const int neighbor = i + tree_neighbor_offset;
+            final_reducer.join(&team_scratch[i * value_count],
+                               &team_scratch[neighbor * value_count]);
+          }
+          tree_neighbor_offset *= 2;
+        } while (tree_neighbor_offset < team_size);
+      }  // end parallel
+    }    // end target
+
+    int tree_neighbor_offset = 1;
+    do {
+#pragma omp target teams distribute parallel for simd map(to   \
+                                                          : f) \
+    is_device_ptr(scratch_ptr)
+      for (int i = 0; i < max_teams - tree_neighbor_offset;
+           i += 2 * tree_neighbor_offset) {
+        ValueType* team_scratch = scratch_ptr;
+        const int team_offset   = max_team_threads * value_count;
+        final_reducer.join(
+            &team_scratch[i * team_offset],
+            &team_scratch[(i + tree_neighbor_offset) * team_offset]);
+
+        // If `final` is provided by the functor.
+        // Do the final only once at the end.
+        if (tree_neighbor_offset * 2 >= max_teams && omp_get_team_num() == 0 &&
+            omp_get_thread_num() == 0) {
+          final_reducer.final(scratch_ptr);
+        }
+      }
+      tree_neighbor_offset *= 2;
+    } while (tree_neighbor_offset < max_teams);
+
+    // If the result view is on the host, copy back the values via memcpy.
+    if (0 < value_count) {
+      if (!ptr_on_device)
+        KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy(
+            ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0,
+            omp_get_initial_device(), omp_get_default_device()));
+      else
+        KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy(
+            ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0,
+            omp_get_default_device(), omp_get_default_device()));
+    }
+  }
+};
+
+template <class FunctorType, class ReducerType, class PointerType,
+          class ValueType, class... PolicyArgs>
+struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>,
+                                ReducerType, PointerType, ValueType> {
+  using PolicyType = TeamPolicyInternal<PolicyArgs...>;
+  using TagType    = typename PolicyType::work_tag;
+  using ReducerTypeFwd =
+      std::conditional_t<std::is_same<InvalidType, ReducerType>::value,
+                         FunctorType, ReducerType>;
+  using Analysis = Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
+                                         PolicyType, ReducerTypeFwd, ValueType>;
+
+  using ReferenceType = typename Analysis::reference_type;
+
+  using ParReduceCopy = ParallelReduceCopy<PointerType>;
+
+  static void execute_reducer(const FunctorType& f, const PolicyType& p,
+                              PointerType result_ptr, bool ptr_on_device) {
+    OpenMPTargetExec::verify_is_process(
+        "Kokkos::Experimental::OpenMPTarget TeamPolicy "
+        "parallel_reduce:reducer");
+    OpenMPTargetExec::verify_initialized(
+        "Kokkos::Experimental::OpenMPTarget TeamPolicy "
+        "parallel_reduce:reducer");
+
+    const int league_size   = p.league_size();
+    const int team_size     = p.team_size();
+    const int vector_length = p.impl_vector_length();
+
+    const size_t shmem_size_L0 = p.scratch_size(0, team_size);
+    const size_t shmem_size_L1 = p.scratch_size(1, team_size);
+    OpenMPTargetExec::resize_scratch(PolicyType::member_type::TEAM_REDUCE_SIZE,
+                                     shmem_size_L0, shmem_size_L1, league_size);
+    void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr();
+
+    ValueType result = ValueType();
+
+    // Maximum active teams possible.
+    // FIXME_OPENMPTARGET: Cray compiler did not yet implement
+    // omp_get_max_teams.
+#if !defined(KOKKOS_COMPILER_CRAY_LLVM)
+    int max_active_teams = omp_get_max_teams();
+#else
+    int max_active_teams =
+        std::min(OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size, league_size);
+#endif
+
+    // If the league size is <=0, do not launch the kernel.
+    if (max_active_teams <= 0) return;
+
+#pragma omp declare reduction(                                         \
+    custom:ValueType                                                   \
+    : OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
+    initializer(OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
+
+#if !defined(KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU)
+#pragma omp target teams num_teams(max_active_teams) thread_limit(team_size) \
+    firstprivate(f) is_device_ptr(scratch_ptr) reduction(custom              \
+                                                         : result)
+#pragma omp parallel reduction(custom : result)
+    {
+      if (omp_get_num_teams() > max_active_teams)
+        Kokkos::abort("`omp_set_num_teams` call was not respected.\n");
+
+      const int blockIdx = omp_get_team_num();
+      const int gridDim  = omp_get_num_teams();
+
+      // Guarantee that the compilers respect the `num_teams` clause
+      for (int league_id = blockIdx; league_id < league_size;
+           league_id += gridDim) {
+        typename PolicyType::member_type team(
+            league_id, league_size, team_size, vector_length, scratch_ptr,
+            blockIdx, shmem_size_L0, shmem_size_L1);
+        if constexpr (std::is_void_v<TagType>)
+          f(team, result);
+        else
+          f(TagType(), team, result);
+      }
+    }
+#else
+#pragma omp target teams distribute firstprivate(f) is_device_ptr(scratch_ptr) \
+    num_teams(max_active_teams) thread_limit(team_size) reduction(custom       \
+                                                                  : result)
+    for (int i = 0; i < league_size; i++) {
+#pragma omp parallel reduction(custom : result)
+      {
+        if (omp_get_num_teams() > max_active_teams)
+          Kokkos::abort("`omp_set_num_teams` call was not respected.\n");
+
+        typename PolicyType::member_type team(i, league_size, team_size,
+                                              vector_length, scratch_ptr, i,
+                                              shmem_size_L0, shmem_size_L1);
+        if constexpr (std::is_void_v<TagType>)
+          f(team, result);
+        else
+          f(TagType(), team, result);
+      }
+    }
+#endif
+
+    // Copy results back to device if `parallel_reduce` is on a device view.
+    ParReduceCopy::memcpy_result(result_ptr, &result, sizeof(ValueType),
+                                 ptr_on_device);
+  }
+
+  template <int NumReductions>
+  static void execute_array(const FunctorType& f, const PolicyType& p,
+                            PointerType result_ptr, bool ptr_on_device) {
+    OpenMPTargetExec::verify_is_process(
+        "Kokkos::Experimental::OpenMPTarget TeamPolicy "
+        "parallel_reduce:array_reduction");
+    OpenMPTargetExec::verify_initialized(
+        "Kokkos::Experimental::OpenMPTarget TeamPolicy "
+        "parallel_reduce:array_reduction");
+
+    const int league_size   = p.league_size();
+    const int team_size     = p.team_size();
+    const int vector_length = p.impl_vector_length();
+
+    const size_t shmem_size_L0 = p.scratch_size(0, team_size);
+    const size_t shmem_size_L1 = p.scratch_size(1, team_size);
+    OpenMPTargetExec::resize_scratch(PolicyType::member_type::TEAM_REDUCE_SIZE,
+                                     shmem_size_L0, shmem_size_L1, league_size);
+    void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr();
+
+    // Maximum active teams possible.
+    // FIXME_OPENMPTARGET: Cray compiler did not yet implement
+    // omp_get_max_teams.
+#if !defined(KOKKOS_COMPILER_CRAY_LLVM)
+    int max_active_teams = omp_get_max_teams();
+#else
+    int max_active_teams =
+        std::min(OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size, league_size);
+#endif
+
+    // If the league size is <=0, do not launch the kernel.
+    if (max_active_teams <= 0) return;
+
+    // Case where the number of reduction items is 1.
+    if constexpr (NumReductions == 1) {
+      ValueType result = ValueType();
+
+      // Case where reduction is on a native data type.
+      if constexpr (std::is_arithmetic<ValueType>::value) {
+#pragma omp target teams num_teams(max_active_teams) thread_limit(team_size) map(to   \
+                                                                       : f) \
+    is_device_ptr(scratch_ptr) reduction(+: result)
+#pragma omp parallel reduction(+ : result)
+        {
+          if (omp_get_num_teams() > max_active_teams)
+            Kokkos::abort("`omp_set_num_teams` call was not respected.\n");
+
+          const int blockIdx = omp_get_team_num();
+          const int gridDim  = omp_get_num_teams();
+
+          // Guarantee that the compilers respect the `num_teams` clause
+          for (int league_id = blockIdx; league_id < league_size;
+               league_id += gridDim) {
+            typename PolicyType::member_type team(
+                league_id, league_size, team_size, vector_length, scratch_ptr,
+                blockIdx, shmem_size_L0, shmem_size_L1);
+            if constexpr (std::is_void_v<TagType>)
+              f(team, result);
+            else
+              f(TagType(), team, result);
+          }
+        }
+      } else {
+        // Case where the reduction is on a non-native data type.
+#pragma omp declare reduction(custom:ValueType : omp_out += omp_in)
+#pragma omp target teams num_teams(max_active_teams) thread_limit(team_size) \
+    map(to                                                                   \
+        : f) is_device_ptr(scratch_ptr) reduction(custom                     \
+                                                  : result)
+#pragma omp parallel reduction(custom : result)
+        {
+          if (omp_get_num_teams() > max_active_teams)
+            Kokkos::abort("`omp_set_num_teams` call was not respected.\n");
+
+          const int blockIdx = omp_get_team_num();
+          const int gridDim  = omp_get_num_teams();
+
+          // Guarantee that the compilers respect the `num_teams` clause
+          for (int league_id = blockIdx; league_id < league_size;
+               league_id += gridDim) {
+            typename PolicyType::member_type team(
+                league_id, league_size, team_size, vector_length, scratch_ptr,
+                blockIdx, shmem_size_L0, shmem_size_L1);
+            if constexpr (std::is_void_v<TagType>)
+              f(team, result);
+            else
+              f(TagType(), team, result);
+          }
+        }
+      }
+
+      // Copy results back to device if `parallel_reduce` is on a device view.
+      ParReduceCopy::memcpy_result(result_ptr, &result, sizeof(ValueType),
+                                   ptr_on_device);
+    } else {
+      ValueType result[NumReductions] = {};
+      // Case where the reduction is on an array.
+#pragma omp target teams num_teams(max_active_teams) thread_limit(team_size) map(to   \
+                                                                       : f) \
+    is_device_ptr(scratch_ptr) reduction(+ : result[:NumReductions])
+#pragma omp parallel reduction(+ : result[:NumReductions])
+      {
+        if (omp_get_num_teams() > max_active_teams)
+          Kokkos::abort("`omp_set_num_teams` call was not respected.\n");
+
+        const int blockIdx = omp_get_team_num();
+        const int gridDim  = omp_get_num_teams();
+
+        // Guarantee that the compilers respect the `num_teams` clause
+        for (int league_id = blockIdx; league_id < league_size;
+             league_id += gridDim) {
+          typename PolicyType::member_type team(
+              league_id, league_size, team_size, vector_length, scratch_ptr,
+              blockIdx, shmem_size_L0, shmem_size_L1);
+          if constexpr (std::is_void_v<TagType>)
+            f(team, result);
+          else
+            f(TagType(), team, result);
+        }
+      }
+
+      // Copy results back to device if `parallel_reduce` is on a device view.
+      ParReduceCopy::memcpy_result(
+          result_ptr, result, NumReductions * sizeof(ValueType), ptr_on_device);
+    }
+  }
+
+  // FIXME_OPENMPTARGET : This routine is a copy from `parallel_reduce` over
+  // RangePolicy. Need a new implementation.
+  static void execute_init_join(const FunctorType& f, const PolicyType& p,
+                                PointerType ptr, const bool ptr_on_device) {
+    OpenMPTargetExec::verify_is_process(
+        "Kokkos::Experimental::OpenMPTarget TeamPolicy "
+        "parallel_reduce:init_join ");
+    OpenMPTargetExec::verify_initialized(
+        "Kokkos::Experimental::OpenMPTarget TeamPolicy "
+        "parallel_reduce:init_join");
+    using FunctorAnalysis =
+        Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE, PolicyType,
+                              FunctorType, ValueType>;
+
+    const int league_size   = p.league_size();
+    const int team_size     = p.team_size();
+    const int vector_length = p.impl_vector_length();
+
+    auto begin = 0;
+    auto end   = league_size * team_size + team_size * vector_length;
+
+    const size_t shmem_size_L0 = p.scratch_size(0, team_size);
+    const size_t shmem_size_L1 = p.scratch_size(1, team_size);
+
+    // FIXME_OPENMPTARGET: This would oversubscribe scratch memory since we are
+    // already using the available scratch memory to create temporaries for each
+    // thread.
+    if ((shmem_size_L0 + shmem_size_L1) > 0) {
+      Kokkos::abort(
+          "OpenMPTarget: Scratch memory is not supported in `parallel_reduce` "
+          "over functors with init/join.");
+    }
+
+    const auto nteams = league_size;
+
+    // Number of elements in the reduction
+    const auto value_count = FunctorAnalysis::value_count(f);
+
+    // Allocate scratch per active thread.
+    OpenMPTargetExec::resize_scratch(1, 0, value_count * sizeof(ValueType),
+                                     league_size);
+    void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr();
+    typename FunctorAnalysis::Reducer final_reducer(f);
+
+    if (end <= begin) {
+// If there is no work to be done, copy back the initialized values and
+// exit.
+#pragma omp target map(to : final_reducer) is_device_ptr(scratch_ptr)
+      {
+        final_reducer.init(scratch_ptr);
+        final_reducer.final(scratch_ptr);
+      }
+
+      if (0 < value_count) {
+        if (!ptr_on_device)
+          KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy(
+              ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0,
+              omp_get_initial_device(), omp_get_default_device()));
+        else
+          KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy(
+              ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0,
+              omp_get_default_device(), omp_get_default_device()));
+      }
+
+      return;
+    }
+
+#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to   \
+                                                                       : f) \
+    is_device_ptr(scratch_ptr)
+    {
+#pragma omp parallel
+      {
+        const int team_num      = omp_get_team_num();
+        const int num_teams     = omp_get_num_teams();
+        ValueType* team_scratch = static_cast<ValueType*>(scratch_ptr) +
+                                  team_num * team_size * value_count;
+        ReferenceType result = final_reducer.init(&team_scratch[0]);
+
+        for (int league_id = team_num; league_id < league_size;
+             league_id += num_teams) {
+          typename PolicyType::member_type team(
+              league_id, league_size, team_size, vector_length, scratch_ptr,
+              team_num, shmem_size_L0, shmem_size_L1);
+          if constexpr (std::is_void_v<TagType>) {
+            f(team, result);
+          } else {
+            f(TagType(), team, result);
+          }
+        }
+      }  // end parallel
+    }    // end target
+
+    int tree_neighbor_offset = 1;
+    do {
+#pragma omp target teams distribute parallel for simd map(to               \
+                                                          : final_reducer) \
+    is_device_ptr(scratch_ptr)
+      for (int i = 0; i < nteams - tree_neighbor_offset;
+           i += 2 * tree_neighbor_offset) {
+        ValueType* team_scratch = static_cast<ValueType*>(scratch_ptr);
+        const int team_offset   = team_size * value_count;
+        final_reducer.join(
+            &team_scratch[i * team_offset],
+            &team_scratch[(i + tree_neighbor_offset) * team_offset]);
+
+        // If `final` is provided by the functor.
+        // Do the final only once at the end.
+        if (tree_neighbor_offset * 2 >= nteams && omp_get_team_num() == 0 &&
+            omp_get_thread_num() == 0) {
+          final_reducer.final(scratch_ptr);
+        }
+      }
+      tree_neighbor_offset *= 2;
+    } while (tree_neighbor_offset < nteams);
+
+    // If the result view is on the host, copy back the values via memcpy.
+    if (0 < value_count) {
+      if (!ptr_on_device)
+        KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy(
+            ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0,
+            omp_get_initial_device(), omp_get_default_device()));
+      else
+        KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy(
+            ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0,
+            omp_get_default_device(), omp_get_default_device()));
+    }
+  }
+};
+
+}  // namespace Impl
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp
index 21bdb67e34f5432e315d0450fa28ff5188ca543f..41e62ce6e6b32e155f8fe886fdfd9132afe6247c 100644
--- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp
@@ -19,7 +19,8 @@
 
 #include <omp.h>
 #include <Kokkos_Parallel.hpp>
-#include <OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp>
+#include <OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp>
+#include <OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp>
 
 // WORKAROUND OPENMPTARGET: sometimes tile sizes don't make it correctly,
 // this was tracked down to a bug in clang with regards of mapping structs
@@ -410,69 +411,49 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
 namespace Kokkos {
 namespace Impl {
 
-template <class FunctorType, class ReducerType, class... Traits>
-class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
+template <class CombinedFunctorReducerType, class... Traits>
+class ParallelReduce<CombinedFunctorReducerType,
+                     Kokkos::MDRangePolicy<Traits...>,
                      Kokkos::Experimental::OpenMPTarget> {
  private:
-  using Policy = Kokkos::MDRangePolicy<Traits...>;
+  using Policy      = Kokkos::MDRangePolicy<Traits...>;
+  using FunctorType = typename CombinedFunctorReducerType::functor_type;
+  using ReducerType = typename CombinedFunctorReducerType::reducer_type;
 
   using WorkTag = typename Policy::work_tag;
   using Member  = typename Policy::member_type;
   using Index   = typename Policy::index_type;
 
-  using ReducerConditional =
-      std::conditional<std::is_same<InvalidType, ReducerType>::value,
-                       FunctorType, ReducerType>;
-  using ReducerTypeFwd = typename ReducerConditional::type;
-  using Analysis = Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
-                                         Policy, ReducerTypeFwd>;
+  using pointer_type   = typename ReducerType::pointer_type;
+  using reference_type = typename ReducerType::reference_type;
 
-  using pointer_type   = typename Analysis::pointer_type;
-  using reference_type = typename Analysis::reference_type;
-
-  static constexpr bool UseReducer = is_reducer<ReducerType>::value;
+  static constexpr bool UseReducer =
+      !std::is_same_v<FunctorType, typename ReducerType::functor_type>;
 
   const pointer_type m_result_ptr;
-  const FunctorType m_functor;
+  const CombinedFunctorReducerType m_functor_reducer;
   const Policy m_policy;
-  const ReducerType m_reducer;
 
-  using ParReduceCommon = ParallelReduceCommon<pointer_type>;
+  using ParReduceCopy = ParallelReduceCopy<pointer_type>;
 
   bool m_result_ptr_on_device;
 
  public:
   inline void execute() const {
-    execute_tile<Policy::rank, typename Analysis::value_type>(
-        m_functor, m_policy, m_result_ptr);
+    execute_tile<Policy::rank, typename ReducerType::value_type>(
+        m_functor_reducer.get_functor(), m_policy, m_result_ptr);
   }
 
   template <class ViewType>
-  inline ParallelReduce(
-      const FunctorType& arg_functor, Policy arg_policy,
-      const ViewType& arg_result_view,
-      std::enable_if_t<Kokkos::is_view<ViewType>::value &&
-                           !Kokkos::is_reducer<ReducerType>::value,
-                       void*> = NULL)
+  inline ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer,
+                        Policy arg_policy, const ViewType& arg_result_view)
       : m_result_ptr(arg_result_view.data()),
-        m_functor(arg_functor),
+        m_functor_reducer(arg_functor_reducer),
         m_policy(arg_policy),
-        m_reducer(InvalidType()),
         m_result_ptr_on_device(
             MemorySpaceAccess<Kokkos::Experimental::OpenMPTargetSpace,
                               typename ViewType::memory_space>::accessible) {}
 
-  inline ParallelReduce(const FunctorType& arg_functor, Policy arg_policy,
-                        const ReducerType& reducer)
-      : m_result_ptr(reducer.view().data()),
-        m_functor(arg_functor),
-        m_policy(arg_policy),
-        m_reducer(reducer),
-        m_result_ptr_on_device(
-            MemorySpaceAccess<Kokkos::Experimental::OpenMPTargetSpace,
-                              typename ReducerType::result_view_type::
-                                  memory_space>::accessible) {}
-
   template <int Rank, class ValueType>
   inline std::enable_if_t<Rank == 2> execute_tile(const FunctorType& functor,
                                                   const Policy& policy,
@@ -518,8 +499,8 @@ reduction(+:result)
       }
     }
 
-    ParReduceCommon::memcpy_result(ptr, &result, sizeof(ValueType),
-                                   m_result_ptr_on_device);
+    ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType),
+                                 m_result_ptr_on_device);
   }
 
   template <int Rank, class ValueType>
@@ -539,10 +520,13 @@ reduction(+:result)
     // FIXME_OPENMPTARGET: Unable to separate directives and their companion
     // loops which leads to code duplication for different reduction types.
     if constexpr (UseReducer) {
-#pragma omp declare reduction(                                         \
-    custom:ValueType                                                   \
-    : OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
-    initializer(OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
+#pragma omp declare reduction(                                                 \
+    custom:ValueType                                                           \
+    : OpenMPTargetReducerWrapper <typename ReducerType::functor_type>::join(   \
+        omp_out, omp_in))                                                      \
+    initializer(                                                               \
+        OpenMPTargetReducerWrapper <typename ReducerType::functor_type>::init( \
+            omp_priv))
 
 #pragma omp target teams distribute parallel for collapse(3) map(to         \
                                                                  : functor) \
@@ -573,8 +557,8 @@ reduction(+:result)
       }
     }
 
-    ParReduceCommon::memcpy_result(ptr, &result, sizeof(ValueType),
-                                   m_result_ptr_on_device);
+    ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType),
+                                 m_result_ptr_on_device);
   }
 
   template <int Rank, class ValueType>
@@ -636,8 +620,8 @@ reduction(+:result)
       }
     }
 
-    ParReduceCommon::memcpy_result(ptr, &result, sizeof(ValueType),
-                                   m_result_ptr_on_device);
+    ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType),
+                                 m_result_ptr_on_device);
   }
 
   template <int Rank, class ValueType>
@@ -707,8 +691,8 @@ reduction(+:result)
       }
     }
 
-    ParReduceCommon::memcpy_result(ptr, &result, sizeof(ValueType),
-                                   m_result_ptr_on_device);
+    ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType),
+                                 m_result_ptr_on_device);
   }
 
   template <int Rank, class ValueType>
@@ -784,8 +768,8 @@ reduction(+:result)
       }
     }
 
-    ParReduceCommon::memcpy_result(ptr, &result, sizeof(ValueType),
-                                   m_result_ptr_on_device);
+    ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType),
+                                 m_result_ptr_on_device);
   }
 
   template <typename Policy, typename Functor>
diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..672271ed6b909a6f05b9397b3b3a600e9000b895
--- /dev/null
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp
@@ -0,0 +1,694 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_OPENMPTARGETREDUCER_HPP
+#define KOKKOS_OPENMPTARGETREDUCER_HPP
+
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_Spinwait.hpp>
+
+#include <Kokkos_Atomic.hpp>
+#include "Kokkos_OpenMPTarget_Abort.hpp"
+
+namespace Kokkos {
+namespace Impl {
+
+template <class Reducer>
+struct OpenMPTargetReducerWrapper {
+  using value_type = typename Reducer::value_type;
+
+  // Using a generic unknown Reducer for the OpenMPTarget backend is not
+  // implemented.
+  KOKKOS_INLINE_FUNCTION
+  static void join(value_type&, const value_type&) = delete;
+
+  KOKKOS_INLINE_FUNCTION
+  static void join(volatile value_type&, const volatile value_type&) = delete;
+
+  KOKKOS_INLINE_FUNCTION
+  static void init(value_type&) = delete;
+};
+
+template <class Scalar, class Space>
+struct OpenMPTargetReducerWrapper<Sum<Scalar, Space>> {
+ public:
+  // Required
+  using value_type = std::remove_cv_t<Scalar>;
+
+  // Required
+  KOKKOS_INLINE_FUNCTION
+  static void join(value_type& dest, const value_type& src) { dest += src; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void join(volatile value_type& dest, const volatile value_type& src) {
+    dest += src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init(value_type& val) {
+    val = reduction_identity<value_type>::sum();
+  }
+};
+
+template <class Scalar, class Space>
+struct OpenMPTargetReducerWrapper<Prod<Scalar, Space>> {
+ public:
+  // Required
+  using value_type = std::remove_cv_t<Scalar>;
+
+  // Required
+  KOKKOS_INLINE_FUNCTION
+  static void join(value_type& dest, const value_type& src) { dest *= src; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void join(volatile value_type& dest, const volatile value_type& src) {
+    dest *= src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init(value_type& val) {
+    val = reduction_identity<value_type>::prod();
+  }
+};
+
+template <class Scalar, class Space>
+struct OpenMPTargetReducerWrapper<Min<Scalar, Space>> {
+ public:
+  // Required
+  using value_type = std::remove_cv_t<Scalar>;
+
+  // Required
+  KOKKOS_INLINE_FUNCTION
+  static void join(value_type& dest, const value_type& src) {
+    if (src < dest) dest = src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static void join(volatile value_type& dest, const volatile value_type& src) {
+    if (src < dest) dest = src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init(value_type& val) {
+    val = reduction_identity<value_type>::min();
+  }
+};
+
+template <class Scalar, class Space>
+struct OpenMPTargetReducerWrapper<Max<Scalar, Space>> {
+ public:
+  // Required
+  using value_type = std::remove_cv_t<Scalar>;
+
+  // Required
+  KOKKOS_INLINE_FUNCTION
+  static void join(value_type& dest, const value_type& src) {
+    if (src > dest) dest = src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static void join(volatile value_type& dest, const volatile value_type& src) {
+    if (src > dest) dest = src;
+  }
+
+  // Required
+  KOKKOS_INLINE_FUNCTION
+  static void init(value_type& val) {
+    val = reduction_identity<value_type>::max();
+  }
+};
+
+template <class Scalar, class Space>
+struct OpenMPTargetReducerWrapper<LAnd<Scalar, Space>> {
+ public:
+  // Required
+  using value_type = std::remove_cv_t<Scalar>;
+
+  KOKKOS_INLINE_FUNCTION
+  static void join(value_type& dest, const value_type& src) {
+    dest = dest && src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static void join(volatile value_type& dest, const volatile value_type& src) {
+    dest = dest && src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init(value_type& val) {
+    val = reduction_identity<value_type>::land();
+  }
+};
+
+template <class Scalar, class Space>
+struct OpenMPTargetReducerWrapper<LOr<Scalar, Space>> {
+ public:
+  // Required
+  using value_type = std::remove_cv_t<Scalar>;
+
+  using result_view_type = Kokkos::View<value_type, Space>;
+
+  // Required
+  KOKKOS_INLINE_FUNCTION
+  static void join(value_type& dest, const value_type& src) {
+    dest = dest || src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static void join(volatile value_type& dest, const volatile value_type& src) {
+    dest = dest || src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init(value_type& val) {
+    val = reduction_identity<value_type>::lor();
+  }
+};
+
+template <class Scalar, class Space>
+struct OpenMPTargetReducerWrapper<BAnd<Scalar, Space>> {
+ public:
+  // Required
+  using value_type = std::remove_cv_t<Scalar>;
+
+  // Required
+  KOKKOS_INLINE_FUNCTION
+  static void join(value_type& dest, const value_type& src) {
+    dest = dest & src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static void join(volatile value_type& dest, const volatile value_type& src) {
+    dest = dest & src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init(value_type& val) {
+    val = reduction_identity<value_type>::band();
+  }
+};
+
+template <class Scalar, class Space>
+struct OpenMPTargetReducerWrapper<BOr<Scalar, Space>> {
+ public:
+  // Required
+  using value_type = std::remove_cv_t<Scalar>;
+
+  // Required
+  KOKKOS_INLINE_FUNCTION
+  static void join(value_type& dest, const value_type& src) {
+    dest = dest | src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static void join(volatile value_type& dest, const volatile value_type& src) {
+    dest = dest | src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init(value_type& val) {
+    val = reduction_identity<value_type>::bor();
+  }
+};
+
+template <class Scalar, class Index, class Space>
+struct OpenMPTargetReducerWrapper<MinLoc<Scalar, Index, Space>> {
+ private:
+  using scalar_type = std::remove_cv_t<Scalar>;
+  using index_type  = std::remove_cv_t<Index>;
+
+ public:
+  // Required
+  using value_type = ValLocScalar<scalar_type, index_type>;
+
+  // Required
+  KOKKOS_INLINE_FUNCTION
+  static void join(value_type& dest, const value_type& src) {
+    if (src.val < dest.val) dest = src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static void join(volatile value_type& dest, const volatile value_type& src) {
+    if (src.val < dest.val) dest = src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init(value_type& val) {
+    val.val = reduction_identity<scalar_type>::min();
+    val.loc = reduction_identity<index_type>::min();
+  }
+};
+
+template <class Scalar, class Index, class Space>
+struct OpenMPTargetReducerWrapper<MaxLoc<Scalar, Index, Space>> {
+ private:
+  using scalar_type = std::remove_cv_t<Scalar>;
+  using index_type  = std::remove_cv_t<Index>;
+
+ public:
+  // Required
+  using value_type = ValLocScalar<scalar_type, index_type>;
+
+  KOKKOS_INLINE_FUNCTION
+  static void join(value_type& dest, const value_type& src) {
+    if (src.val > dest.val) dest = src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static void join(volatile value_type& dest, const volatile value_type& src) {
+    if (src.val > dest.val) dest = src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init(value_type& val) {
+    val.val = reduction_identity<scalar_type>::max();
+    val.loc = reduction_identity<index_type>::min();
+  }
+};
+
+template <class Scalar, class Space>
+struct OpenMPTargetReducerWrapper<MinMax<Scalar, Space>> {
+ private:
+  using scalar_type = std::remove_cv_t<Scalar>;
+
+ public:
+  // Required
+  using value_type = MinMaxScalar<scalar_type>;
+
+  // Required
+  KOKKOS_INLINE_FUNCTION
+  static void join(value_type& dest, const value_type& src) {
+    if (src.min_val < dest.min_val) {
+      dest.min_val = src.min_val;
+    }
+    if (src.max_val > dest.max_val) {
+      dest.max_val = src.max_val;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static void join(volatile value_type& dest, const volatile value_type& src) {
+    if (src.min_val < dest.min_val) {
+      dest.min_val = src.min_val;
+    }
+    if (src.max_val > dest.max_val) {
+      dest.max_val = src.max_val;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init(value_type& val) {
+    val.max_val = reduction_identity<scalar_type>::max();
+    val.min_val = reduction_identity<scalar_type>::min();
+  }
+};
+
+template <class Scalar, class Index, class Space>
+struct OpenMPTargetReducerWrapper<MinMaxLoc<Scalar, Index, Space>> {
+ private:
+  using scalar_type = std::remove_cv_t<Scalar>;
+  using index_type  = std::remove_cv_t<Index>;
+
+ public:
+  // Required
+  using value_type = MinMaxLocScalar<scalar_type, index_type>;
+
+  // Required
+  KOKKOS_INLINE_FUNCTION
+  static void join(value_type& dest, const value_type& src) {
+    if (src.min_val < dest.min_val) {
+      dest.min_val = src.min_val;
+      dest.min_loc = src.min_loc;
+    }
+    if (src.max_val > dest.max_val) {
+      dest.max_val = src.max_val;
+      dest.max_loc = src.max_loc;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static void join(volatile value_type& dest, const volatile value_type& src) {
+    if (src.min_val < dest.min_val) {
+      dest.min_val = src.min_val;
+      dest.min_loc = src.min_loc;
+    }
+    if (src.max_val > dest.max_val) {
+      dest.max_val = src.max_val;
+      dest.max_loc = src.max_loc;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init(value_type& val) {
+    val.max_val = reduction_identity<scalar_type>::max();
+    val.min_val = reduction_identity<scalar_type>::min();
+    val.max_loc = reduction_identity<index_type>::min();
+    val.min_loc = reduction_identity<index_type>::min();
+  }
+};
+
+//
+// specialize for MaxFirstLoc
+//
+template <class Scalar, class Index, class Space>
+struct OpenMPTargetReducerWrapper<MaxFirstLoc<Scalar, Index, Space>> {
+ private:
+  using scalar_type = std::remove_cv_t<Scalar>;
+  using index_type  = std::remove_cv_t<Index>;
+
+ public:
+  // Required
+  using value_type = ValLocScalar<scalar_type, index_type>;
+
+// WORKAROUND OPENMPTARGET
+// This pragma omp declare target should not be necessary, but Intel compiler
+// fails without it
+#pragma omp declare target
+  KOKKOS_INLINE_FUNCTION
+  static void join(value_type& dest, const value_type& src) {
+    if (dest.val < src.val) {
+      dest = src;
+    } else if (!(src.val < dest.val)) {
+      dest.loc = (src.loc < dest.loc) ? src.loc : dest.loc;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static void join(volatile value_type& dest, const volatile value_type& src) {
+    if (dest.val < src.val) {
+      dest = src;
+    } else if (!(src.val < dest.val)) {
+      dest.loc = (src.loc < dest.loc) ? src.loc : dest.loc;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init(value_type& val) {
+    val.val = reduction_identity<scalar_type>::max();
+    val.loc = reduction_identity<index_type>::min();
+  }
+#pragma omp end declare target
+};
+
+//
+// specialize for MinFirstLoc
+//
+template <class Scalar, class Index, class Space>
+struct OpenMPTargetReducerWrapper<MinFirstLoc<Scalar, Index, Space>> {
+ private:
+  using scalar_type = std::remove_cv_t<Scalar>;
+  using index_type  = std::remove_cv_t<Index>;
+
+ public:
+  // Required
+  using value_type = ValLocScalar<scalar_type, index_type>;
+
+// WORKAROUND OPENMPTARGET
+// This pragma omp declare target should not be necessary, but Intel compiler
+// fails without it
+#pragma omp declare target
+  KOKKOS_INLINE_FUNCTION
+  static void join(value_type& dest, const value_type& src) {
+    if (src.val < dest.val) {
+      dest = src;
+    } else if (!(dest.val < src.val)) {
+      dest.loc = (src.loc < dest.loc) ? src.loc : dest.loc;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static void join(volatile value_type& dest, const volatile value_type& src) {
+    if (src.val < dest.val) {
+      dest = src;
+    } else if (!(dest.val < src.val)) {
+      dest.loc = (src.loc < dest.loc) ? src.loc : dest.loc;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init(value_type& val) {
+    val.val = reduction_identity<scalar_type>::min();
+    val.loc = reduction_identity<index_type>::min();
+  }
+#pragma omp end declare target
+};
+
+//
+// specialize for MinMaxFirstLastLoc
+//
+template <class Scalar, class Index, class Space>
+struct OpenMPTargetReducerWrapper<MinMaxFirstLastLoc<Scalar, Index, Space>> {
+ private:
+  using scalar_type = std::remove_cv_t<Scalar>;
+  using index_type  = std::remove_cv_t<Index>;
+
+ public:
+  // Required
+  using value_type = MinMaxLocScalar<scalar_type, index_type>;
+
+// WORKAROUND OPENMPTARGET
+// This pragma omp declare target should not be necessary, but Intel compiler
+// fails without it
+#pragma omp declare target
+  // Required
+  KOKKOS_INLINE_FUNCTION
+  static void join(value_type& dest, const value_type& src) {
+    if (src.min_val < dest.min_val) {
+      dest.min_val = src.min_val;
+      dest.min_loc = src.min_loc;
+    } else if (!(dest.min_val < src.min_val)) {
+      dest.min_loc = (src.min_loc < dest.min_loc) ? src.min_loc : dest.min_loc;
+    }
+
+    if (dest.max_val < src.max_val) {
+      dest.max_val = src.max_val;
+      dest.max_loc = src.max_loc;
+    } else if (!(src.max_val < dest.max_val)) {
+      dest.max_loc = (src.max_loc > dest.max_loc) ? src.max_loc : dest.max_loc;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static void join(volatile value_type& dest, const volatile value_type& src) {
+    if (src.min_val < dest.min_val) {
+      dest.min_val = src.min_val;
+      dest.min_loc = src.min_loc;
+    } else if (!(dest.min_val < src.min_val)) {
+      dest.min_loc = (src.min_loc < dest.min_loc) ? src.min_loc : dest.min_loc;
+    }
+
+    if (dest.max_val < src.max_val) {
+      dest.max_val = src.max_val;
+      dest.max_loc = src.max_loc;
+    } else if (!(src.max_val < dest.max_val)) {
+      dest.max_loc = (src.max_loc > dest.max_loc) ? src.max_loc : dest.max_loc;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init(value_type& val) {
+    val.max_val = reduction_identity<scalar_type>::max();
+    val.min_val = reduction_identity<scalar_type>::min();
+    val.max_loc = reduction_identity<index_type>::max();
+    val.min_loc = reduction_identity<index_type>::min();
+  }
+#pragma omp end declare target
+};
+
+//
+// specialize for FirstLoc
+//
+template <class Index, class Space>
+struct OpenMPTargetReducerWrapper<FirstLoc<Index, Space>> {
+ private:
+  using index_type = std::remove_cv_t<Index>;
+
+ public:
+  // Required
+  using value_type = FirstLocScalar<index_type>;
+
+// WORKAROUND OPENMPTARGET
+// This pragma omp declare target should not be necessary, but Intel compiler
+// fails without it
+#pragma omp declare target
+  // Required
+  KOKKOS_INLINE_FUNCTION
+  static void join(value_type& dest, const value_type& src) {
+    dest.min_loc_true = (src.min_loc_true < dest.min_loc_true)
+                            ? src.min_loc_true
+                            : dest.min_loc_true;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static void join(volatile value_type& dest, const volatile value_type& src) {
+    dest.min_loc_true = (src.min_loc_true < dest.min_loc_true)
+                            ? src.min_loc_true
+                            : dest.min_loc_true;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init(value_type& val) {
+    val.min_loc_true = reduction_identity<index_type>::min();
+  }
+#pragma omp end declare target
+};
+
+//
+// specialize for LastLoc
+//
+template <class Index, class Space>
+struct OpenMPTargetReducerWrapper<LastLoc<Index, Space>> {
+ private:
+  using index_type = std::remove_cv_t<Index>;
+
+ public:
+  // Required
+  using value_type = LastLocScalar<index_type>;
+
+// WORKAROUND OPENMPTARGET
+// This pragma omp declare target should not be necessary, but Intel compiler
+// fails without it
+#pragma omp declare target
+  // Required
+  KOKKOS_INLINE_FUNCTION
+  static void join(value_type& dest, const value_type& src) {
+    dest.max_loc_true = (src.max_loc_true > dest.max_loc_true)
+                            ? src.max_loc_true
+                            : dest.max_loc_true;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static void join(volatile value_type& dest, const volatile value_type& src) {
+    dest.max_loc_true = (src.max_loc_true > dest.max_loc_true)
+                            ? src.max_loc_true
+                            : dest.max_loc_true;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init(value_type& val) {
+    val.max_loc_true = reduction_identity<index_type>::max();
+  }
+#pragma omp end declare target
+};
+
+//
+// specialize for StdIsPartitioned
+//
+template <class Index, class Space>
+struct OpenMPTargetReducerWrapper<StdIsPartitioned<Index, Space>> {
+ private:
+  using index_type = std::remove_cv_t<Index>;
+
+ public:
+  // Required
+  using value_type = StdIsPartScalar<index_type>;
+
+// WORKAROUND OPENMPTARGET
+// This pragma omp declare target should not be necessary, but Intel compiler
+// fails without it
+#pragma omp declare target
+  // Required
+  KOKKOS_INLINE_FUNCTION
+  static void join(value_type& dest, const value_type& src) {
+    dest.max_loc_true = (dest.max_loc_true < src.max_loc_true)
+                            ? src.max_loc_true
+                            : dest.max_loc_true;
+
+    dest.min_loc_false = (dest.min_loc_false < src.min_loc_false)
+                             ? dest.min_loc_false
+                             : src.min_loc_false;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static void join(volatile value_type& dest, const volatile value_type& src) {
+    dest.max_loc_true = (dest.max_loc_true < src.max_loc_true)
+                            ? src.max_loc_true
+                            : dest.max_loc_true;
+
+    dest.min_loc_false = (dest.min_loc_false < src.min_loc_false)
+                             ? dest.min_loc_false
+                             : src.min_loc_false;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init(value_type& val) {
+    val.max_loc_true  = ::Kokkos::reduction_identity<index_type>::max();
+    val.min_loc_false = ::Kokkos::reduction_identity<index_type>::min();
+  }
+#pragma omp end declare target
+};
+
+//
+// specialize for StdPartitionPoint
+//
+template <class Index, class Space>
+struct OpenMPTargetReducerWrapper<StdPartitionPoint<Index, Space>> {
+ private:
+  using index_type = std::remove_cv_t<Index>;
+
+ public:
+  // Required
+  using value_type = StdPartPointScalar<index_type>;
+
+// WORKAROUND OPENMPTARGET
+// This pragma omp declare target should not be necessary, but Intel compiler
+// fails without it
+#pragma omp declare target
+  // Required
+  KOKKOS_INLINE_FUNCTION
+  static void join(value_type& dest, const value_type& src) {
+    dest.min_loc_false = (dest.min_loc_false < src.min_loc_false)
+                             ? dest.min_loc_false
+                             : src.min_loc_false;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static void join(volatile value_type& dest, const volatile value_type& src) {
+    dest.min_loc_false = (dest.min_loc_false < src.min_loc_false)
+                             ? dest.min_loc_false
+                             : src.min_loc_false;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init(value_type& val) {
+    val.min_loc_false = ::Kokkos::reduction_identity<index_type>::min();
+  }
+#pragma omp end declare target
+};
+
+/*
+template<class ReducerType>
+class OpenMPTargetReducerWrapper {
+  public:
+    const ReducerType& reducer;
+    using value_type = typename ReducerType::value_type;
+    value_type& value;
+
+    KOKKOS_INLINE_FUNCTION
+    void join(const value_type& upd) {
+      reducer.join(value,upd);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void init(const value_type& upd) {
+      reducer.init(value,upd);
+    }
+};*/
+
+}  // namespace Impl
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_UniqueToken.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_UniqueToken.hpp
index c7f146871bd5ab1518cb28e2d4000feaf71698a6..d9ea555055a35e02c3727144b91bba0d04f82564 100644
--- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_UniqueToken.hpp
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_UniqueToken.hpp
@@ -20,7 +20,7 @@
 #include <Kokkos_Macros.hpp>
 #ifdef KOKKOS_ENABLE_OPENMPTARGET
 
-#include <Kokkos_OpenMPTargetSpace.hpp>
+#include <OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp>
 #include <Kokkos_UniqueToken.hpp>
 #include <impl/Kokkos_SharedAlloc.hpp>
 #include <impl/Kokkos_ConcurrentBitset.hpp>
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL.cpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
index e38b011c89b5987707a61aa3b2f7709260bdd25d..7fa935f693a431f1cfc116cddd5208f289ca5840 100644
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
@@ -20,9 +20,8 @@
 
 #include <Kokkos_Concepts.hpp>
 #include <SYCL/Kokkos_SYCL_Instance.hpp>
-#include <Kokkos_SYCL.hpp>
+#include <SYCL/Kokkos_SYCL.hpp>
 #include <Kokkos_HostSpace.hpp>
-#include <Kokkos_Serial.hpp>
 #include <Kokkos_Core.hpp>
 #include <impl/Kokkos_Error.hpp>
 #include <impl/Kokkos_DeviceManagement.hpp>
@@ -60,6 +59,13 @@ SYCL::SYCL(const sycl::queue& stream)
         ptr->finalize();
         delete ptr;
       }) {
+  // In principle could be guarded with
+  // #ifdef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES
+  // but we chose to require user-provided queues to be in-order
+  // unconditionally so that code downstream does not break
+  // when the backend setting changes.
+  if (!stream.is_in_order())
+    Kokkos::abort("User provided sycl::queues must be in-order!");
   Impl::SYCLInternal::singleton().verify_is_initialized(
       "SYCL instance constructor");
   m_space_instance->initialize(stream);
@@ -88,6 +94,18 @@ void SYCL::print_configuration(std::ostream& os, bool verbose) const {
   os << "\nRuntime Configuration:\n";
 
   os << "macro  KOKKOS_ENABLE_SYCL : defined\n";
+#ifdef KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED
+  os << "macro  KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED : defined\n";
+#else
+  os << "macro  KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED : undefined\n";
+#endif
+
+#ifdef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES
+  os << "macro  KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES : defined\n";
+#else
+  os << "macro  KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES : undefined\n";
+#endif
+
   if (verbose)
     SYCL::impl_sycl_info(os, m_space_instance->m_queue->get_device());
 }
@@ -124,10 +142,7 @@ void SYCL::impl_initialize(InitializationSettings const& settings) {
   // If the device id is not specified and there are no GPUs, sidestep Kokkos
   // device selection and use whatever is available (if no GPU architecture is
   // specified).
-#if !defined(KOKKOS_ARCH_INTEL_GPU) && !defined(KOKKOS_ARCH_KEPLER) && \
-    !defined(KOKKOS_ARCH_MAXWELL) && !defined(KOKKOS_ARCH_PASCAL) &&   \
-    !defined(KOKKOS_ARCH_VOLTA) && !defined(KOKKOS_ARCH_TURING75) &&   \
-    !defined(KOKKOS_ARCH_AMPERE) && !defined(KOKKOS_ARCH_HOPPER)
+#if !defined(KOKKOS_ARCH_INTEL_GPU) && !defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU)
   if (!settings.has_device_id() && gpu_devices.empty()) {
     Impl::SYCLInternal::singleton().initialize(sycl::device());
     Impl::SYCLInternal::m_syclDev = 0;
@@ -144,7 +159,6 @@ std::ostream& SYCL::impl_sycl_info(std::ostream& os,
   using namespace sycl::info;
   return os << "Name: " << device.get_info<device::name>()
             << "\nDriver Version: " << device.get_info<device::driver_version>()
-            << "\nIs Host: " << device.is_host()
             << "\nIs CPU: " << device.is_cpu()
             << "\nIs GPU: " << device.is_gpu()
             << "\nIs Accelerator: " << device.is_accelerator()
@@ -184,7 +198,6 @@ std::ostream& SYCL::impl_sycl_info(std::ostream& os,
             << "\nNative Vector Width Half: "
             << device.get_info<device::native_vector_width_half>()
             << "\nAddress Bits: " << device.get_info<device::address_bits>()
-            << "\nImage Support: " << device.get_info<device::image_support>()
             << "\nMax Mem Alloc Size: "
             << device.get_info<device::max_mem_alloc_size>()
             << "\nMax Read Image Args: "
@@ -217,26 +230,11 @@ std::ostream& SYCL::impl_sycl_info(std::ostream& os,
             << "\nLocal Mem Size: " << device.get_info<device::local_mem_size>()
             << "\nError Correction Support: "
             << device.get_info<device::error_correction_support>()
-            << "\nHost Unified Memory: "
-            << device.get_info<device::host_unified_memory>()
             << "\nProfiling Timer Resolution: "
             << device.get_info<device::profiling_timer_resolution>()
-            << "\nIs Endian Little: "
-            << device.get_info<device::is_endian_little>()
             << "\nIs Available: " << device.get_info<device::is_available>()
-            << "\nIs Compiler Available: "
-            << device.get_info<device::is_compiler_available>()
-            << "\nIs Linker Available: "
-            << device.get_info<device::is_linker_available>()
-            << "\nQueue Profiling: "
-            << device.get_info<device::queue_profiling>()
             << "\nVendor: " << device.get_info<device::vendor>()
-            << "\nProfile: " << device.get_info<device::profile>()
             << "\nVersion: " << device.get_info<device::version>()
-            << "\nPrintf Buffer Size: "
-            << device.get_info<device::printf_buffer_size>()
-            << "\nPreferred Interop User Sync: "
-            << device.get_info<device::preferred_interop_user_sync>()
             << "\nPartition Max Sub Devices: "
             << device.get_info<device::partition_max_sub_devices>()
             << "\nReference Count: "
diff --git a/packages/kokkos/core/src/Kokkos_SYCL.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL.hpp
similarity index 93%
rename from packages/kokkos/core/src/Kokkos_SYCL.hpp
rename to packages/kokkos/core/src/SYCL/Kokkos_SYCL.hpp
index 0f8e744eb632a2701c6cc53702e4425285e468b3..be6b4b8930283aaa0e6ac67cb4b873af9bfae17d 100644
--- a/packages/kokkos/core/src/Kokkos_SYCL.hpp
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL.hpp
@@ -31,7 +31,7 @@ static_assert(false,
 #else
 #include <CL/sycl.hpp>
 #endif
-#include <Kokkos_SYCL_Space.hpp>
+#include <SYCL/Kokkos_SYCL_Space.hpp>
 #include <Kokkos_Layout.hpp>
 #include <Kokkos_ScratchSpace.hpp>
 #include <impl/Kokkos_Profiling_Interface.hpp>
@@ -163,13 +163,14 @@ std::vector<SYCL> partition_space(const SYCL& sycl_space, Args...) {
   std::vector<SYCL> instances;
   instances.reserve(sizeof...(Args));
   for (unsigned int i = 0; i < sizeof...(Args); ++i)
-    instances.emplace_back(sycl::queue(context, device));
+    instances.emplace_back(
+        sycl::queue(context, device, sycl::property::queue::in_order()));
   return instances;
 }
 
 template <class T>
 std::vector<SYCL> partition_space(const SYCL& sycl_space,
-                                  std::vector<T>& weights) {
+                                  std::vector<T> const& weights) {
   static_assert(
       std::is_arithmetic<T>::value,
       "Kokkos Error: partitioning arguments must be integers or floats");
@@ -178,9 +179,13 @@ std::vector<SYCL> partition_space(const SYCL& sycl_space,
   sycl::device device =
       sycl_space.impl_internal_space_instance()->m_queue->get_device();
   std::vector<SYCL> instances;
+
+  // We only care about the number of instances to create and ignore weights
+  // otherwise.
   instances.reserve(weights.size());
   for (unsigned int i = 0; i < weights.size(); ++i)
-    instances.emplace_back(sycl::queue(context, device));
+    instances.emplace_back(
+        sycl::queue(context, device, sycl::property::queue::in_order()));
   return instances;
 }
 }  // namespace Experimental
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Abort.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Abort.hpp
index 9e6d9fd7e23ff779b03fd5786ac4b15986abd0dd..4b0a142fe6cde92be990964a05e645f4d8108f35 100644
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Abort.hpp
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Abort.hpp
@@ -17,7 +17,7 @@
 #ifndef KOKKOS_SYCL_ABORT_HPP
 #define KOKKOS_SYCL_ABORT_HPP
 
-#include <Kokkos_Macros.hpp>
+#include <Kokkos_Printf.hpp>
 #if defined(KOKKOS_ENABLE_SYCL)
 // FIXME_SYCL
 #if __has_include(<sycl/sycl.hpp>)
@@ -31,7 +31,7 @@ namespace Impl {
 
 inline void sycl_abort(char const* msg) {
 #ifdef NDEBUG
-  KOKKOS_IMPL_DO_NOT_USE_PRINTF("Aborting with message %s.\n", msg);
+  Kokkos::printf("Aborting with message %s.\n", msg);
 #else
   // Choosing "" here causes problems but a single whitespace character works.
   const char* empty = " ";
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp
index 62b7977fcc0325410ea876b95a6f55cbfe3109e7..afc7eebd38817c0f25fcc64b1201aa9ce28ea58d 100644
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp
@@ -18,7 +18,7 @@
 #define KOKKOS_SYCLDEEPCOPY_HPP
 
 #include <Kokkos_Core_fwd.hpp>
-#include <Kokkos_SYCL.hpp>
+#include <SYCL/Kokkos_SYCL.hpp>
 
 #include <vector>
 
@@ -27,26 +27,6 @@
 namespace Kokkos {
 namespace Impl {
 
-template <class DT, class... DP>
-struct ZeroMemset<Kokkos::Experimental::SYCL, DT, DP...> {
-  ZeroMemset(const Kokkos::Experimental::SYCL& exec_space,
-             const View<DT, DP...>& dst,
-             typename View<DT, DP...>::const_value_type&) {
-    auto event = exec_space.impl_internal_space_instance()->m_queue->memset(
-        dst.data(), 0,
-        dst.size() * sizeof(typename View<DT, DP...>::value_type));
-    exec_space.impl_internal_space_instance()
-        ->m_queue->ext_oneapi_submit_barrier(std::vector<sycl::event>{event});
-  }
-
-  ZeroMemset(const View<DT, DP...>& dst,
-             typename View<DT, DP...>::const_value_type&) {
-    Experimental::Impl::SYCLInternal::singleton().m_queue->memset(
-        dst.data(), 0,
-        dst.size() * sizeof(typename View<DT, DP...>::value_type));
-  }
-};
-
 void DeepCopySYCL(void* dst, const void* src, size_t n);
 void DeepCopyAsyncSYCL(const Kokkos::Experimental::SYCL& instance, void* dst,
                        const void* src, size_t n);
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Half_Conversion.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Half_Conversion.hpp
index d7b0271e15d10893e8f0ed233c5a5033769fad54..14a39fc5dad392fb955ff99d029e35ccac42f2d2 100644
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Half_Conversion.hpp
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Half_Conversion.hpp
@@ -53,55 +53,55 @@ half_t cast_to_half(unsigned long val) { return half_t::impl_type(val); }
 template <class T>
 KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, float>::value, T>
 cast_from_half(half_t val) {
-  return half_t::impl_type(val);
+  return static_cast<T>(half_t::impl_type(val));
 }
 template <class T>
 KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, double>::value, T>
 cast_from_half(half_t val) {
-  return half_t::impl_type(val);
+  return static_cast<T>(half_t::impl_type(val));
 }
 template <class T>
 KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, short>::value, T>
 cast_from_half(half_t val) {
-  return half_t::impl_type(val);
+  return static_cast<T>(half_t::impl_type(val));
 }
 template <class T>
 KOKKOS_INLINE_FUNCTION
     std::enable_if_t<std::is_same<T, unsigned short>::value, T>
     cast_from_half(half_t val) {
-  return half_t::impl_type(val);
+  return static_cast<T>(half_t::impl_type(val));
 }
 template <class T>
 KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, int>::value, T>
 cast_from_half(half_t val) {
-  return half_t::impl_type(val);
+  return static_cast<T>(half_t::impl_type(val));
 }
 template <class T>
 KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, unsigned int>::value, T>
 cast_from_half(half_t val) {
-  return half_t::impl_type(val);
+  return static_cast<T>(half_t::impl_type(val));
 }
 template <class T>
 KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, long long>::value, T>
 cast_from_half(half_t val) {
-  return half_t::impl_type(val);
+  return static_cast<T>(half_t::impl_type(val));
 }
 template <class T>
 KOKKOS_INLINE_FUNCTION
     std::enable_if_t<std::is_same<T, unsigned long long>::value, T>
     cast_from_half(half_t val) {
-  return half_t::impl_type(val);
+  return static_cast<T>(half_t::impl_type(val));
 }
 template <class T>
 KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, long>::value, T>
 cast_from_half(half_t val) {
-  return half_t::impl_type(val);
+  return static_cast<T>(half_t::impl_type(val));
 }
 template <class T>
 KOKKOS_INLINE_FUNCTION
     std::enable_if_t<std::is_same<T, unsigned long>::value, T>
     cast_from_half(half_t val) {
-  return half_t::impl_type(val);
+  return static_cast<T>(half_t::impl_type(val));
 }
 }  // namespace Experimental
 
@@ -128,4 +128,112 @@ struct reduction_identity<Kokkos::Experimental::half_t> {
 
 }  // namespace Kokkos
 #endif  // KOKKOS_IMPL_SYCL_HALF_TYPE_DEFINED
+
+#ifdef KOKKOS_IMPL_SYCL_BHALF_TYPE_DEFINED
+
+namespace Kokkos {
+namespace Experimental {
+
+/************************** bhalf conversions *********************************/
+KOKKOS_INLINE_FUNCTION
+bhalf_t cast_to_bhalf(bhalf_t val) { return val; }
+
+KOKKOS_INLINE_FUNCTION
+bhalf_t cast_to_bhalf(float val) { return bhalf_t::impl_type(val); }
+KOKKOS_INLINE_FUNCTION
+bhalf_t cast_to_bhalf(double val) { return bhalf_t::impl_type(val); }
+KOKKOS_INLINE_FUNCTION
+bhalf_t cast_to_bhalf(short val) { return bhalf_t::impl_type(val); }
+KOKKOS_INLINE_FUNCTION
+bhalf_t cast_to_bhalf(unsigned short val) { return bhalf_t::impl_type(val); }
+KOKKOS_INLINE_FUNCTION
+bhalf_t cast_to_bhalf(int val) { return bhalf_t::impl_type(val); }
+KOKKOS_INLINE_FUNCTION
+bhalf_t cast_to_bhalf(unsigned int val) { return bhalf_t::impl_type(val); }
+KOKKOS_INLINE_FUNCTION
+bhalf_t cast_to_bhalf(long long val) { return bhalf_t::impl_type(val); }
+KOKKOS_INLINE_FUNCTION
+bhalf_t cast_to_bhalf(unsigned long long val) {
+  return bhalf_t::impl_type(val);
+}
+KOKKOS_INLINE_FUNCTION
+bhalf_t cast_to_bhalf(long val) { return bhalf_t::impl_type(val); }
+KOKKOS_INLINE_FUNCTION
+bhalf_t cast_to_bhalf(unsigned long val) { return bhalf_t::impl_type(val); }
+
+template <class T>
+KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, float>::value, T>
+cast_from_bhalf(bhalf_t val) {
+  return static_cast<T>(bhalf_t::impl_type(val));
+}
+template <class T>
+KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, double>::value, T>
+cast_from_bhalf(bhalf_t val) {
+  return static_cast<T>(bhalf_t::impl_type(val));
+}
+template <class T>
+KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, short>::value, T>
+cast_from_bhalf(bhalf_t val) {
+  return static_cast<T>(bhalf_t::impl_type(val));
+}
+template <class T>
+KOKKOS_INLINE_FUNCTION
+    std::enable_if_t<std::is_same<T, unsigned short>::value, T>
+    cast_from_bhalf(bhalf_t val) {
+  return static_cast<T>(bhalf_t::impl_type(val));
+}
+template <class T>
+KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, int>::value, T>
+cast_from_bhalf(bhalf_t val) {
+  return static_cast<T>(bhalf_t::impl_type(val));
+}
+template <class T>
+KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, unsigned int>::value, T>
+cast_from_bhalf(bhalf_t val) {
+  return static_cast<T>(bhalf_t::impl_type(val));
+}
+template <class T>
+KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, long long>::value, T>
+cast_from_bhalf(bhalf_t val) {
+  return static_cast<T>(bhalf_t::impl_type(val));
+}
+template <class T>
+KOKKOS_INLINE_FUNCTION
+    std::enable_if_t<std::is_same<T, unsigned long long>::value, T>
+    cast_from_bhalf(bhalf_t val) {
+  return static_cast<T>(bhalf_t::impl_type(val));
+}
+template <class T>
+KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, long>::value, T>
+cast_from_bhalf(bhalf_t val) {
+  return static_cast<T>(bhalf_t::impl_type(val));
+}
+template <class T>
+KOKKOS_INLINE_FUNCTION
+    std::enable_if_t<std::is_same<T, unsigned long>::value, T>
+    cast_from_bhalf(bhalf_t val) {
+  return static_cast<T>(bhalf_t::impl_type(val));
+}
+}  // namespace Experimental
+
+// sycl::bfloat16 doesn't have constexpr constructors so we return float
+template <>
+struct reduction_identity<Kokkos::Experimental::bhalf_t> {
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static float sum() noexcept {
+    return 0.f;
+  }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static float prod() noexcept {
+    return 1.0f;
+  }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static float max() noexcept {
+    return -0x7f7f;
+  }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static float min() noexcept {
+    return 0x7f7f;
+  }
+};
+
+}  // namespace Kokkos
+#endif  // KOKKOS_IMPL_SYCL_BHALF_TYPE_DEFINED
+
 #endif
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Half_Impl_Type.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Half_Impl_Type.hpp
index 8932c15883044c1c7341955490cb5c6dad70e2f7..1c0a0d94dfcc285728d11af31dec3adb60a6c067 100644
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Half_Impl_Type.hpp
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Half_Impl_Type.hpp
@@ -18,7 +18,6 @@
 #define KOKKOS_SYCL_HALF_IMPL_TYPE_HPP_
 
 #include <Kokkos_Macros.hpp>
-#ifdef KOKKOS_ENABLE_SYCL
 
 // FIXME_SYCL
 #if __has_include(<sycl/sycl.hpp>)
@@ -27,18 +26,40 @@
 #include <CL/sycl.hpp>
 #endif
 
-#ifndef KOKKOS_IMPL_HALF_TYPE_DEFINED
 // Make sure no one else tries to define half_t
+#ifndef KOKKOS_IMPL_HALF_TYPE_DEFINED
 #define KOKKOS_IMPL_HALF_TYPE_DEFINED
 #define KOKKOS_IMPL_SYCL_HALF_TYPE_DEFINED
 
-namespace Kokkos {
-namespace Impl {
+namespace Kokkos::Impl {
 struct half_impl_t {
   using type = sycl::half;
 };
-}  // namespace Impl
-}  // namespace Kokkos
+}  // namespace Kokkos::Impl
 #endif  // KOKKOS_IMPL_HALF_TYPE_DEFINED
-#endif  // KOKKOS_ENABLE_SYCL
-#endif
+
+// Make sure no one else tries to define bhalf_t
+#ifndef KOKKOS_IMPL_BHALF_TYPE_DEFINED
+// FIXME_SYCL Evaluate when to drop the check
+#if __has_include(<sycl/ext/oneapi/bfloat16.hpp>)
+#define KOKKOS_IMPL_BHALF_TYPE_DEFINED
+#define KOKKOS_IMPL_SYCL_BHALF_TYPE_DEFINED
+namespace Kokkos::Impl {
+struct bhalf_impl_t {
+  using type = sycl::ext::oneapi::bfloat16;
+};
+}  // namespace Kokkos::Impl
+#elif defined(SYCL_EXT_ONEAPI_BFLOAT16) && defined(KOKKOS_ARCH_INTEL_GPU)
+// FIXME_SYCL bfloat16 is only supported for compute capability 8.0 or higher
+// on Nvidia GPUs but SYCL_EXT_ONEAPI_BFLOAT16 is defined even for lower compute
+// capability.
+#define KOKKOS_IMPL_BHALF_TYPE_DEFINED
+#define KOKKOS_IMPL_SYCL_BHALF_TYPE_DEFINED
+namespace Kokkos::Impl {
+struct bhalf_impl_t {
+  using type = sycl::ext::oneapi::experimental::bfloat16;
+};
+}  // namespace Kokkos::Impl
+#endif  // test for bfloat16 support
+#endif  // KOKKOS_IMPL_BHALF_TYPE_DEFINED
+#endif  // KOKKOS_SYCL_HALF_IMPL_TYPE_HPP_
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp
index 0e1738d6acbc7b0833418f10f3d6b1a0ea31f1e0..080369770d75d9b755e5fa34bab00b7e52aae755 100644
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp
@@ -20,10 +20,25 @@
 
 #include <Kokkos_Core.hpp>  //kokkos_malloc
 
+#include <impl/Kokkos_CheckedIntegerOps.hpp>
+#include <impl/Kokkos_Error.hpp>
+
 namespace Kokkos {
 namespace Experimental {
 namespace Impl {
 
+namespace {
+
+// FIXME_SYCL Should be a multiple of the maximum subgroup size.
+static constexpr auto sizeScratchGrain =
+    sizeof(Kokkos::Experimental::SYCL::size_type[32]);
+
+std::size_t scratch_count(const std::size_t size) {
+  return (size + sizeScratchGrain - 1) / sizeScratchGrain;
+}
+
+}  // namespace
+
 std::vector<std::optional<sycl::queue>*> SYCLInternal::all_queues;
 std::mutex SYCLInternal::mutex;
 
@@ -75,79 +90,92 @@ void SYCLInternal::initialize(const sycl::device& d) {
       Kokkos::Impl::throw_runtime_exception(
           "There was an asynchronous SYCL error!\n");
   };
+#ifdef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES
+  initialize(
+      sycl::queue{d, exception_handler, sycl::property::queue::in_order()});
+#else
   initialize(sycl::queue{d, exception_handler});
+#endif
 }
 
 // FIXME_SYCL
 void SYCLInternal::initialize(const sycl::queue& q) {
+  KOKKOS_EXPECTS(!is_initialized());
+
   if (was_finalized)
     Kokkos::abort("Calling SYCL::initialize after SYCL::finalize is illegal\n");
 
-  if (is_initialized()) return;
-
-  if (!HostSpace::execution_space::impl_is_initialized()) {
-    const std::string msg(
-        "SYCL::initialize ERROR : HostSpace::execution_space is not "
-        "initialized");
-    Kokkos::Impl::throw_runtime_exception(msg);
+  m_queue = q;
+  // guard pushing to all_queues
+  {
+    std::scoped_lock lock(mutex);
+    all_queues.push_back(&m_queue);
   }
+  const sycl::device& d = m_queue->get_device();
 
-  const bool ok_init = nullptr == m_scratchSpace || nullptr == m_scratchFlags;
-  const bool ok_dev  = true;
-  if (ok_init && ok_dev) {
-    m_queue = q;
-    // guard pushing to all_queues
-    {
-      std::scoped_lock lock(mutex);
-      all_queues.push_back(&m_queue);
-    }
-    const sycl::device& d = m_queue->get_device();
+  m_maxWorkgroupSize =
+      d.template get_info<sycl::info::device::max_work_group_size>();
+  // FIXME_SYCL this should give the correct value for NVIDIA GPUs
+  m_maxConcurrency =
+      m_maxWorkgroupSize * 2 *
+      d.template get_info<sycl::info::device::max_compute_units>();
 
-    m_maxWorkgroupSize =
-        d.template get_info<sycl::info::device::max_work_group_size>();
-    // FIXME_SYCL this should give the correct value for NVIDIA GPUs
-    m_maxConcurrency =
-        m_maxWorkgroupSize * 2 *
-        d.template get_info<sycl::info::device::max_compute_units>();
+  m_maxShmemPerBlock =
+      d.template get_info<sycl::info::device::local_mem_size>();
 
-    m_maxShmemPerBlock =
-        d.template get_info<sycl::info::device::local_mem_size>();
+  for (auto& usm_mem : m_indirectKernelMem) {
+    usm_mem.reset(*m_queue, m_instance_id);
+  }
 
-    for (auto& usm_mem : m_indirectKernelMem) {
-      usm_mem.reset(*m_queue, m_instance_id);
-    }
+#ifdef KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED
+  // Init the array for used for arbitrarily sized atomics
+  if (this == &singleton()) {
+    desul::Impl::init_lock_arrays();
+    desul::Impl::init_lock_arrays_sycl(*m_queue);
+  }
+#endif
+}
 
-  } else {
-    std::ostringstream msg;
-    msg << "Kokkos::Experimental::SYCL::initialize(...) FAILED";
+int SYCLInternal::acquire_team_scratch_space() {
+  // Grab the next scratch memory allocation. We must make sure that the last
+  // kernel using the allocation has completed, so we wait for the event that
+  // was registered with that kernel.
+  int current_team_scratch = desul::atomic_fetch_inc_mod(
+      &m_current_team_scratch, m_n_team_scratch - 1,
+      desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice());
 
-    if (!ok_init) {
-      msg << " : Already initialized";
-    }
-    Kokkos::Impl::throw_runtime_exception(msg.str());
-  }
+  m_team_scratch_event[current_team_scratch].wait_and_throw();
 
-  m_team_scratch_current_size = 0;
-  m_team_scratch_ptr          = nullptr;
+  return current_team_scratch;
 }
 
 sycl::device_ptr<void> SYCLInternal::resize_team_scratch_space(
-    std::int64_t bytes, bool force_shrink) {
-  if (m_team_scratch_current_size == 0) {
-    m_team_scratch_current_size = bytes;
-    m_team_scratch_ptr =
+    int scratch_pool_id, std::int64_t bytes, bool force_shrink) {
+  // Multiple ParallelFor/Reduce Teams can call this function at the same time
+  // and invalidate the m_team_scratch_ptr. We use a pool to avoid any race
+  // condition.
+  if (m_team_scratch_current_size[scratch_pool_id] == 0) {
+    m_team_scratch_current_size[scratch_pool_id] = bytes;
+    m_team_scratch_ptr[scratch_pool_id] =
         Kokkos::kokkos_malloc<Experimental::SYCLDeviceUSMSpace>(
             "Kokkos::Experimental::SYCLDeviceUSMSpace::TeamScratchMemory",
-            m_team_scratch_current_size);
+            m_team_scratch_current_size[scratch_pool_id]);
   }
-  if ((bytes > m_team_scratch_current_size) ||
-      ((bytes < m_team_scratch_current_size) && (force_shrink))) {
-    m_team_scratch_current_size = bytes;
-    m_team_scratch_ptr =
+  if ((bytes > m_team_scratch_current_size[scratch_pool_id]) ||
+      ((bytes < m_team_scratch_current_size[scratch_pool_id]) &&
+       (force_shrink))) {
+    m_team_scratch_current_size[scratch_pool_id] = bytes;
+    m_team_scratch_ptr[scratch_pool_id] =
         Kokkos::kokkos_realloc<Experimental::SYCLDeviceUSMSpace>(
-            m_team_scratch_ptr, m_team_scratch_current_size);
+            m_team_scratch_ptr[scratch_pool_id],
+            m_team_scratch_current_size[scratch_pool_id]);
   }
-  return m_team_scratch_ptr;
+  return m_team_scratch_ptr[scratch_pool_id];
+}
+
+void SYCLInternal::register_team_scratch_event(int scratch_pool_id,
+                                               sycl::event event) {
+  m_team_scratch_event[scratch_pool_id] = event;
 }
 
 uint32_t SYCLInternal::impl_get_instance_id() const { return m_instance_id; }
@@ -160,7 +188,13 @@ void SYCLInternal::finalize() {
 
   // The global_unique_token_locks array is static and should only be
   // deallocated once by the defualt instance
-  if (this == &singleton()) Impl::sycl_global_unique_token_locks(true);
+  if (this == &singleton()) {
+    Impl::sycl_global_unique_token_locks(true);
+#ifdef KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED
+    desul::Impl::finalize_lock_arrays();
+    desul::Impl::finalize_lock_arrays_sycl(*m_queue);
+#endif
+  }
 
   using RecordSYCL = Kokkos::Impl::SharedAllocationRecord<SYCLDeviceUSMSpace>;
   if (nullptr != m_scratchSpace)
@@ -173,11 +207,14 @@ void SYCLInternal::finalize() {
   m_scratchFlagsCount = 0;
   m_scratchFlags      = nullptr;
 
-  if (m_team_scratch_current_size > 0)
-    Kokkos::kokkos_free<Kokkos::Experimental::SYCLDeviceUSMSpace>(
-        m_team_scratch_ptr);
-  m_team_scratch_current_size = 0;
-  m_team_scratch_ptr          = nullptr;
+  for (int i = 0; i < m_n_team_scratch; ++i) {
+    if (m_team_scratch_current_size[i] > 0) {
+      Kokkos::kokkos_free<Kokkos::Experimental::SYCLDeviceUSMSpace>(
+          m_team_scratch_ptr[i]);
+      m_team_scratch_current_size[i] = 0;
+      m_team_scratch_ptr[i]          = nullptr;
+    }
+  }
 
   for (auto& usm_mem : m_indirectKernelMem) usm_mem.reset();
   // guard erasing from all_queues
@@ -189,11 +226,9 @@ void SYCLInternal::finalize() {
 }
 
 sycl::device_ptr<void> SYCLInternal::scratch_space(const std::size_t size) {
-  const size_type sizeScratchGrain =
-      sizeof(Kokkos::Experimental::SYCL::size_type);
   if (verify_is_initialized("scratch_space") &&
-      m_scratchSpaceCount * sizeScratchGrain < size) {
-    m_scratchSpaceCount = (size + sizeScratchGrain - 1) / sizeScratchGrain;
+      m_scratchSpaceCount < scratch_count(size)) {
+    m_scratchSpaceCount = scratch_count(size);
 
     using Record = Kokkos::Impl::SharedAllocationRecord<
         Kokkos::Experimental::SYCLDeviceUSMSpace, void>;
@@ -201,10 +236,11 @@ sycl::device_ptr<void> SYCLInternal::scratch_space(const std::size_t size) {
     if (nullptr != m_scratchSpace)
       Record::decrement(Record::get_record(m_scratchSpace));
 
-    Record* const r =
-        Record::allocate(Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue),
-                         "Kokkos::Experimental::SYCL::InternalScratchSpace",
-                         (sizeScratchGrain * m_scratchSpaceCount));
+    std::size_t alloc_size = Kokkos::Impl::multiply_overflow_abort(
+        m_scratchSpaceCount, sizeScratchGrain);
+    Record* const r = Record::allocate(
+        Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue),
+        "Kokkos::Experimental::SYCL::InternalScratchSpace", alloc_size);
 
     Record::increment(r);
 
@@ -215,11 +251,9 @@ sycl::device_ptr<void> SYCLInternal::scratch_space(const std::size_t size) {
 }
 
 sycl::device_ptr<void> SYCLInternal::scratch_flags(const std::size_t size) {
-  const size_type sizeScratchGrain =
-      sizeof(Kokkos::Experimental::SYCL::size_type);
   if (verify_is_initialized("scratch_flags") &&
-      m_scratchFlagsCount * sizeScratchGrain < size) {
-    m_scratchFlagsCount = (size + sizeScratchGrain - 1) / sizeScratchGrain;
+      m_scratchFlagsCount < scratch_count(size)) {
+    m_scratchFlagsCount = scratch_count(size);
 
     using Record = Kokkos::Impl::SharedAllocationRecord<
         Kokkos::Experimental::SYCLDeviceUSMSpace, void>;
@@ -227,20 +261,21 @@ sycl::device_ptr<void> SYCLInternal::scratch_flags(const std::size_t size) {
     if (nullptr != m_scratchFlags)
       Record::decrement(Record::get_record(m_scratchFlags));
 
-    Record* const r =
-        Record::allocate(Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue),
-                         "Kokkos::Experimental::SYCL::InternalScratchFlags",
-                         (sizeScratchGrain * m_scratchFlagsCount));
+    std::size_t alloc_size = Kokkos::Impl::multiply_overflow_abort(
+        m_scratchFlagsCount, sizeScratchGrain);
+    Record* const r = Record::allocate(
+        Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue),
+        "Kokkos::Experimental::SYCL::InternalScratchFlags", alloc_size);
 
     Record::increment(r);
 
     m_scratchFlags = reinterpret_cast<size_type*>(r->data());
   }
-  m_queue->memset(m_scratchFlags, 0, m_scratchFlagsCount * sizeScratchGrain);
-  fence(*m_queue,
-        "Kokkos::Experimental::SYCLInternal::scratch_flags fence after "
-        "initializing m_scratchFlags",
-        m_instance_id);
+  auto memset_event = m_queue->memset(m_scratchFlags, 0,
+                                      m_scratchFlagsCount * sizeScratchGrain);
+#ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES
+  m_queue->ext_oneapi_submit_barrier(std::vector{memset_event});
+#endif
 
   return m_scratchFlags;
 }
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp
index 58775647f0c7ef52d1ab55b41a1cc05493310579..51a617054d6db8ca15dce2bf7d7d236f33dc2425 100644
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp
@@ -45,8 +45,11 @@ class SYCLInternal {
 
   sycl::device_ptr<void> scratch_space(const std::size_t size);
   sycl::device_ptr<void> scratch_flags(const std::size_t size);
-  sycl::device_ptr<void> resize_team_scratch_space(std::int64_t bytes,
+  int acquire_team_scratch_space();
+  sycl::device_ptr<void> resize_team_scratch_space(int scratch_pool_id,
+                                                   std::int64_t bytes,
                                                    bool force_shrink = false);
+  void register_team_scratch_event(int scratch_pool_id, sycl::event event);
 
   uint32_t impl_get_instance_id() const;
   static int m_syclDev;
@@ -62,8 +65,12 @@ class SYCLInternal {
   // mutex to access shared memory
   mutable std::mutex m_mutexScratchSpace;
 
-  int64_t m_team_scratch_current_size       = 0;
-  sycl::device_ptr<void> m_team_scratch_ptr = nullptr;
+  // Team Scratch Level 1 Space
+  static constexpr int m_n_team_scratch                               = 10;
+  mutable int64_t m_team_scratch_current_size[m_n_team_scratch]       = {};
+  mutable sycl::device_ptr<void> m_team_scratch_ptr[m_n_team_scratch] = {};
+  mutable int m_current_team_scratch                                  = 0;
+  mutable sycl::event m_team_scratch_event[m_n_team_scratch]          = {};
   mutable std::mutex m_team_scratch_mutex;
 
   uint32_t m_instance_id = Kokkos::Tools::Experimental::Impl::idForInstance<
@@ -323,10 +330,29 @@ struct sycl::is_device_copyable<
     Kokkos::Experimental::Impl::SYCLFunctionWrapper<Functor, Storage, false>>
     : std::true_type {};
 
+// FIXME_SYCL Remove when this specialization when specializations for
+// sycl::device_copyable also apply to const-qualified types.
+template <typename>
+struct NonTriviallyCopyableAndDeviceCopyable {
+  NonTriviallyCopyableAndDeviceCopyable(
+      const NonTriviallyCopyableAndDeviceCopyable&) {}
+};
+
+template <typename T>
+struct sycl::is_device_copyable<NonTriviallyCopyableAndDeviceCopyable<T>>
+    : std::true_type {};
+
+static_assert(
+    !std::is_trivially_copyable_v<
+        NonTriviallyCopyableAndDeviceCopyable<void>> &&
+    sycl::is_device_copyable_v<NonTriviallyCopyableAndDeviceCopyable<void>>);
+
 template <typename Functor, typename Storage>
 struct sycl::is_device_copyable<
     const Kokkos::Experimental::Impl::SYCLFunctionWrapper<Functor, Storage,
-                                                          false>>
+                                                          false>,
+    std::enable_if_t<!sycl::is_device_copyable_v<
+        const NonTriviallyCopyableAndDeviceCopyable<Functor>>>>
     : std::true_type {};
 #endif
 #endif
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Range.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp
similarity index 61%
rename from packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Range.hpp
rename to packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp
index 73475f92a6c10e546a0d283a361172c412bf6ec7..f4fada570b0e6839f0ba20de9d50b4db274d724d 100644
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Range.hpp
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp
@@ -14,132 +14,15 @@
 //
 //@HEADER
 
-#ifndef KOKKOS_SYCL_PARALLEL_RANGE_HPP_
-#define KOKKOS_SYCL_PARALLEL_RANGE_HPP_
+#ifndef KOKKOS_SYCL_PARALLEL_FOR_MDRANGE_HPP_
+#define KOKKOS_SYCL_PARALLEL_FOR_MDRANGE_HPP_
 
 #include <impl/KokkosExp_IterateTileGPU.hpp>
 
+#ifdef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES
 #include <vector>
+#endif
 
-namespace Kokkos::Impl {
-template <typename FunctorWrapper, typename Policy>
-struct FunctorWrapperRangePolicyParallelFor {
-  using WorkTag = typename Policy::work_tag;
-
-  void operator()(sycl::item<1> item) const {
-    const typename Policy::index_type id = item.get_linear_id() + m_begin;
-    if constexpr (std::is_void_v<WorkTag>)
-      m_functor_wrapper.get_functor()(id);
-    else
-      m_functor_wrapper.get_functor()(WorkTag(), id);
-  }
-
-  typename Policy::index_type m_begin;
-  FunctorWrapper m_functor_wrapper;
-};
-
-// Same as above but for a user-provided workgroup size
-template <typename FunctorWrapper, typename Policy>
-struct FunctorWrapperRangePolicyParallelForCustom {
-  using WorkTag = typename Policy::work_tag;
-
-  void operator()(sycl::item<1> item) const {
-    const typename Policy::index_type id = item.get_linear_id();
-    if (id < m_work_size) {
-      const auto shifted_id = id + m_begin;
-      if constexpr (std::is_void_v<WorkTag>)
-        m_functor_wrapper.get_functor()(shifted_id);
-      else
-        m_functor_wrapper.get_functor()(WorkTag(), shifted_id);
-    }
-  }
-
-  typename Policy::index_type m_begin;
-  FunctorWrapper m_functor_wrapper;
-  typename Policy::index_type m_work_size;
-};
-}  // namespace Kokkos::Impl
-
-template <class FunctorType, class... Traits>
-class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>,
-                                Kokkos::Experimental::SYCL> {
- public:
-  using Policy = Kokkos::RangePolicy<Traits...>;
-
- private:
-  using Member  = typename Policy::member_type;
-  using WorkTag = typename Policy::work_tag;
-
-  const FunctorType m_functor;
-  const Policy m_policy;
-
-  template <typename Functor>
-  static sycl::event sycl_direct_launch(const Policy& policy,
-                                        const Functor& functor,
-                                        const sycl::event& memcpy_event) {
-    // Convenience references
-    const Kokkos::Experimental::SYCL& space = policy.space();
-    sycl::queue& q                          = space.sycl_queue();
-
-    auto parallel_for_event = q.submit([&](sycl::handler& cgh) {
-      cgh.depends_on(memcpy_event);
-      if (policy.chunk_size() <= 1) {
-        FunctorWrapperRangePolicyParallelFor<Functor, Policy> f{policy.begin(),
-                                                                functor};
-        sycl::range<1> range(policy.end() - policy.begin());
-        cgh.parallel_for<FunctorWrapperRangePolicyParallelFor<Functor, Policy>>(
-            range, f);
-      } else {
-        // Use the chunk size as workgroup size. We need to make sure that the
-        // range the kernel is launched with is a multiple of the workgroup
-        // size. Hence, we need to restrict the execution of the functor in the
-        // kernel to the actual range.
-        const auto actual_range = policy.end() - policy.begin();
-        const auto wgroup_size  = policy.chunk_size();
-        const auto launch_range =
-            (actual_range + wgroup_size - 1) / wgroup_size * wgroup_size;
-        FunctorWrapperRangePolicyParallelForCustom<Functor, Policy> f{
-            policy.begin(), functor, actual_range};
-        sycl::nd_range<1> range(launch_range, wgroup_size);
-        cgh.parallel_for<
-            FunctorWrapperRangePolicyParallelForCustom<Functor, Policy>>(range,
-                                                                         f);
-      }
-    });
-    q.ext_oneapi_submit_barrier(std::vector<sycl::event>{parallel_for_event});
-
-    return parallel_for_event;
-  }
-
- public:
-  using functor_type = FunctorType;
-
-  void execute() const {
-    if (m_policy.begin() == m_policy.end()) return;
-
-    Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem&
-        indirectKernelMem = m_policy.space()
-                                .impl_internal_space_instance()
-                                ->get_indirect_kernel_mem();
-
-    auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper(
-        m_functor, indirectKernelMem);
-    sycl::event event = sycl_direct_launch(m_policy, functor_wrapper,
-                                           functor_wrapper.get_copy_event());
-    functor_wrapper.register_event(event);
-  }
-
-  ParallelFor(const ParallelFor&) = delete;
-  ParallelFor(ParallelFor&&)      = delete;
-  ParallelFor& operator=(const ParallelFor&) = delete;
-  ParallelFor& operator=(ParallelFor&&) = delete;
-  ~ParallelFor()                        = default;
-
-  ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
-      : m_functor(arg_functor), m_policy(arg_policy) {}
-};
-
-// ParallelFor
 template <class FunctorType, class... Traits>
 class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
                                 Kokkos::Experimental::SYCL> {
@@ -243,7 +126,11 @@ class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
           sycl::range<3>{global_range[2], global_range[1], global_range[0]},
           sycl::range<3>{local_range[2], local_range[1], local_range[0]}};
 
+#ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES
       cgh.depends_on(memcpy_event);
+#else
+      (void)memcpy_event;
+#endif
       cgh.parallel_for(sycl_swapped_range, [functor_wrapper, bare_policy](
                                                sycl::nd_item<3> item) {
         // swap back for correct index calculations in DeviceIterateTile
@@ -265,7 +152,9 @@ class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
             .exec_range();
       });
     });
+#ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES
     q.ext_oneapi_submit_barrier(std::vector<sycl::event>{parallel_for_event});
+#endif
 
     return parallel_for_event;
   }
@@ -302,4 +191,4 @@ class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
         m_space(arg_policy.space()) {}
 };
 
-#endif  // KOKKOS_SYCL_PARALLEL_RANGE_HPP_
+#endif  // KOKKOS_SYCL_PARALLEL_FOR_MDRANGE_HPP_
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..9c5767d209ff19d8798cec1f8f93bbdbf0a820d4
--- /dev/null
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp
@@ -0,0 +1,148 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_SYCL_PARALLEL_FOR_RANGE_HPP_
+#define KOKKOS_SYCL_PARALLEL_FOR_RANGE_HPP_
+
+#ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES
+#include <vector>
+#endif
+
+namespace Kokkos::Impl {
+template <typename FunctorWrapper, typename Policy>
+struct FunctorWrapperRangePolicyParallelFor {
+  using WorkTag = typename Policy::work_tag;
+
+  void operator()(sycl::item<1> item) const {
+    const typename Policy::index_type id = item.get_linear_id() + m_begin;
+    if constexpr (std::is_void_v<WorkTag>)
+      m_functor_wrapper.get_functor()(id);
+    else
+      m_functor_wrapper.get_functor()(WorkTag(), id);
+  }
+
+  typename Policy::index_type m_begin;
+  FunctorWrapper m_functor_wrapper;
+};
+
+// Same as above but for a user-provided workgroup size
+template <typename FunctorWrapper, typename Policy>
+struct FunctorWrapperRangePolicyParallelForCustom {
+  using WorkTag = typename Policy::work_tag;
+
+  void operator()(sycl::item<1> item) const {
+    const typename Policy::index_type id = item.get_linear_id();
+    if (id < m_work_size) {
+      const auto shifted_id = id + m_begin;
+      if constexpr (std::is_void_v<WorkTag>)
+        m_functor_wrapper.get_functor()(shifted_id);
+      else
+        m_functor_wrapper.get_functor()(WorkTag(), shifted_id);
+    }
+  }
+
+  typename Policy::index_type m_begin;
+  FunctorWrapper m_functor_wrapper;
+  typename Policy::index_type m_work_size;
+};
+}  // namespace Kokkos::Impl
+
+template <class FunctorType, class... Traits>
+class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>,
+                                Kokkos::Experimental::SYCL> {
+ public:
+  using Policy = Kokkos::RangePolicy<Traits...>;
+
+ private:
+  using Member  = typename Policy::member_type;
+  using WorkTag = typename Policy::work_tag;
+
+  const FunctorType m_functor;
+  const Policy m_policy;
+
+  template <typename Functor>
+  static sycl::event sycl_direct_launch(const Policy& policy,
+                                        const Functor& functor,
+                                        const sycl::event& memcpy_event) {
+    // Convenience references
+    const Kokkos::Experimental::SYCL& space = policy.space();
+    sycl::queue& q                          = space.sycl_queue();
+
+    auto parallel_for_event = q.submit([&](sycl::handler& cgh) {
+#ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES
+      cgh.depends_on(memcpy_event);
+#else
+      (void)memcpy_event;
+#endif
+      if (policy.chunk_size() <= 1) {
+        FunctorWrapperRangePolicyParallelFor<Functor, Policy> f{policy.begin(),
+                                                                functor};
+        sycl::range<1> range(policy.end() - policy.begin());
+        cgh.parallel_for<FunctorWrapperRangePolicyParallelFor<Functor, Policy>>(
+            range, f);
+      } else {
+        // Use the chunk size as workgroup size. We need to make sure that the
+        // range the kernel is launched with is a multiple of the workgroup
+        // size. Hence, we need to restrict the execution of the functor in the
+        // kernel to the actual range.
+        const auto actual_range = policy.end() - policy.begin();
+        const auto wgroup_size  = policy.chunk_size();
+        const auto launch_range =
+            (actual_range + wgroup_size - 1) / wgroup_size * wgroup_size;
+        FunctorWrapperRangePolicyParallelForCustom<Functor, Policy> f{
+            policy.begin(), functor, actual_range};
+        sycl::nd_range<1> range(launch_range, wgroup_size);
+        cgh.parallel_for<
+            FunctorWrapperRangePolicyParallelForCustom<Functor, Policy>>(range,
+                                                                         f);
+      }
+    });
+#ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES
+    q.ext_oneapi_submit_barrier(std::vector<sycl::event>{parallel_for_event});
+#endif
+
+    return parallel_for_event;
+  }
+
+ public:
+  using functor_type = FunctorType;
+
+  void execute() const {
+    if (m_policy.begin() == m_policy.end()) return;
+
+    Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem&
+        indirectKernelMem = m_policy.space()
+                                .impl_internal_space_instance()
+                                ->get_indirect_kernel_mem();
+
+    auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper(
+        m_functor, indirectKernelMem);
+    sycl::event event = sycl_direct_launch(m_policy, functor_wrapper,
+                                           functor_wrapper.get_copy_event());
+    functor_wrapper.register_event(event);
+  }
+
+  ParallelFor(const ParallelFor&) = delete;
+  ParallelFor(ParallelFor&&)      = delete;
+  ParallelFor& operator=(const ParallelFor&) = delete;
+  ParallelFor& operator=(ParallelFor&&) = delete;
+  ~ParallelFor()                        = default;
+
+  ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
+      : m_functor(arg_functor), m_policy(arg_policy) {}
+};
+
+#endif  // KOKKOS_SYCL_PARALLEL_FOR_RANGE_HPP_
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..4fc5818ce9bdcdd5092473eb5339a428ee4754c8
--- /dev/null
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp
@@ -0,0 +1,188 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_SYCL_PARALLEL_FOR_TEAM_HPP
+#define KOKKOS_SYCL_PARALLEL_FOR_TEAM_HPP
+
+#include <Kokkos_Parallel.hpp>
+
+#include <SYCL/Kokkos_SYCL_Team.hpp>
+#include <SYCL/Kokkos_SYCL_TeamPolicy.hpp>
+
+#include <vector>
+
+template <typename FunctorType, typename... Properties>
+class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
+                                Kokkos::Experimental::SYCL> {
+ public:
+  using Policy = TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>;
+  using functor_type = FunctorType;
+  using size_type    = ::Kokkos::Experimental::SYCL::size_type;
+
+ private:
+  using member_type   = typename Policy::member_type;
+  using work_tag      = typename Policy::work_tag;
+  using launch_bounds = typename Policy::launch_bounds;
+
+  FunctorType const m_functor;
+  Policy const m_policy;
+  size_type const m_league_size;
+  int m_team_size;
+  size_type const m_vector_size;
+  int m_shmem_begin;
+  int m_shmem_size;
+  sycl::device_ptr<char> m_global_scratch_ptr;
+  size_t m_scratch_size[2];
+  // Only let one ParallelFor/Reduce modify the team scratch memory. The
+  // constructor acquires the mutex which is released in the destructor.
+  std::scoped_lock<std::mutex> m_scratch_lock;
+  int m_scratch_pool_id = -1;
+
+  template <typename FunctorWrapper>
+  sycl::event sycl_direct_launch(const Policy& policy,
+                                 const FunctorWrapper& functor_wrapper,
+                                 const sycl::event& memcpy_event) const {
+    // Convenience references
+    const Kokkos::Experimental::SYCL& space = policy.space();
+    sycl::queue& q                          = space.sycl_queue();
+
+    auto parallel_for_event = q.submit([&](sycl::handler& cgh) {
+      // FIXME_SYCL accessors seem to need a size greater than zero at least for
+      // host queues
+      sycl::local_accessor<char, 1> team_scratch_memory_L0(
+          sycl::range<1>(
+              std::max(m_scratch_size[0] + m_shmem_begin, size_t(1))),
+          cgh);
+
+      // Avoid capturing *this since it might not be trivially copyable
+      const auto shmem_begin       = m_shmem_begin;
+      const size_t scratch_size[2] = {m_scratch_size[0], m_scratch_size[1]};
+      sycl::device_ptr<char> const global_scratch_ptr = m_global_scratch_ptr;
+
+      auto lambda = [=](sycl::nd_item<2> item) {
+        const member_type team_member(
+            team_scratch_memory_L0.get_pointer(), shmem_begin, scratch_size[0],
+            global_scratch_ptr + item.get_group(1) * scratch_size[1],
+            scratch_size[1], item, item.get_group_linear_id(),
+            item.get_group_range(1));
+        if constexpr (std::is_void<work_tag>::value)
+          functor_wrapper.get_functor()(team_member);
+        else
+          functor_wrapper.get_functor()(work_tag(), team_member);
+      };
+
+      static sycl::kernel kernel = [&] {
+        sycl::kernel_id functor_kernel_id =
+            sycl::get_kernel_id<decltype(lambda)>();
+        auto kernel_bundle =
+            sycl::get_kernel_bundle<sycl::bundle_state::executable>(
+                q.get_context(), std::vector{functor_kernel_id});
+        return kernel_bundle.get_kernel(functor_kernel_id);
+      }();
+      auto max_sg_size =
+          kernel
+              .get_info<sycl::info::kernel_device_specific::max_sub_group_size>(
+                  q.get_device());
+      auto final_vector_size = std::min<int>(m_vector_size, max_sg_size);
+      // FIXME_SYCL For some reason, explicitly enforcing the kernel bundle to
+      // be used gives a runtime error.
+      // cgh.use_kernel_bundle(kernel_bundle);
+
+#ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES
+      cgh.depends_on(memcpy_event);
+#else
+      (void)memcpy_event;
+#endif
+      cgh.parallel_for(
+          sycl::nd_range<2>(
+              sycl::range<2>(m_team_size, m_league_size * final_vector_size),
+              sycl::range<2>(m_team_size, final_vector_size)),
+          lambda);
+    });
+#ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES
+    q.ext_oneapi_submit_barrier(std::vector<sycl::event>{parallel_for_event});
+#endif
+    return parallel_for_event;
+  }
+
+ public:
+  inline void execute() const {
+    if (m_league_size == 0) return;
+
+    auto& space = *m_policy.space().impl_internal_space_instance();
+    Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem&
+        indirectKernelMem = space.get_indirect_kernel_mem();
+
+    auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper(
+        m_functor, indirectKernelMem);
+
+    sycl::event event = sycl_direct_launch(m_policy, functor_wrapper,
+                                           functor_wrapper.get_copy_event());
+    functor_wrapper.register_event(event);
+    space.register_team_scratch_event(m_scratch_pool_id, event);
+  }
+
+  ParallelFor(FunctorType const& arg_functor, Policy const& arg_policy)
+      : m_functor(arg_functor),
+        m_policy(arg_policy),
+        m_league_size(arg_policy.league_size()),
+        m_team_size(arg_policy.team_size()),
+        m_vector_size(arg_policy.impl_vector_length()),
+        m_scratch_lock(arg_policy.space()
+                           .impl_internal_space_instance()
+                           ->m_team_scratch_mutex) {
+    // FIXME_SYCL optimize
+    if (m_team_size < 0)
+      m_team_size =
+          m_policy.team_size_recommended(arg_functor, ParallelForTag{});
+
+    m_shmem_begin = (sizeof(double) * (m_team_size + 2));
+    m_shmem_size =
+        (m_policy.scratch_size(0, m_team_size) +
+         FunctorTeamShmemSize<FunctorType>::value(m_functor, m_team_size));
+    m_scratch_size[0] = m_shmem_size;
+    m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);
+
+    // Functor's reduce memory, team scan memory, and team shared memory depend
+    // upon team size.
+    auto& space       = *m_policy.space().impl_internal_space_instance();
+    m_scratch_pool_id = space.acquire_team_scratch_space();
+    m_global_scratch_ptr =
+        static_cast<sycl::device_ptr<char>>(space.resize_team_scratch_space(
+            m_scratch_pool_id,
+            static_cast<ptrdiff_t>(m_scratch_size[1]) * m_league_size));
+
+    if (static_cast<int>(space.m_maxShmemPerBlock) <
+        m_shmem_size - m_shmem_begin) {
+      std::stringstream out;
+      out << "Kokkos::Impl::ParallelFor<SYCL> insufficient shared memory! "
+             "Requested "
+          << m_shmem_size - m_shmem_begin << " bytes but maximum is "
+          << space.m_maxShmemPerBlock << '\n';
+      Kokkos::Impl::throw_runtime_exception(out.str());
+    }
+
+    const auto max_team_size =
+        m_policy.team_size_max(arg_functor, ParallelForTag{});
+    if (m_team_size > m_policy.team_size_max(arg_functor, ParallelForTag{}))
+      Kokkos::Impl::throw_runtime_exception(
+          "Kokkos::Impl::ParallelFor<SYCL> requested too large team size. The "
+          "maximal team_size is " +
+          std::to_string(max_team_size) + '!');
+  }
+};
+
+#endif
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..6964c2dbcf0daf087275969f90b5a8037b78cf49
--- /dev/null
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp
@@ -0,0 +1,343 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_SYCL_PARALLEL_REDUCE_MDRANGE_HPP
+#define KOKKOS_SYCL_PARALLEL_REDUCE_MDRANGE_HPP
+
+#include <Kokkos_Macros.hpp>
+
+#include <Kokkos_Parallel_Reduce.hpp>
+#include <SYCL/Kokkos_SYCL_WorkgroupReduction.hpp>
+#include <Kokkos_BitManipulation.hpp>
+
+#ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES
+#include <vector>
+#endif
+
+template <class CombinedFunctorReducerType, class... Traits>
+class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
+                                   Kokkos::MDRangePolicy<Traits...>,
+                                   Kokkos::Experimental::SYCL> {
+ public:
+  using Policy      = Kokkos::MDRangePolicy<Traits...>;
+  using FunctorType = typename CombinedFunctorReducerType::functor_type;
+  using ReducerType = typename CombinedFunctorReducerType::reducer_type;
+
+ private:
+  using value_type     = typename ReducerType::value_type;
+  using pointer_type   = typename ReducerType::pointer_type;
+  using reference_type = typename ReducerType::reference_type;
+
+  using WorkTag = typename Policy::work_tag;
+
+  // MDRangePolicy is not trivially copyable. Hence, replicate the data we
+  // really need in DeviceIterateTile in a trivially copyable struct.
+  struct BarePolicy {
+    using index_type = typename Policy::index_type;
+
+    BarePolicy(const Policy& policy)
+        : m_lower(policy.m_lower),
+          m_upper(policy.m_upper),
+          m_tile(policy.m_tile),
+          m_tile_end(policy.m_tile_end),
+          m_num_tiles(policy.m_num_tiles),
+          m_prod_tile_dims(policy.m_prod_tile_dims) {}
+
+    const typename Policy::point_type m_lower;
+    const typename Policy::point_type m_upper;
+    const typename Policy::tile_type m_tile;
+    const typename Policy::point_type m_tile_end;
+    const typename Policy::index_type m_num_tiles;
+    const typename Policy::index_type m_prod_tile_dims;
+    static constexpr Iterate inner_direction = Policy::inner_direction;
+    static constexpr int rank                = Policy::rank;
+  };
+
+ public:
+  // V - View
+  template <typename View>
+  ParallelReduce(const CombinedFunctorReducerType& f, const Policy& p,
+                 const View& v)
+      : m_functor_reducer(f),
+        m_policy(p),
+        m_space(p.space()),
+        m_result_ptr(v.data()),
+        m_result_ptr_device_accessible(
+            MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                              typename View::memory_space>::accessible),
+        m_shared_memory_lock(
+            m_space.impl_internal_space_instance()->m_mutexScratchSpace) {}
+
+ private:
+  template <typename CombinedFunctorReducerWrapper>
+  sycl::event sycl_direct_launch(
+      const CombinedFunctorReducerWrapper& functor_reducer_wrapper,
+      const sycl::event& memcpy_event) const {
+    // Convenience references
+    Kokkos::Experimental::Impl::SYCLInternal& instance =
+        *m_space.impl_internal_space_instance();
+    sycl::queue& q = m_space.sycl_queue();
+
+    const typename Policy::index_type n_tiles = m_policy.m_num_tiles;
+    const unsigned int value_count =
+        m_functor_reducer.get_reducer().value_count();
+    sycl::device_ptr<value_type> results_ptr;
+
+    sycl::event last_reduction_event;
+
+    // If n_tiles==0 we only call init() and final() working with the global
+    // scratch memory but don't copy back to m_result_ptr yet.
+    if (n_tiles == 0) {
+      auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) {
+#ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES
+        cgh.depends_on(memcpy_event);
+#else
+        (void)memcpy_event;
+#endif
+        results_ptr = static_cast<sycl::device_ptr<value_type>>(
+            instance.scratch_space(sizeof(value_type) * value_count));
+        sycl::global_ptr<value_type> device_accessible_result_ptr =
+            m_result_ptr_device_accessible ? m_result_ptr : nullptr;
+        cgh.single_task([=]() {
+          const CombinedFunctorReducerType& functor_reducer =
+              functor_reducer_wrapper.get_functor();
+          const ReducerType& reducer = functor_reducer.get_reducer();
+          reducer.init(results_ptr);
+          reducer.final(results_ptr);
+          if (device_accessible_result_ptr)
+            reducer.copy(device_accessible_result_ptr.get(), results_ptr.get());
+        });
+      });
+#ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES
+      q.ext_oneapi_submit_barrier(
+          std::vector<sycl::event>{parallel_reduce_event});
+#endif
+      last_reduction_event = parallel_reduce_event;
+    } else {
+      // Otherwise (when n_tiles is not zero), we perform a reduction on the
+      // values in all workgroups separately, write the workgroup results back
+      // to global memory and recurse until only one workgroup does the
+      // reduction and thus gets the final value.
+      const int wgroup_size = Kokkos::bit_ceil(
+          static_cast<unsigned int>(m_policy.m_prod_tile_dims));
+
+      // FIXME_SYCL Find a better way to determine a good limit for the
+      // maximum number of work groups, also see
+      // https://github.com/intel/llvm/blob/756ba2616111235bba073e481b7f1c8004b34ee6/sycl/source/detail/reduction.cpp#L51-L62
+      size_t max_work_groups =
+          2 * q.get_device().get_info<sycl::info::device::max_compute_units>();
+      int values_per_thread = 1;
+      size_t n_wgroups      = n_tiles;
+      while (n_wgroups > max_work_groups) {
+        values_per_thread *= 2;
+        n_wgroups = (n_tiles + values_per_thread - 1) / values_per_thread;
+      }
+
+      results_ptr = static_cast<sycl::device_ptr<value_type>>(
+          instance.scratch_space(sizeof(value_type) * value_count * n_wgroups));
+      sycl::global_ptr<value_type> device_accessible_result_ptr =
+          m_result_ptr_device_accessible ? m_result_ptr : nullptr;
+      auto scratch_flags = static_cast<sycl::device_ptr<unsigned int>>(
+          instance.scratch_flags(sizeof(unsigned int)));
+
+      auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) {
+        sycl::local_accessor<value_type> local_mem(
+            sycl::range<1>(wgroup_size) * value_count, cgh);
+        sycl::local_accessor<unsigned int> num_teams_done(1, cgh);
+
+        const BarePolicy bare_policy = m_policy;
+
+#ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES
+        cgh.depends_on(memcpy_event);
+#else
+        (void)memcpy_event;
+#endif
+
+        // REMEMBER swap local x<->y to be conforming with Cuda/HIP
+        // implementation
+        cgh.parallel_for(
+            sycl::nd_range<1>{n_wgroups * wgroup_size, wgroup_size},
+            [=](sycl::nd_item<1> item) {
+              const int local_id = item.get_local_linear_id();
+              const CombinedFunctorReducerType& functor_reducer =
+                  functor_reducer_wrapper.get_functor();
+              const FunctorType& functor = functor_reducer.get_functor();
+              const ReducerType& reducer = functor_reducer.get_reducer();
+
+              // In the first iteration, we call functor to initialize the local
+              // memory. Otherwise, the local memory is initialized with the
+              // results from the previous iteration that are stored in global
+              // memory.
+              using index_type = typename Policy::index_type;
+
+              // SWAPPED here to be conforming with CUDA implementation
+              const index_type local_x    = 0;
+              const index_type local_y    = item.get_local_id(0);
+              const index_type local_z    = 0;
+              const index_type global_y   = 0;
+              const index_type global_z   = 0;
+              const index_type n_global_x = n_tiles;
+              const index_type n_global_y = 1;
+              const index_type n_global_z = 1;
+
+              if constexpr (!SYCLReduction::use_shuffle_based_algorithm<
+                                ReducerType>) {
+                reference_type update =
+                    reducer.init(&local_mem[local_id * value_count]);
+
+                for (index_type global_x = item.get_group(0);
+                     global_x < n_tiles; global_x += item.get_group_range(0))
+                  Kokkos::Impl::Reduce::DeviceIterateTile<
+                      Policy::rank, BarePolicy, FunctorType,
+                      typename Policy::work_tag, reference_type>(
+                      bare_policy, functor, update,
+                      {n_global_x, n_global_y, n_global_z},
+                      {global_x, global_y, global_z},
+                      {local_x, local_y, local_z})
+                      .exec_range();
+                item.barrier(sycl::access::fence_space::local_space);
+
+                SYCLReduction::workgroup_reduction<>(
+                    item, local_mem, results_ptr, device_accessible_result_ptr,
+                    value_count, reducer, false, wgroup_size);
+
+                if (local_id == 0) {
+                  sycl::atomic_ref<unsigned, sycl::memory_order::acq_rel,
+                                   sycl::memory_scope::device,
+                                   sycl::access::address_space::global_space>
+                      scratch_flags_ref(*scratch_flags);
+                  num_teams_done[0] = ++scratch_flags_ref;
+                }
+                item.barrier(sycl::access::fence_space::local_space);
+                if (num_teams_done[0] == n_wgroups) {
+                  if (local_id >= static_cast<int>(n_wgroups))
+                    reducer.init(&local_mem[local_id * value_count]);
+                  else {
+                    reducer.copy(&local_mem[local_id * value_count],
+                                 &results_ptr[local_id * value_count]);
+                    for (unsigned int id = local_id + wgroup_size;
+                         id < n_wgroups; id += wgroup_size) {
+                      reducer.join(&local_mem[local_id * value_count],
+                                   &results_ptr[id * value_count]);
+                    }
+                  }
+
+                  SYCLReduction::workgroup_reduction<>(
+                      item, local_mem, results_ptr,
+                      device_accessible_result_ptr, value_count, reducer, true,
+                      std::min<int>(n_wgroups, wgroup_size));
+                }
+              } else {
+                value_type local_value;
+                reference_type update = reducer.init(&local_value);
+
+                for (index_type global_x = item.get_group(0);
+                     global_x < n_tiles; global_x += item.get_group_range(0))
+                  Kokkos::Impl::Reduce::DeviceIterateTile<
+                      Policy::rank, BarePolicy, FunctorType,
+                      typename Policy::work_tag, reference_type>(
+                      bare_policy, functor, update,
+                      {n_global_x, n_global_y, n_global_z},
+                      {global_x, global_y, global_z},
+                      {local_x, local_y, local_z})
+                      .exec_range();
+
+                SYCLReduction::workgroup_reduction<>(
+                    item, local_mem, local_value, results_ptr,
+                    device_accessible_result_ptr, reducer, false, wgroup_size);
+
+                if (local_id == 0) {
+                  sycl::atomic_ref<unsigned, sycl::memory_order::acq_rel,
+                                   sycl::memory_scope::device,
+                                   sycl::access::address_space::global_space>
+                      scratch_flags_ref(*scratch_flags);
+                  num_teams_done[0] = ++scratch_flags_ref;
+                }
+                item.barrier(sycl::access::fence_space::local_space);
+                if (num_teams_done[0] == n_wgroups) {
+                  if (local_id >= static_cast<int>(n_wgroups))
+                    reducer.init(&local_value);
+                  else {
+                    local_value = results_ptr[local_id];
+                    for (unsigned int id = local_id + wgroup_size;
+                         id < n_wgroups; id += wgroup_size) {
+                      reducer.join(&local_value, &results_ptr[id]);
+                    }
+                  }
+
+                  SYCLReduction::workgroup_reduction<>(
+                      item, local_mem, local_value, results_ptr,
+                      device_accessible_result_ptr, reducer, true,
+                      std::min<int>(n_wgroups, wgroup_size));
+                }
+              }
+            });
+      });
+#ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES
+      q.ext_oneapi_submit_barrier(
+          std::vector<sycl::event>{parallel_reduce_event});
+#endif
+      last_reduction_event = parallel_reduce_event;
+    }
+
+    // At this point, the reduced value is written to the entry in results_ptr
+    // and all that is left is to copy it back to the given result pointer if
+    // necessary.
+    if (m_result_ptr && !m_result_ptr_device_accessible) {
+      Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                             Kokkos::Experimental::SYCLDeviceUSMSpace>(
+          m_space, m_result_ptr, results_ptr,
+          sizeof(*m_result_ptr) * value_count);
+    }
+
+    return last_reduction_event;
+  }
+
+ public:
+  template <typename Policy, typename Functor>
+  static int max_tile_size_product(const Policy& policy, const Functor&) {
+    return policy.space().impl_internal_space_instance()->m_maxWorkgroupSize;
+  }
+
+  void execute() const {
+    Kokkos::Experimental::Impl::SYCLInternal& instance =
+        *m_space.impl_internal_space_instance();
+    using IndirectKernelMem =
+        Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem;
+    IndirectKernelMem& indirectKernelMem = instance.get_indirect_kernel_mem();
+
+    auto functor_reducer_wrapper =
+        Experimental::Impl::make_sycl_function_wrapper(m_functor_reducer,
+                                                       indirectKernelMem);
+
+    sycl::event event = sycl_direct_launch(
+        functor_reducer_wrapper, functor_reducer_wrapper.get_copy_event());
+    functor_reducer_wrapper.register_event(event);
+  }
+
+ private:
+  const CombinedFunctorReducerType m_functor_reducer;
+  const BarePolicy m_policy;
+  const Kokkos::Experimental::SYCL& m_space;
+  const pointer_type m_result_ptr;
+  const bool m_result_ptr_device_accessible;
+
+  // Only let one Parallel/Scan modify the shared memory. The
+  // constructor acquires the mutex which is released in the destructor.
+  std::scoped_lock<std::mutex> m_shared_memory_lock;
+};
+
+#endif /* KOKKOS_SYCL_PARALLEL_REDUCE_MDRANGE_HPP */
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..8c900cfa42806716d857c19b7beb49fa70c83d5a
--- /dev/null
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp
@@ -0,0 +1,362 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_SYCL_PARALLEL_REDUCE_RANGE_HPP
+#define KOKKOS_SYCL_PARALLEL_REDUCE_RANGE_HPP
+
+#include <Kokkos_Macros.hpp>
+
+#include <Kokkos_BitManipulation.hpp>
+#include <Kokkos_Parallel_Reduce.hpp>
+#include <SYCL/Kokkos_SYCL_WorkgroupReduction.hpp>
+#include <vector>
+
+template <class CombinedFunctorReducerType, class... Traits>
+class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
+                                   Kokkos::RangePolicy<Traits...>,
+                                   Kokkos::Experimental::SYCL> {
+ public:
+  using Policy      = Kokkos::RangePolicy<Traits...>;
+  using FunctorType = typename CombinedFunctorReducerType::functor_type;
+  using ReducerType = typename CombinedFunctorReducerType::reducer_type;
+
+ private:
+  using value_type     = typename ReducerType::value_type;
+  using pointer_type   = typename ReducerType::pointer_type;
+  using reference_type = typename ReducerType::reference_type;
+
+  using WorkTag = typename Policy::work_tag;
+
+ public:
+  // V - View
+  template <typename View>
+  ParallelReduce(const CombinedFunctorReducerType& f, const Policy& p,
+                 const View& v)
+      : m_functor_reducer(f),
+        m_policy(p),
+        m_result_ptr(v.data()),
+        m_result_ptr_device_accessible(
+            MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                              typename View::memory_space>::accessible),
+        m_shared_memory_lock(
+            p.space().impl_internal_space_instance()->m_mutexScratchSpace) {}
+
+ private:
+  template <typename PolicyType, typename CombinedFunctorReducerWrapper>
+  sycl::event sycl_direct_launch(
+      const PolicyType& policy,
+      const CombinedFunctorReducerWrapper& functor_reducer_wrapper,
+      const sycl::event& memcpy_event) const {
+    // Convenience references
+    const Kokkos::Experimental::SYCL& space = policy.space();
+    Kokkos::Experimental::Impl::SYCLInternal& instance =
+        *space.impl_internal_space_instance();
+    sycl::queue& q = space.sycl_queue();
+
+    std::size_t size = policy.end() - policy.begin();
+    const unsigned int value_count =
+        m_functor_reducer.get_reducer().value_count();
+    sycl::device_ptr<value_type> results_ptr = nullptr;
+    sycl::global_ptr<value_type> device_accessible_result_ptr =
+        m_result_ptr_device_accessible ? m_result_ptr : nullptr;
+
+    sycl::event last_reduction_event;
+
+    // If size<=1 we only call init(), the functor and possibly final once
+    // working with the global scratch memory but don't copy back to
+    // m_result_ptr yet.
+    if (size <= 1) {
+      results_ptr = static_cast<sycl::device_ptr<value_type>>(
+          instance.scratch_space(sizeof(value_type) * value_count));
+
+      auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) {
+        const auto begin = policy.begin();
+#ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES
+        cgh.depends_on(memcpy_event);
+#else
+        (void)memcpy_event;
+#endif
+        cgh.single_task([=]() {
+          const CombinedFunctorReducerType& functor_reducer =
+              functor_reducer_wrapper.get_functor();
+          const FunctorType& functor = functor_reducer.get_functor();
+          const ReducerType& reducer = functor_reducer.get_reducer();
+          reference_type update      = reducer.init(results_ptr);
+          if (size == 1) {
+            if constexpr (std::is_void_v<WorkTag>)
+              functor(begin, update);
+            else
+              functor(WorkTag(), begin, update);
+          }
+          reducer.final(results_ptr);
+          if (device_accessible_result_ptr != nullptr)
+            reducer.copy(device_accessible_result_ptr.get(), results_ptr.get());
+        });
+      });
+#ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES
+      q.ext_oneapi_submit_barrier(
+          std::vector<sycl::event>{parallel_reduce_event});
+#endif
+      last_reduction_event = parallel_reduce_event;
+    } else {
+      // Otherwise (when size > 1), we perform a reduction on the values in all
+      // workgroups separately, write the workgroup results back to global
+      // memory and recurse until only one workgroup does the reduction and thus
+      // gets the final value.
+      auto scratch_flags = static_cast<sycl::device_ptr<unsigned int>>(
+          instance.scratch_flags(sizeof(unsigned int)));
+
+      auto reduction_lambda_factory =
+          [&](sycl::local_accessor<value_type> local_mem,
+              sycl::local_accessor<unsigned int> num_teams_done,
+              sycl::device_ptr<value_type> results_ptr, int values_per_thread) {
+            const auto begin = policy.begin();
+
+            auto lambda = [=](sycl::nd_item<1> item) {
+              const auto n_wgroups   = item.get_group_range()[0];
+              const auto wgroup_size = item.get_local_range()[0];
+
+              const auto local_id = item.get_local_linear_id();
+              const auto global_id =
+                  wgroup_size * item.get_group_linear_id() * values_per_thread +
+                  local_id;
+              const CombinedFunctorReducerType& functor_reducer =
+                  functor_reducer_wrapper.get_functor();
+              const FunctorType& functor = functor_reducer.get_functor();
+              const ReducerType& reducer = functor_reducer.get_reducer();
+
+              using index_type       = typename Policy::index_type;
+              const auto upper_bound = std::min<index_type>(
+                  global_id + values_per_thread * wgroup_size, size);
+
+              if constexpr (!SYCLReduction::use_shuffle_based_algorithm<
+                                ReducerType>) {
+                reference_type update =
+                    reducer.init(&local_mem[local_id * value_count]);
+                for (index_type id = global_id; id < upper_bound;
+                     id += wgroup_size) {
+                  if constexpr (std::is_void_v<WorkTag>)
+                    functor(id + begin, update);
+                  else
+                    functor(WorkTag(), id + begin, update);
+                }
+                item.barrier(sycl::access::fence_space::local_space);
+
+                SYCLReduction::workgroup_reduction<>(
+                    item, local_mem, results_ptr, device_accessible_result_ptr,
+                    value_count, reducer, false, std::min(size, wgroup_size));
+
+                if (local_id == 0) {
+                  sycl::atomic_ref<unsigned, sycl::memory_order::acq_rel,
+                                   sycl::memory_scope::device,
+                                   sycl::access::address_space::global_space>
+                      scratch_flags_ref(*scratch_flags);
+                  num_teams_done[0] = ++scratch_flags_ref;
+                }
+                item.barrier(sycl::access::fence_space::local_space);
+                if (num_teams_done[0] == n_wgroups) {
+                  if (local_id >= n_wgroups)
+                    reducer.init(&local_mem[local_id * value_count]);
+                  else {
+                    reducer.copy(&local_mem[local_id * value_count],
+                                 &results_ptr[local_id * value_count]);
+                    for (unsigned int id = local_id + wgroup_size;
+                         id < n_wgroups; id += wgroup_size) {
+                      reducer.join(&local_mem[local_id * value_count],
+                                   &results_ptr[id * value_count]);
+                    }
+                  }
+
+                  SYCLReduction::workgroup_reduction<>(
+                      item, local_mem, results_ptr,
+                      device_accessible_result_ptr, value_count, reducer, true,
+                      std::min(n_wgroups, wgroup_size));
+                }
+              } else {
+                value_type local_value;
+                reference_type update = reducer.init(&local_value);
+                for (index_type id = global_id; id < upper_bound;
+                     id += wgroup_size) {
+                  if constexpr (std::is_void_v<WorkTag>)
+                    functor(id + begin, update);
+                  else
+                    functor(WorkTag(), id + begin, update);
+                }
+
+                SYCLReduction::workgroup_reduction<>(
+                    item, local_mem, local_value, results_ptr,
+                    device_accessible_result_ptr, reducer, false,
+                    std::min(size, wgroup_size));
+
+                if (local_id == 0) {
+                  sycl::atomic_ref<unsigned, sycl::memory_order::acq_rel,
+                                   sycl::memory_scope::device,
+                                   sycl::access::address_space::global_space>
+                      scratch_flags_ref(*scratch_flags);
+                  num_teams_done[0] = ++scratch_flags_ref;
+                }
+                item.barrier(sycl::access::fence_space::local_space);
+                if (num_teams_done[0] == n_wgroups) {
+                  if (local_id >= n_wgroups)
+                    reducer.init(&local_value);
+                  else {
+                    local_value = results_ptr[local_id];
+                    for (unsigned int id = local_id + wgroup_size;
+                         id < n_wgroups; id += wgroup_size) {
+                      reducer.join(&local_value, &results_ptr[id]);
+                    }
+                  }
+
+                  SYCLReduction::workgroup_reduction<>(
+                      item, local_mem, local_value, results_ptr,
+                      device_accessible_result_ptr, reducer, true,
+                      std::min(n_wgroups, wgroup_size));
+                }
+              }
+            };
+            return lambda;
+          };
+
+      auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) {
+        sycl::local_accessor<unsigned int> num_teams_done(1, cgh);
+
+        auto dummy_reduction_lambda =
+            reduction_lambda_factory({1, cgh}, num_teams_done, nullptr, 1);
+
+        static sycl::kernel kernel = [&] {
+          sycl::kernel_id functor_kernel_id =
+              sycl::get_kernel_id<decltype(dummy_reduction_lambda)>();
+          auto kernel_bundle =
+              sycl::get_kernel_bundle<sycl::bundle_state::executable>(
+                  q.get_context(), std::vector{functor_kernel_id});
+          return kernel_bundle.get_kernel(functor_kernel_id);
+        }();
+        auto multiple = kernel.get_info<sycl::info::kernel_device_specific::
+                                            preferred_work_group_size_multiple>(
+            q.get_device());
+        // FIXME_SYCL The code below queries the kernel for the maximum subgroup
+        // size but it turns out that this is not accurate and choosing a larger
+        // subgroup size gives better peformance (and is what the oneAPI
+        // reduction algorithm does).
+#ifndef KOKKOS_ARCH_INTEL_GPU
+        auto max =
+            kernel
+                .get_info<sycl::info::kernel_device_specific::work_group_size>(
+                    q.get_device());
+#else
+        auto max =
+            q.get_device().get_info<sycl::info::device::max_work_group_size>();
+#endif
+
+        auto max_local_memory =
+            q.get_device().get_info<sycl::info::device::local_mem_size>();
+        // The workgroup size is computed as the minimum of
+        // - the smallest power of two not less than the total work size
+        // - the largest power of two not exceeding the largest multiple of the
+        //   recommended workgroup size not exceeding the maximum workgroup size
+        // - the largest power of two such that we don't use more than 99% (as a
+        //   safe-guard) of the available local memory.
+        const auto wgroup_size = std::min(
+            {Kokkos::bit_ceil(size),
+             Kokkos::bit_floor(static_cast<size_t>(max / multiple) * multiple),
+             Kokkos::bit_floor(static_cast<size_t>(max_local_memory * .99) /
+                               (sizeof(value_type) * value_count))});
+
+        // FIXME_SYCL Find a better way to determine a good limit for the
+        // maximum number of work groups, also see
+        // https://github.com/intel/llvm/blob/756ba2616111235bba073e481b7f1c8004b34ee6/sycl/source/detail/reduction.cpp#L51-L62
+        size_t max_work_groups =
+            2 *
+            q.get_device().get_info<sycl::info::device::max_compute_units>();
+        int values_per_thread = 1;
+        size_t n_wgroups      = (size + wgroup_size - 1) / wgroup_size;
+        while (n_wgroups > max_work_groups) {
+          values_per_thread *= 2;
+          n_wgroups = ((size + values_per_thread - 1) / values_per_thread +
+                       wgroup_size - 1) /
+                      wgroup_size;
+        }
+
+        results_ptr =
+            static_cast<sycl::device_ptr<value_type>>(instance.scratch_space(
+                sizeof(value_type) * value_count * n_wgroups));
+
+        sycl::local_accessor<value_type> local_mem(
+            sycl::range<1>(wgroup_size) * value_count, cgh);
+
+#ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES
+        cgh.depends_on(memcpy_event);
+#else
+        (void)memcpy_event;
+#endif
+
+        auto reduction_lambda = reduction_lambda_factory(
+            local_mem, num_teams_done, results_ptr, values_per_thread);
+
+        cgh.parallel_for(
+            sycl::nd_range<1>(n_wgroups * wgroup_size, wgroup_size),
+            reduction_lambda);
+      });
+#ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES
+      q.ext_oneapi_submit_barrier(
+          std::vector<sycl::event>{parallel_reduce_event});
+#endif
+      last_reduction_event = parallel_reduce_event;
+    }
+
+    // At this point, the reduced value is written to the entry in results_ptr
+    // and all that is left is to copy it back to the given result pointer if
+    // necessary.
+    if (m_result_ptr && !m_result_ptr_device_accessible) {
+      Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                             Kokkos::Experimental::SYCLDeviceUSMSpace>(
+          space, m_result_ptr, results_ptr,
+          sizeof(*m_result_ptr) * value_count);
+    }
+
+    return last_reduction_event;
+  }
+
+ public:
+  void execute() const {
+    Kokkos::Experimental::Impl::SYCLInternal& instance =
+        *m_policy.space().impl_internal_space_instance();
+    using IndirectKernelMem =
+        Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem;
+    IndirectKernelMem& indirectKernelMem = instance.get_indirect_kernel_mem();
+
+    auto functor_reducer_wrapper =
+        Experimental::Impl::make_sycl_function_wrapper(m_functor_reducer,
+                                                       indirectKernelMem);
+
+    sycl::event event =
+        sycl_direct_launch(m_policy, functor_reducer_wrapper,
+                           functor_reducer_wrapper.get_copy_event());
+    functor_reducer_wrapper.register_event(event);
+  }
+
+ private:
+  const CombinedFunctorReducerType m_functor_reducer;
+  const Policy m_policy;
+  const pointer_type m_result_ptr;
+  const bool m_result_ptr_device_accessible;
+
+  // Only let one Parallel/Scan modify the shared memory. The
+  // constructor acquires the mutex which is released in the destructor.
+  std::scoped_lock<std::mutex> m_shared_memory_lock;
+};
+
+#endif /* KOKKOS_SYCL_PARALLEL_REDUCE_RANGE_HPP */
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..07145b0fb93c4c76ad91129b3e96d24d8ed7833b
--- /dev/null
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp
@@ -0,0 +1,458 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_SYCL_PARALLEL_REDUCE_TEAM_HPP
+#define KOKKOS_SYCL_PARALLEL_REDUCE_TEAM_HPP
+
+#include <Kokkos_Parallel.hpp>
+
+#include <SYCL/Kokkos_SYCL_Team.hpp>
+#include <SYCL/Kokkos_SYCL_TeamPolicy.hpp>
+#include <SYCL/Kokkos_SYCL_WorkgroupReduction.hpp>
+
+#include <vector>
+
+template <class CombinedFunctorReducerType, class... Properties>
+class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
+                                   Kokkos::TeamPolicy<Properties...>,
+                                   Kokkos::Experimental::SYCL> {
+ public:
+  using Policy = TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>;
+  using FunctorType = typename CombinedFunctorReducerType::functor_type;
+  using ReducerType = typename CombinedFunctorReducerType::reducer_type;
+
+ private:
+  using member_type   = typename Policy::member_type;
+  using WorkTag       = typename Policy::work_tag;
+  using launch_bounds = typename Policy::launch_bounds;
+
+  using pointer_type   = typename ReducerType::pointer_type;
+  using reference_type = typename ReducerType::reference_type;
+  using value_type     = typename ReducerType::value_type;
+
+ public:
+  using functor_type = FunctorType;
+  using size_type    = Kokkos::Experimental::SYCL::size_type;
+
+ private:
+  const CombinedFunctorReducerType m_functor_reducer;
+  const Policy m_policy;
+  const pointer_type m_result_ptr;
+  const bool m_result_ptr_device_accessible;
+  size_type m_shmem_begin;
+  size_type m_shmem_size;
+  sycl::device_ptr<char> m_global_scratch_ptr;
+  size_t m_scratch_size[2];
+  const size_type m_league_size;
+  int m_team_size;
+  const size_type m_vector_size;
+  // Only let one ParallelFor/Reduce modify the team scratch memory. The
+  // constructor acquires the mutex which is released in the destructor.
+  std::scoped_lock<std::mutex> m_scratch_lock;
+  int m_scratch_pool_id = -1;
+
+  template <typename PolicyType, typename CombinedFunctorReducerWrapper>
+  sycl::event sycl_direct_launch(
+      const PolicyType& policy,
+      const CombinedFunctorReducerWrapper& functor_reducer_wrapper,
+      const sycl::event& memcpy_event) const {
+    // Convenience references
+    const Kokkos::Experimental::SYCL& space = policy.space();
+    Kokkos::Experimental::Impl::SYCLInternal& instance =
+        *space.impl_internal_space_instance();
+    sycl::queue& q = space.sycl_queue();
+
+    const unsigned int value_count =
+        m_functor_reducer.get_reducer().value_count();
+    std::size_t size = std::size_t(m_league_size) * m_team_size * m_vector_size;
+    value_type* results_ptr = nullptr;
+
+    sycl::event last_reduction_event;
+
+    // If size<=1 we only call init(), the functor and possibly final once
+    // working with the global scratch memory but don't copy back to
+    // m_result_ptr yet.
+    if (size <= 1) {
+      results_ptr =
+          static_cast<sycl::device_ptr<value_type>>(instance.scratch_space(
+              sizeof(value_type) * std::max(value_count, 1u)));
+      sycl::global_ptr<value_type> device_accessible_result_ptr =
+          m_result_ptr_device_accessible ? m_result_ptr : nullptr;
+
+      auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) {
+        // FIXME_SYCL accessors seem to need a size greater than zero at least
+        // for host queues
+        sycl::local_accessor<char, 1> team_scratch_memory_L0(
+            sycl::range<1>(
+                std::max(m_scratch_size[0] + m_shmem_begin, size_t(1))),
+            cgh);
+
+        // Avoid capturing *this since it might not be trivially copyable
+        const auto shmem_begin       = m_shmem_begin;
+        const size_t scratch_size[2] = {m_scratch_size[0], m_scratch_size[1]};
+        sycl::device_ptr<char> const global_scratch_ptr = m_global_scratch_ptr;
+
+#ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES
+        cgh.depends_on(memcpy_event);
+#else
+        (void)memcpy_event;
+#endif
+        cgh.parallel_for(
+            sycl::nd_range<2>(sycl::range<2>(1, 1), sycl::range<2>(1, 1)),
+            [=](sycl::nd_item<2> item) {
+              const CombinedFunctorReducerType& functor_reducer =
+                  functor_reducer_wrapper.get_functor();
+              const FunctorType& functor = functor_reducer.get_functor();
+              const ReducerType& reducer = functor_reducer.get_reducer();
+
+              reference_type update = reducer.init(results_ptr);
+              if (size == 1) {
+                const member_type team_member(
+                    team_scratch_memory_L0.get_pointer(), shmem_begin,
+                    scratch_size[0], global_scratch_ptr, scratch_size[1], item,
+                    item.get_group_linear_id(), item.get_group_range(1));
+                if constexpr (std::is_void_v<WorkTag>)
+                  functor(team_member, update);
+                else
+                  functor(WorkTag(), team_member, update);
+              }
+              reducer.final(results_ptr);
+              if (device_accessible_result_ptr)
+                reducer.copy(device_accessible_result_ptr, &results_ptr[0]);
+            });
+      });
+#ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES
+      q.ext_oneapi_submit_barrier(
+          std::vector<sycl::event>{parallel_reduce_event});
+#endif
+      last_reduction_event = parallel_reduce_event;
+    } else {
+      // Otherwise, (if the total range has more than one element) we perform a
+      // reduction on the values in all workgroups separately, write the
+      // workgroup results back to global memory and recurse until only one
+      // workgroup does the reduction and thus gets the final value.
+      auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) {
+        auto scratch_flags = static_cast<sycl::device_ptr<unsigned int>>(
+            instance.scratch_flags(sizeof(unsigned int)));
+
+        // FIXME_SYCL accessors seem to need a size greater than zero at least
+        // for host queues
+        sycl::local_accessor<char, 1> team_scratch_memory_L0(
+            sycl::range<1>(
+                std::max(m_scratch_size[0] + m_shmem_begin, size_t(1))),
+            cgh);
+
+        // Avoid capturing *this since it might not be trivially copyable
+        const auto shmem_begin       = m_shmem_begin;
+        const auto league_size       = m_league_size;
+        const size_t scratch_size[2] = {m_scratch_size[0], m_scratch_size[1]};
+        sycl::device_ptr<char> const global_scratch_ptr = m_global_scratch_ptr;
+
+        auto team_reduction_factory =
+            [&](sycl::local_accessor<value_type, 1> local_mem,
+                sycl::device_ptr<value_type> results_ptr) {
+              sycl::global_ptr<value_type> device_accessible_result_ptr =
+                  m_result_ptr_device_accessible ? m_result_ptr : nullptr;
+              auto lambda = [=](sycl::nd_item<2> item) {
+                auto n_wgroups = item.get_group_range()[1];
+                int wgroup_size =
+                    item.get_local_range()[0] * item.get_local_range()[1];
+                auto group_id = item.get_group_linear_id();
+                auto size     = n_wgroups * wgroup_size;
+
+                auto& num_teams_done = reinterpret_cast<unsigned int&>(
+                    local_mem[wgroup_size * std::max(value_count, 1u)]);
+                const auto local_id = item.get_local_linear_id();
+                const CombinedFunctorReducerType& functor_reducer =
+                    functor_reducer_wrapper.get_functor();
+                const FunctorType& functor = functor_reducer.get_functor();
+                const ReducerType& reducer = functor_reducer.get_reducer();
+
+                if constexpr (!SYCLReduction::use_shuffle_based_algorithm<
+                                  ReducerType>) {
+                  reference_type update =
+                      reducer.init(&local_mem[local_id * value_count]);
+                  for (int league_rank = group_id; league_rank < league_size;
+                       league_rank += n_wgroups) {
+                    const member_type team_member(
+                        team_scratch_memory_L0.get_pointer(), shmem_begin,
+                        scratch_size[0],
+                        global_scratch_ptr +
+                            item.get_group(1) * scratch_size[1],
+                        scratch_size[1], item, league_rank, league_size);
+                    if constexpr (std::is_void_v<WorkTag>)
+                      functor(team_member, update);
+                    else
+                      functor(WorkTag(), team_member, update);
+                  }
+                  item.barrier(sycl::access::fence_space::local_space);
+
+                  SYCLReduction::workgroup_reduction<>(
+                      item, local_mem, results_ptr,
+                      device_accessible_result_ptr, value_count, reducer, false,
+                      std::min<std::size_t>(size,
+                                            item.get_local_range()[0] *
+                                                item.get_local_range()[1]));
+
+                  if (local_id == 0) {
+                    sycl::atomic_ref<unsigned, sycl::memory_order::acq_rel,
+                                     sycl::memory_scope::device,
+                                     sycl::access::address_space::global_space>
+                        scratch_flags_ref(*scratch_flags);
+                    num_teams_done = ++scratch_flags_ref;
+                  }
+                  sycl::group_barrier(item.get_group());
+                  if (num_teams_done == n_wgroups) {
+                    if (local_id >= n_wgroups)
+                      reducer.init(&local_mem[local_id * value_count]);
+                    else {
+                      reducer.copy(&local_mem[local_id * value_count],
+                                   &results_ptr[local_id * value_count]);
+                      for (unsigned int id = local_id + wgroup_size;
+                           id < n_wgroups; id += wgroup_size) {
+                        reducer.join(&local_mem[local_id * value_count],
+                                     &results_ptr[id * value_count]);
+                      }
+                    }
+
+                    SYCLReduction::workgroup_reduction<>(
+                        item, local_mem, results_ptr,
+                        device_accessible_result_ptr, value_count, reducer,
+                        true,
+                        std::min(n_wgroups, item.get_local_range()[0] *
+                                                item.get_local_range()[1]));
+                  }
+                } else {
+                  value_type local_value;
+                  reference_type update = reducer.init(&local_value);
+                  for (int league_rank = group_id; league_rank < league_size;
+                       league_rank += n_wgroups) {
+                    const member_type team_member(
+                        team_scratch_memory_L0.get_pointer(), shmem_begin,
+                        scratch_size[0],
+                        global_scratch_ptr +
+                            item.get_group(1) * scratch_size[1],
+                        scratch_size[1], item, league_rank, league_size);
+                    if constexpr (std::is_void_v<WorkTag>)
+                      functor(team_member, update);
+                    else
+                      functor(WorkTag(), team_member, update);
+                  }
+
+                  SYCLReduction::workgroup_reduction<>(
+                      item, local_mem, local_value, results_ptr,
+                      device_accessible_result_ptr, reducer, false,
+                      std::min<std::size_t>(size,
+                                            item.get_local_range()[0] *
+                                                item.get_local_range()[1]));
+
+                  if (local_id == 0) {
+                    sycl::atomic_ref<unsigned, sycl::memory_order::acq_rel,
+                                     sycl::memory_scope::device,
+                                     sycl::access::address_space::global_space>
+                        scratch_flags_ref(*scratch_flags);
+                    num_teams_done = ++scratch_flags_ref;
+                  }
+                  item.barrier(sycl::access::fence_space::local_space);
+                  if (num_teams_done == n_wgroups) {
+                    if (local_id >= n_wgroups)
+                      reducer.init(&local_value);
+                    else {
+                      local_value = results_ptr[local_id];
+                      for (unsigned int id = local_id + wgroup_size;
+                           id < n_wgroups; id += wgroup_size) {
+                        reducer.join(&local_value, &results_ptr[id]);
+                      }
+                    }
+
+                    SYCLReduction::workgroup_reduction<>(
+                        item, local_mem, local_value, results_ptr,
+                        device_accessible_result_ptr, reducer, true,
+                        std::min(n_wgroups, item.get_local_range()[0] *
+                                                item.get_local_range()[1]));
+                  }
+                }
+              };
+              return lambda;
+            };
+
+        auto dummy_reduction_lambda = team_reduction_factory({1, cgh}, nullptr);
+
+        static sycl::kernel kernel = [&] {
+          sycl::kernel_id functor_kernel_id =
+              sycl::get_kernel_id<decltype(dummy_reduction_lambda)>();
+          auto kernel_bundle =
+              sycl::get_kernel_bundle<sycl::bundle_state::executable>(
+                  q.get_context(), std::vector{functor_kernel_id});
+          return kernel_bundle.get_kernel(functor_kernel_id);
+        }();
+        auto max_sg_size = kernel.get_info<
+            sycl::info::kernel_device_specific::max_sub_group_size>(
+            q.get_device());
+        auto final_vector_size = std::min<int>(m_vector_size, max_sg_size);
+        // FIXME_SYCL For some reason, explicitly enforcing the kernel bundle to
+        // be used gives a runtime error.
+
+        //     cgh.use_kernel_bundle(kernel_bundle);
+
+        auto wgroup_size = m_team_size * final_vector_size;
+        std::size_t size = std::size_t(m_league_size) * wgroup_size;
+        sycl::local_accessor<value_type, 1> local_mem(
+            sycl::range<1>(wgroup_size) * std::max(value_count, 1u) +
+                (sizeof(unsigned int) + sizeof(value_type) - 1) /
+                    sizeof(value_type),
+            cgh);
+
+        const auto init_size =
+            std::max<std::size_t>((size + wgroup_size - 1) / wgroup_size, 1);
+        results_ptr =
+            static_cast<sycl::device_ptr<value_type>>(instance.scratch_space(
+                sizeof(value_type) * std::max(value_count, 1u) * init_size));
+
+        size_t max_work_groups =
+            2 *
+            q.get_device().get_info<sycl::info::device::max_compute_units>();
+        int values_per_thread = 1;
+        size_t n_wgroups      = m_league_size;
+        while (n_wgroups > max_work_groups) {
+          values_per_thread *= 2;
+          n_wgroups =
+              ((size_t(m_league_size) * wgroup_size + values_per_thread - 1) /
+                   values_per_thread +
+               wgroup_size - 1) /
+              wgroup_size;
+        }
+
+        auto reduction_lambda = team_reduction_factory(local_mem, results_ptr);
+
+#ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES
+        cgh.depends_on(memcpy_event);
+#endif
+
+        cgh.parallel_for(
+            sycl::nd_range<2>(
+                sycl::range<2>(m_team_size, n_wgroups * m_vector_size),
+                sycl::range<2>(m_team_size, m_vector_size)),
+            reduction_lambda);
+      });
+#ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES
+      q.ext_oneapi_submit_barrier(
+          std::vector<sycl::event>{parallel_reduce_event});
+#endif
+      last_reduction_event = parallel_reduce_event;
+    }
+
+    // At this point, the reduced value is written to the entry in results_ptr
+    // and all that is left is to copy it back to the given result pointer if
+    // necessary.
+    if (m_result_ptr && !m_result_ptr_device_accessible) {
+      Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                             Kokkos::Experimental::SYCLDeviceUSMSpace>(
+          space, m_result_ptr, results_ptr,
+          sizeof(*m_result_ptr) * value_count);
+    }
+
+    return last_reduction_event;
+  }
+
+ public:
+  inline void execute() {
+    Kokkos::Experimental::Impl::SYCLInternal& instance =
+        *m_policy.space().impl_internal_space_instance();
+    using IndirectKernelMem =
+        Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem;
+    IndirectKernelMem& indirectKernelMem = instance.get_indirect_kernel_mem();
+
+    auto functor_reducer_wrapper =
+        Experimental::Impl::make_sycl_function_wrapper(m_functor_reducer,
+                                                       indirectKernelMem);
+
+    sycl::event event =
+        sycl_direct_launch(m_policy, functor_reducer_wrapper,
+                           functor_reducer_wrapper.get_copy_event());
+    functor_reducer_wrapper.register_event(event);
+    instance.register_team_scratch_event(m_scratch_pool_id, event);
+  }
+
+ private:
+  void initialize() {
+    // FIXME_SYCL optimize
+    if (m_team_size < 0)
+      m_team_size = m_policy.team_size_recommended(
+          m_functor_reducer.get_functor(), m_functor_reducer.get_reducer(),
+          ParallelReduceTag{});
+    // Must be a power of two greater than two, get the one not bigger than the
+    // requested one.
+    if ((m_team_size & m_team_size - 1) || m_team_size < 2) {
+      int temp_team_size = 2;
+      while ((temp_team_size << 1) < m_team_size) temp_team_size <<= 1;
+      m_team_size = temp_team_size;
+    }
+
+    m_shmem_begin     = (sizeof(double) * (m_team_size + 2));
+    m_shmem_size      = (m_policy.scratch_size(0, m_team_size) +
+                    FunctorTeamShmemSize<FunctorType>::value(
+                        m_functor_reducer.get_functor(), m_team_size));
+    m_scratch_size[0] = m_shmem_size;
+    m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);
+
+    // Functor's reduce memory, team scan memory, and team shared memory depend
+    // upon team size.
+    auto& space       = *m_policy.space().impl_internal_space_instance();
+    m_scratch_pool_id = space.acquire_team_scratch_space();
+    m_global_scratch_ptr =
+        static_cast<sycl::device_ptr<char>>(space.resize_team_scratch_space(
+            m_scratch_pool_id,
+            static_cast<ptrdiff_t>(m_scratch_size[1]) * m_league_size));
+
+    if (static_cast<int>(space.m_maxShmemPerBlock) <
+        m_shmem_size - m_shmem_begin) {
+      std::stringstream out;
+      out << "Kokkos::Impl::ParallelFor<SYCL> insufficient shared memory! "
+             "Requested "
+          << m_shmem_size - m_shmem_begin << " bytes but maximum is "
+          << space.m_maxShmemPerBlock << '\n';
+      Kokkos::Impl::throw_runtime_exception(out.str());
+    }
+
+    if (m_team_size > m_policy.team_size_max(m_functor_reducer.get_functor(),
+                                             m_functor_reducer.get_reducer(),
+                                             ParallelReduceTag{}))
+      Kokkos::Impl::throw_runtime_exception(
+          "Kokkos::Impl::ParallelFor<SYCL> requested too large team size.");
+  }
+
+ public:
+  template <class ViewType>
+  ParallelReduce(CombinedFunctorReducerType const& arg_functor_reducer,
+                 Policy const& arg_policy, ViewType const& arg_result)
+      : m_functor_reducer(arg_functor_reducer),
+        m_policy(arg_policy),
+        m_result_ptr(arg_result.data()),
+        m_result_ptr_device_accessible(
+            MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                              typename ViewType::memory_space>::accessible),
+        m_league_size(arg_policy.league_size()),
+        m_team_size(arg_policy.team_size()),
+        m_vector_size(arg_policy.impl_vector_length()),
+        m_scratch_lock(arg_policy.space()
+                           .impl_internal_space_instance()
+                           ->m_team_scratch_mutex) {
+    initialize();
+  }
+};
+
+#endif
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..04425723e1987586277ca6322e2066653e97c250
--- /dev/null
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp
@@ -0,0 +1,412 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKO_SYCL_PARALLEL_SCAN_RANGE_HPP
+#define KOKKO_SYCL_PARALLEL_SCAN_RANGE_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <memory>
+#include <vector>
+
+namespace Kokkos::Impl {
+
+// Perform a scan over a workgroup.
+// At the end of this function, the subgroup scans are stored in the local array
+// such that the last value (at position n_active_subgroups-1) contains the
+// total sum.
+template <int dim, typename ValueType, typename FunctorType>
+void workgroup_scan(sycl::nd_item<dim> item, const FunctorType& final_reducer,
+                    sycl::local_accessor<ValueType> local_mem,
+                    ValueType& local_value, int global_range) {
+  // subgroup scans
+  auto sg               = item.get_sub_group();
+  const int sg_group_id = sg.get_group_id()[0];
+  const int id_in_sg    = sg.get_local_id()[0];
+
+  for (int stride = 1; stride < global_range; stride <<= 1) {
+    auto tmp = sg.shuffle_up(local_value, stride);
+    if (id_in_sg >= stride) final_reducer.join(&local_value, &tmp);
+  }
+
+  const int max_subgroup_size = sg.get_max_local_range()[0];
+  const int n_active_subgroups =
+      (global_range + max_subgroup_size - 1) / max_subgroup_size;
+
+  const int local_range = sg.get_local_range()[0];
+  if (id_in_sg == local_range - 1 && sg_group_id < n_active_subgroups)
+    local_mem[sg_group_id] = local_value;
+  local_value = sg.shuffle_up(local_value, 1);
+  if (id_in_sg == 0) final_reducer.init(&local_value);
+  sycl::group_barrier(item.get_group());
+
+  // scan subgroup results using the first subgroup
+  if (n_active_subgroups > 1) {
+    if (sg_group_id == 0) {
+      const int n_rounds = (n_active_subgroups + local_range - 1) / local_range;
+      for (int round = 0; round < n_rounds; ++round) {
+        const int idx = id_in_sg + round * local_range;
+        const auto upper_bound =
+            std::min(local_range, n_active_subgroups - round * local_range);
+        auto local_sg_value = local_mem[idx < n_active_subgroups ? idx : 0];
+        for (int stride = 1; stride < upper_bound; stride <<= 1) {
+          auto tmp = sg.shuffle_up(local_sg_value, stride);
+          if (id_in_sg >= stride) {
+            if (idx < n_active_subgroups)
+              final_reducer.join(&local_sg_value, &tmp);
+            else
+              local_sg_value = tmp;
+          }
+        }
+        if (idx < n_active_subgroups) {
+          local_mem[idx] = local_sg_value;
+          if (round > 0)
+            final_reducer.join(&local_mem[idx],
+                               &local_mem[round * local_range - 1]);
+        }
+        if (round + 1 < n_rounds) sycl::group_barrier(sg);
+      }
+    }
+    sycl::group_barrier(item.get_group());
+  }
+
+  // add results to all subgroups
+  if (sg_group_id > 0)
+    final_reducer.join(&local_value, &local_mem[sg_group_id - 1]);
+}
+
+template <class FunctorType, class ValueType, class... Traits>
+class ParallelScanSYCLBase {
+ public:
+  using Policy = Kokkos::RangePolicy<Traits...>;
+
+ protected:
+  using Member       = typename Policy::member_type;
+  using WorkTag      = typename Policy::work_tag;
+  using LaunchBounds = typename Policy::launch_bounds;
+
+ public:
+  using Analysis       = FunctorAnalysis<FunctorPatternInterface::SCAN, Policy,
+                                   FunctorType, ValueType>;
+  using pointer_type   = typename Analysis::pointer_type;
+  using value_type     = typename Analysis::value_type;
+  using reference_type = typename Analysis::reference_type;
+  using functor_type   = FunctorType;
+  using size_type      = Kokkos::Experimental::SYCL::size_type;
+  using index_type     = typename Policy::index_type;
+
+ protected:
+  const CombinedFunctorReducer<FunctorType, typename Analysis::Reducer>
+      m_functor_reducer;
+  const Policy m_policy;
+  pointer_type m_scratch_space = nullptr;
+  const pointer_type m_result_ptr;
+  const bool m_result_ptr_device_accessible;
+
+  // Only let one Parallel/Scan modify the shared memory. The
+  // constructor acquires the mutex which is released in the destructor.
+  std::scoped_lock<std::mutex> m_shared_memory_lock;
+
+ private:
+  template <typename FunctorWrapper>
+  sycl::event sycl_direct_launch(const FunctorWrapper& functor_wrapper,
+                                 sycl::event memcpy_event) {
+    // Convenience references
+    const Kokkos::Experimental::SYCL& space = m_policy.space();
+    Kokkos::Experimental::Impl::SYCLInternal& instance =
+        *space.impl_internal_space_instance();
+    sycl::queue& q = space.sycl_queue();
+
+    const auto size = m_policy.end() - m_policy.begin();
+
+    auto scratch_flags = static_cast<sycl::device_ptr<unsigned int>>(
+        instance.scratch_flags(sizeof(unsigned int)));
+
+    const auto begin = m_policy.begin();
+
+    // Initialize global memory
+    auto scan_lambda_factory =
+        [&](sycl::local_accessor<value_type> local_mem,
+            sycl::local_accessor<unsigned int> num_teams_done,
+            sycl::device_ptr<value_type> global_mem_,
+            sycl::device_ptr<value_type> group_results_) {
+          auto lambda = [=](sycl::nd_item<1> item) {
+            auto global_mem    = global_mem_;
+            auto group_results = group_results_;
+
+            const CombinedFunctorReducer<
+                FunctorType, typename Analysis::Reducer>& functor_reducer =
+                functor_wrapper.get_functor();
+            const FunctorType& functor = functor_reducer.get_functor();
+            const typename Analysis::Reducer& reducer =
+                functor_reducer.get_reducer();
+
+            const auto n_wgroups  = item.get_group_range()[0];
+            const int wgroup_size = item.get_local_range()[0];
+
+            const int local_id         = item.get_local_linear_id();
+            const index_type global_id = item.get_global_linear_id();
+
+            // Initialize local memory
+            value_type local_value;
+            reducer.init(&local_value);
+            if (global_id < size) {
+              if constexpr (std::is_void<WorkTag>::value)
+                functor(global_id + begin, local_value, false);
+              else
+                functor(WorkTag(), global_id + begin, local_value, false);
+            }
+
+            workgroup_scan<>(item, reducer, local_mem, local_value,
+                             wgroup_size);
+
+            // Write results to global memory
+            if (global_id < size) global_mem[global_id] = local_value;
+
+            if (local_id == wgroup_size - 1) {
+              group_results[item.get_group_linear_id()] =
+                  local_mem[item.get_sub_group().get_group_range()[0] - 1];
+
+              sycl::atomic_ref<unsigned, sycl::memory_order::acq_rel,
+                               sycl::memory_scope::device,
+                               sycl::access::address_space::global_space>
+                  scratch_flags_ref(*scratch_flags);
+              num_teams_done[0] = ++scratch_flags_ref;
+            }
+            item.barrier(sycl::access::fence_space::global_space);
+            if (num_teams_done[0] == n_wgroups) {
+              value_type total;
+              reducer.init(&total);
+
+              for (unsigned int offset = 0; offset < n_wgroups;
+                   offset += wgroup_size) {
+                index_type id = local_id + offset;
+                if (id < static_cast<index_type>(n_wgroups))
+                  local_value = group_results[id];
+                else
+                  reducer.init(&local_value);
+                workgroup_scan<>(
+                    item, reducer, local_mem, local_value,
+                    std::min<index_type>(n_wgroups - offset, wgroup_size));
+                if (id < static_cast<index_type>(n_wgroups)) {
+                  reducer.join(&local_value, &total);
+                  group_results[id] = local_value;
+                }
+                reducer.join(
+                    &total,
+                    &local_mem[item.get_sub_group().get_group_range()[0] - 1]);
+                if (offset + wgroup_size < n_wgroups)
+                  item.barrier(sycl::access::fence_space::global_space);
+              }
+            }
+          };
+          return lambda;
+        };
+
+    size_t wgroup_size;
+    size_t n_wgroups;
+    sycl::device_ptr<value_type> global_mem;
+    sycl::device_ptr<value_type> group_results;
+
+    auto perform_work_group_scans = q.submit([&](sycl::handler& cgh) {
+      sycl::local_accessor<unsigned int> num_teams_done(1, cgh);
+
+      auto dummy_scan_lambda =
+          scan_lambda_factory({1, cgh}, num_teams_done, nullptr, nullptr);
+
+      static sycl::kernel kernel = [&] {
+        sycl::kernel_id functor_kernel_id =
+            sycl::get_kernel_id<decltype(dummy_scan_lambda)>();
+        auto kernel_bundle =
+            sycl::get_kernel_bundle<sycl::bundle_state::executable>(
+                q.get_context(), std::vector{functor_kernel_id});
+        return kernel_bundle.get_kernel(functor_kernel_id);
+      }();
+      auto multiple = kernel.get_info<sycl::info::kernel_device_specific::
+                                          preferred_work_group_size_multiple>(
+          q.get_device());
+      auto max =
+          kernel.get_info<sycl::info::kernel_device_specific::work_group_size>(
+              q.get_device());
+
+      wgroup_size = static_cast<size_t>(max / multiple) * multiple;
+      n_wgroups   = (size + wgroup_size - 1) / wgroup_size;
+
+      // Compute the total amount of memory we will need.
+      // We need to allocate memory for the whole range (rounded towards the
+      // next multiple of the workgroup size) and for one element per workgroup
+      // that will contain the sum of the previous workgroups totals.
+      // FIXME_SYCL consider only storing one value per block and recreate
+      // initial results in the end before doing the final pass
+      global_mem =
+          static_cast<sycl::device_ptr<value_type>>(instance.scratch_space(
+              n_wgroups * (wgroup_size + 1) * sizeof(value_type)));
+      m_scratch_space = global_mem;
+
+      group_results = global_mem + n_wgroups * wgroup_size;
+
+      // Store subgroup totals in local space
+      const auto min_subgroup_size =
+          q.get_device()
+              .template get_info<sycl::info::device::sub_group_sizes>()
+              .front();
+      sycl::local_accessor<value_type> local_mem(
+          sycl::range<1>((wgroup_size + min_subgroup_size - 1) /
+                         min_subgroup_size),
+          cgh);
+
+#ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES
+      cgh.depends_on(memcpy_event);
+#else
+      (void)memcpy_event;
+#endif
+
+      auto scan_lambda = scan_lambda_factory(local_mem, num_teams_done,
+                                             global_mem, group_results);
+      cgh.parallel_for(sycl::nd_range<1>(n_wgroups * wgroup_size, wgroup_size),
+                       scan_lambda);
+    });
+
+    // Write results to global memory
+    auto update_global_results = q.submit([&](sycl::handler& cgh) {
+      auto result_ptr_device_accessible = m_result_ptr_device_accessible;
+      // The compiler failed with CL_INVALID_ARG_VALUE if using m_result_ptr
+      // directly.
+      auto result_ptr = m_result_ptr_device_accessible ? m_result_ptr : nullptr;
+
+#ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES
+      cgh.depends_on(perform_work_group_scans);
+#endif
+
+      cgh.parallel_for(
+          sycl::nd_range<1>(n_wgroups * wgroup_size, wgroup_size),
+          [=](sycl::nd_item<1> item) {
+            auto global_mem_copy       = global_mem;
+            const index_type global_id = item.get_global_linear_id();
+            const CombinedFunctorReducer<
+                FunctorType, typename Analysis::Reducer>& functor_reducer =
+                functor_wrapper.get_functor();
+            const FunctorType& functor = functor_reducer.get_functor();
+            const typename Analysis::Reducer& reducer =
+                functor_reducer.get_reducer();
+
+            if (global_id < size) {
+              value_type update = global_mem[global_id];
+
+              reducer.join(&update, &group_results[item.get_group_linear_id()]);
+
+              if constexpr (std::is_void<WorkTag>::value)
+                functor(global_id + begin, update, true);
+              else
+                functor(WorkTag(), global_id + begin, update, true);
+
+              global_mem_copy[global_id] = update;
+              if (global_id == size - 1 && result_ptr_device_accessible)
+                *result_ptr = update;
+            }
+          });
+    });
+#ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES
+    q.ext_oneapi_submit_barrier(
+        std::vector<sycl::event>{update_global_results});
+#endif
+    return update_global_results;
+  }
+
+ public:
+  template <typename PostFunctor>
+  void impl_execute(const PostFunctor& post_functor) {
+    if (m_policy.begin() == m_policy.end()) return;
+
+    auto& instance = *m_policy.space().impl_internal_space_instance();
+
+    Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem&
+        indirectKernelMem = instance.get_indirect_kernel_mem();
+
+    auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper(
+        m_functor_reducer, indirectKernelMem);
+
+    sycl::event event =
+        sycl_direct_launch(functor_wrapper, functor_wrapper.get_copy_event());
+    functor_wrapper.register_event(event);
+    post_functor();
+  }
+
+  ParallelScanSYCLBase(const FunctorType& arg_functor, const Policy& arg_policy,
+                       pointer_type arg_result_ptr,
+                       bool arg_result_ptr_device_accessible)
+      : m_functor_reducer(arg_functor, typename Analysis::Reducer{arg_functor}),
+        m_policy(arg_policy),
+        m_result_ptr(arg_result_ptr),
+        m_result_ptr_device_accessible(arg_result_ptr_device_accessible),
+        m_shared_memory_lock(m_policy.space()
+                                 .impl_internal_space_instance()
+                                 ->m_mutexScratchSpace) {}
+};
+
+}  // namespace Kokkos::Impl
+
+template <class FunctorType, class... Traits>
+class Kokkos::Impl::ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
+                                 Kokkos::Experimental::SYCL>
+    : private ParallelScanSYCLBase<FunctorType, void, Traits...> {
+ public:
+  using Base = ParallelScanSYCLBase<FunctorType, void, Traits...>;
+
+  inline void execute() {
+    Base::impl_execute([]() {});
+  }
+
+  ParallelScan(const FunctorType& arg_functor,
+               const typename Base::Policy& arg_policy)
+      : Base(arg_functor, arg_policy, nullptr, false) {}
+};
+
+//----------------------------------------------------------------------------
+
+template <class FunctorType, class ReturnType, class... Traits>
+class Kokkos::Impl::ParallelScanWithTotal<
+    FunctorType, Kokkos::RangePolicy<Traits...>, ReturnType,
+    Kokkos::Experimental::SYCL>
+    : public ParallelScanSYCLBase<FunctorType, ReturnType, Traits...> {
+ public:
+  using Base = ParallelScanSYCLBase<FunctorType, ReturnType, Traits...>;
+
+  const Kokkos::Experimental::SYCL& m_exec;
+
+  inline void execute() {
+    Base::impl_execute([&]() {
+      const long long nwork = Base::m_policy.end() - Base::m_policy.begin();
+      if (nwork > 0 && !Base::m_result_ptr_device_accessible) {
+        const int size = Base::m_functor_reducer.get_reducer().value_size();
+        DeepCopy<HostSpace, Kokkos::Experimental::SYCLDeviceUSMSpace,
+                 Kokkos::Experimental::SYCL>(m_exec, Base::m_result_ptr,
+                                             Base::m_scratch_space + nwork - 1,
+                                             size);
+      }
+    });
+  }
+
+  template <class ViewType>
+  ParallelScanWithTotal(const FunctorType& arg_functor,
+                        const typename Base::Policy& arg_policy,
+                        const ViewType& arg_result_view)
+      : Base(arg_functor, arg_policy, arg_result_view.data(),
+             MemorySpaceAccess<Experimental::SYCLDeviceUSMSpace,
+                               typename ViewType::memory_space>::accessible),
+        m_exec(arg_policy.space()) {}
+};
+
+#endif
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp
deleted file mode 100644
index c7959c1c1c5eb04b9c4523ed2e4bbddedd7d5db2..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp
+++ /dev/null
@@ -1,856 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#ifndef KOKKOS_SYCL_PARALLEL_REDUCE_HPP
-#define KOKKOS_SYCL_PARALLEL_REDUCE_HPP
-
-#include <Kokkos_Macros.hpp>
-
-#include <vector>
-#if defined(KOKKOS_ENABLE_SYCL)
-#include <Kokkos_Parallel_Reduce.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-namespace Impl {
-
-template <class ReducerType>
-inline constexpr bool use_shuffle_based_algorithm =
-    std::is_reference_v<typename ReducerType::reference_type>;
-
-namespace SYCLReduction {
-template <typename ValueType, typename ReducerType, int dim>
-std::enable_if_t<!use_shuffle_based_algorithm<ReducerType>> workgroup_reduction(
-    sycl::nd_item<dim>& item, sycl::local_ptr<ValueType> local_mem,
-    sycl::device_ptr<ValueType> results_ptr,
-    sycl::global_ptr<ValueType> device_accessible_result_ptr,
-    const unsigned int value_count, const ReducerType& final_reducer,
-    bool final, unsigned int max_size) {
-  const auto local_id = item.get_local_linear_id();
-
-  // Perform the actual workgroup reduction in each subgroup
-  // separately.
-  auto sg             = item.get_sub_group();
-  auto* result        = &local_mem[local_id * value_count];
-  const auto id_in_sg = sg.get_local_id()[0];
-  const auto local_range =
-      std::min<unsigned int>(sg.get_local_range()[0], max_size);
-  const auto upper_stride_bound =
-      std::min(local_range - id_in_sg, max_size - local_id);
-  for (unsigned int stride = 1; stride < local_range; stride <<= 1) {
-    if (stride < upper_stride_bound)
-      final_reducer.join(result, &local_mem[(local_id + stride) * value_count]);
-    sycl::group_barrier(sg);
-  }
-  sycl::group_barrier(item.get_group());
-
-  // Copy the subgroup results into the first positions of the
-  // reduction array.
-  if (id_in_sg == 0)
-    final_reducer.copy(&local_mem[sg.get_group_id()[0] * value_count], result);
-  sycl::group_barrier(item.get_group());
-
-  // Do the final reduction only using the first subgroup.
-  if (sg.get_group_id()[0] == 0) {
-    const auto n_subgroups = sg.get_group_range()[0];
-    auto* result_          = &local_mem[id_in_sg * value_count];
-    // In case the number of subgroups is larger than the range of
-    // the first subgroup, we first combine the items with a higher
-    // index.
-    for (unsigned int offset = local_range; offset < n_subgroups;
-         offset += local_range)
-      if (id_in_sg + offset < n_subgroups)
-        final_reducer.join(result_,
-                           &local_mem[(id_in_sg + offset) * value_count]);
-    sycl::group_barrier(sg);
-
-    // Then, we proceed as before.
-    for (unsigned int stride = 1; stride < local_range; stride <<= 1) {
-      if (id_in_sg + stride < n_subgroups)
-        final_reducer.join(result_,
-                           &local_mem[(id_in_sg + stride) * value_count]);
-      sycl::group_barrier(sg);
-    }
-
-    // Finally, we copy the workgroup results back to global memory
-    // to be used in the next iteration. If this is the last
-    // iteration, i.e., there is only one workgroup also call
-    // final() if necessary.
-    if (id_in_sg == 0) {
-      if (final) {
-        final_reducer.final(&local_mem[0]);
-        if (device_accessible_result_ptr != nullptr)
-          final_reducer.copy(&device_accessible_result_ptr[0], &local_mem[0]);
-        else
-          final_reducer.copy(&results_ptr[0], &local_mem[0]);
-      } else
-        final_reducer.copy(
-            &results_ptr[(item.get_group_linear_id()) * value_count],
-            &local_mem[0]);
-    }
-  }
-}
-
-template <typename ValueType, typename ReducerType, int dim>
-std::enable_if_t<use_shuffle_based_algorithm<ReducerType>> workgroup_reduction(
-    sycl::nd_item<dim>& item, sycl::local_ptr<ValueType> local_mem,
-    ValueType local_value, sycl::device_ptr<ValueType> results_ptr,
-    sycl::global_ptr<ValueType> device_accessible_result_ptr,
-    const ReducerType& final_reducer, bool final, unsigned int max_size) {
-  const auto local_id = item.get_local_linear_id();
-
-  // Perform the actual workgroup reduction in each subgroup
-  // separately.
-  auto sg             = item.get_sub_group();
-  const auto id_in_sg = sg.get_local_id()[0];
-  const auto local_range =
-      std::min<unsigned int>(sg.get_local_range()[0], max_size);
-  const auto upper_stride_bound =
-      std::min(local_range - id_in_sg, max_size - local_id);
-  for (unsigned int stride = 1; stride < local_range; stride <<= 1) {
-    auto tmp = sg.shuffle_down(local_value, stride);
-    if (stride < upper_stride_bound) final_reducer.join(&local_value, &tmp);
-  }
-
-  // Copy the subgroup results into the first positions of the
-  // reduction array.
-  const auto max_subgroup_size = sg.get_max_local_range()[0];
-  const auto n_active_subgroups =
-      (max_size + max_subgroup_size - 1) / max_subgroup_size;
-  if (id_in_sg == 0 && sg.get_group_id()[0] <= n_active_subgroups)
-    local_mem[sg.get_group_id()[0]] = local_value;
-  item.barrier(sycl::access::fence_space::local_space);
-
-  // Do the final reduction only using the first subgroup.
-  if (sg.get_group_id()[0] == 0) {
-    auto sg_value = local_mem[id_in_sg < n_active_subgroups ? id_in_sg : 0];
-
-    // In case the number of subgroups is larger than the range of
-    // the first subgroup, we first combine the items with a higher
-    // index.
-    if (n_active_subgroups > local_range) {
-      for (unsigned int offset = local_range; offset < n_active_subgroups;
-           offset += local_range)
-        if (id_in_sg + offset < n_active_subgroups) {
-          final_reducer.join(&sg_value, &local_mem[(id_in_sg + offset)]);
-        }
-      sg.barrier();
-    }
-
-    // Then, we proceed as before.
-    for (unsigned int stride = 1; stride < local_range; stride <<= 1) {
-      auto tmp = sg.shuffle_down(sg_value, stride);
-      if (id_in_sg + stride < n_active_subgroups)
-        final_reducer.join(&sg_value, &tmp);
-    }
-
-    // Finally, we copy the workgroup results back to global memory
-    // to be used in the next iteration. If this is the last
-    // iteration, i.e., there is only one workgroup also call
-    // final() if necessary.
-    if (id_in_sg == 0) {
-      if (final) {
-        final_reducer.final(&sg_value);
-        if (device_accessible_result_ptr != nullptr)
-          device_accessible_result_ptr[0] = sg_value;
-        else
-          results_ptr[0] = sg_value;
-      } else
-        results_ptr[(item.get_group_linear_id())] = sg_value;
-    }
-  }
-}
-
-}  // namespace SYCLReduction
-
-template <class FunctorType, class ReducerType, class... Traits>
-class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
-                     Kokkos::Experimental::SYCL> {
- public:
-  using Policy = Kokkos::RangePolicy<Traits...>;
-
- private:
-  using ReducerConditional =
-      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                         FunctorType, ReducerType>;
-  using ReducerTypeFwd = typename ReducerConditional::type;
-  using Analysis =
-      FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy, ReducerTypeFwd>;
-  using execution_space = typename Analysis::execution_space;
-  using value_type      = typename Analysis::value_type;
-  using pointer_type    = typename Analysis::pointer_type;
-  using reference_type  = typename Analysis::reference_type;
-
-  using WorkTag = typename Policy::work_tag;
-
- public:
-  // V - View
-  template <typename V>
-  ParallelReduce(const FunctorType& f, const Policy& p, const V& v,
-                 std::enable_if_t<Kokkos::is_view<V>::value, void*> = nullptr)
-      : m_functor(f),
-        m_policy(p),
-        m_result_ptr(v.data()),
-        m_result_ptr_device_accessible(
-            MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
-                              typename V::memory_space>::accessible),
-        m_shared_memory_lock(
-            p.space().impl_internal_space_instance()->m_mutexScratchSpace) {}
-
-  ParallelReduce(const FunctorType& f, const Policy& p,
-                 const ReducerType& reducer)
-      : m_functor(f),
-        m_policy(p),
-        m_reducer(reducer),
-        m_result_ptr(reducer.view().data()),
-        m_result_ptr_device_accessible(
-            MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
-                              typename ReducerType::result_view_type::
-                                  memory_space>::accessible),
-        m_shared_memory_lock(
-            p.space().impl_internal_space_instance()->m_mutexScratchSpace) {}
-
- private:
-  template <typename PolicyType, typename FunctorWrapper,
-            typename ReducerWrapper>
-  sycl::event sycl_direct_launch(
-      const PolicyType& policy, const FunctorWrapper& functor_wrapper,
-      const ReducerWrapper& reducer_wrapper,
-      const std::vector<sycl::event>& memcpy_events) const {
-    // Convenience references
-    const Kokkos::Experimental::SYCL& space = policy.space();
-    Kokkos::Experimental::Impl::SYCLInternal& instance =
-        *space.impl_internal_space_instance();
-    sycl::queue& q = space.sycl_queue();
-
-    constexpr size_t values_per_thread = 2;
-    std::size_t size                   = policy.end() - policy.begin();
-    const unsigned int value_count =
-        Analysis::value_count(ReducerConditional::select(m_functor, m_reducer));
-    sycl::device_ptr<value_type> results_ptr = nullptr;
-    sycl::global_ptr<value_type> device_accessible_result_ptr =
-        m_result_ptr_device_accessible ? m_result_ptr : nullptr;
-
-    sycl::event last_reduction_event;
-
-    // If size<=1 we only call init(), the functor and possibly final once
-    // working with the global scratch memory but don't copy back to
-    // m_result_ptr yet.
-    if (size <= 1) {
-      results_ptr =
-          static_cast<sycl::device_ptr<value_type>>(instance.scratch_space(
-              sizeof(value_type) * std::max(value_count, 1u)));
-
-      auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) {
-        const auto begin = policy.begin();
-        cgh.depends_on(memcpy_events);
-        cgh.single_task([=]() {
-          const auto& functor          = functor_wrapper.get_functor();
-          const auto& selected_reducer = ReducerConditional::select(
-              static_cast<const FunctorType&>(functor),
-              static_cast<const ReducerType&>(reducer_wrapper.get_functor()));
-          typename Analysis::Reducer final_reducer(&selected_reducer);
-          reference_type update = final_reducer.init(results_ptr);
-          if (size == 1) {
-            if constexpr (std::is_void<WorkTag>::value)
-              functor(begin, update);
-            else
-              functor(WorkTag(), begin, update);
-          }
-          final_reducer.final(results_ptr);
-          if (device_accessible_result_ptr != nullptr)
-            final_reducer.copy(device_accessible_result_ptr.get(),
-                               results_ptr.get());
-        });
-      });
-      q.ext_oneapi_submit_barrier(
-          std::vector<sycl::event>{parallel_reduce_event});
-      last_reduction_event = parallel_reduce_event;
-    }
-
-    // Otherwise, we perform a reduction on the values in all workgroups
-    // separately, write the workgroup results back to global memory and recurse
-    // until only one workgroup does the reduction and thus gets the final
-    // value.
-    if (size > 1) {
-      auto scratch_flags = static_cast<sycl::device_ptr<unsigned int>>(
-          instance.scratch_flags(sizeof(unsigned int)));
-
-      auto reduction_lambda_factory =
-          [&](sycl::accessor<value_type, 1, sycl::access::mode::read_write,
-                             sycl::access::target::local>
-                  local_mem,
-              sycl::accessor<unsigned int, 1, sycl::access::mode::read_write,
-                             sycl::access::target::local>
-                  num_teams_done,
-              sycl::device_ptr<value_type> results_ptr) {
-            const auto begin = policy.begin();
-
-            auto lambda = [=](sycl::nd_item<1> item) {
-              const auto n_wgroups   = item.get_group_range()[0];
-              const auto wgroup_size = item.get_local_range()[0];
-
-              const auto local_id = item.get_local_linear_id();
-              const auto global_id =
-                  wgroup_size * item.get_group_linear_id() * values_per_thread +
-                  local_id;
-              const auto& functor          = functor_wrapper.get_functor();
-              const auto& selected_reducer = ReducerConditional::select(
-                  static_cast<const FunctorType&>(functor),
-                  static_cast<const ReducerType&>(
-                      reducer_wrapper.get_functor()));
-              typename Analysis::Reducer final_reducer(&selected_reducer);
-
-              using index_type       = typename Policy::index_type;
-              const auto upper_bound = std::min<index_type>(
-                  global_id + values_per_thread * wgroup_size, size);
-
-              if constexpr (Analysis::StaticValueSize == 0) {
-                reference_type update =
-                    final_reducer.init(&local_mem[local_id * value_count]);
-                for (index_type id = global_id; id < upper_bound;
-                     id += wgroup_size) {
-                  if constexpr (std::is_void<WorkTag>::value)
-                    functor(id + begin, update);
-                  else
-                    functor(WorkTag(), id + begin, update);
-                }
-                item.barrier(sycl::access::fence_space::local_space);
-
-                SYCLReduction::workgroup_reduction<>(
-                    item, local_mem.get_pointer(), results_ptr,
-                    device_accessible_result_ptr, value_count, final_reducer,
-                    false, std::min(size, wgroup_size));
-
-                if (local_id == 0) {
-                  sycl::atomic_ref<unsigned, sycl::memory_order::relaxed,
-                                   sycl::memory_scope::device,
-                                   sycl::access::address_space::global_space>
-                      scratch_flags_ref(*scratch_flags);
-                  num_teams_done[0] = ++scratch_flags_ref;
-                }
-                item.barrier(sycl::access::fence_space::local_space);
-                if (num_teams_done[0] == n_wgroups) {
-                  if (local_id >= n_wgroups)
-                    final_reducer.init(&local_mem[local_id * value_count]);
-                  else {
-                    final_reducer.copy(&local_mem[local_id * value_count],
-                                       &results_ptr[local_id * value_count]);
-                    for (unsigned int id = local_id + wgroup_size;
-                         id < n_wgroups; id += wgroup_size) {
-                      final_reducer.join(&local_mem[local_id * value_count],
-                                         &results_ptr[id * value_count]);
-                    }
-                  }
-
-                  SYCLReduction::workgroup_reduction<>(
-                      item, local_mem.get_pointer(), results_ptr,
-                      device_accessible_result_ptr, value_count, final_reducer,
-                      true, std::min(n_wgroups, wgroup_size));
-                }
-              } else {
-                value_type local_value;
-                reference_type update = final_reducer.init(&local_value);
-                for (index_type id = global_id; id < upper_bound;
-                     id += wgroup_size) {
-                  if constexpr (std::is_void<WorkTag>::value)
-                    functor(id + begin, update);
-                  else
-                    functor(WorkTag(), id + begin, update);
-                }
-
-                SYCLReduction::workgroup_reduction<>(
-                    item, local_mem.get_pointer(), local_value, results_ptr,
-                    device_accessible_result_ptr, final_reducer, false,
-                    std::min(size, wgroup_size));
-
-                if (local_id == 0) {
-                  sycl::atomic_ref<unsigned, sycl::memory_order::relaxed,
-                                   sycl::memory_scope::device,
-                                   sycl::access::address_space::global_space>
-                      scratch_flags_ref(*scratch_flags);
-                  num_teams_done[0] = ++scratch_flags_ref;
-                }
-                item.barrier(sycl::access::fence_space::local_space);
-                if (num_teams_done[0] == n_wgroups) {
-                  if (local_id >= n_wgroups)
-                    final_reducer.init(&local_value);
-                  else {
-                    local_value = results_ptr[local_id];
-                    for (unsigned int id = local_id + wgroup_size;
-                         id < n_wgroups; id += wgroup_size) {
-                      final_reducer.join(&local_value, &results_ptr[id]);
-                    }
-                  }
-
-                  SYCLReduction::workgroup_reduction<>(
-                      item, local_mem.get_pointer(), local_value, results_ptr,
-                      device_accessible_result_ptr, final_reducer, true,
-                      std::min(n_wgroups, wgroup_size));
-                }
-              }
-            };
-            return lambda;
-          };
-
-      auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) {
-        sycl::accessor<unsigned int, 1, sycl::access::mode::read_write,
-                       sycl::access::target::local>
-            num_teams_done(1, cgh);
-
-        auto dummy_reduction_lambda =
-            reduction_lambda_factory({1, cgh}, num_teams_done, nullptr);
-
-        static sycl::kernel kernel = [&] {
-          sycl::kernel_id functor_kernel_id =
-              sycl::get_kernel_id<decltype(dummy_reduction_lambda)>();
-          auto kernel_bundle =
-              sycl::get_kernel_bundle<sycl::bundle_state::executable>(
-                  q.get_context(), std::vector{functor_kernel_id});
-          return kernel_bundle.get_kernel(functor_kernel_id);
-        }();
-        auto multiple = kernel.get_info<sycl::info::kernel_device_specific::
-                                            preferred_work_group_size_multiple>(
-            q.get_device());
-        auto max =
-            kernel
-                .get_info<sycl::info::kernel_device_specific::work_group_size>(
-                    q.get_device());
-
-// FIXME_SYCL 1024 seems to be invalid when running on a Volta70.
-#ifndef KOKKOS_ARCH_INTEL_GPU
-        if (max > 512) max = 512;
-#endif
-
-        const size_t wgroup_size =
-            static_cast<size_t>(max / multiple) * multiple;
-
-        const std::size_t init_size =
-            ((size + values_per_thread - 1) / values_per_thread + wgroup_size -
-             1) /
-            wgroup_size;
-        results_ptr =
-            static_cast<sycl::device_ptr<value_type>>(instance.scratch_space(
-                sizeof(value_type) * std::max(value_count, 1u) * init_size));
-
-        auto n_wgroups = ((size + values_per_thread - 1) / values_per_thread +
-                          wgroup_size - 1) /
-                         wgroup_size;
-
-        sycl::accessor<value_type, 1, sycl::access::mode::read_write,
-                       sycl::access::target::local>
-            local_mem(sycl::range<1>(wgroup_size) * std::max(value_count, 1u),
-                      cgh);
-
-        cgh.depends_on(memcpy_events);
-
-        auto reduction_lambda =
-            reduction_lambda_factory(local_mem, num_teams_done, results_ptr);
-        cgh.parallel_for(
-            sycl::nd_range<1>(n_wgroups * wgroup_size, wgroup_size),
-            reduction_lambda);
-      });
-
-      last_reduction_event = q.ext_oneapi_submit_barrier(
-          std::vector<sycl::event>{parallel_reduce_event});
-    }
-
-    // At this point, the reduced value is written to the entry in results_ptr
-    // and all that is left is to copy it back to the given result pointer if
-    // necessary.
-    if (m_result_ptr && !m_result_ptr_device_accessible) {
-      Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
-                             Kokkos::Experimental::SYCLDeviceUSMSpace>(
-          space, m_result_ptr, results_ptr,
-          sizeof(*m_result_ptr) * value_count);
-    }
-
-    return last_reduction_event;
-  }
-
- public:
-  void execute() const {
-    Kokkos::Experimental::Impl::SYCLInternal& instance =
-        *m_policy.space().impl_internal_space_instance();
-    using IndirectKernelMem =
-        Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem;
-    IndirectKernelMem& indirectKernelMem  = instance.get_indirect_kernel_mem();
-    IndirectKernelMem& indirectReducerMem = instance.get_indirect_kernel_mem();
-
-    auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper(
-        m_functor, indirectKernelMem);
-    auto reducer_wrapper = Experimental::Impl::make_sycl_function_wrapper(
-        m_reducer, indirectReducerMem);
-
-    sycl::event event = sycl_direct_launch(
-        m_policy, functor_wrapper, reducer_wrapper,
-        {functor_wrapper.get_copy_event(), reducer_wrapper.get_copy_event()});
-    functor_wrapper.register_event(event);
-    reducer_wrapper.register_event(event);
-  }
-
- private:
-  const FunctorType m_functor;
-  const Policy m_policy;
-  const ReducerType m_reducer;
-  const pointer_type m_result_ptr;
-  const bool m_result_ptr_device_accessible;
-
-  // Only let one Parallel/Scan modify the shared memory. The
-  // constructor acquires the mutex which is released in the destructor.
-  std::scoped_lock<std::mutex> m_shared_memory_lock;
-};
-
-template <class FunctorType, class ReducerType, class... Traits>
-class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
-                     Kokkos::Experimental::SYCL> {
- public:
-  using Policy = Kokkos::MDRangePolicy<Traits...>;
-
- private:
-  using ReducerConditional =
-      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                         FunctorType, ReducerType>;
-  using ReducerTypeFwd = typename ReducerConditional::type;
-  using Analysis =
-      FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy, ReducerTypeFwd>;
-  using execution_space = typename Analysis::execution_space;
-  using value_type      = typename Analysis::value_type;
-  using pointer_type    = typename Analysis::pointer_type;
-  using reference_type  = typename Analysis::reference_type;
-
-  using WorkTag = typename Policy::work_tag;
-
-  // MDRangePolicy is not trivially copyable. Hence, replicate the data we
-  // really need in DeviceIterateTile in a trivially copyable struct.
-  struct BarePolicy {
-    using index_type = typename Policy::index_type;
-
-    BarePolicy(const Policy& policy)
-        : m_lower(policy.m_lower),
-          m_upper(policy.m_upper),
-          m_tile(policy.m_tile),
-          m_tile_end(policy.m_tile_end),
-          m_num_tiles(policy.m_num_tiles),
-          m_prod_tile_dims(policy.m_prod_tile_dims) {}
-
-    const typename Policy::point_type m_lower;
-    const typename Policy::point_type m_upper;
-    const typename Policy::tile_type m_tile;
-    const typename Policy::point_type m_tile_end;
-    const typename Policy::index_type m_num_tiles;
-    const typename Policy::index_type m_prod_tile_dims;
-    static constexpr Iterate inner_direction = Policy::inner_direction;
-    static constexpr int rank                = Policy::rank;
-  };
-
- public:
-  // V - View
-  template <typename V>
-  ParallelReduce(const FunctorType& f, const Policy& p, const V& v,
-                 std::enable_if_t<Kokkos::is_view<V>::value, void*> = nullptr)
-      : m_functor(f),
-        m_policy(p),
-        m_space(p.space()),
-        m_result_ptr(v.data()),
-        m_result_ptr_device_accessible(
-            MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
-                              typename V::memory_space>::accessible),
-        m_shared_memory_lock(
-            m_space.impl_internal_space_instance()->m_mutexScratchSpace) {}
-
-  ParallelReduce(const FunctorType& f, const Policy& p,
-                 const ReducerType& reducer)
-      : m_functor(f),
-        m_policy(p),
-        m_space(p.space()),
-        m_reducer(reducer),
-        m_result_ptr(reducer.view().data()),
-        m_result_ptr_device_accessible(
-            MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
-                              typename ReducerType::result_view_type::
-                                  memory_space>::accessible),
-        m_shared_memory_lock(
-            m_space.impl_internal_space_instance()->m_mutexScratchSpace) {}
-
- private:
-  template <typename PolicyType, typename FunctorWrapper,
-            typename ReducerWrapper>
-  sycl::event sycl_direct_launch(
-      const PolicyType& policy, const FunctorWrapper& functor_wrapper,
-      const ReducerWrapper& reducer_wrapper,
-      const std::vector<sycl::event>& memcpy_events) const {
-    // Convenience references
-    Kokkos::Experimental::Impl::SYCLInternal& instance =
-        *m_space.impl_internal_space_instance();
-    sycl::queue& q = m_space.sycl_queue();
-
-    const typename Policy::index_type nwork = m_policy.m_num_tiles;
-    const typename Policy::index_type block_size =
-        std::pow(2, std::ceil(std::log2(m_policy.m_prod_tile_dims)));
-
-    const sycl::range<1> local_range(block_size);
-    // REMEMBER swap local x<->y to be conforming with Cuda/HIP implementation
-    const sycl::range<1> global_range(nwork * block_size);
-    const sycl::nd_range<1> range{global_range, local_range};
-
-    const size_t wgroup_size = range.get_local_range().size();
-    size_t size              = range.get_global_range().size();
-    const auto init_size =
-        std::max<std::size_t>((size + wgroup_size - 1) / wgroup_size, 1);
-    const unsigned int value_count =
-        Analysis::value_count(ReducerConditional::select(m_functor, m_reducer));
-    const auto results_ptr =
-        static_cast<sycl::device_ptr<value_type>>(instance.scratch_space(
-            sizeof(value_type) * std::max(value_count, 1u) * init_size));
-    sycl::global_ptr<value_type> device_accessible_result_ptr =
-        m_result_ptr_device_accessible ? m_result_ptr : nullptr;
-    auto scratch_flags = static_cast<sycl::device_ptr<unsigned int>>(
-        instance.scratch_flags(sizeof(unsigned int)));
-
-    sycl::event last_reduction_event;
-
-    // If size<=1 we only call init(), the functor and possibly final once
-    // working with the global scratch memory but don't copy back to
-    // m_result_ptr yet.
-    if (size <= 1) {
-      auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) {
-        cgh.depends_on(memcpy_events);
-        cgh.single_task([=]() {
-          const auto& functor          = functor_wrapper.get_functor();
-          const auto& selected_reducer = ReducerConditional::select(
-              static_cast<const FunctorType&>(functor),
-              static_cast<const ReducerType&>(reducer_wrapper.get_functor()));
-          typename Analysis::Reducer final_reducer(&selected_reducer);
-
-          reference_type update = final_reducer.init(results_ptr);
-          if (size == 1) {
-            Kokkos::Impl::Reduce::DeviceIterateTile<
-                Policy::rank, BarePolicy, FunctorType,
-                typename Policy::work_tag, reference_type>(
-                policy, functor, update, {1, 1, 1}, {0, 0, 0}, {0, 0, 0})
-                .exec_range();
-          }
-          final_reducer.final(results_ptr);
-          if (device_accessible_result_ptr)
-            final_reducer.copy(device_accessible_result_ptr.get(),
-                               results_ptr.get());
-        });
-      });
-      q.ext_oneapi_submit_barrier(
-          std::vector<sycl::event>{parallel_reduce_event});
-      last_reduction_event = parallel_reduce_event;
-    }
-
-    // Otherwise, we perform a reduction on the values in all workgroups
-    // separately, write the workgroup results back to global memory and recurse
-    // until only one workgroup does the reduction and thus gets the final
-    // value.
-    if (size > 1) {
-      auto n_wgroups             = (size + wgroup_size - 1) / wgroup_size;
-      auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) {
-        sycl::accessor<value_type, 1, sycl::access::mode::read_write,
-                       sycl::access::target::local>
-            local_mem(sycl::range<1>(wgroup_size) * std::max(value_count, 1u),
-                      cgh);
-        sycl::accessor<unsigned int, 1, sycl::access::mode::read_write,
-                       sycl::access::target::local>
-            num_teams_done(1, cgh);
-
-        const BarePolicy bare_policy = m_policy;
-
-        cgh.depends_on(memcpy_events);
-
-        cgh.parallel_for(range, [=](sycl::nd_item<1> item) {
-          const auto local_id          = item.get_local_linear_id();
-          const auto& functor          = functor_wrapper.get_functor();
-          const auto& selected_reducer = ReducerConditional::select(
-              static_cast<const FunctorType&>(functor),
-              static_cast<const ReducerType&>(reducer_wrapper.get_functor()));
-          typename Analysis::Reducer final_reducer(&selected_reducer);
-
-          // In the first iteration, we call functor to initialize the local
-          // memory. Otherwise, the local memory is initialized with the
-          // results from the previous iteration that are stored in global
-          // memory.
-          using index_type = typename Policy::index_type;
-
-          // SWAPPED here to be conforming with CUDA implementation
-          const index_type local_x    = 0;
-          const index_type local_y    = item.get_local_id(0);
-          const index_type local_z    = 0;
-          const index_type global_x   = item.get_group(0);
-          const index_type global_y   = 0;
-          const index_type global_z   = 0;
-          const index_type n_global_x = item.get_group_range(0);
-          const index_type n_global_y = 1;
-          const index_type n_global_z = 1;
-
-          if constexpr (Analysis::StaticValueSize == 0) {
-            reference_type update =
-                final_reducer.init(&local_mem[local_id * value_count]);
-
-            Kokkos::Impl::Reduce::DeviceIterateTile<
-                Policy::rank, BarePolicy, FunctorType,
-                typename Policy::work_tag, reference_type>(
-                bare_policy, functor, update,
-                {n_global_x, n_global_y, n_global_z},
-                {global_x, global_y, global_z}, {local_x, local_y, local_z})
-                .exec_range();
-            item.barrier(sycl::access::fence_space::local_space);
-
-            SYCLReduction::workgroup_reduction<>(
-                item, local_mem.get_pointer(), results_ptr,
-                device_accessible_result_ptr, value_count, final_reducer, false,
-                std::min(size, wgroup_size));
-
-            if (local_id == 0) {
-              sycl::atomic_ref<unsigned, sycl::memory_order::relaxed,
-                               sycl::memory_scope::device,
-                               sycl::access::address_space::global_space>
-                  scratch_flags_ref(*scratch_flags);
-              num_teams_done[0] = ++scratch_flags_ref;
-            }
-            item.barrier(sycl::access::fence_space::local_space);
-            if (num_teams_done[0] == n_wgroups) {
-              if (local_id >= n_wgroups)
-                final_reducer.init(&local_mem[local_id * value_count]);
-              else {
-                final_reducer.copy(&local_mem[local_id * value_count],
-                                   &results_ptr[local_id * value_count]);
-                for (unsigned int id = local_id + wgroup_size; id < n_wgroups;
-                     id += wgroup_size) {
-                  final_reducer.join(&local_mem[local_id * value_count],
-                                     &results_ptr[id * value_count]);
-                }
-              }
-
-              SYCLReduction::workgroup_reduction<>(
-                  item, local_mem.get_pointer(), results_ptr,
-                  device_accessible_result_ptr, value_count, final_reducer,
-                  true, std::min(n_wgroups, wgroup_size));
-            }
-          } else {
-            value_type local_value;
-            reference_type update = final_reducer.init(&local_value);
-
-            Kokkos::Impl::Reduce::DeviceIterateTile<
-                Policy::rank, BarePolicy, FunctorType,
-                typename Policy::work_tag, reference_type>(
-                bare_policy, functor, update,
-                {n_global_x, n_global_y, n_global_z},
-                {global_x, global_y, global_z}, {local_x, local_y, local_z})
-                .exec_range();
-
-            SYCLReduction::workgroup_reduction<>(
-                item, local_mem.get_pointer(), local_value, results_ptr,
-                device_accessible_result_ptr, final_reducer, false,
-                std::min(size, wgroup_size));
-
-            if (local_id == 0) {
-              sycl::atomic_ref<unsigned, sycl::memory_order::relaxed,
-                               sycl::memory_scope::device,
-                               sycl::access::address_space::global_space>
-                  scratch_flags_ref(*scratch_flags);
-              num_teams_done[0] = ++scratch_flags_ref;
-            }
-            item.barrier(sycl::access::fence_space::local_space);
-            if (num_teams_done[0] == n_wgroups) {
-              if (local_id >= n_wgroups)
-                final_reducer.init(&local_value);
-              else {
-                local_value = results_ptr[local_id];
-                for (unsigned int id = local_id + wgroup_size; id < n_wgroups;
-                     id += wgroup_size) {
-                  final_reducer.join(&local_value, &results_ptr[id]);
-                }
-              }
-
-              SYCLReduction::workgroup_reduction<>(
-                  item, local_mem.get_pointer(), local_value, results_ptr,
-                  device_accessible_result_ptr, final_reducer, true,
-                  std::min(n_wgroups, wgroup_size));
-            }
-          }
-        });
-      });
-      last_reduction_event       = q.ext_oneapi_submit_barrier(
-          std::vector<sycl::event>{parallel_reduce_event});
-    }
-
-    // At this point, the reduced value is written to the entry in results_ptr
-    // and all that is left is to copy it back to the given result pointer if
-    // necessary.
-    if (m_result_ptr && !m_result_ptr_device_accessible) {
-      Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
-                             Kokkos::Experimental::SYCLDeviceUSMSpace>(
-          m_space, m_result_ptr, results_ptr,
-          sizeof(*m_result_ptr) * value_count);
-    }
-
-    return last_reduction_event;
-  }
-
- public:
-  template <typename Policy, typename Functor>
-  static int max_tile_size_product(const Policy& policy, const Functor&) {
-    return policy.space().impl_internal_space_instance()->m_maxWorkgroupSize;
-  }
-
-  void execute() const {
-    Kokkos::Experimental::Impl::SYCLInternal& instance =
-        *m_space.impl_internal_space_instance();
-    using IndirectKernelMem =
-        Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem;
-    IndirectKernelMem& indirectKernelMem  = instance.get_indirect_kernel_mem();
-    IndirectKernelMem& indirectReducerMem = instance.get_indirect_kernel_mem();
-
-    auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper(
-        m_functor, indirectKernelMem);
-    auto reducer_wrapper = Experimental::Impl::make_sycl_function_wrapper(
-        m_reducer, indirectReducerMem);
-
-    sycl::event event = sycl_direct_launch(
-        m_policy, functor_wrapper, reducer_wrapper,
-        {functor_wrapper.get_copy_event(), reducer_wrapper.get_copy_event()});
-    functor_wrapper.register_event(event);
-    reducer_wrapper.register_event(event);
-  }
-
- private:
-  const FunctorType m_functor;
-  const BarePolicy m_policy;
-  const Kokkos::Experimental::SYCL& m_space;
-  const ReducerType m_reducer;
-  const pointer_type m_result_ptr;
-  const bool m_result_ptr_device_accessible;
-
-  // Only let one Parallel/Scan modify the shared memory. The
-  // constructor acquires the mutex which is released in the destructor.
-  std::scoped_lock<std::mutex> m_shared_memory_lock;
-};
-
-}  // namespace Impl
-}  // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif
-#endif /* KOKKOS_SYCL_PARALLEL_REDUCE_HPP */
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp
deleted file mode 100644
index cf651ced95e5c1f34ad8705189b6cd15f8c526ba..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp
+++ /dev/null
@@ -1,363 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#ifndef KOKKO_SYCL_PARALLEL_SCAN_HPP
-#define KOKKO_SYCL_PARALLEL_SCAN_HPP
-
-#include <Kokkos_Macros.hpp>
-#include <memory>
-#include <vector>
-#if defined(KOKKOS_ENABLE_SYCL)
-
-namespace Kokkos {
-namespace Impl {
-
-// Perform a scan over a workgroup.
-// At the end of this function, the subgroup scans are stored in the local array
-// such that the last value (at position n_active_subgroups-1) contains the
-// total sum.
-template <int dim, typename ValueType, typename FunctorType>
-void workgroup_scan(sycl::nd_item<dim> item, const FunctorType& final_reducer,
-                    sycl::local_ptr<ValueType> local_mem,
-                    ValueType& local_value, unsigned int global_range) {
-  // subgroup scans
-  auto sg                = item.get_sub_group();
-  const auto sg_group_id = sg.get_group_id()[0];
-  const auto id_in_sg    = sg.get_local_id()[0];
-  for (unsigned int stride = 1; stride < global_range; stride <<= 1) {
-    auto tmp = sg.shuffle_up(local_value, stride);
-    if (id_in_sg >= stride) final_reducer.join(&local_value, &tmp);
-  }
-
-  const auto max_subgroup_size = sg.get_max_local_range()[0];
-  const auto n_active_subgroups =
-      (global_range + max_subgroup_size - 1) / max_subgroup_size;
-
-  const auto local_range = sg.get_local_range()[0];
-  if (id_in_sg == local_range - 1 && sg_group_id < n_active_subgroups)
-    local_mem[sg_group_id] = local_value;
-  local_value = sg.shuffle_up(local_value, 1);
-  if (id_in_sg == 0) final_reducer.init(&local_value);
-  sycl::group_barrier(item.get_group());
-
-  // scan subgroup results using the first subgroup
-  if (n_active_subgroups > 1) {
-    if (sg_group_id == 0) {
-      const auto n_rounds =
-          (n_active_subgroups + local_range - 1) / local_range;
-      for (unsigned int round = 0; round < n_rounds; ++round) {
-        const unsigned int idx = id_in_sg + round * local_range;
-        const auto upper_bound =
-            std::min(local_range, n_active_subgroups - round * local_range);
-        auto local_sg_value = local_mem[idx < n_active_subgroups ? idx : 0];
-        for (unsigned int stride = 1; stride < upper_bound; stride <<= 1) {
-          auto tmp = sg.shuffle_up(local_sg_value, stride);
-          if (id_in_sg >= stride) {
-            if (idx < n_active_subgroups)
-              final_reducer.join(&local_sg_value, &tmp);
-            else
-              local_sg_value = tmp;
-          }
-        }
-        if (idx < n_active_subgroups) {
-          local_mem[idx] = local_sg_value;
-          if (round > 0)
-            final_reducer.join(&local_mem[idx],
-                               &local_mem[round * local_range - 1]);
-        }
-        if (round + 1 < n_rounds) sycl::group_barrier(sg);
-      }
-    }
-    sycl::group_barrier(item.get_group());
-  }
-
-  // add results to all subgroups
-  if (sg_group_id > 0)
-    final_reducer.join(&local_value, &local_mem[sg_group_id - 1]);
-}
-
-template <class FunctorType, class... Traits>
-class ParallelScanSYCLBase {
- public:
-  using Policy = Kokkos::RangePolicy<Traits...>;
-
- protected:
-  using Member       = typename Policy::member_type;
-  using WorkTag      = typename Policy::work_tag;
-  using WorkRange    = typename Policy::WorkRange;
-  using LaunchBounds = typename Policy::launch_bounds;
-
- public:
-  using Analysis =
-      FunctorAnalysis<FunctorPatternInterface::SCAN, Policy, FunctorType>;
-  using pointer_type   = typename Analysis::pointer_type;
-  using value_type     = typename Analysis::value_type;
-  using reference_type = typename Analysis::reference_type;
-  using functor_type   = FunctorType;
-  using size_type      = Kokkos::Experimental::SYCL::size_type;
-  using index_type     = typename Policy::index_type;
-
- protected:
-  const FunctorType m_functor;
-  const Policy m_policy;
-  pointer_type m_scratch_space = nullptr;
-  const pointer_type m_result_ptr;
-  const bool m_result_ptr_device_accessible;
-
-  // Only let one Parallel/Scan modify the shared memory. The
-  // constructor acquires the mutex which is released in the destructor.
-  std::scoped_lock<std::mutex> m_shared_memory_lock;
-
- private:
-  template <typename FunctorWrapper>
-  void scan_internal(sycl::queue& q, const FunctorWrapper& functor_wrapper,
-                     pointer_type global_mem, std::size_t size) const {
-    // FIXME_SYCL optimize
-    constexpr size_t wgroup_size = 128;
-    auto n_wgroups               = (size + wgroup_size - 1) / wgroup_size;
-    pointer_type group_results   = global_mem + n_wgroups * wgroup_size;
-
-    auto local_scans = q.submit([&](sycl::handler& cgh) {
-      // Store subgroup totals
-      const auto min_subgroup_size =
-          q.get_device()
-              .template get_info<sycl::info::device::sub_group_sizes>()
-              .front();
-      sycl::accessor<value_type, 1, sycl::access::mode::read_write,
-                     sycl::access::target::local>
-          local_mem(sycl::range<1>((wgroup_size + min_subgroup_size - 1) /
-                                   min_subgroup_size),
-                    cgh);
-
-      cgh.parallel_for(
-          sycl::nd_range<1>(n_wgroups * wgroup_size, wgroup_size),
-          [=](sycl::nd_item<1> item) {
-            const FunctorType& functor = functor_wrapper.get_functor();
-            typename Analysis::Reducer final_reducer(&functor);
-
-            const auto local_id  = item.get_local_linear_id();
-            const auto global_id = item.get_global_linear_id();
-
-            // Initialize local memory
-            value_type local_value;
-            if (global_id < size)
-              local_value = global_mem[global_id];
-            else
-              final_reducer.init(&local_value);
-
-            workgroup_scan<>(item, final_reducer, local_mem.get_pointer(),
-                             local_value, wgroup_size);
-
-            if (n_wgroups > 1 && local_id == wgroup_size - 1)
-              group_results[item.get_group_linear_id()] =
-                  local_mem[item.get_sub_group().get_group_range()[0] - 1];
-
-            // Write results to global memory
-            if (global_id < size) global_mem[global_id] = local_value;
-          });
-    });
-    q.ext_oneapi_submit_barrier(std::vector<sycl::event>{local_scans});
-
-    if (n_wgroups > 1) {
-      scan_internal(q, functor_wrapper, group_results, n_wgroups);
-      auto update_with_group_results = q.submit([&](sycl::handler& cgh) {
-        cgh.parallel_for(
-            sycl::nd_range<1>(n_wgroups * wgroup_size, wgroup_size),
-            [=](sycl::nd_item<1> item) {
-              const auto global_id       = item.get_global_linear_id();
-              const FunctorType& functor = functor_wrapper.get_functor();
-              typename Analysis::Reducer final_reducer(&functor);
-              if (global_id < size)
-                final_reducer.join(&global_mem[global_id],
-                                   &group_results[item.get_group_linear_id()]);
-            });
-      });
-      q.ext_oneapi_submit_barrier(
-          std::vector<sycl::event>{update_with_group_results});
-    }
-  }
-
-  template <typename FunctorWrapper>
-  sycl::event sycl_direct_launch(const FunctorWrapper& functor_wrapper,
-                                 sycl::event memcpy_event) const {
-    // Convenience references
-    const Kokkos::Experimental::SYCL& space = m_policy.space();
-    sycl::queue& q                          = space.sycl_queue();
-
-    const std::size_t len = m_policy.end() - m_policy.begin();
-
-    // Initialize global memory
-    auto initialize_global_memory = q.submit([&](sycl::handler& cgh) {
-      auto global_mem = m_scratch_space;
-      auto begin      = m_policy.begin();
-
-      cgh.depends_on(memcpy_event);
-      cgh.parallel_for(sycl::range<1>(len), [=](sycl::item<1> item) {
-        const typename Policy::index_type id =
-            static_cast<typename Policy::index_type>(item.get_id()) + begin;
-        const FunctorType& functor = functor_wrapper.get_functor();
-        typename Analysis::Reducer final_reducer(&functor);
-
-        value_type update{};
-        final_reducer.init(&update);
-        if constexpr (std::is_void<WorkTag>::value)
-          functor_wrapper.get_functor()(id, update, false);
-        else
-          functor_wrapper.get_functor()(WorkTag(), id, update, false);
-        global_mem[id] = update;
-      });
-    });
-    q.ext_oneapi_submit_barrier(
-        std::vector<sycl::event>{initialize_global_memory});
-
-    // Perform the actual exclusive scan
-    scan_internal(q, functor_wrapper, m_scratch_space, len);
-
-    // Write results to global memory
-    auto update_global_results = q.submit([&](sycl::handler& cgh) {
-      auto global_mem                   = m_scratch_space;
-      auto result_ptr_device_accessible = m_result_ptr_device_accessible;
-      // The compiler failed with CL_INVALID_ARG_VALUE if using m_result_ptr
-      // directly.
-      auto result_ptr = m_result_ptr_device_accessible ? m_result_ptr : nullptr;
-      cgh.parallel_for(sycl::range<1>(len), [=](sycl::item<1> item) {
-        auto global_id = item.get_id(0);
-
-        value_type update = global_mem[global_id];
-        if constexpr (std::is_void<WorkTag>::value)
-          functor_wrapper.get_functor()(global_id, update, true);
-        else
-          functor_wrapper.get_functor()(WorkTag(), global_id, update, true);
-        global_mem[global_id] = update;
-        if (global_id == len - 1 && result_ptr_device_accessible)
-          *result_ptr = update;
-      });
-    });
-    q.ext_oneapi_submit_barrier(
-        std::vector<sycl::event>{update_global_results});
-    return update_global_results;
-  }
-
- public:
-  template <typename PostFunctor>
-  void impl_execute(const PostFunctor& post_functor) {
-    if (m_policy.begin() == m_policy.end()) return;
-
-    auto& instance        = *m_policy.space().impl_internal_space_instance();
-    const std::size_t len = m_policy.end() - m_policy.begin();
-
-    // Compute the total amount of memory we will need. We emulate the recursive
-    // structure that is used to do the actual scan. Essentially, we need to
-    // allocate memory for the whole range and then recursively for the reduced
-    // group results until only one group is left.
-    std::size_t total_memory = 0;
-    {
-      size_t wgroup_size   = 128;
-      size_t n_nested_size = len;
-      size_t n_nested_wgroups;
-      do {
-        n_nested_wgroups = (n_nested_size + wgroup_size - 1) / wgroup_size;
-        n_nested_size    = n_nested_wgroups;
-        total_memory += sizeof(value_type) * n_nested_wgroups * wgroup_size;
-      } while (n_nested_wgroups > 1);
-      total_memory += sizeof(value_type) * wgroup_size;
-    }
-
-    // FIXME_SYCL consider only storing one value per block and recreate initial
-    // results in the end before doing the final pass
-    m_scratch_space = static_cast<sycl::device_ptr<value_type>>(
-        instance.scratch_space(total_memory));
-
-    Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem&
-        indirectKernelMem = instance.get_indirect_kernel_mem();
-
-    auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper(
-        m_functor, indirectKernelMem);
-
-    sycl::event event =
-        sycl_direct_launch(functor_wrapper, functor_wrapper.get_copy_event());
-    functor_wrapper.register_event(event);
-    post_functor();
-  }
-
-  ParallelScanSYCLBase(const FunctorType& arg_functor, const Policy& arg_policy,
-                       pointer_type arg_result_ptr,
-                       bool arg_result_ptr_device_accessible)
-      : m_functor(arg_functor),
-        m_policy(arg_policy),
-        m_result_ptr(arg_result_ptr),
-        m_result_ptr_device_accessible(arg_result_ptr_device_accessible),
-        m_shared_memory_lock(m_policy.space()
-                                 .impl_internal_space_instance()
-                                 ->m_mutexScratchSpace) {}
-};
-
-template <class FunctorType, class... Traits>
-class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
-                   Kokkos::Experimental::SYCL>
-    : private ParallelScanSYCLBase<FunctorType, Traits...> {
- public:
-  using Base = ParallelScanSYCLBase<FunctorType, Traits...>;
-
-  inline void execute() {
-    Base::impl_execute([]() {});
-  }
-
-  ParallelScan(const FunctorType& arg_functor,
-               const typename Base::Policy& arg_policy)
-      : Base(arg_functor, arg_policy, nullptr, false) {}
-};
-
-//----------------------------------------------------------------------------
-
-template <class FunctorType, class ReturnType, class... Traits>
-class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
-                            ReturnType, Kokkos::Experimental::SYCL>
-    : public ParallelScanSYCLBase<FunctorType, Traits...> {
- public:
-  using Base = ParallelScanSYCLBase<FunctorType, Traits...>;
-
-  const Kokkos::Experimental::SYCL& m_exec;
-
-  inline void execute() {
-    Base::impl_execute([&]() {
-      const long long nwork = Base::m_policy.end() - Base::m_policy.begin();
-      if (nwork > 0 && !Base::m_result_ptr_device_accessible) {
-        const int size = Base::Analysis::value_size(Base::m_functor);
-        DeepCopy<HostSpace, Kokkos::Experimental::SYCLDeviceUSMSpace,
-                 Kokkos::Experimental::SYCL>(m_exec, Base::m_result_ptr,
-                                             Base::m_scratch_space + nwork - 1,
-                                             size);
-      }
-    });
-  }
-
-  template <class ViewType>
-  ParallelScanWithTotal(const FunctorType& arg_functor,
-                        const typename Base::Policy& arg_policy,
-                        const ViewType& arg_result_view)
-      : Base(arg_functor, arg_policy, arg_result_view.data(),
-             MemorySpaceAccess<Experimental::SYCLDeviceUSMSpace,
-                               typename ViewType::memory_space>::accessible),
-        m_exec(arg_policy.space()) {}
-};
-
-}  // namespace Impl
-}  // namespace Kokkos
-
-#endif
-
-#endif
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
deleted file mode 100644
index 601580b2d8b7b591844bd102fd2165c324e1ab60..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
+++ /dev/null
@@ -1,953 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#ifndef KOKKOS_SYCL_PARALLEL_TEAM_HPP
-#define KOKKOS_SYCL_PARALLEL_TEAM_HPP
-
-#include <Kokkos_Parallel.hpp>
-
-#include <SYCL/Kokkos_SYCL_Parallel_Reduce.hpp>  // workgroup_reduction
-#include <SYCL/Kokkos_SYCL_Team.hpp>
-
-#include <vector>
-
-namespace Kokkos {
-namespace Impl {
-template <typename... Properties>
-class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
-    : public PolicyTraits<Properties...> {
- public:
-  using execution_policy = TeamPolicyInternal;
-
-  using traits = PolicyTraits<Properties...>;
-
-  template <typename ExecSpace, typename... OtherProperties>
-  friend class TeamPolicyInternal;
-
- private:
-  typename traits::execution_space m_space;
-  int m_league_size;
-  int m_team_size;
-  int m_vector_length;
-  size_t m_team_scratch_size[2];
-  size_t m_thread_scratch_size[2];
-  int m_chunk_size;
-  bool m_tune_team_size;
-  bool m_tune_vector_length;
-
- public:
-  using execution_space = Kokkos::Experimental::SYCL;
-
-  template <class... OtherProperties>
-  TeamPolicyInternal(TeamPolicyInternal<OtherProperties...> const& p) {
-    m_league_size            = p.m_league_size;
-    m_team_size              = p.m_team_size;
-    m_vector_length          = p.m_vector_length;
-    m_team_scratch_size[0]   = p.m_team_scratch_size[0];
-    m_team_scratch_size[1]   = p.m_team_scratch_size[1];
-    m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
-    m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
-    m_chunk_size             = p.m_chunk_size;
-    m_space                  = p.m_space;
-    m_tune_team_size         = p.m_tune_team_size;
-    m_tune_vector_length     = p.m_tune_vector_length;
-  }
-
-  template <typename FunctorType>
-  int team_size_max(FunctorType const& f, ParallelForTag const&) const {
-    return internal_team_size_max_for(f);
-  }
-
-  template <class FunctorType>
-  inline int team_size_max(const FunctorType& f,
-                           const ParallelReduceTag&) const {
-    return internal_team_size_max_reduce(f);
-  }
-
-  template <class FunctorType, class ReducerType>
-  inline int team_size_max(const FunctorType& f, const ReducerType& /*r*/,
-                           const ParallelReduceTag&) const {
-    return internal_team_size_max_reduce(f);
-  }
-
-  template <typename FunctorType>
-  int team_size_recommended(FunctorType const& f, ParallelForTag const&) const {
-    return internal_team_size_recommended_for(f);
-  }
-
-  template <typename FunctorType>
-  inline int team_size_recommended(FunctorType const& f,
-                                   ParallelReduceTag const&) const {
-    return internal_team_size_recommended_reduce(f);
-  }
-
-  template <class FunctorType, class ReducerType>
-  int team_size_recommended(FunctorType const& f, ReducerType const&,
-                            ParallelReduceTag const&) const {
-    return internal_team_size_recommended_reduce(f);
-  }
-  inline bool impl_auto_vector_length() const { return m_tune_vector_length; }
-  inline bool impl_auto_team_size() const { return m_tune_team_size; }
-  // FIXME_SYCL This is correct in most cases, but not necessarily in case a
-  // custom sycl::queue is used to initialize the execution space.
-  static int vector_length_max() {
-    std::vector<size_t> sub_group_sizes =
-        execution_space{}
-            .impl_internal_space_instance()
-            ->m_queue->get_device()
-            .template get_info<sycl::info::device::sub_group_sizes>();
-    return *std::max_element(sub_group_sizes.begin(), sub_group_sizes.end());
-  }
-
- private:
-  static int verify_requested_vector_length(int requested_vector_length) {
-    int test_vector_length =
-        std::min(requested_vector_length, vector_length_max());
-
-    // Allow only power-of-two vector_length
-    if (!(is_integral_power_of_two(test_vector_length))) {
-      int test_pow2 = 1;
-      while (test_pow2 < test_vector_length) test_pow2 <<= 1;
-      test_vector_length = test_pow2 >> 1;
-    }
-
-    return test_vector_length;
-  }
-
- public:
-  static int scratch_size_max(int level) {
-    return level == 0 ? 1024 * 32
-                      :           // FIXME_SYCL arbitrarily setting this to 32kB
-               20 * 1024 * 1024;  // FIXME_SYCL arbitrarily setting this to 20MB
-  }
-  inline void impl_set_vector_length(size_t size) { m_vector_length = size; }
-  inline void impl_set_team_size(size_t size) { m_team_size = size; }
-  int impl_vector_length() const { return m_vector_length; }
-
-  int team_size() const { return m_team_size; }
-
-  int league_size() const { return m_league_size; }
-
-  size_t scratch_size(int level, int team_size_ = -1) const {
-    if (team_size_ < 0) team_size_ = m_team_size;
-    return m_team_scratch_size[level] +
-           team_size_ * m_thread_scratch_size[level];
-  }
-
-  size_t team_scratch_size(int level) const {
-    return m_team_scratch_size[level];
-  }
-
-  size_t thread_scratch_size(int level) const {
-    return m_thread_scratch_size[level];
-  }
-
-  typename traits::execution_space space() const { return m_space; }
-
-  TeamPolicyInternal()
-      : m_space(typename traits::execution_space()),
-        m_league_size(0),
-        m_team_size(-1),
-        m_vector_length(0),
-        m_team_scratch_size{0, 0},
-        m_thread_scratch_size{0, 0},
-        m_chunk_size(vector_length_max()),
-        m_tune_team_size(false),
-        m_tune_vector_length(false) {}
-
-  /** \brief  Specify league size, request team size */
-  TeamPolicyInternal(const execution_space space_, int league_size_,
-                     int team_size_request, int vector_length_request = 1)
-      : m_space(space_),
-        m_league_size(league_size_),
-        m_team_size(team_size_request),
-        m_vector_length(
-            (vector_length_request > 0)
-                ? verify_requested_vector_length(vector_length_request)
-                : (verify_requested_vector_length(1))),
-        m_team_scratch_size{0, 0},
-        m_thread_scratch_size{0, 0},
-        m_chunk_size(vector_length_max()),
-        m_tune_team_size(bool(team_size_request <= 0)),
-        m_tune_vector_length(bool(vector_length_request <= 0)) {
-    // FIXME_SYCL Check that league size is permissible,
-    // https://github.com/intel/llvm/pull/4064
-
-    // Make sure total block size is permissible
-    if (m_team_size * m_vector_length >
-        static_cast<int>(
-            m_space.impl_internal_space_instance()->m_maxWorkgroupSize)) {
-      Impl::throw_runtime_exception(
-          std::string("Kokkos::TeamPolicy<SYCL> the team size is too large. "
-                      "Team size x vector length is " +
-                      std::to_string(m_team_size * m_vector_length) +
-                      " but must be smaller than ") +
-          std::to_string(
-              m_space.impl_internal_space_instance()->m_maxWorkgroupSize));
-    }
-  }
-
-  /** \brief  Specify league size, request team size */
-  TeamPolicyInternal(const execution_space space_, int league_size_,
-                     const Kokkos::AUTO_t& /* team_size_request */,
-                     int vector_length_request = 1)
-      : TeamPolicyInternal(space_, league_size_, -1, vector_length_request) {}
-  // FLAG
-  /** \brief  Specify league size and team size, request vector length*/
-  TeamPolicyInternal(const execution_space space_, int league_size_,
-                     int team_size_request,
-                     const Kokkos::AUTO_t& /* vector_length_request */
-                     )
-      : TeamPolicyInternal(space_, league_size_, team_size_request, -1)
-
-  {}
-
-  /** \brief  Specify league size, request team size and vector length*/
-  TeamPolicyInternal(const execution_space space_, int league_size_,
-                     const Kokkos::AUTO_t& /* team_size_request */,
-                     const Kokkos::AUTO_t& /* vector_length_request */
-
-                     )
-      : TeamPolicyInternal(space_, league_size_, -1, -1)
-
-  {}
-
-  TeamPolicyInternal(int league_size_, int team_size_request,
-                     int vector_length_request = 1)
-      : TeamPolicyInternal(typename traits::execution_space(), league_size_,
-                           team_size_request, vector_length_request) {}
-
-  TeamPolicyInternal(int league_size_,
-                     const Kokkos::AUTO_t& /* team_size_request */,
-                     int vector_length_request = 1)
-      : TeamPolicyInternal(typename traits::execution_space(), league_size_, -1,
-                           vector_length_request) {}
-
-  /** \brief  Specify league size and team size, request vector length*/
-  TeamPolicyInternal(int league_size_, int team_size_request,
-                     const Kokkos::AUTO_t& /* vector_length_request */
-
-                     )
-      : TeamPolicyInternal(typename traits::execution_space(), league_size_,
-                           team_size_request, -1)
-
-  {}
-
-  /** \brief  Specify league size, request team size and vector length*/
-  TeamPolicyInternal(int league_size_,
-                     const Kokkos::AUTO_t& /* team_size_request */,
-                     const Kokkos::AUTO_t& /* vector_length_request */
-
-                     )
-      : TeamPolicyInternal(typename traits::execution_space(), league_size_, -1,
-                           -1) {}
-
-  int chunk_size() const { return m_chunk_size; }
-
-  TeamPolicyInternal& set_chunk_size(typename traits::index_type chunk_size_) {
-    m_chunk_size = chunk_size_;
-    return *this;
-  }
-
-  /** \brief set per team scratch size for a specific level of the scratch
-   * hierarchy */
-  TeamPolicyInternal& set_scratch_size(int level,
-                                       PerTeamValue const& per_team) {
-    m_team_scratch_size[level] = per_team.value;
-    return *this;
-  }
-
-  /** \brief set per thread scratch size for a specific level of the scratch
-   * hierarchy */
-  TeamPolicyInternal& set_scratch_size(int level,
-                                       PerThreadValue const& per_thread) {
-    m_thread_scratch_size[level] = per_thread.value;
-    return *this;
-  }
-
-  /** \brief set per thread and per team scratch size for a specific level of
-   * the scratch hierarchy */
-  TeamPolicyInternal& set_scratch_size(int level, PerTeamValue const& per_team,
-                                       PerThreadValue const& per_thread) {
-    m_team_scratch_size[level]   = per_team.value;
-    m_thread_scratch_size[level] = per_thread.value;
-    return *this;
-  }
-
-  using member_type = Kokkos::Impl::SYCLTeamMember;
-
- protected:
-  template <class FunctorType>
-  int internal_team_size_max_for(const FunctorType& /*f*/) const {
-    // nested_reducer_memsize = (sizeof(double) * (m_team_size + 2)
-    // custom: m_team_scratch_size[0] + m_thread_scratch_size[0] * m_team_size
-    // total:
-    // 2*sizeof(double)+m_team_scratch_size[0]
-    // + m_team_size(sizeof(double)+m_thread_scratch_size[0])
-    const int max_threads_for_memory =
-        (space().impl_internal_space_instance()->m_maxShmemPerBlock -
-         2 * sizeof(double) - m_team_scratch_size[0]) /
-        (sizeof(double) + m_thread_scratch_size[0]);
-    return std::min({
-             int(m_space.impl_internal_space_instance()->m_maxWorkgroupSize),
-      // FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs.
-#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) ||  \
-    defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) ||    \
-    defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
-    defined(KOKKOS_ARCH_HOPPER)
-                 256,
-#endif
-                 max_threads_for_memory
-           }) /
-           impl_vector_length();
-  }
-
-  template <class FunctorType>
-  int internal_team_size_max_reduce(const FunctorType& f) const {
-    using Analysis        = FunctorAnalysis<FunctorPatternInterface::REDUCE,
-                                     TeamPolicyInternal, FunctorType>;
-    using value_type      = typename Analysis::value_type;
-    const int value_count = Analysis::value_count(f);
-
-    // nested_reducer_memsize = (sizeof(double) * (m_team_size + 2)
-    // reducer_memsize = sizeof(value_type) * m_team_size * value_count
-    // custom: m_team_scratch_size[0] + m_thread_scratch_size[0] * m_team_size
-    // total:
-    // 2*sizeof(double)+m_team_scratch_size[0]
-    // + m_team_size(sizeof(double)+sizeof(value_type)*value_count
-    //               +m_thread_scratch_size[0])
-    const int max_threads_for_memory =
-        (space().impl_internal_space_instance()->m_maxShmemPerBlock -
-         2 * sizeof(double) - m_team_scratch_size[0]) /
-        (sizeof(double) + sizeof(value_type) * value_count +
-         m_thread_scratch_size[0]);
-    return std::min<int>({
-             int(m_space.impl_internal_space_instance()->m_maxWorkgroupSize),
-      // FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs.
-#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) ||  \
-    defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) ||    \
-    defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
-    defined(KOKKOS_ARCH_HOPPER)
-                 256,
-#endif
-                 max_threads_for_memory
-           }) /
-           impl_vector_length();
-  }
-
-  template <class FunctorType>
-  int internal_team_size_recommended_for(const FunctorType& f) const {
-    // FIXME_SYCL improve
-    return 1 << Kokkos::Impl::int_log2(internal_team_size_max_for(f));
-  }
-
-  template <class FunctorType>
-  int internal_team_size_recommended_reduce(const FunctorType& f) const {
-    // FIXME_SYCL improve
-    return 1 << Kokkos::Impl::int_log2(internal_team_size_max_reduce(f));
-  }
-};
-
-template <typename FunctorType, typename... Properties>
-class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
-                  Kokkos::Experimental::SYCL> {
- public:
-  using Policy = TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>;
-  using functor_type = FunctorType;
-  using size_type    = ::Kokkos::Experimental::SYCL::size_type;
-
- private:
-  using member_type   = typename Policy::member_type;
-  using work_tag      = typename Policy::work_tag;
-  using launch_bounds = typename Policy::launch_bounds;
-
-  FunctorType const m_functor;
-  Policy const m_policy;
-  size_type const m_league_size;
-  int m_team_size;
-  size_type const m_vector_size;
-  int m_shmem_begin;
-  int m_shmem_size;
-  sycl::device_ptr<char> m_global_scratch_ptr;
-  size_t m_scratch_size[2];
-  // Only let one ParallelFor/Reduce modify the team scratch memory. The
-  // constructor acquires the mutex which is released in the destructor.
-  std::scoped_lock<std::mutex> m_scratch_lock;
-
-  template <typename FunctorWrapper>
-  sycl::event sycl_direct_launch(const Policy& policy,
-                                 const FunctorWrapper& functor_wrapper,
-                                 const sycl::event& memcpy_events) const {
-    // Convenience references
-    const Kokkos::Experimental::SYCL& space = policy.space();
-    sycl::queue& q                          = space.sycl_queue();
-
-    auto parallel_for_event = q.submit([&](sycl::handler& cgh) {
-      // FIXME_SYCL accessors seem to need a size greater than zero at least for
-      // host queues
-      sycl::accessor<char, 1, sycl::access::mode::read_write,
-                     sycl::access::target::local>
-          team_scratch_memory_L0(
-              sycl::range<1>(
-                  std::max(m_scratch_size[0] + m_shmem_begin, size_t(1))),
-              cgh);
-
-      // Avoid capturing *this since it might not be trivially copyable
-      const auto shmem_begin       = m_shmem_begin;
-      const size_t scratch_size[2] = {m_scratch_size[0], m_scratch_size[1]};
-      sycl::device_ptr<char> const global_scratch_ptr = m_global_scratch_ptr;
-
-      auto lambda = [=](sycl::nd_item<2> item) {
-        const member_type team_member(
-            team_scratch_memory_L0.get_pointer(), shmem_begin, scratch_size[0],
-            global_scratch_ptr + item.get_group(1) * scratch_size[1],
-            scratch_size[1], item);
-        if constexpr (std::is_void<work_tag>::value)
-          functor_wrapper.get_functor()(team_member);
-        else
-          functor_wrapper.get_functor()(work_tag(), team_member);
-      };
-
-      static sycl::kernel kernel = [&] {
-        sycl::kernel_id functor_kernel_id =
-            sycl::get_kernel_id<decltype(lambda)>();
-        auto kernel_bundle =
-            sycl::get_kernel_bundle<sycl::bundle_state::executable>(
-                q.get_context(), std::vector{functor_kernel_id});
-        return kernel_bundle.get_kernel(functor_kernel_id);
-      }();
-      auto max_sg_size =
-          kernel
-              .get_info<sycl::info::kernel_device_specific::max_sub_group_size>(
-                  q.get_device(),
-                  sycl::range<3>(m_team_size, m_vector_size, 1));
-      auto final_vector_size = std::min<int>(m_vector_size, max_sg_size);
-      // FIXME_SYCL For some reason, explicitly enforcing the kernel bundle to
-      // be used gives a runtime error.
-      // cgh.use_kernel_bundle(kernel_bundle);
-
-      cgh.depends_on(memcpy_events);
-      cgh.parallel_for(
-          sycl::nd_range<2>(
-              sycl::range<2>(m_team_size, m_league_size * final_vector_size),
-              sycl::range<2>(m_team_size, final_vector_size)),
-          lambda);
-    });
-    q.ext_oneapi_submit_barrier(std::vector<sycl::event>{parallel_for_event});
-    return parallel_for_event;
-  }
-
- public:
-  inline void execute() const {
-    if (m_league_size == 0) return;
-
-    Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem&
-        indirectKernelMem = m_policy.space()
-                                .impl_internal_space_instance()
-                                ->get_indirect_kernel_mem();
-
-    auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper(
-        m_functor, indirectKernelMem);
-
-    sycl::event event = sycl_direct_launch(m_policy, functor_wrapper,
-                                           functor_wrapper.get_copy_event());
-    functor_wrapper.register_event(event);
-  }
-
-  ParallelFor(FunctorType const& arg_functor, Policy const& arg_policy)
-      : m_functor(arg_functor),
-        m_policy(arg_policy),
-        m_league_size(arg_policy.league_size()),
-        m_team_size(arg_policy.team_size()),
-        m_vector_size(arg_policy.impl_vector_length()),
-        m_scratch_lock(arg_policy.space()
-                           .impl_internal_space_instance()
-                           ->m_team_scratch_mutex) {
-    // FIXME_SYCL optimize
-    if (m_team_size < 0)
-      m_team_size =
-          m_policy.team_size_recommended(arg_functor, ParallelForTag{});
-
-    m_shmem_begin = (sizeof(double) * (m_team_size + 2));
-    m_shmem_size =
-        (m_policy.scratch_size(0, m_team_size) +
-         FunctorTeamShmemSize<FunctorType>::value(m_functor, m_team_size));
-    m_scratch_size[0] = m_shmem_size;
-    m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);
-
-    // Functor's reduce memory, team scan memory, and team shared memory depend
-    // upon team size.
-    auto& space = *m_policy.space().impl_internal_space_instance();
-    m_global_scratch_ptr =
-        static_cast<sycl::device_ptr<char>>(space.resize_team_scratch_space(
-            static_cast<ptrdiff_t>(m_scratch_size[1]) * m_league_size));
-
-    if (static_cast<int>(space.m_maxShmemPerBlock) <
-        m_shmem_size - m_shmem_begin) {
-      std::stringstream out;
-      out << "Kokkos::Impl::ParallelFor<SYCL> insufficient shared memory! "
-             "Requested "
-          << m_shmem_size - m_shmem_begin << " bytes but maximum is "
-          << space.m_maxShmemPerBlock << '\n';
-      Kokkos::Impl::throw_runtime_exception(out.str());
-    }
-
-    const auto max_team_size =
-        m_policy.team_size_max(arg_functor, ParallelForTag{});
-    if (m_team_size > m_policy.team_size_max(arg_functor, ParallelForTag{}))
-      Kokkos::Impl::throw_runtime_exception(
-          "Kokkos::Impl::ParallelFor<SYCL> requested too large team size. The "
-          "maximal team_size is " +
-          std::to_string(max_team_size) + '!');
-  }
-};
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-template <class FunctorType, class ReducerType, class... Properties>
-class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
-                     ReducerType, Kokkos::Experimental::SYCL> {
- public:
-  using Policy = TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>;
-
- private:
-  using ReducerConditional =
-      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                         FunctorType, ReducerType>;
-  using ReducerTypeFwd = typename ReducerConditional::type;
-  using Analysis =
-      FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy, ReducerTypeFwd>;
-  using member_type   = typename Policy::member_type;
-  using WorkTag       = typename Policy::work_tag;
-  using launch_bounds = typename Policy::launch_bounds;
-
-  using pointer_type   = typename Analysis::pointer_type;
-  using reference_type = typename Analysis::reference_type;
-  using value_type     = typename Analysis::value_type;
-
- public:
-  using functor_type = FunctorType;
-  using size_type    = Kokkos::Experimental::SYCL::size_type;
-
- private:
-  const FunctorType m_functor;
-  const Policy m_policy;
-  const ReducerType m_reducer;
-  const pointer_type m_result_ptr;
-  const bool m_result_ptr_device_accessible;
-  size_type m_shmem_begin;
-  size_type m_shmem_size;
-  sycl::device_ptr<char> m_global_scratch_ptr;
-  size_t m_scratch_size[2];
-  const size_type m_league_size;
-  int m_team_size;
-  const size_type m_vector_size;
-  // Only let one ParallelFor/Reduce modify the team scratch memory. The
-  // constructor acquires the mutex which is released in the destructor.
-  std::scoped_lock<std::mutex> m_scratch_lock;
-
-  template <typename PolicyType, typename FunctorWrapper,
-            typename ReducerWrapper>
-  sycl::event sycl_direct_launch(
-      const PolicyType& policy, const FunctorWrapper& functor_wrapper,
-      const ReducerWrapper& reducer_wrapper,
-      const std::vector<sycl::event>& memcpy_events) const {
-    // Convenience references
-    const Kokkos::Experimental::SYCL& space = policy.space();
-    Kokkos::Experimental::Impl::SYCLInternal& instance =
-        *space.impl_internal_space_instance();
-    sycl::queue& q = space.sycl_queue();
-
-    const unsigned int value_count =
-        Analysis::value_count(ReducerConditional::select(m_functor, m_reducer));
-    std::size_t size = std::size_t(m_league_size) * m_team_size * m_vector_size;
-    value_type* results_ptr = nullptr;
-
-    sycl::event last_reduction_event;
-
-    // If size<=1 we only call init(), the functor and possibly final once
-    // working with the global scratch memory but don't copy back to
-    // m_result_ptr yet.
-    if (size <= 1) {
-      results_ptr =
-          static_cast<sycl::device_ptr<value_type>>(instance.scratch_space(
-              sizeof(value_type) * std::max(value_count, 1u)));
-      sycl::global_ptr<value_type> device_accessible_result_ptr =
-          m_result_ptr_device_accessible ? m_result_ptr : nullptr;
-
-      auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) {
-        // FIXME_SYCL accessors seem to need a size greater than zero at least
-        // for host queues
-        sycl::accessor<char, 1, sycl::access::mode::read_write,
-                       sycl::access::target::local>
-            team_scratch_memory_L0(
-                sycl::range<1>(
-                    std::max(m_scratch_size[0] + m_shmem_begin, size_t(1))),
-                cgh);
-
-        // Avoid capturing *this since it might not be trivially copyable
-        const auto shmem_begin       = m_shmem_begin;
-        const size_t scratch_size[2] = {m_scratch_size[0], m_scratch_size[1]};
-        sycl::device_ptr<char> const global_scratch_ptr = m_global_scratch_ptr;
-
-        cgh.depends_on(memcpy_events);
-        cgh.parallel_for(
-            sycl::nd_range<2>(sycl::range<2>(1, 1), sycl::range<2>(1, 1)),
-            [=](sycl::nd_item<2> item) {
-              const auto& functor          = functor_wrapper.get_functor();
-              const auto& selected_reducer = ReducerConditional::select(
-                  static_cast<const FunctorType&>(functor),
-                  static_cast<const ReducerType&>(
-                      reducer_wrapper.get_functor()));
-              typename Analysis::Reducer final_reducer(&selected_reducer);
-
-              reference_type update = final_reducer.init(results_ptr);
-              if (size == 1) {
-                const member_type team_member(
-                    team_scratch_memory_L0.get_pointer(), shmem_begin,
-                    scratch_size[0], global_scratch_ptr, scratch_size[1], item);
-                if constexpr (std::is_void<WorkTag>::value)
-                  functor(team_member, update);
-                else
-                  functor(WorkTag(), team_member, update);
-              }
-              final_reducer.final(results_ptr);
-              if (device_accessible_result_ptr)
-                final_reducer.copy(device_accessible_result_ptr,
-                                   &results_ptr[0]);
-            });
-      });
-      q.ext_oneapi_submit_barrier(
-          std::vector<sycl::event>{parallel_reduce_event});
-      last_reduction_event = parallel_reduce_event;
-    } else {
-      // Otherwise, (if the total range has more than one element) we perform a
-      // reduction on the values in all workgroups separately, write the
-      // workgroup results back to global memory and recurse until only one
-      // workgroup does the reduction and thus gets the final value.
-      auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) {
-        auto scratch_flags = static_cast<sycl::device_ptr<unsigned int>>(
-            instance.scratch_flags(sizeof(unsigned int)));
-
-        // FIXME_SYCL accessors seem to need a size greater than zero at least
-        // for host queues
-        sycl::accessor<char, 1, sycl::access::mode::read_write,
-                       sycl::access::target::local>
-            team_scratch_memory_L0(
-                sycl::range<1>(
-                    std::max(m_scratch_size[0] + m_shmem_begin, size_t(1))),
-                cgh);
-
-        // Avoid capturing *this since it might not be trivially copyable
-        const auto shmem_begin       = m_shmem_begin;
-        const size_t scratch_size[2] = {m_scratch_size[0], m_scratch_size[1]};
-        sycl::device_ptr<char> const global_scratch_ptr = m_global_scratch_ptr;
-
-        auto team_reduction_factory =
-            [&](sycl::accessor<value_type, 1, sycl::access::mode::read_write,
-                               sycl::access::target::local>
-                    local_mem,
-                sycl::device_ptr<value_type> results_ptr) {
-              sycl::global_ptr<value_type> device_accessible_result_ptr =
-                  m_result_ptr_device_accessible ? m_result_ptr : nullptr;
-              auto lambda = [=](sycl::nd_item<2> item) {
-                auto n_wgroups =
-                    item.get_group_range()[0] * item.get_group_range()[1];
-                auto wgroup_size =
-                    item.get_local_range()[0] * item.get_local_range()[1];
-                auto size = n_wgroups * wgroup_size;
-
-                auto& num_teams_done = reinterpret_cast<unsigned int&>(
-                    local_mem[wgroup_size * std::max(value_count, 1u)]);
-                const auto local_id          = item.get_local_linear_id();
-                const auto& functor          = functor_wrapper.get_functor();
-                const auto& selected_reducer = ReducerConditional::select(
-                    static_cast<const FunctorType&>(functor),
-                    static_cast<const ReducerType&>(
-                        reducer_wrapper.get_functor()));
-                typename Analysis::Reducer final_reducer(&selected_reducer);
-
-                if constexpr (Analysis::StaticValueSize == 0) {
-                  reference_type update =
-                      final_reducer.init(&local_mem[local_id * value_count]);
-                  const member_type team_member(
-                      team_scratch_memory_L0.get_pointer(), shmem_begin,
-                      scratch_size[0],
-                      global_scratch_ptr + item.get_group(1) * scratch_size[1],
-                      scratch_size[1], item);
-                  if constexpr (std::is_void<WorkTag>::value)
-                    functor(team_member, update);
-                  else
-                    functor(WorkTag(), team_member, update);
-                  item.barrier(sycl::access::fence_space::local_space);
-
-                  SYCLReduction::workgroup_reduction<>(
-                      item, local_mem.get_pointer(), results_ptr,
-                      device_accessible_result_ptr, value_count,
-                      selected_reducer, false,
-                      std::min<std::size_t>(size,
-                                            item.get_local_range()[0] *
-                                                item.get_local_range()[1]));
-
-                  if (local_id == 0) {
-                    sycl::atomic_ref<unsigned, sycl::memory_order::relaxed,
-                                     sycl::memory_scope::device,
-                                     sycl::access::address_space::global_space>
-                        scratch_flags_ref(*scratch_flags);
-                    num_teams_done = ++scratch_flags_ref;
-                  }
-                  sycl::group_barrier(item.get_group());
-                  if (num_teams_done == n_wgroups) {
-                    if (local_id >= n_wgroups)
-                      final_reducer.init(&local_mem[local_id * value_count]);
-                    else {
-                      final_reducer.copy(&local_mem[local_id * value_count],
-                                         &results_ptr[local_id * value_count]);
-                      for (unsigned int id = local_id + wgroup_size;
-                           id < n_wgroups; id += wgroup_size) {
-                        final_reducer.join(&local_mem[local_id * value_count],
-                                           &results_ptr[id * value_count]);
-                      }
-                    }
-
-                    SYCLReduction::workgroup_reduction<>(
-                        item, local_mem.get_pointer(), results_ptr,
-                        device_accessible_result_ptr, value_count,
-                        selected_reducer, true,
-                        std::min(n_wgroups, item.get_local_range()[0] *
-                                                item.get_local_range()[1]));
-                  }
-                } else {
-                  value_type local_value;
-                  reference_type update = final_reducer.init(&local_value);
-                  const member_type team_member(
-                      team_scratch_memory_L0.get_pointer(), shmem_begin,
-                      scratch_size[0],
-                      global_scratch_ptr + item.get_group(1) * scratch_size[1],
-                      scratch_size[1], item);
-                  if constexpr (std::is_void<WorkTag>::value)
-                    functor(team_member, update);
-                  else
-                    functor(WorkTag(), team_member, update);
-
-                  SYCLReduction::workgroup_reduction<>(
-                      item, local_mem.get_pointer(), local_value, results_ptr,
-                      device_accessible_result_ptr, final_reducer, false,
-                      std::min<std::size_t>(size,
-                                            item.get_local_range()[0] *
-                                                item.get_local_range()[1]));
-
-                  if (local_id == 0) {
-                    sycl::atomic_ref<unsigned, sycl::memory_order::relaxed,
-                                     sycl::memory_scope::device,
-                                     sycl::access::address_space::global_space>
-                        scratch_flags_ref(*scratch_flags);
-                    num_teams_done = ++scratch_flags_ref;
-                  }
-                  item.barrier(sycl::access::fence_space::local_space);
-                  if (num_teams_done == n_wgroups) {
-                    if (local_id >= n_wgroups)
-                      final_reducer.init(&local_value);
-                    else {
-                      local_value = results_ptr[local_id];
-                      for (unsigned int id = local_id + wgroup_size;
-                           id < n_wgroups; id += wgroup_size) {
-                        final_reducer.join(&local_value, &results_ptr[id]);
-                      }
-                    }
-
-                    SYCLReduction::workgroup_reduction<>(
-                        item, local_mem.get_pointer(), local_value, results_ptr,
-                        device_accessible_result_ptr, final_reducer, true,
-                        std::min(n_wgroups, item.get_local_range()[0] *
-                                                item.get_local_range()[1]));
-                  }
-                }
-              };
-              return lambda;
-            };
-
-        auto dummy_reduction_lambda = team_reduction_factory({1, cgh}, nullptr);
-
-        static sycl::kernel kernel = [&] {
-          sycl::kernel_id functor_kernel_id =
-              sycl::get_kernel_id<decltype(dummy_reduction_lambda)>();
-          auto kernel_bundle =
-              sycl::get_kernel_bundle<sycl::bundle_state::executable>(
-                  q.get_context(), std::vector{functor_kernel_id});
-          return kernel_bundle.get_kernel(functor_kernel_id);
-        }();
-        auto max_sg_size = kernel.get_info<
-            sycl::info::kernel_device_specific::max_sub_group_size>(
-            q.get_device(), sycl::range<3>(m_team_size, m_vector_size, 1));
-        auto final_vector_size = std::min<int>(m_vector_size, max_sg_size);
-        // FIXME_SYCL For some reason, explicitly enforcing the kernel bundle to
-        // be used gives a runtime error.
-
-        //     cgh.use_kernel_bundle(kernel_bundle);
-
-        auto wgroup_size = m_team_size * final_vector_size;
-        std::size_t size = std::size_t(m_league_size) * wgroup_size;
-        sycl::accessor<value_type, 1, sycl::access::mode::read_write,
-                       sycl::access::target::local>
-            local_mem(sycl::range<1>(wgroup_size) * std::max(value_count, 1u) +
-                          (sizeof(unsigned int) + sizeof(value_type) - 1) /
-                              sizeof(value_type),
-                      cgh);
-
-        const auto init_size =
-            std::max<std::size_t>((size + wgroup_size - 1) / wgroup_size, 1);
-        results_ptr =
-            static_cast<sycl::device_ptr<value_type>>(instance.scratch_space(
-                sizeof(value_type) * std::max(value_count, 1u) * init_size));
-
-        auto reduction_lambda = team_reduction_factory(local_mem, results_ptr);
-
-        cgh.depends_on(memcpy_events);
-
-        cgh.parallel_for(
-            sycl::nd_range<2>(
-                sycl::range<2>(m_team_size, m_league_size * m_vector_size),
-                sycl::range<2>(m_team_size, m_vector_size)),
-            reduction_lambda);
-      });
-      last_reduction_event       = q.ext_oneapi_submit_barrier(
-          std::vector<sycl::event>{parallel_reduce_event});
-    }
-
-    // At this point, the reduced value is written to the entry in results_ptr
-    // and all that is left is to copy it back to the given result pointer if
-    // necessary.
-    if (m_result_ptr && !m_result_ptr_device_accessible) {
-      Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
-                             Kokkos::Experimental::SYCLDeviceUSMSpace>(
-          space, m_result_ptr, results_ptr,
-          sizeof(*m_result_ptr) * value_count);
-    }
-
-    return last_reduction_event;
-  }
-
- public:
-  inline void execute() {
-    Kokkos::Experimental::Impl::SYCLInternal& instance =
-        *m_policy.space().impl_internal_space_instance();
-    using IndirectKernelMem =
-        Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem;
-    IndirectKernelMem& indirectKernelMem  = instance.get_indirect_kernel_mem();
-    IndirectKernelMem& indirectReducerMem = instance.get_indirect_kernel_mem();
-
-    auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper(
-        m_functor, indirectKernelMem);
-    auto reducer_wrapper = Experimental::Impl::make_sycl_function_wrapper(
-        m_reducer, indirectReducerMem);
-
-    sycl::event event = sycl_direct_launch(
-        m_policy, functor_wrapper, reducer_wrapper,
-        {functor_wrapper.get_copy_event(), reducer_wrapper.get_copy_event()});
-    functor_wrapper.register_event(event);
-    reducer_wrapper.register_event(event);
-  }
-
- private:
-  void initialize() {
-    // FIXME_SYCL optimize
-    if (m_team_size < 0)
-      m_team_size =
-          m_policy.team_size_recommended(m_functor, ParallelReduceTag{});
-    // Must be a power of two greater than two, get the one not bigger than the
-    // requested one.
-    if ((m_team_size & m_team_size - 1) || m_team_size < 2) {
-      int temp_team_size = 2;
-      while ((temp_team_size << 1) < m_team_size) temp_team_size <<= 1;
-      m_team_size = temp_team_size;
-    }
-
-    m_shmem_begin = (sizeof(double) * (m_team_size + 2));
-    m_shmem_size =
-        (m_policy.scratch_size(0, m_team_size) +
-         FunctorTeamShmemSize<FunctorType>::value(m_functor, m_team_size));
-    m_scratch_size[0] = m_shmem_size;
-    m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);
-
-    // Functor's reduce memory, team scan memory, and team shared memory depend
-    // upon team size.
-    auto& space = *m_policy.space().impl_internal_space_instance();
-    m_global_scratch_ptr =
-        static_cast<sycl::device_ptr<char>>(space.resize_team_scratch_space(
-            static_cast<ptrdiff_t>(m_scratch_size[1]) * m_league_size));
-
-    if (static_cast<int>(space.m_maxShmemPerBlock) <
-        m_shmem_size - m_shmem_begin) {
-      std::stringstream out;
-      out << "Kokkos::Impl::ParallelFor<SYCL> insufficient shared memory! "
-             "Requested "
-          << m_shmem_size - m_shmem_begin << " bytes but maximum is "
-          << space.m_maxShmemPerBlock << '\n';
-      Kokkos::Impl::throw_runtime_exception(out.str());
-    }
-
-    if (m_team_size > m_policy.team_size_max(m_functor, ParallelReduceTag{}))
-      Kokkos::Impl::throw_runtime_exception(
-          "Kokkos::Impl::ParallelFor<SYCL> requested too large team size.");
-  }
-
- public:
-  template <class ViewType>
-  ParallelReduce(
-      FunctorType const& arg_functor, Policy const& arg_policy,
-      ViewType const& arg_result,
-      std::enable_if_t<Kokkos::is_view<ViewType>::value, void*> = nullptr)
-      : m_functor(arg_functor),
-        m_policy(arg_policy),
-        m_reducer(InvalidType()),
-        m_result_ptr(arg_result.data()),
-        m_result_ptr_device_accessible(
-            MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
-                              typename ViewType::memory_space>::accessible),
-        m_league_size(arg_policy.league_size()),
-        m_team_size(arg_policy.team_size()),
-        m_vector_size(arg_policy.impl_vector_length()),
-        m_scratch_lock(arg_policy.space()
-                           .impl_internal_space_instance()
-                           ->m_team_scratch_mutex) {
-    initialize();
-  }
-
-  ParallelReduce(FunctorType const& arg_functor, Policy const& arg_policy,
-                 ReducerType const& reducer)
-      : m_functor(arg_functor),
-        m_policy(arg_policy),
-        m_reducer(reducer),
-        m_result_ptr(reducer.view().data()),
-        m_result_ptr_device_accessible(
-            MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
-                              typename ReducerType::result_view_type::
-                                  memory_space>::accessible),
-        m_league_size(arg_policy.league_size()),
-        m_team_size(arg_policy.team_size()),
-        m_vector_size(arg_policy.impl_vector_length()),
-        m_scratch_lock(arg_policy.space()
-                           .impl_internal_space_instance()
-                           ->m_team_scratch_mutex) {
-    initialize();
-  }
-};
-}  // namespace Impl
-}  // namespace Kokkos
-
-#endif
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp
index 6151b3eaafbd4217d046f42d00895e24c5f6cb29..64b7f56796a527a73546f248030f43bf73541fc6 100644
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp
@@ -21,8 +21,8 @@
 #include <Kokkos_Macros.hpp>
 
 #include <Kokkos_HostSpace.hpp>
-#include <Kokkos_SYCL.hpp>
-#include <Kokkos_SYCL_Space.hpp>
+#include <SYCL/Kokkos_SYCL.hpp>
+#include <SYCL/Kokkos_SYCL_Space.hpp>
 #include <SYCL/Kokkos_SYCL_DeepCopy.hpp>
 #include <SYCL/Kokkos_SYCL_Instance.hpp>
 #include <impl/Kokkos_MemorySpace.hpp>
@@ -39,13 +39,11 @@ void DeepCopySYCL(void* dst, const void* src, size_t n) {
 
 void DeepCopyAsyncSYCL(const Kokkos::Experimental::SYCL& instance, void* dst,
                        const void* src, size_t n) {
-  // FIXME_SYCL memcpy doesn't respect submit_barrier which means that we need
-  // to actually fence the execution space to make sure the memcpy is properly
-  // enqueued when using out-of-order queues.
   sycl::queue& q = *instance.impl_internal_space_instance()->m_queue;
-  q.wait_and_throw();
-  auto event = q.memcpy(dst, src, n);
+  auto event     = q.memcpy(dst, src, n);
+#ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES
   q.ext_oneapi_submit_barrier(std::vector<sycl::event>{event});
+#endif
 }
 
 void DeepCopyAsyncSYCL(void* dst, const void* src, size_t n) {
diff --git a/packages/kokkos/core/src/Kokkos_SYCL_Space.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.hpp
similarity index 100%
rename from packages/kokkos/core/src/Kokkos_SYCL_Space.hpp
rename to packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.hpp
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp
index 674037ed9590fecc7a8ded10a3b1cf43a9324e28..89c09c3195feeb8c7c6d6ba7cce353ab7a05c5b6 100644
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp
@@ -42,6 +42,8 @@ class SYCLTeamMember {
   scratch_memory_space m_team_shared;
   int m_team_reduce_size;
   sycl::nd_item<2> m_item;
+  int m_league_rank;
+  int m_league_size;
 
  public:
   KOKKOS_INLINE_FUNCTION
@@ -61,12 +63,8 @@ class SYCLTeamMember {
     return m_team_shared.set_team_thread_mode(level, team_size(), team_rank());
   }
 
-  KOKKOS_INLINE_FUNCTION int league_rank() const {
-    return m_item.get_group_linear_id();
-  }
-  KOKKOS_INLINE_FUNCTION int league_size() const {
-    return m_item.get_group_range(1);
-  }
+  KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank; }
+  KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size; }
   KOKKOS_INLINE_FUNCTION int team_rank() const {
     return m_item.get_local_id(0);
   }
@@ -341,12 +339,15 @@ class SYCLTeamMember {
                  const std::size_t shared_size,
                  sycl::device_ptr<void> scratch_level_1_ptr,
                  const std::size_t scratch_level_1_size,
-                 const sycl::nd_item<2> item)
+                 const sycl::nd_item<2> item, const int arg_league_rank,
+                 const int arg_league_size)
       : m_team_reduce(shared),
         m_team_shared(static_cast<sycl::local_ptr<char>>(shared) + shared_begin,
                       shared_size, scratch_level_1_ptr, scratch_level_1_size),
         m_team_reduce_size(shared_begin),
-        m_item(item) {}
+        m_item(item),
+        m_league_rank(arg_league_rank),
+        m_league_size(arg_league_size) {}
 
  public:
   // Declare to avoid unused private member warnings which are trigger
@@ -572,15 +573,17 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<
  *  final == true.
  */
 // This is the same code as in CUDA and largely the same as in OpenMPTarget
-template <typename iType, typename FunctorType>
+template <typename iType, typename FunctorType, typename ValueType>
 KOKKOS_INLINE_FUNCTION void parallel_scan(
     const Impl::TeamThreadRangeBoundariesStruct<iType, Impl::SYCLTeamMember>&
         loop_bounds,
-    const FunctorType& lambda) {
-  // Extract value_type from lambda
-  using value_type = typename Kokkos::Impl::FunctorAnalysis<
-      Kokkos::Impl::FunctorPatternInterface::SCAN, void,
-      FunctorType>::value_type;
+    const FunctorType& lambda, ValueType& return_val) {
+  // Extract ValueType from the Closure
+  using closure_value_type = typename Kokkos::Impl::FunctorAnalysis<
+      Kokkos::Impl::FunctorPatternInterface::SCAN, void, FunctorType,
+      void>::value_type;
+  static_assert(std::is_same_v<closure_value_type, ValueType>,
+                "Non-matching value types of closure and return type");
 
   const auto start     = loop_bounds.start;
   const auto end       = loop_bounds.end;
@@ -588,12 +591,12 @@ KOKKOS_INLINE_FUNCTION void parallel_scan(
   const auto team_size = member.team_size();
   const auto team_rank = member.team_rank();
   const auto nchunk    = (end - start + team_size - 1) / team_size;
-  value_type accum     = 0;
+  ValueType accum      = 0;
   // each team has to process one or more chunks of the prefix scan
   for (iType i = 0; i < nchunk; ++i) {
     auto ii = start + i * team_size + team_rank;
     // local accumulation for this chunk
-    value_type local_accum = 0;
+    ValueType local_accum = 0;
     // user updates value with prefix value
     if (ii < loop_bounds.end) lambda(ii, local_accum, false);
     // perform team scan
@@ -607,6 +610,21 @@ KOKKOS_INLINE_FUNCTION void parallel_scan(
     // broadcast last value to rest of the team
     member.team_broadcast(accum, team_size - 1);
   }
+
+  return_val = accum;
+}
+
+template <typename iType, class FunctorType>
+KOKKOS_INLINE_FUNCTION void parallel_scan(
+    const Impl::TeamThreadRangeBoundariesStruct<iType, Impl::SYCLTeamMember>&
+        loop_bounds,
+    const FunctorType& lambda) {
+  using value_type = typename Kokkos::Impl::FunctorAnalysis<
+      Kokkos::Impl::FunctorPatternInterface::SCAN, void, FunctorType,
+      void>::value_type;
+
+  value_type scan_val;
+  parallel_scan(loop_bounds, lambda, scan_val);
 }
 
 template <typename iType, class Closure>
@@ -775,7 +793,8 @@ parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<
                   iType, Impl::SYCLTeamMember>& loop_boundaries,
               const Closure& closure, const ReducerType& reducer) {
   using value_type = typename Kokkos::Impl::FunctorAnalysis<
-      Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type;
+      Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure,
+      void>::value_type;
 
   value_type accum;
   reducer.init(accum);
@@ -806,7 +825,7 @@ parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<
     // This sets i's val to i-1's contribution to make the latter shfl_up an
     // exclusive scan -- the final accumulation of i's val will be included in
     // the second closure call later.
-    if (i < loop_boundaries.end && tidx1 > 0) closure(i - 1, val, false);
+    if (i - 1 < loop_boundaries.end && tidx1 > 0) closure(i - 1, val, false);
 
     // Bottom up exclusive scan in triangular pattern where each SYCL thread is
     // the root of a reduction tree from the zeroth "lane" to itself.
@@ -828,6 +847,7 @@ parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<
     if (i < loop_boundaries.end) closure(i, val, true);
     accum = sg.shuffle(val, mask + vector_offset);
   }
+  reducer.reference() = accum;
 }
 
 /** \brief  Intra-thread vector parallel exclusive prefix sum.
@@ -844,11 +864,38 @@ KOKKOS_INLINE_FUNCTION void parallel_scan(
         loop_boundaries,
     const Closure& closure) {
   using value_type = typename Kokkos::Impl::FunctorAnalysis<
-      Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type;
+      Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure,
+      void>::value_type;
   value_type dummy;
   parallel_scan(loop_boundaries, closure, Kokkos::Sum<value_type>{dummy});
 }
 
+/** \brief  Intra-thread vector parallel exclusive prefix sum.
+ *
+ *  Executes closure(iType i, ValueType & val, bool final) for each i=[0..N)
+ *
+ *  The range [0..N) is mapped to all vector lanes in the
+ *  thread and a scan operation is performed.
+ *  The last call to closure has final == true.
+ */
+template <typename iType, class Closure, typename ValueType>
+KOKKOS_INLINE_FUNCTION void parallel_scan(
+    const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::SYCLTeamMember>&
+        loop_boundaries,
+    const Closure& closure, ValueType& return_val) {
+  // Extract ValueType from the Closure
+  using closure_value_type = typename Kokkos::Impl::FunctorAnalysis<
+      Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure,
+      void>::value_type;
+  static_assert(std::is_same<closure_value_type, ValueType>::value,
+                "Non-matching value types of closure and return type");
+
+  ValueType accum;
+  parallel_scan(loop_boundaries, closure, Kokkos::Sum<ValueType>{accum});
+
+  return_val = accum;
+}
+
 }  // namespace Kokkos
 
 namespace Kokkos {
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_TeamPolicy.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_TeamPolicy.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..17ce59058bdda81124f67d7ca580604d3fd35ba5
--- /dev/null
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_TeamPolicy.hpp
@@ -0,0 +1,357 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_SYCL_TEAM_POLICY_HPP
+#define KOKKOS_SYCL_TEAM_POLICY_HPP
+
+#include <SYCL/Kokkos_SYCL_Team.hpp>
+
+#include <vector>
+
+template <typename... Properties>
+class Kokkos::Impl::TeamPolicyInternal<Kokkos::Experimental::SYCL,
+                                       Properties...>
+    : public PolicyTraits<Properties...> {
+ public:
+  using execution_policy = TeamPolicyInternal;
+
+  using traits = PolicyTraits<Properties...>;
+
+  template <typename ExecSpace, typename... OtherProperties>
+  friend class TeamPolicyInternal;
+
+ private:
+  typename traits::execution_space m_space;
+  int m_league_size;
+  int m_team_size;
+  int m_vector_length;
+  size_t m_team_scratch_size[2];
+  size_t m_thread_scratch_size[2];
+  int m_chunk_size;
+  bool m_tune_team_size;
+  bool m_tune_vector_length;
+
+ public:
+  using execution_space = Kokkos::Experimental::SYCL;
+
+  template <class... OtherProperties>
+  TeamPolicyInternal(TeamPolicyInternal<OtherProperties...> const& p) {
+    m_league_size            = p.m_league_size;
+    m_team_size              = p.m_team_size;
+    m_vector_length          = p.m_vector_length;
+    m_team_scratch_size[0]   = p.m_team_scratch_size[0];
+    m_team_scratch_size[1]   = p.m_team_scratch_size[1];
+    m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
+    m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
+    m_chunk_size             = p.m_chunk_size;
+    m_space                  = p.m_space;
+    m_tune_team_size         = p.m_tune_team_size;
+    m_tune_vector_length     = p.m_tune_vector_length;
+  }
+
+  template <typename FunctorType>
+  int team_size_max(FunctorType const& f, ParallelForTag const&) const {
+    return internal_team_size_max_for(f);
+  }
+
+  template <class FunctorType>
+  inline int team_size_max(const FunctorType& f,
+                           const ParallelReduceTag&) const {
+    return internal_team_size_max_reduce<void>(f);
+  }
+
+  template <class FunctorType, class ReducerType>
+  inline int team_size_max(const FunctorType& f, const ReducerType& /*r*/,
+                           const ParallelReduceTag&) const {
+    return internal_team_size_max_reduce<typename ReducerType::value_type>(f);
+  }
+
+  template <typename FunctorType>
+  int team_size_recommended(FunctorType const& f, ParallelForTag const&) const {
+    return internal_team_size_recommended_for(f);
+  }
+
+  template <typename FunctorType>
+  inline int team_size_recommended(FunctorType const& f,
+                                   ParallelReduceTag const&) const {
+    return internal_team_size_recommended_reduce<void>(f);
+  }
+
+  template <class FunctorType, class ReducerType>
+  int team_size_recommended(FunctorType const& f, ReducerType const&,
+                            ParallelReduceTag const&) const {
+    return internal_team_size_recommended_reduce<
+        typename ReducerType::value_type>(f);
+  }
+  inline bool impl_auto_vector_length() const { return m_tune_vector_length; }
+  inline bool impl_auto_team_size() const { return m_tune_team_size; }
+  // FIXME_SYCL This is correct in most cases, but not necessarily in case a
+  // custom sycl::queue is used to initialize the execution space.
+  static int vector_length_max() {
+    std::vector<size_t> sub_group_sizes =
+        execution_space{}
+            .impl_internal_space_instance()
+            ->m_queue->get_device()
+            .template get_info<sycl::info::device::sub_group_sizes>();
+    return *std::max_element(sub_group_sizes.begin(), sub_group_sizes.end());
+  }
+
+ private:
+  static int verify_requested_vector_length(int requested_vector_length) {
+    int test_vector_length =
+        std::min(requested_vector_length, vector_length_max());
+
+    // Allow only power-of-two vector_length
+    if (!(is_integral_power_of_two(test_vector_length))) {
+      int test_pow2 = 1;
+      while (test_pow2 < test_vector_length) test_pow2 <<= 1;
+      test_vector_length = test_pow2 >> 1;
+    }
+
+    return test_vector_length;
+  }
+
+ public:
+  static int scratch_size_max(int level) {
+    return level == 0 ? 1024 * 32
+                      :           // FIXME_SYCL arbitrarily setting this to 32kB
+               20 * 1024 * 1024;  // FIXME_SYCL arbitrarily setting this to 20MB
+  }
+  inline void impl_set_vector_length(size_t size) { m_vector_length = size; }
+  inline void impl_set_team_size(size_t size) { m_team_size = size; }
+  int impl_vector_length() const { return m_vector_length; }
+
+  int team_size() const { return m_team_size; }
+
+  int league_size() const { return m_league_size; }
+
+  size_t scratch_size(int level, int team_size_ = -1) const {
+    if (team_size_ < 0) team_size_ = m_team_size;
+    return m_team_scratch_size[level] +
+           team_size_ * m_thread_scratch_size[level];
+  }
+
+  size_t team_scratch_size(int level) const {
+    return m_team_scratch_size[level];
+  }
+
+  size_t thread_scratch_size(int level) const {
+    return m_thread_scratch_size[level];
+  }
+
+  typename traits::execution_space space() const { return m_space; }
+
+  TeamPolicyInternal()
+      : m_space(typename traits::execution_space()),
+        m_league_size(0),
+        m_team_size(-1),
+        m_vector_length(0),
+        m_team_scratch_size{0, 0},
+        m_thread_scratch_size{0, 0},
+        m_chunk_size(vector_length_max()),
+        m_tune_team_size(false),
+        m_tune_vector_length(false) {}
+
+  /** \brief  Specify league size, request team size */
+  TeamPolicyInternal(const execution_space space_, int league_size_,
+                     int team_size_request, int vector_length_request = 1)
+      : m_space(space_),
+        m_league_size(league_size_),
+        m_team_size(team_size_request),
+        m_vector_length(
+            (vector_length_request > 0)
+                ? verify_requested_vector_length(vector_length_request)
+                : (verify_requested_vector_length(1))),
+        m_team_scratch_size{0, 0},
+        m_thread_scratch_size{0, 0},
+        m_chunk_size(vector_length_max()),
+        m_tune_team_size(bool(team_size_request <= 0)),
+        m_tune_vector_length(bool(vector_length_request <= 0)) {
+    // FIXME_SYCL Check that league size is permissible,
+    // https://github.com/intel/llvm/pull/4064
+
+    // Make sure total block size is permissible
+    if (m_team_size * m_vector_length >
+        static_cast<int>(
+            m_space.impl_internal_space_instance()->m_maxWorkgroupSize)) {
+      Impl::throw_runtime_exception(
+          std::string("Kokkos::TeamPolicy<SYCL> the team size is too large. "
+                      "Team size x vector length is " +
+                      std::to_string(m_team_size * m_vector_length) +
+                      " but must be smaller than ") +
+          std::to_string(
+              m_space.impl_internal_space_instance()->m_maxWorkgroupSize));
+    }
+  }
+
+  /** \brief  Specify league size, request team size */
+  TeamPolicyInternal(const execution_space space_, int league_size_,
+                     const Kokkos::AUTO_t& /* team_size_request */,
+                     int vector_length_request = 1)
+      : TeamPolicyInternal(space_, league_size_, -1, vector_length_request) {}
+  // FLAG
+  /** \brief  Specify league size and team size, request vector length*/
+  TeamPolicyInternal(const execution_space space_, int league_size_,
+                     int team_size_request,
+                     const Kokkos::AUTO_t& /* vector_length_request */
+                     )
+      : TeamPolicyInternal(space_, league_size_, team_size_request, -1)
+
+  {}
+
+  /** \brief  Specify league size, request team size and vector length*/
+  TeamPolicyInternal(const execution_space space_, int league_size_,
+                     const Kokkos::AUTO_t& /* team_size_request */,
+                     const Kokkos::AUTO_t& /* vector_length_request */
+
+                     )
+      : TeamPolicyInternal(space_, league_size_, -1, -1)
+
+  {}
+
+  TeamPolicyInternal(int league_size_, int team_size_request,
+                     int vector_length_request = 1)
+      : TeamPolicyInternal(typename traits::execution_space(), league_size_,
+                           team_size_request, vector_length_request) {}
+
+  TeamPolicyInternal(int league_size_,
+                     const Kokkos::AUTO_t& /* team_size_request */,
+                     int vector_length_request = 1)
+      : TeamPolicyInternal(typename traits::execution_space(), league_size_, -1,
+                           vector_length_request) {}
+
+  /** \brief  Specify league size and team size, request vector length*/
+  TeamPolicyInternal(int league_size_, int team_size_request,
+                     const Kokkos::AUTO_t& /* vector_length_request */
+
+                     )
+      : TeamPolicyInternal(typename traits::execution_space(), league_size_,
+                           team_size_request, -1)
+
+  {}
+
+  /** \brief  Specify league size, request team size and vector length*/
+  TeamPolicyInternal(int league_size_,
+                     const Kokkos::AUTO_t& /* team_size_request */,
+                     const Kokkos::AUTO_t& /* vector_length_request */
+
+                     )
+      : TeamPolicyInternal(typename traits::execution_space(), league_size_, -1,
+                           -1) {}
+
+  int chunk_size() const { return m_chunk_size; }
+
+  TeamPolicyInternal& set_chunk_size(typename traits::index_type chunk_size_) {
+    m_chunk_size = chunk_size_;
+    return *this;
+  }
+
+  /** \brief set per team scratch size for a specific level of the scratch
+   * hierarchy */
+  TeamPolicyInternal& set_scratch_size(int level,
+                                       PerTeamValue const& per_team) {
+    m_team_scratch_size[level] = per_team.value;
+    return *this;
+  }
+
+  /** \brief set per thread scratch size for a specific level of the scratch
+   * hierarchy */
+  TeamPolicyInternal& set_scratch_size(int level,
+                                       PerThreadValue const& per_thread) {
+    m_thread_scratch_size[level] = per_thread.value;
+    return *this;
+  }
+
+  /** \brief set per thread and per team scratch size for a specific level of
+   * the scratch hierarchy */
+  TeamPolicyInternal& set_scratch_size(int level, PerTeamValue const& per_team,
+                                       PerThreadValue const& per_thread) {
+    m_team_scratch_size[level]   = per_team.value;
+    m_thread_scratch_size[level] = per_thread.value;
+    return *this;
+  }
+
+  using member_type = Kokkos::Impl::SYCLTeamMember;
+
+ protected:
+  template <class FunctorType>
+  int internal_team_size_max_for(const FunctorType& /*f*/) const {
+    // nested_reducer_memsize = (sizeof(double) * (m_team_size + 2)
+    // custom: m_team_scratch_size[0] + m_thread_scratch_size[0] * m_team_size
+    // total:
+    // 2*sizeof(double)+m_team_scratch_size[0]
+    // + m_team_size(sizeof(double)+m_thread_scratch_size[0])
+    const int max_threads_for_memory =
+        (space().impl_internal_space_instance()->m_maxShmemPerBlock -
+         2 * sizeof(double) - m_team_scratch_size[0]) /
+        (sizeof(double) + m_thread_scratch_size[0]);
+    return std::min({
+             int(m_space.impl_internal_space_instance()->m_maxWorkgroupSize),
+      // FIXME_SYCL Avoid requesting too many registers on NVIDIA GPUs.
+#if defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU)
+                 256,
+#endif
+                 max_threads_for_memory
+           }) /
+           impl_vector_length();
+  }
+
+  template <class ValueType, class FunctorType>
+  int internal_team_size_max_reduce(const FunctorType& f) const {
+    using Analysis =
+        FunctorAnalysis<FunctorPatternInterface::REDUCE, TeamPolicyInternal,
+                        FunctorType, ValueType>;
+    using value_type      = typename Analysis::value_type;
+    const int value_count = Analysis::value_count(f);
+
+    // nested_reducer_memsize = (sizeof(double) * (m_team_size + 2)
+    // reducer_memsize = sizeof(value_type) * m_team_size * value_count
+    // custom: m_team_scratch_size[0] + m_thread_scratch_size[0] * m_team_size
+    // total:
+    // 2*sizeof(double)+m_team_scratch_size[0]
+    // + m_team_size(sizeof(double)+sizeof(value_type)*value_count
+    //               +m_thread_scratch_size[0])
+    const int max_threads_for_memory =
+        (space().impl_internal_space_instance()->m_maxShmemPerBlock -
+         2 * sizeof(double) - m_team_scratch_size[0]) /
+        (sizeof(double) + sizeof(value_type) * value_count +
+         m_thread_scratch_size[0]);
+    return std::min<int>({
+             int(m_space.impl_internal_space_instance()->m_maxWorkgroupSize),
+      // FIXME_SYCL Avoid requesting too many registers on NVIDIA GPUs.
+#if defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU)
+                 256,
+#endif
+                 max_threads_for_memory
+           }) /
+           impl_vector_length();
+  }
+
+  template <class FunctorType>
+  int internal_team_size_recommended_for(const FunctorType& f) const {
+    // FIXME_SYCL improve
+    return 1 << Kokkos::Impl::int_log2(internal_team_size_max_for(f));
+  }
+
+  template <class ValueType, class FunctorType>
+  int internal_team_size_recommended_reduce(const FunctorType& f) const {
+    // FIXME_SYCL improve
+    return 1 << Kokkos::Impl::int_log2(
+               internal_team_size_max_reduce<ValueType>(f));
+  }
+};
+
+#endif
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp
index 2f0a67b3dd06a71cdd95b20000931cb047f222dd..d55fc6a84ba4b05e292770919037af471c99001b 100644
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp
@@ -18,7 +18,7 @@
 #define KOKKOS_SYCL_UNIQUE_TOKEN_HPP
 
 #include <impl/Kokkos_ConcurrentBitset.hpp>
-#include <Kokkos_SYCL_Space.hpp>
+#include <SYCL/Kokkos_SYCL_Space.hpp>
 #include <Kokkos_UniqueToken.hpp>
 
 namespace Kokkos {
@@ -93,12 +93,8 @@ class UniqueToken<SYCL, UniqueTokenScope::Global> {
     }
 
     // Make sure that all writes in the previous lock owner are visible to me
-#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
     desul::atomic_thread_fence(desul::MemoryOrderAcquire(),
                                desul::MemoryScopeDevice());
-#else
-    Kokkos::memory_fence();
-#endif
     return idx;
   }
 
@@ -114,12 +110,8 @@ class UniqueToken<SYCL, UniqueTokenScope::Global> {
   KOKKOS_INLINE_FUNCTION
   void release(size_type idx) const noexcept {
     // Make sure my writes are visible to the next lock owner
-#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
     desul::atomic_thread_fence(desul::MemoryOrderRelease(),
                                desul::MemoryScopeDevice());
-#else
-    Kokkos::memory_fence();
-#endif
     (void)Kokkos::atomic_exchange(&m_locks(idx), 0);
   }
 };
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c308384af090f1f395480ef49232f8987762cd07
--- /dev/null
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp
@@ -0,0 +1,177 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_SYCL_WORKGROUP_REDUCTION_HPP
+#define KOKKOS_SYCL_WORKGROUP_REDUCTION_HPP
+
+#include <Kokkos_Macros.hpp>
+
+namespace Kokkos::Impl::SYCLReduction {
+
+// FIXME_SYCL It appears that using shuffles is slower than going through local
+// memory.
+template <class ReducerType>
+inline constexpr bool use_shuffle_based_algorithm = false;
+// std::is_reference_v<typename ReducerType::reference_type>;
+
+template <typename ValueType, typename ReducerType, int dim>
+std::enable_if_t<!use_shuffle_based_algorithm<ReducerType>> workgroup_reduction(
+    sycl::nd_item<dim>& item, sycl::local_accessor<ValueType> local_mem,
+    sycl::device_ptr<ValueType> results_ptr,
+    sycl::global_ptr<ValueType> device_accessible_result_ptr,
+    const unsigned int value_count_, const ReducerType& final_reducer,
+    bool final, unsigned int max_size) {
+  const unsigned int value_count =
+      std::is_reference_v<typename ReducerType::reference_type> ? 1
+                                                                : value_count_;
+  const int local_id = item.get_local_linear_id();
+
+  // Perform the actual workgroup reduction in each subgroup
+  // separately.
+  auto sg            = item.get_sub_group();
+  auto* result       = &local_mem[local_id * value_count];
+  const int id_in_sg = sg.get_local_id()[0];
+  const auto local_range =
+      std::min<unsigned int>(sg.get_local_range()[0], max_size);
+  const auto upper_stride_bound =
+      std::min<unsigned int>(local_range - id_in_sg, max_size - local_id);
+  for (unsigned int stride = 1; stride < local_range; stride <<= 1) {
+    if (stride < upper_stride_bound)
+      final_reducer.join(result, &local_mem[(local_id + stride) * value_count]);
+    sycl::group_barrier(sg);
+  }
+  sycl::group_barrier(item.get_group());
+
+  // Do the final reduction only using the first subgroup.
+  if (sg.get_group_id()[0] == 0) {
+    const unsigned int n_subgroups = sg.get_group_range()[0];
+    const int max_subgroup_size    = sg.get_max_local_range()[0];
+    auto* result_ = &local_mem[id_in_sg * max_subgroup_size * value_count];
+    // In case the number of subgroups is larger than the range of
+    // the first subgroup, we first combine the items with a higher
+    // index.
+    for (unsigned int offset = local_range; offset < n_subgroups;
+         offset += local_range)
+      if (id_in_sg + offset < n_subgroups)
+        final_reducer.join(
+            result_,
+            &local_mem[(id_in_sg + offset) * max_subgroup_size * value_count]);
+    sycl::group_barrier(sg);
+
+    // Then, we proceed as before.
+    for (unsigned int stride = 1; stride < local_range; stride <<= 1) {
+      if (id_in_sg + stride < n_subgroups)
+        final_reducer.join(
+            result_,
+            &local_mem[(id_in_sg + stride) * max_subgroup_size * value_count]);
+      sycl::group_barrier(sg);
+    }
+
+    // Finally, we copy the workgroup results back to global memory
+    // to be used in the next iteration. If this is the last
+    // iteration, i.e., there is only one workgroup also call
+    // final() if necessary.
+    if (id_in_sg == 0) {
+      if (final) {
+        final_reducer.final(&local_mem[0]);
+        if (device_accessible_result_ptr != nullptr)
+          final_reducer.copy(&device_accessible_result_ptr[0], &local_mem[0]);
+        else
+          final_reducer.copy(&results_ptr[0], &local_mem[0]);
+      } else
+        final_reducer.copy(
+            &results_ptr[(item.get_group_linear_id()) * value_count],
+            &local_mem[0]);
+    }
+  }
+}
+
+template <typename ValueType, typename ReducerType, int dim>
+std::enable_if_t<use_shuffle_based_algorithm<ReducerType>> workgroup_reduction(
+    sycl::nd_item<dim>& item, sycl::local_accessor<ValueType> local_mem,
+    ValueType local_value, sycl::device_ptr<ValueType> results_ptr,
+    sycl::global_ptr<ValueType> device_accessible_result_ptr,
+    const ReducerType& final_reducer, bool final, unsigned int max_size) {
+  const auto local_id = item.get_local_linear_id();
+
+  // Perform the actual workgroup reduction in each subgroup
+  // separately.
+  auto sg            = item.get_sub_group();
+  const int id_in_sg = sg.get_local_id()[0];
+  const auto local_range =
+      std::min<unsigned int>(sg.get_local_range()[0], max_size);
+
+  const auto upper_stride_bound =
+      std::min<unsigned int>(local_range - id_in_sg, max_size - local_id);
+  for (unsigned int stride = 1; stride < local_range; stride <<= 1) {
+    auto tmp = sg.shuffle_down(local_value, stride);
+    if (stride < upper_stride_bound) final_reducer.join(&local_value, &tmp);
+  }
+
+  // Copy the subgroup results into the first positions of the
+  // reduction array.
+  const int max_subgroup_size = sg.get_max_local_range()[0];
+  const int n_active_subgroups =
+      (max_size + max_subgroup_size - 1) / max_subgroup_size;
+  const int sg_group_id = sg.get_group_id()[0];
+  if (id_in_sg == 0 && sg_group_id <= n_active_subgroups)
+    local_mem[sg_group_id] = local_value;
+
+  item.barrier(sycl::access::fence_space::local_space);
+
+  // Do the final reduction only using the first subgroup.
+  if (sg.get_group_id()[0] == 0) {
+    auto sg_value = local_mem[id_in_sg < n_active_subgroups ? id_in_sg : 0];
+
+    // In case the number of subgroups is larger than the range of
+    // the first subgroup, we first combine the items with a higher
+    // index.
+    if (n_active_subgroups > local_range) {
+      for (unsigned int offset = local_range; offset < n_active_subgroups;
+           offset += local_range)
+        if (id_in_sg + offset < n_active_subgroups) {
+          final_reducer.join(&sg_value, &local_mem[(id_in_sg + offset)]);
+        }
+      sg.barrier();
+    }
+
+    // Then, we proceed as before.
+    for (unsigned int stride = 1; stride < local_range; stride <<= 1) {
+      auto tmp = sg.shuffle_down(sg_value, stride);
+      if (id_in_sg + stride < n_active_subgroups)
+        final_reducer.join(&sg_value, &tmp);
+    }
+
+    // Finally, we copy the workgroup results back to global memory
+    // to be used in the next iteration. If this is the last
+    // iteration, i.e., there is only one workgroup also call
+    // final() if necessary.
+    if (id_in_sg == 0) {
+      if (final) {
+        final_reducer.final(&sg_value);
+        if (device_accessible_result_ptr != nullptr)
+          device_accessible_result_ptr[0] = sg_value;
+        else
+          results_ptr[0] = sg_value;
+      } else
+        results_ptr[(item.get_group_linear_id())] = sg_value;
+    }
+  }
+}
+
+}  // namespace Kokkos::Impl::SYCLReduction
+
+#endif /* KOKKOS_SYCL_WORKGROUP_REDUCTION_HPP */
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..9548f211d9e3fb090e2d10ca24ac77e5871df51e
--- /dev/null
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp
@@ -0,0 +1,49 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_SYCL_ZEROMEMSET_HPP
+#define KOKKOS_SYCL_ZEROMEMSET_HPP
+
+#include <impl/Kokkos_ZeroMemset_fwd.hpp>
+#include <SYCL/Kokkos_SYCL.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+template <class T, class... P>
+struct ZeroMemset<Kokkos::Experimental::SYCL, View<T, P...>> {
+  ZeroMemset(const Kokkos::Experimental::SYCL& exec_space,
+             const View<T, P...>& dst,
+             typename View<T, P...>::const_value_type&) {
+    auto event = exec_space.impl_internal_space_instance()->m_queue->memset(
+        dst.data(), 0, dst.size() * sizeof(typename View<T, P...>::value_type));
+#ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES
+    exec_space.impl_internal_space_instance()
+        ->m_queue->ext_oneapi_submit_barrier(std::vector<sycl::event>{event});
+#endif
+  }
+
+  ZeroMemset(const View<T, P...>& dst,
+             typename View<T, P...>::const_value_type&) {
+    Experimental::Impl::SYCLInternal::singleton().m_queue->memset(
+        dst.data(), 0, dst.size() * sizeof(typename View<T, P...>::value_type));
+  }
+};
+
+}  // namespace Impl
+}  // namespace Kokkos
+
+#endif  // !defined(KOKKOS_SYCL_ZEROMEMSET_HPP)
diff --git a/packages/kokkos/core/src/Serial/Kokkos_Serial.cpp b/packages/kokkos/core/src/Serial/Kokkos_Serial.cpp
index df91d8499ac02ce4cbd7e42fce525f9956a29744..e81e8349391e8ef5b51e80bd162ad77e82eba61e 100644
--- a/packages/kokkos/core/src/Serial/Kokkos_Serial.cpp
+++ b/packages/kokkos/core/src/Serial/Kokkos_Serial.cpp
@@ -20,7 +20,7 @@
 
 #include <Kokkos_Core.hpp>
 
-#include <Kokkos_Serial.hpp>
+#include <Serial/Kokkos_Serial.hpp>
 #include <impl/Kokkos_Traits.hpp>
 #include <impl/Kokkos_Error.hpp>
 #include <impl/Kokkos_ExecSpaceManager.hpp>
@@ -42,9 +42,6 @@ void SerialInternal::initialize() {
 
   Impl::SharedAllocationRecord<void, void>::tracking_enable();
 
-  // Init the array of locks used for arbitrarily sized atomics
-  Impl::init_lock_array_host_space();
-
   m_is_initialized = true;
 }
 
@@ -148,16 +145,18 @@ Serial::Serial()
     : m_space_instance(&Impl::SerialInternal::singleton(),
                        [](Impl::SerialInternal*) {}) {}
 
+Serial::Serial(NewInstance)
+    : m_space_instance(new Impl::SerialInternal, [](Impl::SerialInternal* ptr) {
+        ptr->finalize();
+        delete ptr;
+      }) {}
+
 void Serial::print_configuration(std::ostream& os, bool /*verbose*/) const {
   os << "Host Serial Execution Space:\n";
   os << "  KOKKOS_ENABLE_SERIAL: yes\n";
 
-  os << "Serial Atomics:\n";
-  os << "  KOKKOS_ENABLE_SERIAL_ATOMICS: ";
-#ifdef KOKKOS_ENABLE_SERIAL_ATOMICS
-  os << "yes\n";
-#else
-  os << "no\n";
+#ifdef KOKKOS_INTERNAL_NOT_PARALLEL
+  os << "Kokkos atomics disabled\n";
 #endif
 
   os << "\nSerial Runtime Configuration:\n";
diff --git a/packages/kokkos/core/src/Kokkos_Serial.hpp b/packages/kokkos/core/src/Serial/Kokkos_Serial.hpp
similarity index 86%
rename from packages/kokkos/core/src/Kokkos_Serial.hpp
rename to packages/kokkos/core/src/Serial/Kokkos_Serial.hpp
index ede3c96b8bd071941d1d2a993b22157a6725bbc7..db1567610b23bcd4b7ced931321e9c5c46b4496b 100644
--- a/packages/kokkos/core/src/Kokkos_Serial.hpp
+++ b/packages/kokkos/core/src/Serial/Kokkos_Serial.hpp
@@ -72,6 +72,10 @@ class SerialInternal {
 };
 }  // namespace Impl
 
+struct NewInstance {
+  explicit NewInstance() = default;
+};
+
 /// \class Serial
 /// \brief Kokkos device for non-parallel execution
 ///
@@ -108,6 +112,8 @@ class Serial {
 
   Serial();
 
+  Serial(NewInstance);
+
   /// \brief True if and only if this method is being called in a
   ///   thread-parallel function.
   ///
@@ -207,23 +213,6 @@ struct DeviceTypeTraits<Serial> {
 namespace Kokkos {
 namespace Impl {
 
-// We only need to provide a specialization for Serial if there is a host
-// parallel execution space since the specialization for
-// DefaultHostExecutionSpace is defined elsewhere.
-struct DummyExecutionSpace;
-template <class DT, class... DP>
-struct ZeroMemset<
-    std::conditional_t<!std::is_same<Serial, DefaultHostExecutionSpace>::value,
-                       Serial, DummyExecutionSpace>,
-    DT, DP...> : public ZeroMemset<DefaultHostExecutionSpace, DT, DP...> {
-  using Base = ZeroMemset<DefaultHostExecutionSpace, DT, DP...>;
-  using Base::Base;
-
-  ZeroMemset(const Serial&, const View<DT, DP...>& dst,
-             typename View<DT, DP...>::const_value_type& value)
-      : Base(dst, value) {}
-};
-
 template <>
 struct MemorySpaceAccess<Kokkos::Serial::memory_space,
                          Kokkos::Serial::scratch_memory_space> {
@@ -235,6 +224,38 @@ struct MemorySpaceAccess<Kokkos::Serial::memory_space,
 }  // namespace Impl
 }  // namespace Kokkos
 
+namespace Kokkos::Experimental {
+
+template <class... Args>
+std::vector<Serial> partition_space(const Serial&, Args...) {
+  static_assert(
+      (... && std::is_arithmetic_v<Args>),
+      "Kokkos Error: partitioning arguments must be integers or floats");
+  std::vector<Serial> instances;
+  instances.reserve(sizeof...(Args));
+  std::generate_n(std::back_inserter(instances), sizeof...(Args),
+                  []() { return Serial{NewInstance{}}; });
+  return instances;
+}
+
+template <class T>
+std::vector<Serial> partition_space(const Serial&,
+                                    std::vector<T> const& weights) {
+  static_assert(
+      std::is_arithmetic<T>::value,
+      "Kokkos Error: partitioning arguments must be integers or floats");
+
+  // We only care about the number of instances to create and ignore weights
+  // otherwise.
+  std::vector<Serial> instances;
+  instances.reserve(weights.size());
+  std::generate_n(std::back_inserter(instances), weights.size(),
+                  []() { return Serial{NewInstance{}}; });
+  return instances;
+}
+
+}  // namespace Kokkos::Experimental
+
 #include <Serial/Kokkos_Serial_Parallel_Range.hpp>
 #include <Serial/Kokkos_Serial_Parallel_MDRange.hpp>
 #include <Serial/Kokkos_Serial_Parallel_Team.hpp>
diff --git a/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp b/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp
index afdecd2f054652d5937c6e0eb23688debc37460e..69787aa5001ab7104024a62976a51efe2d34e2ef 100644
--- a/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp
+++ b/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp
@@ -58,35 +58,24 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
       : m_iter(arg_policy, arg_functor) {}
 };
 
-template <class FunctorType, class ReducerType, class... Traits>
-class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
-                     Kokkos::Serial> {
+template <class CombinedFunctorReducerType, class... Traits>
+class ParallelReduce<CombinedFunctorReducerType,
+                     Kokkos::MDRangePolicy<Traits...>, Kokkos::Serial> {
  private:
   using MDRangePolicy = Kokkos::MDRangePolicy<Traits...>;
   using Policy        = typename MDRangePolicy::impl_range_policy;
+  using FunctorType   = typename CombinedFunctorReducerType::functor_type;
+  using ReducerType   = typename CombinedFunctorReducerType::reducer_type;
 
   using WorkTag = typename MDRangePolicy::work_tag;
 
-  using ReducerConditional =
-      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                         FunctorType, ReducerType>;
-  using ReducerTypeFwd = typename ReducerConditional::type;
-  using WorkTagFwd =
-      std::conditional_t<std::is_same<InvalidType, ReducerType>::value, WorkTag,
-                         void>;
+  using pointer_type   = typename ReducerType::pointer_type;
+  using value_type     = typename ReducerType::value_type;
+  using reference_type = typename ReducerType::reference_type;
 
-  using Analysis = FunctorAnalysis<FunctorPatternInterface::REDUCE,
-                                   MDRangePolicy, ReducerTypeFwd>;
-
-  using pointer_type   = typename Analysis::pointer_type;
-  using value_type     = typename Analysis::value_type;
-  using reference_type = typename Analysis::reference_type;
-
-  using iterate_type =
-      typename Kokkos::Impl::HostIterateTile<MDRangePolicy, FunctorType,
-                                             WorkTag, reference_type>;
+  using iterate_type = typename Kokkos::Impl::HostIterateTile<
+      MDRangePolicy, CombinedFunctorReducerType, WorkTag, reference_type>;
   const iterate_type m_iter;
-  const ReducerType m_reducer;
   const pointer_type m_result_ptr;
 
   inline void exec(reference_type update) const {
@@ -107,8 +96,8 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
     return 1024;
   }
   inline void execute() const {
-    const size_t pool_reduce_size = Analysis::value_size(
-        ReducerConditional::select(m_iter.m_func, m_reducer));
+    const ReducerType& reducer     = m_iter.m_func.get_reducer();
+    const size_t pool_reduce_size  = reducer.value_size();
     const size_t team_reduce_size  = 0;  // Never shrinks
     const size_t team_shared_size  = 0;  // Never shrinks
     const size_t thread_local_size = 0;  // Never shrinks
@@ -128,44 +117,27 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
             : pointer_type(
                   internal_instance->m_thread_team_data.pool_reduce_local());
 
-    typename Analysis::Reducer final_reducer(
-        &ReducerConditional::select(m_iter.m_func, m_reducer));
-
-    reference_type update = final_reducer.init(ptr);
+    reference_type update = reducer.init(ptr);
 
     this->exec(update);
 
-    final_reducer.final(ptr);
+    reducer.final(ptr);
   }
 
-  template <class HostViewType>
-  ParallelReduce(const FunctorType& arg_functor,
+  template <class ViewType>
+  ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer,
                  const MDRangePolicy& arg_policy,
-                 const HostViewType& arg_result_view,
-                 std::enable_if_t<Kokkos::is_view<HostViewType>::value &&
-                                      !Kokkos::is_reducer<ReducerType>::value,
-                                  void*> = nullptr)
-      : m_iter(arg_policy, arg_functor),
-        m_reducer(InvalidType()),
+                 const ViewType& arg_result_view)
+      : m_iter(arg_policy, arg_functor_reducer),
         m_result_ptr(arg_result_view.data()) {
-    static_assert(Kokkos::is_view<HostViewType>::value,
+    static_assert(Kokkos::is_view<ViewType>::value,
                   "Kokkos::Serial reduce result must be a View");
 
     static_assert(
-        Kokkos::Impl::MemorySpaceAccess<typename HostViewType::memory_space,
+        Kokkos::Impl::MemorySpaceAccess<typename ViewType::memory_space,
                                         Kokkos::HostSpace>::accessible,
-        "Kokkos::Serial reduce result must be a View in HostSpace");
-  }
-
-  inline ParallelReduce(const FunctorType& arg_functor,
-                        MDRangePolicy arg_policy, const ReducerType& reducer)
-      : m_iter(arg_policy, arg_functor),
-        m_reducer(reducer),
-        m_result_ptr(reducer.view().data()) {
-    /*static_assert( std::is_same< typename ViewType::memory_space
-                                    , Kokkos::HostSpace >::value
-      , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace"
-      );*/
+        "Kokkos::Serial reduce result must be a View accessible from "
+        "HostSpace");
   }
 };
 
diff --git a/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp b/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp
index f35c13170b8d5b0210dec7183d2433f20fd7303f..56894716dbd7bc720da9fc640aa15f110ccd2455 100644
--- a/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp
+++ b/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp
@@ -58,31 +58,20 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Serial> {
 
 /*--------------------------------------------------------------------------*/
 
-template <class FunctorType, class ReducerType, class... Traits>
-class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
+template <class CombinedFunctorReducerType, class... Traits>
+class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>,
                      Kokkos::Serial> {
  private:
-  using Policy  = Kokkos::RangePolicy<Traits...>;
-  using WorkTag = typename Policy::work_tag;
-
-  using ReducerConditional =
-      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                         FunctorType, ReducerType>;
-
-  using ReducerTypeFwd = typename ReducerConditional::type;
-  using WorkTagFwd =
-      std::conditional_t<std::is_same<InvalidType, ReducerType>::value, WorkTag,
-                         void>;
-
-  using Analysis =
-      FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy, ReducerTypeFwd>;
+  using Policy      = Kokkos::RangePolicy<Traits...>;
+  using WorkTag     = typename Policy::work_tag;
+  using FunctorType = typename CombinedFunctorReducerType::functor_type;
+  using ReducerType = typename CombinedFunctorReducerType::reducer_type;
 
-  using pointer_type   = typename Analysis::pointer_type;
-  using reference_type = typename Analysis::reference_type;
+  using pointer_type   = typename ReducerType::pointer_type;
+  using reference_type = typename ReducerType::reference_type;
 
-  const FunctorType m_functor;
+  const CombinedFunctorReducerType m_functor_reducer;
   const Policy m_policy;
-  const ReducerType m_reducer;
   const pointer_type m_result_ptr;
 
   template <class TagType>
@@ -90,7 +79,7 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
       reference_type update) const {
     const typename Policy::member_type e = m_policy.end();
     for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) {
-      m_functor(i, update);
+      m_functor_reducer.get_functor()(i, update);
     }
   }
 
@@ -101,14 +90,14 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
 
     const typename Policy::member_type e = m_policy.end();
     for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) {
-      m_functor(t, i, update);
+      m_functor_reducer.get_functor()(t, i, update);
     }
   }
 
  public:
   inline void execute() const {
     const size_t pool_reduce_size =
-        Analysis::value_size(ReducerConditional::select(m_functor, m_reducer));
+        m_functor_reducer.get_reducer().value_size();
     const size_t team_reduce_size  = 0;  // Never shrinks
     const size_t team_shared_size  = 0;  // Never shrinks
     const size_t thread_local_size = 0;  // Never shrinks
@@ -127,45 +116,27 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
             : pointer_type(
                   internal_instance->m_thread_team_data.pool_reduce_local());
 
-    typename Analysis::Reducer final_reducer(
-        &ReducerConditional::select(m_functor, m_reducer));
-
-    reference_type update = final_reducer.init(ptr);
+    reference_type update = m_functor_reducer.get_reducer().init(ptr);
 
     this->template exec<WorkTag>(update);
 
-    final_reducer.final(ptr);
+    m_functor_reducer.get_reducer().final(ptr);
   }
 
-  template <class HostViewType>
-  ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy,
-                 const HostViewType& arg_result_view,
-                 std::enable_if_t<Kokkos::is_view<HostViewType>::value &&
-                                      !Kokkos::is_reducer<ReducerType>::value,
-                                  void*> = nullptr)
-      : m_functor(arg_functor),
+  template <class ViewType>
+  ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer,
+                 const Policy& arg_policy, const ViewType& arg_result_view)
+      : m_functor_reducer(arg_functor_reducer),
         m_policy(arg_policy),
-        m_reducer(InvalidType()),
         m_result_ptr(arg_result_view.data()) {
-    static_assert(Kokkos::is_view<HostViewType>::value,
+    static_assert(Kokkos::is_view<ViewType>::value,
                   "Kokkos::Serial reduce result must be a View");
 
     static_assert(
-        Kokkos::Impl::MemorySpaceAccess<typename HostViewType::memory_space,
+        Kokkos::Impl::MemorySpaceAccess<typename ViewType::memory_space,
                                         Kokkos::HostSpace>::accessible,
-        "Kokkos::Serial reduce result must be a View in HostSpace");
-  }
-
-  inline ParallelReduce(const FunctorType& arg_functor, Policy arg_policy,
-                        const ReducerType& reducer)
-      : m_functor(arg_functor),
-        m_policy(arg_policy),
-        m_reducer(reducer),
-        m_result_ptr(reducer.view().data()) {
-    /*static_assert( std::is_same< typename ViewType::memory_space
-                                    , Kokkos::HostSpace >::value
-      , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace"
-      );*/
+        "Kokkos::Serial reduce result must be a View accessible from "
+        "HostSpace");
   }
 };
 
@@ -179,12 +150,13 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
   using WorkTag = typename Policy::work_tag;
 
   using Analysis =
-      FunctorAnalysis<FunctorPatternInterface::SCAN, Policy, FunctorType>;
+      FunctorAnalysis<FunctorPatternInterface::SCAN, Policy, FunctorType, void>;
 
   using pointer_type   = typename Analysis::pointer_type;
   using reference_type = typename Analysis::reference_type;
 
-  const FunctorType m_functor;
+  const CombinedFunctorReducer<FunctorType, typename Analysis::Reducer>
+      m_functor_reducer;
   const Policy m_policy;
 
   template <class TagType>
@@ -192,7 +164,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
       reference_type update) const {
     const typename Policy::member_type e = m_policy.end();
     for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) {
-      m_functor(i, update, true);
+      m_functor_reducer.get_functor()(i, update, true);
     }
   }
 
@@ -202,13 +174,15 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
     const TagType t{};
     const typename Policy::member_type e = m_policy.end();
     for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) {
-      m_functor(t, i, update, true);
+      m_functor_reducer.get_functor()(t, i, update, true);
     }
   }
 
  public:
   inline void execute() const {
-    const size_t pool_reduce_size  = Analysis::value_size(m_functor);
+    const typename Analysis::Reducer& final_reducer =
+        m_functor_reducer.get_reducer();
+    const size_t pool_reduce_size  = final_reducer.value_size();
     const size_t team_reduce_size  = 0;  // Never shrinks
     const size_t team_shared_size  = 0;  // Never shrinks
     const size_t thread_local_size = 0;  // Never shrinks
@@ -221,8 +195,6 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
         pool_reduce_size, team_reduce_size, team_shared_size,
         thread_local_size);
 
-    typename Analysis::Reducer final_reducer(&m_functor);
-
     reference_type update = final_reducer.init(pointer_type(
         internal_instance->m_thread_team_data.pool_reduce_local()));
 
@@ -230,7 +202,8 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
   }
 
   inline ParallelScan(const FunctorType& arg_functor, const Policy& arg_policy)
-      : m_functor(arg_functor), m_policy(arg_policy) {}
+      : m_functor_reducer(arg_functor, typename Analysis::Reducer{arg_functor}),
+        m_policy(arg_policy) {}
 };
 
 /*--------------------------------------------------------------------------*/
@@ -241,14 +214,15 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
   using Policy  = Kokkos::RangePolicy<Traits...>;
   using WorkTag = typename Policy::work_tag;
 
-  using Analysis =
-      FunctorAnalysis<FunctorPatternInterface::SCAN, Policy, FunctorType>;
+  using Analysis = FunctorAnalysis<FunctorPatternInterface::SCAN, Policy,
+                                   FunctorType, ReturnType>;
 
   using value_type     = typename Analysis::value_type;
   using pointer_type   = typename Analysis::pointer_type;
   using reference_type = typename Analysis::reference_type;
 
-  const FunctorType m_functor;
+  const CombinedFunctorReducer<FunctorType, typename Analysis::Reducer>
+      m_functor_reducer;
   const Policy m_policy;
   const pointer_type m_result_ptr;
 
@@ -257,7 +231,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
       reference_type update) const {
     const typename Policy::member_type e = m_policy.end();
     for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) {
-      m_functor(i, update, true);
+      m_functor_reducer.get_functor()(i, update, true);
     }
   }
 
@@ -267,13 +241,14 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
     const TagType t{};
     const typename Policy::member_type e = m_policy.end();
     for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) {
-      m_functor(t, i, update, true);
+      m_functor_reducer.get_functor()(t, i, update, true);
     }
   }
 
  public:
   inline void execute() {
-    const size_t pool_reduce_size  = Analysis::value_size(m_functor);
+    const size_t pool_reduce_size =
+        m_functor_reducer.get_reducer().value_size();
     const size_t team_reduce_size  = 0;  // Never shrinks
     const size_t team_shared_size  = 0;  // Never shrinks
     const size_t thread_local_size = 0;  // Never shrinks
@@ -286,7 +261,8 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
         pool_reduce_size, team_reduce_size, team_shared_size,
         thread_local_size);
 
-    typename Analysis::Reducer final_reducer(&m_functor);
+    const typename Analysis::Reducer& final_reducer =
+        m_functor_reducer.get_reducer();
 
     reference_type update = final_reducer.init(pointer_type(
         internal_instance->m_thread_team_data.pool_reduce_local()));
@@ -301,7 +277,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
   ParallelScanWithTotal(const FunctorType& arg_functor,
                         const Policy& arg_policy,
                         const ViewType& arg_result_view)
-      : m_functor(arg_functor),
+      : m_functor_reducer(arg_functor, typename Analysis::Reducer{arg_functor}),
         m_policy(arg_policy),
         m_result_ptr(arg_result_view.data()) {
     static_assert(
diff --git a/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp b/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp
index c5156f1f7f21913589070fe13404a3c801d3252c..0876f1af229d764ca4bebf752f819b75282f1c15 100644
--- a/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp
+++ b/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp
@@ -268,35 +268,25 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
 
 /*--------------------------------------------------------------------------*/
 
-template <class FunctorType, class ReducerType, class... Properties>
-class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
-                     ReducerType, Kokkos::Serial> {
+template <class CombinedFunctorReducerType, class... Properties>
+class ParallelReduce<CombinedFunctorReducerType,
+                     Kokkos::TeamPolicy<Properties...>, Kokkos::Serial> {
  private:
-  enum { TEAM_REDUCE_SIZE = 512 };
+  static constexpr int TEAM_REDUCE_SIZE = 512;
 
-  using Policy = TeamPolicyInternal<Kokkos::Serial, Properties...>;
+  using Policy      = TeamPolicyInternal<Kokkos::Serial, Properties...>;
+  using FunctorType = typename CombinedFunctorReducerType::functor_type;
+  using ReducerType = typename CombinedFunctorReducerType::reducer_type;
 
   using Member  = typename Policy::member_type;
   using WorkTag = typename Policy::work_tag;
 
-  using ReducerConditional =
-      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                         FunctorType, ReducerType>;
-  using ReducerTypeFwd = typename ReducerConditional::type;
-  using WorkTagFwd =
-      std::conditional_t<std::is_same<InvalidType, ReducerType>::value, WorkTag,
-                         void>;
-
-  using Analysis =
-      FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy, ReducerTypeFwd>;
-
-  using pointer_type   = typename Analysis::pointer_type;
-  using reference_type = typename Analysis::reference_type;
+  using pointer_type   = typename ReducerType::pointer_type;
+  using reference_type = typename ReducerType::reference_type;
 
-  const FunctorType m_functor;
+  const CombinedFunctorReducerType m_functor_reducer;
   const Policy m_policy;
   const int m_league;
-  const ReducerType m_reducer;
   pointer_type m_result_ptr;
   size_t m_shared;
 
@@ -304,7 +294,7 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   inline std::enable_if_t<std::is_void<TagType>::value> exec(
       HostThreadTeamData& data, reference_type update) const {
     for (int ileague = 0; ileague < m_league; ++ileague) {
-      m_functor(Member(data, ileague, m_league), update);
+      m_functor_reducer.get_functor()(Member(data, ileague, m_league), update);
     }
   }
 
@@ -314,14 +304,15 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
     const TagType t{};
 
     for (int ileague = 0; ileague < m_league; ++ileague) {
-      m_functor(t, Member(data, ileague, m_league), update);
+      m_functor_reducer.get_functor()(t, Member(data, ileague, m_league),
+                                      update);
     }
   }
 
  public:
   inline void execute() const {
     const size_t pool_reduce_size =
-        Analysis::value_size(ReducerConditional::select(m_functor, m_reducer));
+        m_functor_reducer.get_reducer().value_size();
 
     const size_t team_reduce_size  = TEAM_REDUCE_SIZE;
     const size_t team_shared_size  = m_shared;
@@ -341,53 +332,32 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
             : pointer_type(
                   internal_instance->m_thread_team_data.pool_reduce_local());
 
-    typename Analysis::Reducer final_reducer(
-        &ReducerConditional::select(m_functor, m_reducer));
-
-    reference_type update = final_reducer.init(ptr);
+    reference_type update = m_functor_reducer.get_reducer().init(ptr);
 
     this->template exec<WorkTag>(internal_instance->m_thread_team_data, update);
 
-    final_reducer.final(ptr);
+    m_functor_reducer.get_reducer().final(ptr);
   }
 
   template <class ViewType>
-  ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy,
-                 const ViewType& arg_result,
-                 std::enable_if_t<Kokkos::is_view<ViewType>::value &&
-                                      !Kokkos::is_reducer<ReducerType>::value,
-                                  void*> = nullptr)
-      : m_functor(arg_functor),
+  ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer,
+                 const Policy& arg_policy, const ViewType& arg_result)
+      : m_functor_reducer(arg_functor_reducer),
         m_policy(arg_policy),
         m_league(arg_policy.league_size()),
-        m_reducer(InvalidType()),
         m_result_ptr(arg_result.data()),
         m_shared(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) +
-                 FunctorTeamShmemSize<FunctorType>::value(m_functor, 1)) {
+                 FunctorTeamShmemSize<FunctorType>::value(
+                     m_functor_reducer.get_functor(), 1)) {
     static_assert(Kokkos::is_view<ViewType>::value,
                   "Reduction result on Kokkos::Serial must be a Kokkos::View");
 
     static_assert(
         Kokkos::Impl::MemorySpaceAccess<typename ViewType::memory_space,
                                         Kokkos::HostSpace>::accessible,
-        "Reduction result on Kokkos::Serial must be a Kokkos::View in "
+        "Kokkos::Serial reduce result must be a View accessible from "
         "HostSpace");
   }
-
-  inline ParallelReduce(const FunctorType& arg_functor, Policy arg_policy,
-                        const ReducerType& reducer)
-      : m_functor(arg_functor),
-        m_policy(arg_policy),
-        m_league(arg_policy.league_size()),
-        m_reducer(reducer),
-        m_result_ptr(reducer.view().data()),
-        m_shared(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) +
-                 FunctorTeamShmemSize<FunctorType>::value(arg_functor, 1)) {
-    /*static_assert( std::is_same< typename ViewType::memory_space
-                            , Kokkos::HostSpace >::value
-    , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace"
-    );*/
-  }
 };
 
 }  // namespace Impl
diff --git a/packages/kokkos/core/src/Serial/Kokkos_Serial_Task.hpp b/packages/kokkos/core/src/Serial/Kokkos_Serial_Task.hpp
index c744f34760ca6a0d42362ac3124bd769bcd957e2..f9c86f55ce05f92e074e37770efc3911830f5e87 100644
--- a/packages/kokkos/core/src/Serial/Kokkos_Serial_Task.hpp
+++ b/packages/kokkos/core/src/Serial/Kokkos_Serial_Task.hpp
@@ -22,9 +22,9 @@
 
 #include <Kokkos_TaskScheduler_fwd.hpp>
 
-#include <impl/Kokkos_TaskQueue.hpp>
-#include <Kokkos_Serial.hpp>
+#include <Serial/Kokkos_Serial.hpp>
 #include <impl/Kokkos_HostThreadTeam.hpp>
+#include <impl/Kokkos_TaskQueue.hpp>
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
diff --git a/packages/kokkos/core/src/Serial/Kokkos_Serial_ZeroMemset.hpp b/packages/kokkos/core/src/Serial/Kokkos_Serial_ZeroMemset.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..3ec2dfbcfa0a83c20e9d02042c958890074a6c22
--- /dev/null
+++ b/packages/kokkos/core/src/Serial/Kokkos_Serial_ZeroMemset.hpp
@@ -0,0 +1,50 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_SERIAL_ZEROMEMSET_HPP
+#define KOKKOS_SERIAL_ZEROMEMSET_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <impl/Kokkos_ZeroMemset_fwd.hpp>
+#include <Serial/Kokkos_Serial.hpp>
+
+#include <type_traits>
+
+namespace Kokkos {
+namespace Impl {
+
+// We only need to provide a specialization for Serial if there is a host
+// parallel execution space since the specialization for
+// DefaultHostExecutionSpace is defined elsewhere.
+struct DummyExecutionSpace;
+template <class T, class... P>
+struct ZeroMemset<
+    std::conditional_t<!std::is_same<Serial, DefaultHostExecutionSpace>::value,
+                       Serial, DummyExecutionSpace>,
+    View<T, P...>>
+    : public ZeroMemset<DefaultHostExecutionSpace, View<T, P...>> {
+  using Base = ZeroMemset<DefaultHostExecutionSpace, View<T, P...>>;
+  using Base::Base;
+
+  ZeroMemset(const Serial&, const View<T, P...>& dst,
+             typename View<T, P...>::const_value_type& value)
+      : Base(dst, value) {}
+};
+
+}  // namespace Impl
+}  // namespace Kokkos
+
+#endif  // !defined(KOKKOS_SERIAL_ZEROMEMSET_HPP)
diff --git a/packages/kokkos/core/src/Kokkos_Threads.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads.hpp
similarity index 89%
rename from packages/kokkos/core/src/Kokkos_Threads.hpp
rename to packages/kokkos/core/src/Threads/Kokkos_Threads.hpp
index db3b771f2b46b779edfa4c29a6b5bd720555184c..c0d70c03ecbef0fb10d2757b0fa9fad66235d817 100644
--- a/packages/kokkos/core/src/Kokkos_Threads.hpp
+++ b/packages/kokkos/core/src/Threads/Kokkos_Threads.hpp
@@ -169,21 +169,5 @@ struct MemorySpaceAccess<Kokkos::Threads::memory_space,
 }  // namespace Impl
 }  // namespace Kokkos
 
-/*--------------------------------------------------------------------------*/
-
-#include <Kokkos_ExecPolicy.hpp>
-#include <Kokkos_Parallel.hpp>
-#include <Threads/Kokkos_ThreadsExec.hpp>
-#include <Threads/Kokkos_ThreadsTeam.hpp>
-#include <Threads/Kokkos_Threads_Parallel_Range.hpp>
-#include <Threads/Kokkos_Threads_Parallel_MDRange.hpp>
-#include <Threads/Kokkos_Threads_Parallel_Team.hpp>
-#include <Threads/Kokkos_Threads_UniqueToken.hpp>
-
-#include <KokkosExp_MDRangePolicy.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
 #endif /* #if defined( KOKKOS_ENABLE_THREADS ) */
 #endif /* #define KOKKOS_THREADS_HPP */
diff --git a/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp b/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
index 8f39c726c722a2a1c555905a3f3219074d497ad6..c754091e87e0423790c6b16f86658c8021c136a0 100644
--- a/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
+++ b/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
@@ -768,9 +768,6 @@ void ThreadsExec::initialize(int thread_count_arg) {
               << thread_count << " threads per process." << std::endl;
   }
 
-  // Init the array for used for arbitrarily sized atomics
-  Impl::init_lock_array_host_space();
-
   Impl::SharedAllocationRecord<void, void>::tracking_enable();
 }
 
diff --git a/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp b/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
index 8b15928debce7048c80bac661deb157a8ffaab7a..377e096bfbebc6df30ca024c82aef2275e0c68a4 100644
--- a/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
+++ b/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
@@ -20,13 +20,16 @@
 #include <Kokkos_Macros.hpp>
 
 #include <cstdio>
-
+#include <ostream>
 #include <utility>
+
 #include <impl/Kokkos_Spinwait.hpp>
 
 #include <Kokkos_Atomic.hpp>
+#include <Kokkos_Pair.hpp>
 
 #include <impl/Kokkos_ConcurrentBitset.hpp>
+#include <Threads/Kokkos_Threads.hpp>
 
 //----------------------------------------------------------------------------
 
diff --git a/packages/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp b/packages/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp
index 3f734f08d4f986fcb48c1691d14a794bc744a4de..b1cadc7c485d65b16cc5cfc61d99a50ffecfb67e 100644
--- a/packages/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp
+++ b/packages/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp
@@ -976,16 +976,19 @@ parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<
  * lambda(iType i, ValueType & val, bool final) for each i=0..N-1.
  *
  */
-template <typename iType, class FunctorType>
+template <typename iType, class FunctorType, typename ValueType>
 KOKKOS_INLINE_FUNCTION void parallel_scan(
     const Impl::TeamThreadRangeBoundariesStruct<
         iType, Impl::ThreadsExecTeamMember>& loop_bounds,
-    const FunctorType& lambda) {
-  using value_type = typename Kokkos::Impl::FunctorAnalysis<
-      Kokkos::Impl::FunctorPatternInterface::SCAN, void,
-      FunctorType>::value_type;
+    const FunctorType& lambda, ValueType& return_val) {
+  // Extract ValueType from the Closure
+  using closure_value_type = typename Kokkos::Impl::FunctorAnalysis<
+      Kokkos::Impl::FunctorPatternInterface::SCAN, void, FunctorType,
+      void>::value_type;
+  static_assert(std::is_same_v<closure_value_type, ValueType>,
+                "Non-matching value types of closure and return type");
 
-  auto scan_val = value_type{};
+  auto scan_val = ValueType{};
 
   // Intra-member scan
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
@@ -1006,6 +1009,21 @@ KOKKOS_INLINE_FUNCTION void parallel_scan(
        i += loop_bounds.increment) {
     lambda(i, scan_val, true);
   }
+
+  return_val = scan_val;
+}
+
+template <typename iType, class FunctorType>
+KOKKOS_INLINE_FUNCTION void parallel_scan(
+    const Impl::TeamThreadRangeBoundariesStruct<
+        iType, Impl::ThreadsExecTeamMember>& loop_bounds,
+    const FunctorType& lambda) {
+  using value_type = typename Kokkos::Impl::FunctorAnalysis<
+      Kokkos::Impl::FunctorPatternInterface::SCAN, void, FunctorType,
+      void>::value_type;
+
+  value_type scan_val;
+  parallel_scan(loop_bounds, lambda, scan_val);
 }
 
 /** \brief  Intra-thread vector parallel exclusive prefix sum. Executes
@@ -1020,17 +1038,20 @@ KOKKOS_INLINE_FUNCTION void parallel_scan(
  * final==true. Scan_val will be set to the final sum value over all vector
  * lanes.
  */
-template <typename iType, class FunctorType>
+template <typename iType, class FunctorType, typename ValueType>
 KOKKOS_INLINE_FUNCTION void parallel_scan(
     const Impl::ThreadVectorRangeBoundariesStruct<
         iType, Impl::ThreadsExecTeamMember>& loop_boundaries,
-    const FunctorType& lambda) {
-  using value_type =
+    const FunctorType& lambda, ValueType& return_val) {
+  // Extract ValueType from the Closure
+  using closure_value_type =
       typename Impl::FunctorAnalysis<Impl::FunctorPatternInterface::SCAN,
-                                     TeamPolicy<Threads>,
-                                     FunctorType>::value_type;
+                                     TeamPolicy<Threads>, FunctorType,
+                                     void>::value_type;
+  static_assert(std::is_same<closure_value_type, ValueType>::value,
+                "Non-matching value types of closure and return type");
 
-  value_type scan_val = value_type();
+  ValueType scan_val = ValueType();
 
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
@@ -1039,6 +1060,22 @@ KOKKOS_INLINE_FUNCTION void parallel_scan(
        i += loop_boundaries.increment) {
     lambda(i, scan_val, true);
   }
+
+  return_val = scan_val;
+}
+
+template <typename iType, class FunctorType>
+KOKKOS_INLINE_FUNCTION void parallel_scan(
+    const Impl::ThreadVectorRangeBoundariesStruct<
+        iType, Impl::ThreadsExecTeamMember>& loop_boundaries,
+    const FunctorType& lambda) {
+  using value_type =
+      typename Impl::FunctorAnalysis<Impl::FunctorPatternInterface::SCAN,
+                                     TeamPolicy<Threads>, FunctorType,
+                                     void>::value_type;
+
+  value_type scan_val;
+  parallel_scan(loop_boundaries, lambda, scan_val);
 }
 
 /** \brief  Intra-thread vector parallel scan with reducer
diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_MDRange.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_MDRange.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..0828f262993cc13527d7ee900f7b3db1fbcc68d9
--- /dev/null
+++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_MDRange.hpp
@@ -0,0 +1,115 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_THREADS_PARALLEL_FOR_MDRANGE_HPP
+#define KOKKOS_THREADS_PARALLEL_FOR_MDRANGE_HPP
+
+#include <Kokkos_Parallel.hpp>
+
+#include <KokkosExp_MDRangePolicy.hpp>
+namespace Kokkos {
+namespace Impl {
+
+template <class FunctorType, class... Traits>
+class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
+                  Kokkos::Threads> {
+ private:
+  using MDRangePolicy = Kokkos::MDRangePolicy<Traits...>;
+  using Policy        = typename MDRangePolicy::impl_range_policy;
+
+  using WorkTag = typename MDRangePolicy::work_tag;
+
+  using WorkRange = typename Policy::WorkRange;
+  using Member    = typename Policy::member_type;
+
+  using iterate_type = typename Kokkos::Impl::HostIterateTile<
+      MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void>;
+
+  const iterate_type m_iter;
+
+  inline void exec_range(const Member ibeg, const Member iend) const {
+    for (Member i = ibeg; i < iend; ++i) {
+      m_iter(i);
+    }
+  }
+
+  static void exec(ThreadsExec &exec, const void *arg) {
+    exec_schedule<typename Policy::schedule_type::type>(exec, arg);
+  }
+
+  template <class Schedule>
+  static std::enable_if_t<std::is_same<Schedule, Kokkos::Static>::value>
+  exec_schedule(ThreadsExec &exec, const void *arg) {
+    const ParallelFor &self = *((const ParallelFor *)arg);
+
+    auto const num_tiles = self.m_iter.m_rp.m_num_tiles;
+    WorkRange range(Policy(0, num_tiles).set_chunk_size(1), exec.pool_rank(),
+                    exec.pool_size());
+
+    self.exec_range(range.begin(), range.end());
+
+    exec.fan_in();
+  }
+
+  template <class Schedule>
+  static std::enable_if_t<std::is_same<Schedule, Kokkos::Dynamic>::value>
+  exec_schedule(ThreadsExec &exec, const void *arg) {
+    const ParallelFor &self = *((const ParallelFor *)arg);
+
+    auto const num_tiles = self.m_iter.m_rp.m_num_tiles;
+    WorkRange range(Policy(0, num_tiles).set_chunk_size(1), exec.pool_rank(),
+                    exec.pool_size());
+
+    exec.set_work_range(range.begin(), range.end(), 1);
+    exec.reset_steal_target();
+    exec.barrier();
+
+    long work_index = exec.get_work_index();
+
+    while (work_index != -1) {
+      const Member begin = static_cast<Member>(work_index);
+      const Member end   = begin + 1 < num_tiles ? begin + 1 : num_tiles;
+
+      self.exec_range(begin, end);
+      work_index = exec.get_work_index();
+    }
+
+    exec.fan_in();
+  }
+
+ public:
+  inline void execute() const {
+    ThreadsExec::start(&ParallelFor::exec, this);
+    ThreadsExec::fence();
+  }
+
+  ParallelFor(const FunctorType &arg_functor, const MDRangePolicy &arg_policy)
+      : m_iter(arg_policy, arg_functor) {}
+
+  template <typename Policy, typename Functor>
+  static int max_tile_size_product(const Policy &, const Functor &) {
+    /**
+     * 1024 here is just our guess for a reasonable max tile size,
+     * it isn't a hardware constraint. If people see a use for larger
+     * tile size products, we're happy to change this.
+     */
+    return 1024;
+  }
+};
+
+}  // namespace Impl
+}  // namespace Kokkos
+#endif
diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Range.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Range.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..3698416ef187dfcf6649b57ce8ac3d297fad1d05
--- /dev/null
+++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Range.hpp
@@ -0,0 +1,122 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_THREADS_PARALLEL_FOR_RANGE_HPP
+#define KOKKOS_THREADS_PARALLEL_FOR_RANGE_HPP
+
+#include <Kokkos_Parallel.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+template <class FunctorType, class... Traits>
+class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>,
+                  Kokkos::Threads> {
+ private:
+  using Policy    = Kokkos::RangePolicy<Traits...>;
+  using WorkTag   = typename Policy::work_tag;
+  using WorkRange = typename Policy::WorkRange;
+  using Member    = typename Policy::member_type;
+
+  const FunctorType m_functor;
+  const Policy m_policy;
+
+  template <class TagType>
+  inline static std::enable_if_t<std::is_void<TagType>::value> exec_range(
+      const FunctorType &functor, const Member ibeg, const Member iend) {
+#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \
+    defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
+#pragma ivdep
+#endif
+    for (Member i = ibeg; i < iend; ++i) {
+      functor(i);
+    }
+  }
+
+  template <class TagType>
+  inline static std::enable_if_t<!std::is_void<TagType>::value> exec_range(
+      const FunctorType &functor, const Member ibeg, const Member iend) {
+    const TagType t{};
+#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \
+    defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
+#pragma ivdep
+#endif
+    for (Member i = ibeg; i < iend; ++i) {
+      functor(t, i);
+    }
+  }
+
+  static void exec(ThreadsExec &exec, const void *arg) {
+    exec_schedule<typename Policy::schedule_type::type>(exec, arg);
+  }
+
+  template <class Schedule>
+  static std::enable_if_t<std::is_same<Schedule, Kokkos::Static>::value>
+  exec_schedule(ThreadsExec &exec, const void *arg) {
+    const ParallelFor &self = *((const ParallelFor *)arg);
+
+    WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size());
+
+    ParallelFor::template exec_range<WorkTag>(self.m_functor, range.begin(),
+                                              range.end());
+
+    exec.fan_in();
+  }
+
+  template <class Schedule>
+  static std::enable_if_t<std::is_same<Schedule, Kokkos::Dynamic>::value>
+  exec_schedule(ThreadsExec &exec, const void *arg) {
+    const ParallelFor &self = *((const ParallelFor *)arg);
+
+    WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size());
+
+    exec.set_work_range(range.begin() - self.m_policy.begin(),
+                        range.end() - self.m_policy.begin(),
+                        self.m_policy.chunk_size());
+    exec.reset_steal_target();
+    exec.barrier();
+
+    long work_index = exec.get_work_index();
+
+    while (work_index != -1) {
+      const Member begin =
+          static_cast<Member>(work_index) * self.m_policy.chunk_size() +
+          self.m_policy.begin();
+      const Member end =
+          begin + self.m_policy.chunk_size() < self.m_policy.end()
+              ? begin + self.m_policy.chunk_size()
+              : self.m_policy.end();
+      ParallelFor::template exec_range<WorkTag>(self.m_functor, begin, end);
+      work_index = exec.get_work_index();
+    }
+
+    exec.fan_in();
+  }
+
+ public:
+  inline void execute() const {
+    ThreadsExec::start(&ParallelFor::exec, this);
+    ThreadsExec::fence();
+  }
+
+  ParallelFor(const FunctorType &arg_functor, const Policy &arg_policy)
+      : m_functor(arg_functor), m_policy(arg_policy) {}
+};
+
+}  // namespace Impl
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..36404857a228beaff5208b9b5051be0b579a1dc0
--- /dev/null
+++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp
@@ -0,0 +1,118 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_THREADS_PARALLEL_FOR_TEAM_HPP
+#define KOKKOS_THREADS_PARALLEL_FOR_TEAM_HPP
+
+#include <Kokkos_Parallel.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+template <class FunctorType, class... Properties>
+class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
+                  Kokkos::Threads> {
+ private:
+  using Policy =
+      Kokkos::Impl::TeamPolicyInternal<Kokkos::Threads, Properties...>;
+  using WorkTag = typename Policy::work_tag;
+  using Member  = typename Policy::member_type;
+
+  const FunctorType m_functor;
+  const Policy m_policy;
+  const size_t m_shared;
+
+  template <class TagType, class Schedule>
+  inline static std::enable_if_t<std::is_void<TagType>::value &&
+                                 std::is_same<Schedule, Kokkos::Static>::value>
+  exec_team(const FunctorType &functor, Member member) {
+    for (; member.valid_static(); member.next_static()) {
+      functor(member);
+    }
+  }
+
+  template <class TagType, class Schedule>
+  inline static std::enable_if_t<!std::is_void<TagType>::value &&
+                                 std::is_same<Schedule, Kokkos::Static>::value>
+  exec_team(const FunctorType &functor, Member member) {
+    const TagType t{};
+    for (; member.valid_static(); member.next_static()) {
+      functor(t, member);
+    }
+  }
+
+  template <class TagType, class Schedule>
+  inline static std::enable_if_t<std::is_void<TagType>::value &&
+                                 std::is_same<Schedule, Kokkos::Dynamic>::value>
+  exec_team(const FunctorType &functor, Member member) {
+    for (; member.valid_dynamic(); member.next_dynamic()) {
+      functor(member);
+    }
+  }
+
+  template <class TagType, class Schedule>
+  inline static std::enable_if_t<!std::is_void<TagType>::value &&
+                                 std::is_same<Schedule, Kokkos::Dynamic>::value>
+  exec_team(const FunctorType &functor, Member member) {
+    const TagType t{};
+    for (; member.valid_dynamic(); member.next_dynamic()) {
+      functor(t, member);
+    }
+  }
+
+  static void exec(ThreadsExec &exec, const void *arg) {
+    const ParallelFor &self = *((const ParallelFor *)arg);
+
+    ParallelFor::exec_team<WorkTag, typename Policy::schedule_type::type>(
+        self.m_functor, Member(&exec, self.m_policy, self.m_shared));
+
+    exec.barrier();
+    exec.fan_in();
+  }
+  template <typename Policy>
+  Policy fix_policy(Policy policy) {
+    if (policy.impl_vector_length() < 0) {
+      policy.impl_set_vector_length(1);
+    }
+    if (policy.team_size() < 0) {
+      policy.impl_set_team_size(
+          policy.team_size_recommended(m_functor, ParallelForTag{}));
+    }
+    return policy;
+  }
+
+ public:
+  inline void execute() const {
+    ThreadsExec::resize_scratch(
+        0, Policy::member_type::team_reduce_size() + m_shared);
+
+    ThreadsExec::start(&ParallelFor::exec, this);
+
+    ThreadsExec::fence();
+  }
+
+  ParallelFor(const FunctorType &arg_functor, const Policy &arg_policy)
+      : m_functor(arg_functor),
+        m_policy(fix_policy(arg_policy)),
+        m_shared(m_policy.scratch_size(0) + m_policy.scratch_size(1) +
+                 FunctorTeamShmemSize<FunctorType>::value(
+                     arg_functor, m_policy.team_size())) {}
+};
+
+}  // namespace Impl
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_MDRange.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_MDRange.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..3d06379480f72686dd5773789762fe23fc74d519
--- /dev/null
+++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_MDRange.hpp
@@ -0,0 +1,156 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_THREADS_PARALLEL_REDUCE_MDRANGE_HPP
+#define KOKKOS_THREADS_PARALLEL_REDUCE_MDRANGE_HPP
+
+#include <Kokkos_Parallel.hpp>
+
+#include <KokkosExp_MDRangePolicy.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+template <class CombinedFunctorReducerType, class... Traits>
+class ParallelReduce<CombinedFunctorReducerType,
+                     Kokkos::MDRangePolicy<Traits...>, Kokkos::Threads> {
+ private:
+  using MDRangePolicy = Kokkos::MDRangePolicy<Traits...>;
+  using Policy        = typename MDRangePolicy::impl_range_policy;
+  using FunctorType   = typename CombinedFunctorReducerType::functor_type;
+  using ReducerType   = typename CombinedFunctorReducerType::reducer_type;
+
+  using WorkTag   = typename MDRangePolicy::work_tag;
+  using WorkRange = typename Policy::WorkRange;
+  using Member    = typename Policy::member_type;
+
+  using pointer_type   = typename ReducerType::pointer_type;
+  using value_type     = typename ReducerType::value_type;
+  using reference_type = typename ReducerType::reference_type;
+
+  using iterate_type = typename Kokkos::Impl::HostIterateTile<
+      MDRangePolicy, CombinedFunctorReducerType, WorkTag, reference_type>;
+
+  const iterate_type m_iter;
+  const pointer_type m_result_ptr;
+
+  inline void exec_range(const Member &ibeg, const Member &iend,
+                         reference_type update) const {
+    for (Member i = ibeg; i < iend; ++i) {
+      m_iter(i, update);
+    }
+  }
+
+  static void exec(ThreadsExec &exec, const void *arg) {
+    exec_schedule<typename Policy::schedule_type::type>(exec, arg);
+  }
+
+  template <class Schedule>
+  static std::enable_if_t<std::is_same<Schedule, Kokkos::Static>::value>
+  exec_schedule(ThreadsExec &exec, const void *arg) {
+    const ParallelReduce &self = *((const ParallelReduce *)arg);
+
+    const auto num_tiles = self.m_iter.m_rp.m_num_tiles;
+    const WorkRange range(Policy(0, num_tiles).set_chunk_size(1),
+                          exec.pool_rank(), exec.pool_size());
+
+    const ReducerType &reducer = self.m_iter.m_func.get_reducer();
+    self.exec_range(
+        range.begin(), range.end(),
+        reducer.init(static_cast<pointer_type>(exec.reduce_memory())));
+
+    exec.fan_in_reduce(reducer);
+  }
+
+  template <class Schedule>
+  static std::enable_if_t<std::is_same<Schedule, Kokkos::Dynamic>::value>
+  exec_schedule(ThreadsExec &exec, const void *arg) {
+    const ParallelReduce &self = *((const ParallelReduce *)arg);
+
+    const auto num_tiles = self.m_iter.m_rp.m_num_tiles;
+    const WorkRange range(Policy(0, num_tiles).set_chunk_size(1),
+                          exec.pool_rank(), exec.pool_size());
+
+    exec.set_work_range(range.begin(), range.end(), 1);
+    exec.reset_steal_target();
+    exec.barrier();
+
+    long work_index = exec.get_work_index();
+
+    const ReducerType &reducer = self.m_iter.m_func.get_reducer();
+    reference_type update =
+        self.m_reducer.init(static_cast<pointer_type>(exec.reduce_memory()));
+    while (work_index != -1) {
+      const Member begin = static_cast<Member>(work_index);
+      const Member end   = begin + 1 < num_tiles ? begin + 1 : num_tiles;
+      self.exec_range(begin, end, update);
+      work_index = exec.get_work_index();
+    }
+
+    exec.fan_in_reduce(self.m_reducer);
+  }
+
+ public:
+  inline void execute() const {
+    const ReducerType &reducer = m_iter.m_func.get_reducer();
+    ThreadsExec::resize_scratch(reducer.value_size(), 0);
+
+    ThreadsExec::start(&ParallelReduce::exec, this);
+
+    ThreadsExec::fence();
+
+    if (m_result_ptr) {
+      const pointer_type data =
+          (pointer_type)ThreadsExec::root_reduce_scratch();
+
+      const unsigned n = reducer.value_count();
+      for (unsigned i = 0; i < n; ++i) {
+        m_result_ptr[i] = data[i];
+      }
+    }
+  }
+
+  template <class ViewType>
+  ParallelReduce(const CombinedFunctorReducerType &arg_functor_reducer,
+                 const MDRangePolicy &arg_policy,
+                 const ViewType &arg_result_view)
+      : m_iter(arg_policy, arg_functor_reducer),
+        m_result_ptr(arg_result_view.data()) {
+    static_assert(Kokkos::is_view<ViewType>::value,
+                  "Kokkos::Threads reduce result must be a View");
+
+    static_assert(
+        Kokkos::Impl::MemorySpaceAccess<typename ViewType::memory_space,
+                                        Kokkos::HostSpace>::accessible,
+        "Kokkos::Threads reduce result must be a View accessible from "
+        "HostSpace");
+  }
+
+  template <typename Policy, typename Functor>
+  static int max_tile_size_product(const Policy &, const Functor &) {
+    /**
+     * 1024 here is just our guess for a reasonable max tile size,
+     * it isn't a hardware constraint. If people see a use for larger
+     * tile size products, we're happy to change this.
+     */
+    return 1024;
+  }
+};
+
+}  // namespace Impl
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_Range.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_Range.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..5fa97b403c4e767321bd2605649c3e9ec2636a48
--- /dev/null
+++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_Range.hpp
@@ -0,0 +1,171 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_THREADS_PARALLEL_REDUCE_RANGE_HPP
+#define KOKKOS_THREADS_PARALLEL_REDUCE_RANGE_HPP
+
+#include <Kokkos_Parallel.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+template <class CombinedFunctorReducerType, class... Traits>
+class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>,
+                     Kokkos::Threads> {
+ private:
+  using Policy      = Kokkos::RangePolicy<Traits...>;
+  using FunctorType = typename CombinedFunctorReducerType::functor_type;
+  using ReducerType = typename CombinedFunctorReducerType::reducer_type;
+
+  using WorkTag   = typename Policy::work_tag;
+  using WorkRange = typename Policy::WorkRange;
+  using Member    = typename Policy::member_type;
+
+  using pointer_type   = typename ReducerType::pointer_type;
+  using reference_type = typename ReducerType::reference_type;
+
+  const CombinedFunctorReducerType m_functor_reducer;
+  const Policy m_policy;
+  const pointer_type m_result_ptr;
+
+  template <class TagType>
+  inline static std::enable_if_t<std::is_void<TagType>::value> exec_range(
+      const FunctorType &functor, const Member &ibeg, const Member &iend,
+      reference_type update) {
+#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \
+    defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
+#pragma ivdep
+#endif
+    for (Member i = ibeg; i < iend; ++i) {
+      functor(i, update);
+    }
+  }
+
+  template <class TagType>
+  inline static std::enable_if_t<!std::is_void<TagType>::value> exec_range(
+      const FunctorType &functor, const Member &ibeg, const Member &iend,
+      reference_type update) {
+    const TagType t{};
+#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \
+    defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
+#pragma ivdep
+#endif
+    for (Member i = ibeg; i < iend; ++i) {
+      functor(t, i, update);
+    }
+  }
+
+  static void exec(ThreadsExec &exec, const void *arg) {
+    exec_schedule<typename Policy::schedule_type::type>(exec, arg);
+  }
+
+  template <class Schedule>
+  static std::enable_if_t<std::is_same<Schedule, Kokkos::Static>::value>
+  exec_schedule(ThreadsExec &exec, const void *arg) {
+    const ParallelReduce &self = *((const ParallelReduce *)arg);
+    const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size());
+
+    const ReducerType &reducer = self.m_functor_reducer.get_reducer();
+
+    ParallelReduce::template exec_range<WorkTag>(
+        self.m_functor_reducer.get_functor(), range.begin(), range.end(),
+        reducer.init(static_cast<pointer_type>(exec.reduce_memory())));
+
+    exec.fan_in_reduce(reducer);
+  }
+
+  template <class Schedule>
+  static std::enable_if_t<std::is_same<Schedule, Kokkos::Dynamic>::value>
+  exec_schedule(ThreadsExec &exec, const void *arg) {
+    const ParallelReduce &self = *((const ParallelReduce *)arg);
+    const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size());
+
+    exec.set_work_range(range.begin() - self.m_policy.begin(),
+                        range.end() - self.m_policy.begin(),
+                        self.m_policy.chunk_size());
+    exec.reset_steal_target();
+    exec.barrier();
+
+    long work_index            = exec.get_work_index();
+    const ReducerType &reducer = self.m_functor_reducer.get_reducer();
+
+    reference_type update =
+        reducer.init(static_cast<pointer_type>(exec.reduce_memory()));
+    while (work_index != -1) {
+      const Member begin =
+          static_cast<Member>(work_index) * self.m_policy.chunk_size() +
+          self.m_policy.begin();
+      const Member end =
+          begin + self.m_policy.chunk_size() < self.m_policy.end()
+              ? begin + self.m_policy.chunk_size()
+              : self.m_policy.end();
+      ParallelReduce::template exec_range<WorkTag>(
+          self.m_functor_reducer.get_functor(), begin, end, update);
+      work_index = exec.get_work_index();
+    }
+
+    exec.fan_in_reduce(reducer);
+  }
+
+ public:
+  inline void execute() const {
+    const ReducerType &reducer = m_functor_reducer.get_reducer();
+
+    if (m_policy.end() <= m_policy.begin()) {
+      if (m_result_ptr) {
+        reducer.init(m_result_ptr);
+        reducer.final(m_result_ptr);
+      }
+    } else {
+      ThreadsExec::resize_scratch(reducer.value_size(), 0);
+
+      ThreadsExec::start(&ParallelReduce::exec, this);
+
+      ThreadsExec::fence();
+
+      if (m_result_ptr) {
+        const pointer_type data =
+            (pointer_type)ThreadsExec::root_reduce_scratch();
+
+        const unsigned n = reducer.value_count();
+        for (unsigned i = 0; i < n; ++i) {
+          m_result_ptr[i] = data[i];
+        }
+      }
+    }
+  }
+
+  template <class ViewType>
+  ParallelReduce(const CombinedFunctorReducerType &arg_functor_reducer,
+                 const Policy &arg_policy, const ViewType &arg_result_view)
+      : m_functor_reducer(arg_functor_reducer),
+        m_policy(arg_policy),
+        m_result_ptr(arg_result_view.data()) {
+    static_assert(Kokkos::is_view<ViewType>::value,
+                  "Kokkos::Threads reduce result must be a View");
+
+    static_assert(
+        Kokkos::Impl::MemorySpaceAccess<typename ViewType::memory_space,
+                                        Kokkos::HostSpace>::accessible,
+        "Kokkos::Threads reduce result must be a View accessible from "
+        "HostSpace");
+  }
+};
+
+}  // namespace Impl
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_Team.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_Team.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c4b6100a9df2ac5d9ac9bfb799ce84489c1e577b
--- /dev/null
+++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_Team.hpp
@@ -0,0 +1,136 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_THREADS_PARALLEL_REDUCE_TEAM_HPP
+#define KOKKOS_THREADS_PARALLEL_REDUCE_TEAM_HPP
+
+#include <Kokkos_Parallel.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+template <class CombinedFunctorReducerType, class... Properties>
+class ParallelReduce<CombinedFunctorReducerType,
+                     Kokkos::TeamPolicy<Properties...>, Kokkos::Threads> {
+ private:
+  using Policy =
+      Kokkos::Impl::TeamPolicyInternal<Kokkos::Threads, Properties...>;
+  using FunctorType = typename CombinedFunctorReducerType::functor_type;
+  using ReducerType = typename CombinedFunctorReducerType::reducer_type;
+  using WorkTag     = typename Policy::work_tag;
+  using Member      = typename Policy::member_type;
+
+  using pointer_type   = typename ReducerType::pointer_type;
+  using reference_type = typename ReducerType::reference_type;
+
+  const CombinedFunctorReducerType m_functor_reducer;
+  const Policy m_policy;
+  const pointer_type m_result_ptr;
+  const size_t m_shared;
+
+  template <class TagType>
+  inline static std::enable_if_t<std::is_void<TagType>::value> exec_team(
+      const FunctorType &functor, Member member, reference_type update) {
+    for (; member.valid_static(); member.next_static()) {
+      functor(member, update);
+    }
+  }
+
+  template <class TagType>
+  inline static std::enable_if_t<!std::is_void<TagType>::value> exec_team(
+      const FunctorType &functor, Member member, reference_type update) {
+    const TagType t{};
+    for (; member.valid_static(); member.next_static()) {
+      functor(t, member, update);
+    }
+  }
+
+  static void exec(ThreadsExec &exec, const void *arg) {
+    const ParallelReduce &self = *((const ParallelReduce *)arg);
+
+    ParallelReduce::template exec_team<WorkTag>(
+        self.m_functor_reducer.get_functor(),
+        Member(&exec, self.m_policy, self.m_shared),
+        self.m_functor_reducer.get_reducer().init(
+            static_cast<pointer_type>(exec.reduce_memory())));
+
+    exec.fan_in_reduce(self.m_functor_reducer.get_reducer());
+  }
+
+ public:
+  inline void execute() const {
+    const ReducerType &reducer = m_functor_reducer.get_reducer();
+
+    if (m_policy.league_size() * m_policy.team_size() == 0) {
+      if (m_result_ptr) {
+        reducer.init(m_result_ptr);
+        reducer.final(m_result_ptr);
+      }
+    } else {
+      ThreadsExec::resize_scratch(
+          reducer.value_size(),
+          Policy::member_type::team_reduce_size() + m_shared);
+
+      ThreadsExec::start(&ParallelReduce::exec, this);
+
+      ThreadsExec::fence();
+
+      if (m_result_ptr) {
+        const pointer_type data =
+            (pointer_type)ThreadsExec::root_reduce_scratch();
+
+        const unsigned n = reducer.value_count();
+        for (unsigned i = 0; i < n; ++i) {
+          m_result_ptr[i] = data[i];
+        }
+      }
+    }
+  }
+
+  template <typename Policy>
+  Policy fix_policy(Policy policy) {
+    if (policy.impl_vector_length() < 0) {
+      policy.impl_set_vector_length(1);
+    }
+    if (policy.team_size() < 0) {
+      policy.impl_set_team_size(policy.team_size_recommended(
+          m_functor_reducer.get_functor(), m_functor_reducer.get_reducer(),
+          ParallelReduceTag{}));
+    }
+    return policy;
+  }
+
+  template <class ViewType>
+  inline ParallelReduce(const CombinedFunctorReducerType &arg_functor_reducer,
+                        const Policy &arg_policy, const ViewType &arg_result)
+      : m_functor_reducer(arg_functor_reducer),
+        m_policy(fix_policy(arg_policy)),
+        m_result_ptr(arg_result.data()),
+        m_shared(m_policy.scratch_size(0) + m_policy.scratch_size(1) +
+                 FunctorTeamShmemSize<FunctorType>::value(
+                     arg_functor_reducer.get_functor(), m_policy.team_size())) {
+    static_assert(
+        Kokkos::Impl::MemorySpaceAccess<typename ViewType::memory_space,
+                                        Kokkos::HostSpace>::accessible,
+        "Kokkos::Threads reduce result must be a View accessible from "
+        "HostSpace");
+  }
+};
+
+}  // namespace Impl
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelScan_Range.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelScan_Range.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..74d8561a34b7addf740a8d02d03f7f04a12aca25
--- /dev/null
+++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelScan_Range.hpp
@@ -0,0 +1,198 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_THREADS_PARALLEL_SCAN_RANGE_HPP
+#define KOKKOS_THREADS_PARALLEL_SCAN_RANGE_HPP
+
+#include <Kokkos_Parallel.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+template <class FunctorType, class... Traits>
+class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
+                   Kokkos::Threads> {
+ private:
+  using Policy    = Kokkos::RangePolicy<Traits...>;
+  using WorkRange = typename Policy::WorkRange;
+  using WorkTag   = typename Policy::work_tag;
+  using Member    = typename Policy::member_type;
+  using Analysis  = Impl::FunctorAnalysis<Impl::FunctorPatternInterface::SCAN,
+                                         Policy, FunctorType, void>;
+  using pointer_type   = typename Analysis::pointer_type;
+  using reference_type = typename Analysis::reference_type;
+
+  const FunctorType m_functor;
+  const Policy m_policy;
+
+  template <class TagType>
+  inline static std::enable_if_t<std::is_void<TagType>::value> exec_range(
+      const FunctorType &functor, const Member &ibeg, const Member &iend,
+      reference_type update, const bool final) {
+#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \
+    defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
+#pragma ivdep
+#endif
+    for (Member i = ibeg; i < iend; ++i) {
+      functor(i, update, final);
+    }
+  }
+
+  template <class TagType>
+  inline static std::enable_if_t<!std::is_void<TagType>::value> exec_range(
+      const FunctorType &functor, const Member &ibeg, const Member &iend,
+      reference_type update, const bool final) {
+    const TagType t{};
+#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \
+    defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
+#pragma ivdep
+#endif
+    for (Member i = ibeg; i < iend; ++i) {
+      functor(t, i, update, final);
+    }
+  }
+
+  static void exec(ThreadsExec &exec, const void *arg) {
+    const ParallelScan &self = *((const ParallelScan *)arg);
+
+    const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size());
+
+    typename Analysis::Reducer final_reducer(self.m_functor);
+
+    reference_type update =
+        final_reducer.init(static_cast<pointer_type>(exec.reduce_memory()));
+
+    ParallelScan::template exec_range<WorkTag>(self.m_functor, range.begin(),
+                                               range.end(), update, false);
+
+    //  exec.template scan_large( final_reducer );
+    exec.scan_small(final_reducer);
+
+    ParallelScan::template exec_range<WorkTag>(self.m_functor, range.begin(),
+                                               range.end(), update, true);
+
+    exec.fan_in();
+  }
+
+ public:
+  inline void execute() const {
+    ThreadsExec::resize_scratch(2 * Analysis::value_size(m_functor), 0);
+    ThreadsExec::start(&ParallelScan::exec, this);
+    ThreadsExec::fence();
+  }
+
+  ParallelScan(const FunctorType &arg_functor, const Policy &arg_policy)
+      : m_functor(arg_functor), m_policy(arg_policy) {}
+};
+
+template <class FunctorType, class ReturnType, class... Traits>
+class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
+                            ReturnType, Kokkos::Threads> {
+ private:
+  using Policy    = Kokkos::RangePolicy<Traits...>;
+  using WorkRange = typename Policy::WorkRange;
+  using WorkTag   = typename Policy::work_tag;
+  using Member    = typename Policy::member_type;
+
+  using Analysis = Impl::FunctorAnalysis<Impl::FunctorPatternInterface::SCAN,
+                                         Policy, FunctorType, ReturnType>;
+
+  using value_type     = typename Analysis::value_type;
+  using pointer_type   = typename Analysis::pointer_type;
+  using reference_type = typename Analysis::reference_type;
+
+  const FunctorType m_functor;
+  const Policy m_policy;
+  const pointer_type m_result_ptr;
+
+  template <class TagType>
+  inline static std::enable_if_t<std::is_void<TagType>::value> exec_range(
+      const FunctorType &functor, const Member &ibeg, const Member &iend,
+      reference_type update, const bool final) {
+#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \
+    defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
+#pragma ivdep
+#endif
+    for (Member i = ibeg; i < iend; ++i) {
+      functor(i, update, final);
+    }
+  }
+
+  template <class TagType>
+  inline static std::enable_if_t<!std::is_void<TagType>::value> exec_range(
+      const FunctorType &functor, const Member &ibeg, const Member &iend,
+      reference_type update, const bool final) {
+    const TagType t{};
+#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \
+    defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
+#pragma ivdep
+#endif
+    for (Member i = ibeg; i < iend; ++i) {
+      functor(t, i, update, final);
+    }
+  }
+
+  static void exec(ThreadsExec &exec, const void *arg) {
+    const ParallelScanWithTotal &self = *((const ParallelScanWithTotal *)arg);
+
+    const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size());
+
+    typename Analysis::Reducer final_reducer(self.m_functor);
+
+    reference_type update =
+        final_reducer.init(static_cast<pointer_type>(exec.reduce_memory()));
+
+    ParallelScanWithTotal::template exec_range<WorkTag>(
+        self.m_functor, range.begin(), range.end(), update, false);
+
+    //  exec.template scan_large(final_reducer);
+    exec.scan_small(final_reducer);
+
+    ParallelScanWithTotal::template exec_range<WorkTag>(
+        self.m_functor, range.begin(), range.end(), update, true);
+
+    exec.fan_in();
+
+    if (exec.pool_rank() == exec.pool_size() - 1) {
+      *self.m_result_ptr = update;
+    }
+  }
+
+ public:
+  inline void execute() const {
+    ThreadsExec::resize_scratch(2 * Analysis::value_size(m_functor), 0);
+    ThreadsExec::start(&ParallelScanWithTotal::exec, this);
+    ThreadsExec::fence();
+  }
+
+  template <class ViewType>
+  ParallelScanWithTotal(const FunctorType &arg_functor,
+                        const Policy &arg_policy,
+                        const ViewType &arg_result_view)
+      : m_functor(arg_functor),
+        m_policy(arg_policy),
+        m_result_ptr(arg_result_view.data()) {
+    static_assert(
+        Kokkos::Impl::MemorySpaceAccess<typename ViewType::memory_space,
+                                        Kokkos::HostSpace>::accessible,
+        "Kokkos::Threads parallel_scan result must be host-accessible!");
+  }
+};
+
+}  // namespace Impl
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_Parallel_MDRange.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_Parallel_MDRange.hpp
deleted file mode 100644
index 35392e3bfb054004ef3f01239e84b15362bc8cfe..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/src/Threads/Kokkos_Threads_Parallel_MDRange.hpp
+++ /dev/null
@@ -1,271 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#ifndef KOKKOS_THREADS_PARALLEL_MDRANGE_HPP
-#define KOKKOS_THREADS_PARALLEL_MDRANGE_HPP
-
-#include <Kokkos_Parallel.hpp>
-
-#include <KokkosExp_MDRangePolicy.hpp>
-
-namespace Kokkos {
-namespace Impl {
-
-template <class FunctorType, class... Traits>
-class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
-                  Kokkos::Threads> {
- private:
-  using MDRangePolicy = Kokkos::MDRangePolicy<Traits...>;
-  using Policy        = typename MDRangePolicy::impl_range_policy;
-
-  using WorkTag = typename MDRangePolicy::work_tag;
-
-  using WorkRange = typename Policy::WorkRange;
-  using Member    = typename Policy::member_type;
-
-  using iterate_type = typename Kokkos::Impl::HostIterateTile<
-      MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void>;
-
-  const iterate_type m_iter;
-
-  inline void exec_range(const Member ibeg, const Member iend) const {
-    for (Member i = ibeg; i < iend; ++i) {
-      m_iter(i);
-    }
-  }
-
-  static void exec(ThreadsExec &exec, const void *arg) {
-    exec_schedule<typename Policy::schedule_type::type>(exec, arg);
-  }
-
-  template <class Schedule>
-  static std::enable_if_t<std::is_same<Schedule, Kokkos::Static>::value>
-  exec_schedule(ThreadsExec &exec, const void *arg) {
-    const ParallelFor &self = *((const ParallelFor *)arg);
-
-    auto const num_tiles = self.m_iter.m_rp.m_num_tiles;
-    WorkRange range(Policy(0, num_tiles).set_chunk_size(1), exec.pool_rank(),
-                    exec.pool_size());
-
-    self.exec_range(range.begin(), range.end());
-
-    exec.fan_in();
-  }
-
-  template <class Schedule>
-  static std::enable_if_t<std::is_same<Schedule, Kokkos::Dynamic>::value>
-  exec_schedule(ThreadsExec &exec, const void *arg) {
-    const ParallelFor &self = *((const ParallelFor *)arg);
-
-    auto const num_tiles = self.m_iter.m_rp.m_num_tiles;
-    WorkRange range(Policy(0, num_tiles).set_chunk_size(1), exec.pool_rank(),
-                    exec.pool_size());
-
-    exec.set_work_range(range.begin(), range.end(), 1);
-    exec.reset_steal_target();
-    exec.barrier();
-
-    long work_index = exec.get_work_index();
-
-    while (work_index != -1) {
-      const Member begin = static_cast<Member>(work_index);
-      const Member end   = begin + 1 < num_tiles ? begin + 1 : num_tiles;
-
-      self.exec_range(begin, end);
-      work_index = exec.get_work_index();
-    }
-
-    exec.fan_in();
-  }
-
- public:
-  inline void execute() const {
-    ThreadsExec::start(&ParallelFor::exec, this);
-    ThreadsExec::fence();
-  }
-
-  ParallelFor(const FunctorType &arg_functor, const MDRangePolicy &arg_policy)
-      : m_iter(arg_policy, arg_functor) {}
-
-  template <typename Policy, typename Functor>
-  static int max_tile_size_product(const Policy &, const Functor &) {
-    /**
-     * 1024 here is just our guess for a reasonable max tile size,
-     * it isn't a hardware constraint. If people see a use for larger
-     * tile size products, we're happy to change this.
-     */
-    return 1024;
-  }
-};
-
-template <class FunctorType, class ReducerType, class... Traits>
-class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
-                     Kokkos::Threads> {
- private:
-  using MDRangePolicy = Kokkos::MDRangePolicy<Traits...>;
-  using Policy        = typename MDRangePolicy::impl_range_policy;
-
-  using WorkTag   = typename MDRangePolicy::work_tag;
-  using WorkRange = typename Policy::WorkRange;
-  using Member    = typename Policy::member_type;
-
-  using ReducerConditional =
-      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                         FunctorType, ReducerType>;
-  using ReducerTypeFwd = typename ReducerConditional::type;
-  using WorkTagFwd =
-      typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                                  WorkTag, void>::type;
-
-  using Analysis = Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
-                                         MDRangePolicy, ReducerTypeFwd>;
-  using pointer_type   = typename Analysis::pointer_type;
-  using value_type     = typename Analysis::value_type;
-  using reference_type = typename Analysis::reference_type;
-
-  using iterate_type =
-      typename Kokkos::Impl::HostIterateTile<MDRangePolicy, FunctorType,
-                                             WorkTag, reference_type>;
-
-  const iterate_type m_iter;
-  const ReducerType m_reducer;
-  const pointer_type m_result_ptr;
-
-  inline void exec_range(const Member &ibeg, const Member &iend,
-                         reference_type update) const {
-    for (Member i = ibeg; i < iend; ++i) {
-      m_iter(i, update);
-    }
-  }
-
-  static void exec(ThreadsExec &exec, const void *arg) {
-    exec_schedule<typename Policy::schedule_type::type>(exec, arg);
-  }
-
-  template <class Schedule>
-  static std::enable_if_t<std::is_same<Schedule, Kokkos::Static>::value>
-  exec_schedule(ThreadsExec &exec, const void *arg) {
-    const ParallelReduce &self = *((const ParallelReduce *)arg);
-
-    const auto num_tiles = self.m_iter.m_rp.m_num_tiles;
-    const WorkRange range(Policy(0, num_tiles).set_chunk_size(1),
-                          exec.pool_rank(), exec.pool_size());
-
-    typename Analysis::Reducer reducer(
-        &ReducerConditional::select(self.m_iter.m_func, self.m_reducer));
-
-    self.exec_range(
-        range.begin(), range.end(),
-        reducer.init(static_cast<pointer_type>(exec.reduce_memory())));
-
-    exec.fan_in_reduce(reducer);
-  }
-
-  template <class Schedule>
-  static std::enable_if_t<std::is_same<Schedule, Kokkos::Dynamic>::value>
-  exec_schedule(ThreadsExec &exec, const void *arg) {
-    const ParallelReduce &self = *((const ParallelReduce *)arg);
-
-    const auto num_tiles = self.m_iter.m_rp.m_num_tiles;
-    const WorkRange range(Policy(0, num_tiles).set_chunk_size(1),
-                          exec.pool_rank(), exec.pool_size());
-
-    exec.set_work_range(range.begin(), range.end(), 1);
-    exec.reset_steal_target();
-    exec.barrier();
-
-    long work_index = exec.get_work_index();
-    typename Analysis::Reducer reducer(
-        &ReducerConditional::select(self.m_iter.m_func, self.m_reducer));
-
-    reference_type update =
-        reducer.init(static_cast<pointer_type>(exec.reduce_memory()));
-    while (work_index != -1) {
-      const Member begin = static_cast<Member>(work_index);
-      const Member end   = begin + 1 < num_tiles ? begin + 1 : num_tiles;
-      self.exec_range(begin, end, update);
-      work_index = exec.get_work_index();
-    }
-
-    exec.fan_in_reduce(reducer);
-  }
-
- public:
-  inline void execute() const {
-    ThreadsExec::resize_scratch(Analysis::value_size(ReducerConditional::select(
-                                    m_iter.m_func, m_reducer)),
-                                0);
-
-    ThreadsExec::start(&ParallelReduce::exec, this);
-
-    ThreadsExec::fence();
-
-    if (m_result_ptr) {
-      const pointer_type data =
-          (pointer_type)ThreadsExec::root_reduce_scratch();
-
-      const unsigned n = Analysis::value_count(
-          ReducerConditional::select(m_iter.m_func, m_reducer));
-      for (unsigned i = 0; i < n; ++i) {
-        m_result_ptr[i] = data[i];
-      }
-    }
-  }
-
-  template <class HostViewType>
-  ParallelReduce(const FunctorType &arg_functor,
-                 const MDRangePolicy &arg_policy,
-                 const HostViewType &arg_result_view,
-                 std::enable_if_t<Kokkos::is_view<HostViewType>::value &&
-                                      !Kokkos::is_reducer<ReducerType>::value,
-                                  void *> = nullptr)
-      : m_iter(arg_policy, arg_functor),
-        m_reducer(InvalidType()),
-        m_result_ptr(arg_result_view.data()) {
-    static_assert(Kokkos::is_view<HostViewType>::value,
-                  "Kokkos::Threads reduce result must be a View");
-
-    static_assert(
-        std::is_same<typename HostViewType::memory_space, HostSpace>::value,
-        "Kokkos::Threads reduce result must be a View in HostSpace");
-  }
-
-  inline ParallelReduce(const FunctorType &arg_functor,
-                        MDRangePolicy arg_policy, const ReducerType &reducer)
-      : m_iter(arg_policy, arg_functor),
-        m_reducer(reducer),
-        m_result_ptr(reducer.view().data()) {
-    /*static_assert( std::is_same< typename ViewType::memory_space
-                                    , Kokkos::HostSpace >::value
-      , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace"
-      );*/
-  }
-
-  template <typename Policy, typename Functor>
-  static int max_tile_size_product(const Policy &, const Functor &) {
-    /**
-     * 1024 here is just our guess for a reasonable max tile size,
-     * it isn't a hardware constraint. If people see a use for larger
-     * tile size products, we're happy to change this.
-     */
-    return 1024;
-  }
-};
-
-}  // namespace Impl
-}  // namespace Kokkos
-
-#endif
diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_Parallel_Range.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_Parallel_Range.hpp
deleted file mode 100644
index 7d3527facdf174b40513228a117656fab64a67f8..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/src/Threads/Kokkos_Threads_Parallel_Range.hpp
+++ /dev/null
@@ -1,465 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#ifndef KOKKOS_THREADS_PARALLEL_RANGE_HPP
-#define KOKKOS_THREADS_PARALLEL_RANGE_HPP
-
-#include <Kokkos_Parallel.hpp>
-
-namespace Kokkos {
-namespace Impl {
-
-template <class FunctorType, class... Traits>
-class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>,
-                  Kokkos::Threads> {
- private:
-  using Policy    = Kokkos::RangePolicy<Traits...>;
-  using WorkTag   = typename Policy::work_tag;
-  using WorkRange = typename Policy::WorkRange;
-  using Member    = typename Policy::member_type;
-
-  const FunctorType m_functor;
-  const Policy m_policy;
-
-  template <class TagType>
-  inline static std::enable_if_t<std::is_void<TagType>::value> exec_range(
-      const FunctorType &functor, const Member ibeg, const Member iend) {
-#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \
-    defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
-#pragma ivdep
-#endif
-    for (Member i = ibeg; i < iend; ++i) {
-      functor(i);
-    }
-  }
-
-  template <class TagType>
-  inline static std::enable_if_t<!std::is_void<TagType>::value> exec_range(
-      const FunctorType &functor, const Member ibeg, const Member iend) {
-    const TagType t{};
-#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \
-    defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
-#pragma ivdep
-#endif
-    for (Member i = ibeg; i < iend; ++i) {
-      functor(t, i);
-    }
-  }
-
-  static void exec(ThreadsExec &exec, const void *arg) {
-    exec_schedule<typename Policy::schedule_type::type>(exec, arg);
-  }
-
-  template <class Schedule>
-  static std::enable_if_t<std::is_same<Schedule, Kokkos::Static>::value>
-  exec_schedule(ThreadsExec &exec, const void *arg) {
-    const ParallelFor &self = *((const ParallelFor *)arg);
-
-    WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size());
-
-    ParallelFor::template exec_range<WorkTag>(self.m_functor, range.begin(),
-                                              range.end());
-
-    exec.fan_in();
-  }
-
-  template <class Schedule>
-  static std::enable_if_t<std::is_same<Schedule, Kokkos::Dynamic>::value>
-  exec_schedule(ThreadsExec &exec, const void *arg) {
-    const ParallelFor &self = *((const ParallelFor *)arg);
-
-    WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size());
-
-    exec.set_work_range(range.begin() - self.m_policy.begin(),
-                        range.end() - self.m_policy.begin(),
-                        self.m_policy.chunk_size());
-    exec.reset_steal_target();
-    exec.barrier();
-
-    long work_index = exec.get_work_index();
-
-    while (work_index != -1) {
-      const Member begin =
-          static_cast<Member>(work_index) * self.m_policy.chunk_size() +
-          self.m_policy.begin();
-      const Member end =
-          begin + self.m_policy.chunk_size() < self.m_policy.end()
-              ? begin + self.m_policy.chunk_size()
-              : self.m_policy.end();
-      ParallelFor::template exec_range<WorkTag>(self.m_functor, begin, end);
-      work_index = exec.get_work_index();
-    }
-
-    exec.fan_in();
-  }
-
- public:
-  inline void execute() const {
-    ThreadsExec::start(&ParallelFor::exec, this);
-    ThreadsExec::fence();
-  }
-
-  ParallelFor(const FunctorType &arg_functor, const Policy &arg_policy)
-      : m_functor(arg_functor), m_policy(arg_policy) {}
-};
-
-template <class FunctorType, class ReducerType, class... Traits>
-class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
-                     Kokkos::Threads> {
- private:
-  using Policy = Kokkos::RangePolicy<Traits...>;
-
-  using WorkTag   = typename Policy::work_tag;
-  using WorkRange = typename Policy::WorkRange;
-  using Member    = typename Policy::member_type;
-
-  using ReducerConditional =
-      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                         FunctorType, ReducerType>;
-  using ReducerTypeFwd = typename ReducerConditional::type;
-  using WorkTagFwd =
-      typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                                  WorkTag, void>::type;
-
-  using Analysis = Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
-                                         Policy, ReducerTypeFwd>;
-
-  using pointer_type   = typename Analysis::pointer_type;
-  using reference_type = typename Analysis::reference_type;
-
-  const FunctorType m_functor;
-  const Policy m_policy;
-  const ReducerType m_reducer;
-  const pointer_type m_result_ptr;
-
-  template <class TagType>
-  inline static std::enable_if_t<std::is_void<TagType>::value> exec_range(
-      const FunctorType &functor, const Member &ibeg, const Member &iend,
-      reference_type update) {
-#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \
-    defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
-#pragma ivdep
-#endif
-    for (Member i = ibeg; i < iend; ++i) {
-      functor(i, update);
-    }
-  }
-
-  template <class TagType>
-  inline static std::enable_if_t<!std::is_void<TagType>::value> exec_range(
-      const FunctorType &functor, const Member &ibeg, const Member &iend,
-      reference_type update) {
-    const TagType t{};
-#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \
-    defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
-#pragma ivdep
-#endif
-    for (Member i = ibeg; i < iend; ++i) {
-      functor(t, i, update);
-    }
-  }
-
-  static void exec(ThreadsExec &exec, const void *arg) {
-    exec_schedule<typename Policy::schedule_type::type>(exec, arg);
-  }
-
-  template <class Schedule>
-  static std::enable_if_t<std::is_same<Schedule, Kokkos::Static>::value>
-  exec_schedule(ThreadsExec &exec, const void *arg) {
-    const ParallelReduce &self = *((const ParallelReduce *)arg);
-    const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size());
-
-    typename Analysis::Reducer reducer(
-        &ReducerConditional::select(self.m_functor, self.m_reducer));
-
-    ParallelReduce::template exec_range<WorkTag>(
-        self.m_functor, range.begin(), range.end(),
-        reducer.init(static_cast<pointer_type>(exec.reduce_memory())));
-
-    exec.fan_in_reduce(reducer);
-  }
-
-  template <class Schedule>
-  static std::enable_if_t<std::is_same<Schedule, Kokkos::Dynamic>::value>
-  exec_schedule(ThreadsExec &exec, const void *arg) {
-    const ParallelReduce &self = *((const ParallelReduce *)arg);
-    const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size());
-
-    exec.set_work_range(range.begin() - self.m_policy.begin(),
-                        range.end() - self.m_policy.begin(),
-                        self.m_policy.chunk_size());
-    exec.reset_steal_target();
-    exec.barrier();
-
-    long work_index = exec.get_work_index();
-    typename Analysis::Reducer reducer(
-        &ReducerConditional::select(self.m_functor, self.m_reducer));
-
-    reference_type update =
-        reducer.init(static_cast<pointer_type>(exec.reduce_memory()));
-    while (work_index != -1) {
-      const Member begin =
-          static_cast<Member>(work_index) * self.m_policy.chunk_size() +
-          self.m_policy.begin();
-      const Member end =
-          begin + self.m_policy.chunk_size() < self.m_policy.end()
-              ? begin + self.m_policy.chunk_size()
-              : self.m_policy.end();
-      ParallelReduce::template exec_range<WorkTag>(self.m_functor, begin, end,
-                                                   update);
-      work_index = exec.get_work_index();
-    }
-
-    exec.fan_in_reduce(reducer);
-  }
-
- public:
-  inline void execute() const {
-    if (m_policy.end() <= m_policy.begin()) {
-      if (m_result_ptr) {
-        typename Analysis::Reducer final_reducer(
-            &ReducerConditional::select(m_functor, m_reducer));
-        final_reducer.init(m_result_ptr);
-        final_reducer.final(m_result_ptr);
-      }
-    } else {
-      ThreadsExec::resize_scratch(
-          Analysis::value_size(
-              ReducerConditional::select(m_functor, m_reducer)),
-          0);
-
-      ThreadsExec::start(&ParallelReduce::exec, this);
-
-      ThreadsExec::fence();
-
-      if (m_result_ptr) {
-        const pointer_type data =
-            (pointer_type)ThreadsExec::root_reduce_scratch();
-
-        const unsigned n = Analysis::value_count(
-            ReducerConditional::select(m_functor, m_reducer));
-        for (unsigned i = 0; i < n; ++i) {
-          m_result_ptr[i] = data[i];
-        }
-      }
-    }
-  }
-
-  template <class HostViewType>
-  ParallelReduce(const FunctorType &arg_functor, const Policy &arg_policy,
-                 const HostViewType &arg_result_view,
-                 std::enable_if_t<Kokkos::is_view<HostViewType>::value &&
-                                      !Kokkos::is_reducer<ReducerType>::value,
-                                  void *> = nullptr)
-      : m_functor(arg_functor),
-        m_policy(arg_policy),
-        m_reducer(InvalidType()),
-        m_result_ptr(arg_result_view.data()) {
-    static_assert(Kokkos::is_view<HostViewType>::value,
-                  "Kokkos::Threads reduce result must be a View");
-
-    static_assert(
-        std::is_same<typename HostViewType::memory_space, HostSpace>::value,
-        "Kokkos::Threads reduce result must be a View in HostSpace");
-  }
-
-  inline ParallelReduce(const FunctorType &arg_functor, Policy arg_policy,
-                        const ReducerType &reducer)
-      : m_functor(arg_functor),
-        m_policy(arg_policy),
-        m_reducer(reducer),
-        m_result_ptr(reducer.view().data()) {
-    /*static_assert( std::is_same< typename ViewType::memory_space
-                                    , Kokkos::HostSpace >::value
-      , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace"
-      );*/
-  }
-};
-
-template <class FunctorType, class... Traits>
-class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
-                   Kokkos::Threads> {
- private:
-  using Policy    = Kokkos::RangePolicy<Traits...>;
-  using WorkRange = typename Policy::WorkRange;
-  using WorkTag   = typename Policy::work_tag;
-  using Member    = typename Policy::member_type;
-  using Analysis  = Impl::FunctorAnalysis<Impl::FunctorPatternInterface::SCAN,
-                                         Policy, FunctorType>;
-  using pointer_type   = typename Analysis::pointer_type;
-  using reference_type = typename Analysis::reference_type;
-
-  const FunctorType m_functor;
-  const Policy m_policy;
-
-  template <class TagType>
-  inline static std::enable_if_t<std::is_void<TagType>::value> exec_range(
-      const FunctorType &functor, const Member &ibeg, const Member &iend,
-      reference_type update, const bool final) {
-#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \
-    defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
-#pragma ivdep
-#endif
-    for (Member i = ibeg; i < iend; ++i) {
-      functor(i, update, final);
-    }
-  }
-
-  template <class TagType>
-  inline static std::enable_if_t<!std::is_void<TagType>::value> exec_range(
-      const FunctorType &functor, const Member &ibeg, const Member &iend,
-      reference_type update, const bool final) {
-    const TagType t{};
-#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \
-    defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
-#pragma ivdep
-#endif
-    for (Member i = ibeg; i < iend; ++i) {
-      functor(t, i, update, final);
-    }
-  }
-
-  static void exec(ThreadsExec &exec, const void *arg) {
-    const ParallelScan &self = *((const ParallelScan *)arg);
-
-    const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size());
-
-    typename Analysis::Reducer final_reducer(&self.m_functor);
-
-    reference_type update =
-        final_reducer.init(static_cast<pointer_type>(exec.reduce_memory()));
-
-    ParallelScan::template exec_range<WorkTag>(self.m_functor, range.begin(),
-                                               range.end(), update, false);
-
-    //  exec.template scan_large( final_reducer );
-    exec.scan_small(final_reducer);
-
-    ParallelScan::template exec_range<WorkTag>(self.m_functor, range.begin(),
-                                               range.end(), update, true);
-
-    exec.fan_in();
-  }
-
- public:
-  inline void execute() const {
-    ThreadsExec::resize_scratch(2 * Analysis::value_size(m_functor), 0);
-    ThreadsExec::start(&ParallelScan::exec, this);
-    ThreadsExec::fence();
-  }
-
-  ParallelScan(const FunctorType &arg_functor, const Policy &arg_policy)
-      : m_functor(arg_functor), m_policy(arg_policy) {}
-};
-
-template <class FunctorType, class ReturnType, class... Traits>
-class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
-                            ReturnType, Kokkos::Threads> {
- private:
-  using Policy    = Kokkos::RangePolicy<Traits...>;
-  using WorkRange = typename Policy::WorkRange;
-  using WorkTag   = typename Policy::work_tag;
-  using Member    = typename Policy::member_type;
-
-  using Analysis = Impl::FunctorAnalysis<Impl::FunctorPatternInterface::SCAN,
-                                         Policy, FunctorType>;
-
-  using value_type     = typename Analysis::value_type;
-  using pointer_type   = typename Analysis::pointer_type;
-  using reference_type = typename Analysis::reference_type;
-
-  const FunctorType m_functor;
-  const Policy m_policy;
-  const pointer_type m_result_ptr;
-
-  template <class TagType>
-  inline static std::enable_if_t<std::is_void<TagType>::value> exec_range(
-      const FunctorType &functor, const Member &ibeg, const Member &iend,
-      reference_type update, const bool final) {
-#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \
-    defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
-#pragma ivdep
-#endif
-    for (Member i = ibeg; i < iend; ++i) {
-      functor(i, update, final);
-    }
-  }
-
-  template <class TagType>
-  inline static std::enable_if_t<!std::is_void<TagType>::value> exec_range(
-      const FunctorType &functor, const Member &ibeg, const Member &iend,
-      reference_type update, const bool final) {
-    const TagType t{};
-#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \
-    defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
-#pragma ivdep
-#endif
-    for (Member i = ibeg; i < iend; ++i) {
-      functor(t, i, update, final);
-    }
-  }
-
-  static void exec(ThreadsExec &exec, const void *arg) {
-    const ParallelScanWithTotal &self = *((const ParallelScanWithTotal *)arg);
-
-    const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size());
-
-    typename Analysis::Reducer final_reducer(&self.m_functor);
-
-    reference_type update =
-        final_reducer.init(static_cast<pointer_type>(exec.reduce_memory()));
-
-    ParallelScanWithTotal::template exec_range<WorkTag>(
-        self.m_functor, range.begin(), range.end(), update, false);
-
-    //  exec.template scan_large(final_reducer);
-    exec.scan_small(final_reducer);
-
-    ParallelScanWithTotal::template exec_range<WorkTag>(
-        self.m_functor, range.begin(), range.end(), update, true);
-
-    exec.fan_in();
-
-    if (exec.pool_rank() == exec.pool_size() - 1) {
-      *self.m_result_ptr = update;
-    }
-  }
-
- public:
-  inline void execute() const {
-    ThreadsExec::resize_scratch(2 * Analysis::value_size(m_functor), 0);
-    ThreadsExec::start(&ParallelScanWithTotal::exec, this);
-    ThreadsExec::fence();
-  }
-
-  template <class ViewType>
-  ParallelScanWithTotal(const FunctorType &arg_functor,
-                        const Policy &arg_policy,
-                        const ViewType &arg_result_view)
-      : m_functor(arg_functor),
-        m_policy(arg_policy),
-        m_result_ptr(arg_result_view.data()) {
-    static_assert(
-        Kokkos::Impl::MemorySpaceAccess<typename ViewType::memory_space,
-                                        Kokkos::HostSpace>::accessible,
-        "Kokkos::Threads parallel_scan result must be host-accessible!");
-  }
-};
-
-}  // namespace Impl
-}  // namespace Kokkos
-
-#endif
diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_Parallel_Team.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_Parallel_Team.hpp
deleted file mode 100644
index a602078c5229eb5323d1be14d2f8936ecef2a32b..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/src/Threads/Kokkos_Threads_Parallel_Team.hpp
+++ /dev/null
@@ -1,251 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#ifndef KOKKOS_THREADS_PARALLEL_TEAM_HPP
-#define KOKKOS_THREADS_PARALLEL_TEAM_HPP
-
-#include <Kokkos_Parallel.hpp>
-
-namespace Kokkos {
-namespace Impl {
-
-template <class FunctorType, class... Properties>
-class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
-                  Kokkos::Threads> {
- private:
-  using Policy =
-      Kokkos::Impl::TeamPolicyInternal<Kokkos::Threads, Properties...>;
-  using WorkTag = typename Policy::work_tag;
-  using Member  = typename Policy::member_type;
-
-  const FunctorType m_functor;
-  const Policy m_policy;
-  const size_t m_shared;
-
-  template <class TagType, class Schedule>
-  inline static std::enable_if_t<std::is_void<TagType>::value &&
-                                 std::is_same<Schedule, Kokkos::Static>::value>
-  exec_team(const FunctorType &functor, Member member) {
-    for (; member.valid_static(); member.next_static()) {
-      functor(member);
-    }
-  }
-
-  template <class TagType, class Schedule>
-  inline static std::enable_if_t<!std::is_void<TagType>::value &&
-                                 std::is_same<Schedule, Kokkos::Static>::value>
-  exec_team(const FunctorType &functor, Member member) {
-    const TagType t{};
-    for (; member.valid_static(); member.next_static()) {
-      functor(t, member);
-    }
-  }
-
-  template <class TagType, class Schedule>
-  inline static std::enable_if_t<std::is_void<TagType>::value &&
-                                 std::is_same<Schedule, Kokkos::Dynamic>::value>
-  exec_team(const FunctorType &functor, Member member) {
-    for (; member.valid_dynamic(); member.next_dynamic()) {
-      functor(member);
-    }
-  }
-
-  template <class TagType, class Schedule>
-  inline static std::enable_if_t<!std::is_void<TagType>::value &&
-                                 std::is_same<Schedule, Kokkos::Dynamic>::value>
-  exec_team(const FunctorType &functor, Member member) {
-    const TagType t{};
-    for (; member.valid_dynamic(); member.next_dynamic()) {
-      functor(t, member);
-    }
-  }
-
-  static void exec(ThreadsExec &exec, const void *arg) {
-    const ParallelFor &self = *((const ParallelFor *)arg);
-
-    ParallelFor::exec_team<WorkTag, typename Policy::schedule_type::type>(
-        self.m_functor, Member(&exec, self.m_policy, self.m_shared));
-
-    exec.barrier();
-    exec.fan_in();
-  }
-  template <typename Policy>
-  Policy fix_policy(Policy policy) {
-    if (policy.impl_vector_length() < 0) {
-      policy.impl_set_vector_length(1);
-    }
-    if (policy.team_size() < 0) {
-      policy.impl_set_team_size(
-          policy.team_size_recommended(m_functor, ParallelForTag{}));
-    }
-    return policy;
-  }
-
- public:
-  inline void execute() const {
-    ThreadsExec::resize_scratch(
-        0, Policy::member_type::team_reduce_size() + m_shared);
-
-    ThreadsExec::start(&ParallelFor::exec, this);
-
-    ThreadsExec::fence();
-  }
-
-  ParallelFor(const FunctorType &arg_functor, const Policy &arg_policy)
-      : m_functor(arg_functor),
-        m_policy(fix_policy(arg_policy)),
-        m_shared(m_policy.scratch_size(0) + m_policy.scratch_size(1) +
-                 FunctorTeamShmemSize<FunctorType>::value(
-                     arg_functor, m_policy.team_size())) {}
-};
-
-template <class FunctorType, class ReducerType, class... Properties>
-class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
-                     ReducerType, Kokkos::Threads> {
- private:
-  using Policy =
-      Kokkos::Impl::TeamPolicyInternal<Kokkos::Threads, Properties...>;
-  using WorkTag = typename Policy::work_tag;
-  using Member  = typename Policy::member_type;
-
-  using ReducerConditional =
-      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                         FunctorType, ReducerType>;
-  using ReducerTypeFwd = typename ReducerConditional::type;
-  using WorkTagFwd =
-      typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                                  WorkTag, void>::type;
-
-  using Analysis = Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
-                                         Policy, ReducerTypeFwd>;
-  using pointer_type   = typename Analysis::pointer_type;
-  using reference_type = typename Analysis::reference_type;
-
-  const FunctorType m_functor;
-  const Policy m_policy;
-  const ReducerType m_reducer;
-  const pointer_type m_result_ptr;
-  const size_t m_shared;
-
-  template <class TagType>
-  inline static std::enable_if_t<std::is_void<TagType>::value> exec_team(
-      const FunctorType &functor, Member member, reference_type update) {
-    for (; member.valid_static(); member.next_static()) {
-      functor(member, update);
-    }
-  }
-
-  template <class TagType>
-  inline static std::enable_if_t<!std::is_void<TagType>::value> exec_team(
-      const FunctorType &functor, Member member, reference_type update) {
-    const TagType t{};
-    for (; member.valid_static(); member.next_static()) {
-      functor(t, member, update);
-    }
-  }
-
-  static void exec(ThreadsExec &exec, const void *arg) {
-    const ParallelReduce &self = *((const ParallelReduce *)arg);
-
-    typename Analysis::Reducer reducer(
-        &ReducerConditional::select(self.m_functor, self.m_reducer));
-
-    ParallelReduce::template exec_team<WorkTag>(
-        self.m_functor, Member(&exec, self.m_policy, self.m_shared),
-        reducer.init(static_cast<pointer_type>(exec.reduce_memory())));
-
-    exec.fan_in_reduce(reducer);
-  }
-
- public:
-  inline void execute() const {
-    if (m_policy.league_size() * m_policy.team_size() == 0) {
-      if (m_result_ptr) {
-        typename Analysis::Reducer final_reducer(
-            &ReducerConditional::select(m_functor, m_reducer));
-        final_reducer.init(m_result_ptr);
-        final_reducer.final(m_result_ptr);
-      }
-    } else {
-      ThreadsExec::resize_scratch(
-          Analysis::value_size(
-              ReducerConditional::select(m_functor, m_reducer)),
-          Policy::member_type::team_reduce_size() + m_shared);
-
-      ThreadsExec::start(&ParallelReduce::exec, this);
-
-      ThreadsExec::fence();
-
-      if (m_result_ptr) {
-        const pointer_type data =
-            (pointer_type)ThreadsExec::root_reduce_scratch();
-
-        const unsigned n = Analysis::value_count(
-            ReducerConditional::select(m_functor, m_reducer));
-        for (unsigned i = 0; i < n; ++i) {
-          m_result_ptr[i] = data[i];
-        }
-      }
-    }
-  }
-
-  template <typename Policy>
-  Policy fix_policy(Policy policy) {
-    if (policy.impl_vector_length() < 0) {
-      policy.impl_set_vector_length(1);
-    }
-    if (policy.team_size() < 0) {
-      policy.impl_set_team_size(policy.team_size_recommended(
-          m_functor, m_reducer, ParallelReduceTag{}));
-    }
-    return policy;
-  }
-
-  template <class ViewType>
-  inline ParallelReduce(
-      const FunctorType &arg_functor, const Policy &arg_policy,
-      const ViewType &arg_result,
-      std::enable_if_t<Kokkos::is_view<ViewType>::value &&
-                           !Kokkos::is_reducer<ReducerType>::value,
-                       void *> = nullptr)
-      : m_functor(arg_functor),
-        m_policy(fix_policy(arg_policy)),
-        m_reducer(InvalidType()),
-        m_result_ptr(arg_result.data()),
-        m_shared(m_policy.scratch_size(0) + m_policy.scratch_size(1) +
-                 FunctorTeamShmemSize<FunctorType>::value(
-                     arg_functor, m_policy.team_size())) {}
-
-  inline ParallelReduce(const FunctorType &arg_functor, Policy arg_policy,
-                        const ReducerType &reducer)
-      : m_functor(arg_functor),
-        m_policy(fix_policy(arg_policy)),
-        m_reducer(reducer),
-        m_result_ptr(reducer.view().data()),
-        m_shared(m_policy.scratch_size(0) + m_policy.scratch_size(1) +
-                 FunctorTeamShmemSize<FunctorType>::value(
-                     arg_functor, m_policy.team_size())) {
-    /*static_assert( std::is_same< typename ViewType::memory_space
-                            , Kokkos::HostSpace >::value
-    , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace"
-    );*/
-  }
-};
-
-}  // namespace Impl
-}  // namespace Kokkos
-
-#endif
diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp
index 7c29ce57395d5ea6cdba296fa6307882fdde1e72..d4ce697548fab07735f363c9026d1f2d8164eb3e 100644
--- a/packages/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp
+++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp
@@ -18,7 +18,7 @@
 #define KOKKOS_THREADS_WORKGRAPHPOLICY_HPP
 
 #include <Kokkos_Core_fwd.hpp>
-#include <Kokkos_Threads.hpp>
+#include <Threads/Kokkos_ThreadsExec.hpp>
 
 namespace Kokkos {
 namespace Impl {
diff --git a/packages/kokkos/core/src/View/MDSpan/Kokkos_MDSpan_Extents.hpp b/packages/kokkos/core/src/View/MDSpan/Kokkos_MDSpan_Extents.hpp
index fb5195254344c4295a1339ec871df37be37d7622..3846b52d2396cb58497379cbb851e71af1902685 100644
--- a/packages/kokkos/core/src/View/MDSpan/Kokkos_MDSpan_Extents.hpp
+++ b/packages/kokkos/core/src/View/MDSpan/Kokkos_MDSpan_Extents.hpp
@@ -54,7 +54,7 @@ struct ExtentFromDimension {
 // Kokkos uses a dimension of '0' to denote a dynamic dimension.
 template <>
 struct ExtentFromDimension<std::size_t{0}> {
-  static constexpr std::size_t value = std::experimental::dynamic_extent;
+  static constexpr std::size_t value = dynamic_extent;
 };
 
 template <std::size_t N>
@@ -63,7 +63,7 @@ struct DimensionFromExtent {
 };
 
 template <>
-struct DimensionFromExtent<std::experimental::dynamic_extent> {
+struct DimensionFromExtent<dynamic_extent> {
   static constexpr std::size_t value = std::size_t{0};
 };
 
@@ -73,9 +73,9 @@ struct ExtentsFromDimension;
 template <class IndexType, class Dimension, std::size_t... Indices>
 struct ExtentsFromDimension<IndexType, Dimension,
                             std::index_sequence<Indices...>> {
-  using type = std::experimental::extents<
-      IndexType,
-      ExtentFromDimension<Dimension::static_extent(Indices)>::value...>;
+  using type =
+      extents<IndexType,
+              ExtentFromDimension<Dimension::static_extent(Indices)>::value...>;
 };
 
 template <class Extents, class Indices>
diff --git a/packages/kokkos/core/src/View/MDSpan/Kokkos_MDSpan_Header.hpp b/packages/kokkos/core/src/View/MDSpan/Kokkos_MDSpan_Header.hpp
index 495b891cca3b8c1f5d4c55f823453d44e41d76d3..f4ae702fc4450af05155f8b0a0bd9f62e9b7b349 100644
--- a/packages/kokkos/core/src/View/MDSpan/Kokkos_MDSpan_Header.hpp
+++ b/packages/kokkos/core/src/View/MDSpan/Kokkos_MDSpan_Header.hpp
@@ -23,12 +23,28 @@ static_assert(false,
 #define KOKKOS_EXPERIMENTAL_MDSPAN_HPP
 
 // Look for the right mdspan
-#if __has_include(<mdspan>)
+#if __cplusplus >= 202002L
+#include <version>
+#endif
+
+// Only use standard library mdspan if we are not running Cuda or HIP.
+// Likely these implementations won't be supported on device, so we should use
+// our own device-compatible version for now.
+#if (__cpp_lib_mdspan >= 202207L) && !defined(KOKKOS_ENABLE_CUDA) && \
+    !defined(KOKKOS_ENABLE_HIP)
 #include <mdspan>
-namespace mdspan_ns = std;
+namespace Kokkos {
+using std::default_accessor;
+using std::dextents;
+using std::dynamic_extent;
+using std::extents;
+using std::layout_left;
+using std::layout_right;
+using std::layout_stride;
+using std::mdspan;
+}  // namespace Kokkos
 #else
-#include <experimental/mdspan>
-namespace mdspan_ns = std::experimental;
+#include <mdspan/mdspan.hpp>
 #endif
 
 #endif  // KOKKOS_EXPERIMENTAL_MDSPAN_HPP
diff --git a/packages/kokkos/core/src/decl/Kokkos_Declare_CUDA.hpp b/packages/kokkos/core/src/decl/Kokkos_Declare_CUDA.hpp
index 215b18f221980a921f5bfed7c1c60668da4ad13a..ebdf2c8211fe9209c2106e1df51b242cf760883c 100644
--- a/packages/kokkos/core/src/decl/Kokkos_Declare_CUDA.hpp
+++ b/packages/kokkos/core/src/decl/Kokkos_Declare_CUDA.hpp
@@ -18,7 +18,7 @@
 #define KOKKOS_DECLARE_CUDA_HPP
 
 #if defined(KOKKOS_ENABLE_CUDA)
-#include <Kokkos_Cuda.hpp>
+#include <Cuda/Kokkos_Cuda.hpp>
 #include <Cuda/Kokkos_Cuda_Half_Impl_Type.hpp>
 #include <Cuda/Kokkos_Cuda_Half_Conversion.hpp>
 #include <Cuda/Kokkos_Cuda_Parallel_MDRange.hpp>
@@ -31,6 +31,7 @@
 #include <Cuda/Kokkos_Cuda_Task.hpp>
 #include <Cuda/Kokkos_Cuda_MDRangePolicy.hpp>
 #include <Cuda/Kokkos_Cuda_UniqueToken.hpp>
+#include <Cuda/Kokkos_Cuda_ZeroMemset.hpp>
 #endif
 
 #endif
diff --git a/packages/kokkos/core/src/decl/Kokkos_Declare_HIP.hpp b/packages/kokkos/core/src/decl/Kokkos_Declare_HIP.hpp
index 09ea8826004be8bc409e708a626972f25668b39c..e115f7051f3a18434c04c5a5474267429ba0f3fb 100644
--- a/packages/kokkos/core/src/decl/Kokkos_Declare_HIP.hpp
+++ b/packages/kokkos/core/src/decl/Kokkos_Declare_HIP.hpp
@@ -30,6 +30,7 @@
 #include <HIP/Kokkos_HIP_Parallel_Team.hpp>
 #include <HIP/Kokkos_HIP_SharedAllocationRecord.hpp>
 #include <HIP/Kokkos_HIP_UniqueToken.hpp>
+#include <HIP/Kokkos_HIP_ZeroMemset.hpp>
 
 namespace Kokkos {
 namespace Experimental {
diff --git a/packages/kokkos/core/src/decl/Kokkos_Declare_HPX.hpp b/packages/kokkos/core/src/decl/Kokkos_Declare_HPX.hpp
index 73f94591f528256313d8d57a16bae994ef653b1b..f9012362465d8e7c88d8e503138a64e92de1929b 100644
--- a/packages/kokkos/core/src/decl/Kokkos_Declare_HPX.hpp
+++ b/packages/kokkos/core/src/decl/Kokkos_Declare_HPX.hpp
@@ -18,7 +18,7 @@
 #define KOKKOS_DECLARE_HPX_HPP
 
 #if defined(KOKKOS_ENABLE_HPX)
-#include <Kokkos_HPX.hpp>
+#include <HPX/Kokkos_HPX.hpp>
 #include <HPX/Kokkos_HPX_MDRangePolicy.hpp>
 #endif
 
diff --git a/packages/kokkos/core/src/decl/Kokkos_Declare_OPENACC.hpp b/packages/kokkos/core/src/decl/Kokkos_Declare_OPENACC.hpp
index 137286c741b413e6bbe581e6912ad2f3b455b198..727e551cd8a5434f5270111267a2763e8785dcf8 100644
--- a/packages/kokkos/core/src/decl/Kokkos_Declare_OPENACC.hpp
+++ b/packages/kokkos/core/src/decl/Kokkos_Declare_OPENACC.hpp
@@ -24,9 +24,12 @@
 #include <OpenACC/Kokkos_OpenACC_SharedAllocationRecord.hpp>
 #include <OpenACC/Kokkos_OpenACC_ParallelFor_Range.hpp>
 #include <OpenACC/Kokkos_OpenACC_ParallelReduce_Range.hpp>
+#include <OpenACC/Kokkos_OpenACC_ParallelScan_Range.hpp>
 #include <OpenACC/Kokkos_OpenACC_MDRangePolicy.hpp>
 #include <OpenACC/Kokkos_OpenACC_ParallelFor_MDRange.hpp>
+#include <OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp>
 #include <OpenACC/Kokkos_OpenACC_ParallelFor_Team.hpp>
+#include <OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp>
 #endif
 
 #endif
diff --git a/packages/kokkos/core/src/decl/Kokkos_Declare_OPENMP.hpp b/packages/kokkos/core/src/decl/Kokkos_Declare_OPENMP.hpp
index 1e1314145d203154ecf01a09e2e110f87c4673f9..52aefa25add0f1db895d46555311d87dee119338 100644
--- a/packages/kokkos/core/src/decl/Kokkos_Declare_OPENMP.hpp
+++ b/packages/kokkos/core/src/decl/Kokkos_Declare_OPENMP.hpp
@@ -18,8 +18,12 @@
 #define KOKKOS_DECLARE_OPENMP_HPP
 
 #if defined(KOKKOS_ENABLE_OPENMP)
-#include <Kokkos_OpenMP.hpp>
+#include <OpenMP/Kokkos_OpenMP.hpp>
 #include <OpenMP/Kokkos_OpenMP_MDRangePolicy.hpp>
+#include <OpenMP/Kokkos_OpenMP_UniqueToken.hpp>
+#include <OpenMP/Kokkos_OpenMP_Parallel_For.hpp>
+#include <OpenMP/Kokkos_OpenMP_Parallel_Reduce.hpp>
+#include <OpenMP/Kokkos_OpenMP_Parallel_Scan.hpp>
 #endif
 
 #endif
diff --git a/packages/kokkos/core/src/decl/Kokkos_Declare_OPENMPTARGET.hpp b/packages/kokkos/core/src/decl/Kokkos_Declare_OPENMPTARGET.hpp
index 0bd89ef4cf4ba9af2f59a61b622c6f7efa200743..6bde8f59d883fe136aab97b8622d1ff49c05d7ea 100644
--- a/packages/kokkos/core/src/decl/Kokkos_Declare_OPENMPTARGET.hpp
+++ b/packages/kokkos/core/src/decl/Kokkos_Declare_OPENMPTARGET.hpp
@@ -18,10 +18,17 @@
 #define KOKKOS_DECLARE_OPENMPTARGET_HPP
 
 #if defined(KOKKOS_ENABLE_OPENMPTARGET)
-#include <Kokkos_OpenMPTarget.hpp>
-#include <Kokkos_OpenMPTargetSpace.hpp>
+#include <OpenMPTarget/Kokkos_OpenMPTarget.hpp>
+#include <OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp>
+#include <OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp>
 #include <OpenMPTarget/Kokkos_OpenMPTarget_MDRangePolicy.hpp>
 #include <OpenMPTarget/Kokkos_OpenMPTarget_UniqueToken.hpp>
+#include <OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Range.hpp>
+#include <OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp>
+#include <OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp>
+#include <OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp>
+#include <OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp>
+#include <OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Team.hpp>
 #endif
 
 #endif
diff --git a/packages/kokkos/core/src/decl/Kokkos_Declare_SERIAL.hpp b/packages/kokkos/core/src/decl/Kokkos_Declare_SERIAL.hpp
index bb59ae2ce8bea2b9fb9817398b00bb3451039150..86b044bee5f9fa144fc319bf52981bd249c8fc44 100644
--- a/packages/kokkos/core/src/decl/Kokkos_Declare_SERIAL.hpp
+++ b/packages/kokkos/core/src/decl/Kokkos_Declare_SERIAL.hpp
@@ -18,8 +18,9 @@
 #define KOKKOS_DECLARE_SERIAL_HPP
 
 #if defined(KOKKOS_ENABLE_SERIAL)
-#include <Kokkos_Serial.hpp>
+#include <Serial/Kokkos_Serial.hpp>
 #include <Serial/Kokkos_Serial_MDRangePolicy.hpp>
+#include <Serial/Kokkos_Serial_ZeroMemset.hpp>
 #endif
 
 #endif
diff --git a/packages/kokkos/core/src/decl/Kokkos_Declare_SYCL.hpp b/packages/kokkos/core/src/decl/Kokkos_Declare_SYCL.hpp
index 0c8dddbeb3cc19a1eb6ffc1ab8c57b10bcd1870a..bd12c5c6a99f9d52cba072e361fd4d661f261774 100644
--- a/packages/kokkos/core/src/decl/Kokkos_Declare_SYCL.hpp
+++ b/packages/kokkos/core/src/decl/Kokkos_Declare_SYCL.hpp
@@ -18,16 +18,20 @@
 #define KOKKOS_DECLARE_SYCL_HPP
 
 #if defined(KOKKOS_ENABLE_SYCL)
-#include <Kokkos_SYCL.hpp>
+#include <SYCL/Kokkos_SYCL.hpp>
 #include <SYCL/Kokkos_SYCL_Half_Impl_Type.hpp>
 #include <SYCL/Kokkos_SYCL_Half_Conversion.hpp>
 #include <SYCL/Kokkos_SYCL_DeepCopy.hpp>
 #include <SYCL/Kokkos_SYCL_MDRangePolicy.hpp>
-#include <SYCL/Kokkos_SYCL_Parallel_Range.hpp>
-#include <SYCL/Kokkos_SYCL_Parallel_Reduce.hpp>
-#include <SYCL/Kokkos_SYCL_Parallel_Scan.hpp>
-#include <SYCL/Kokkos_SYCL_Parallel_Team.hpp>
+#include <SYCL/Kokkos_SYCL_ParallelFor_Range.hpp>
+#include <SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp>
+#include <SYCL/Kokkos_SYCL_ParallelFor_Team.hpp>
+#include <SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp>
+#include <SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp>
+#include <SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp>
+#include <SYCL/Kokkos_SYCL_ParallelScan_Range.hpp>
 #include <SYCL/Kokkos_SYCL_UniqueToken.hpp>
+#include <SYCL/Kokkos_SYCL_ZeroMemset.hpp>
 #endif
 
 #endif
diff --git a/packages/kokkos/core/src/decl/Kokkos_Declare_THREADS.hpp b/packages/kokkos/core/src/decl/Kokkos_Declare_THREADS.hpp
index b7af04be3c33da95de03427cf0c1ee60c79c01dd..f5cbc0c1d1d67886719c2d512bdbab5ce371898b 100644
--- a/packages/kokkos/core/src/decl/Kokkos_Declare_THREADS.hpp
+++ b/packages/kokkos/core/src/decl/Kokkos_Declare_THREADS.hpp
@@ -18,8 +18,18 @@
 #define KOKKOS_DECLARE_THREADS_HPP
 
 #if defined(KOKKOS_ENABLE_THREADS)
-#include <Kokkos_Threads.hpp>
+#include <Threads/Kokkos_Threads.hpp>
+#include <Threads/Kokkos_ThreadsExec.hpp>
 #include <Threads/Kokkos_Threads_MDRangePolicy.hpp>
+#include <Threads/Kokkos_Threads_ParallelFor_Range.hpp>
+#include <Threads/Kokkos_Threads_ParallelFor_MDRange.hpp>
+#include <Threads/Kokkos_Threads_ParallelFor_Team.hpp>
+#include <Threads/Kokkos_Threads_ParallelReduce_Range.hpp>
+#include <Threads/Kokkos_Threads_ParallelReduce_MDRange.hpp>
+#include <Threads/Kokkos_Threads_ParallelReduce_Team.hpp>
+#include <Threads/Kokkos_Threads_ParallelScan_Range.hpp>
+#include <Threads/Kokkos_ThreadsTeam.hpp>
+#include <Threads/Kokkos_Threads_UniqueToken.hpp>
 #endif
 
 #endif
diff --git a/packages/kokkos/core/src/impl/CMakeLists.txt b/packages/kokkos/core/src/impl/CMakeLists.txt
deleted file mode 100644
index 203fd4a3a44adbc41e2615bd73e91a3bfaaec0f4..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/src/impl/CMakeLists.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-
-SET(HEADERS "")
-SET(SOURCES "")
-
-FILE(GLOB HEADERS *.hpp *.h)
-FILE(GLOB SOURCES *.cpp)
-
-TRIBITS_ADD_LIBRARY(
-    kokkoscore_impl
-    NOINSTALLHEADERS ${HEADERS}
-    SOURCES ${SOURCES}
-    DEPLIBS
-    )
-
-SET(TRILINOS_INCDIR ${${PROJECT_NAME}_INSTALL_INCLUDE_DIR})
-
-INSTALL(FILES ${HEADERS} DESTINATION ${TRILINOS_INCDIR}/impl/)
-
diff --git a/packages/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp b/packages/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp
index e2b606004fece193df834ee95c2b7617aef84281..a44ffefa6b72d489d2027572f2566fe64a492057 100644
--- a/packages/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp
+++ b/packages/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp
@@ -37,1167 +37,1278 @@ namespace Impl {
 // Temporary, for testing new loop macros
 #define KOKKOS_ENABLE_NEW_LOOP_MACROS 1
 
-#define LOOP_1L(type, tile)   \
-  KOKKOS_ENABLE_IVDEP_MDRANGE \
+#define KOKKOS_IMPL_LOOP_1L(type, tile) \
+  KOKKOS_ENABLE_IVDEP_MDRANGE           \
   for (type i0 = 0; i0 < static_cast<type>(tile[0]); ++i0)
 
-#define LOOP_2L(type, tile) \
-  for (type i1 = 0; i1 < static_cast<type>(tile[1]); ++i1) LOOP_1L(type, tile)
+#define KOKKOS_IMPL_LOOP_2L(type, tile)                    \
+  for (type i1 = 0; i1 < static_cast<type>(tile[1]); ++i1) \
+  KOKKOS_IMPL_LOOP_1L(type, tile)
 
-#define LOOP_3L(type, tile) \
-  for (type i2 = 0; i2 < static_cast<type>(tile[2]); ++i2) LOOP_2L(type, tile)
+#define KOKKOS_IMPL_LOOP_3L(type, tile)                    \
+  for (type i2 = 0; i2 < static_cast<type>(tile[2]); ++i2) \
+  KOKKOS_IMPL_LOOP_2L(type, tile)
 
-#define LOOP_4L(type, tile) \
-  for (type i3 = 0; i3 < static_cast<type>(tile[3]); ++i3) LOOP_3L(type, tile)
+#define KOKKOS_IMPL_LOOP_4L(type, tile)                    \
+  for (type i3 = 0; i3 < static_cast<type>(tile[3]); ++i3) \
+  KOKKOS_IMPL_LOOP_3L(type, tile)
 
-#define LOOP_5L(type, tile) \
-  for (type i4 = 0; i4 < static_cast<type>(tile[4]); ++i4) LOOP_4L(type, tile)
+#define KOKKOS_IMPL_LOOP_5L(type, tile)                    \
+  for (type i4 = 0; i4 < static_cast<type>(tile[4]); ++i4) \
+  KOKKOS_IMPL_LOOP_4L(type, tile)
 
-#define LOOP_6L(type, tile) \
-  for (type i5 = 0; i5 < static_cast<type>(tile[5]); ++i5) LOOP_5L(type, tile)
+#define KOKKOS_IMPL_LOOP_6L(type, tile)                    \
+  for (type i5 = 0; i5 < static_cast<type>(tile[5]); ++i5) \
+  KOKKOS_IMPL_LOOP_5L(type, tile)
 
-#define LOOP_7L(type, tile) \
-  for (type i6 = 0; i6 < static_cast<type>(tile[6]); ++i6) LOOP_6L(type, tile)
+#define KOKKOS_IMPL_LOOP_7L(type, tile)                    \
+  for (type i6 = 0; i6 < static_cast<type>(tile[6]); ++i6) \
+  KOKKOS_IMPL_LOOP_6L(type, tile)
 
-#define LOOP_8L(type, tile) \
-  for (type i7 = 0; i7 < static_cast<type>(tile[7]); ++i7) LOOP_7L(type, tile)
+#define KOKKOS_IMPL_LOOP_8L(type, tile)                    \
+  for (type i7 = 0; i7 < static_cast<type>(tile[7]); ++i7) \
+  KOKKOS_IMPL_LOOP_7L(type, tile)
 
-#define LOOP_1R(type, tile)   \
-  KOKKOS_ENABLE_IVDEP_MDRANGE \
+#define KOKKOS_IMPL_LOOP_1R(type, tile) \
+  KOKKOS_ENABLE_IVDEP_MDRANGE           \
   for (type i0 = 0; i0 < static_cast<type>(tile[0]); ++i0)
 
-#define LOOP_2R(type, tile) \
-  LOOP_1R(type, tile)       \
+#define KOKKOS_IMPL_LOOP_2R(type, tile) \
+  KOKKOS_IMPL_LOOP_1R(type, tile)       \
   for (type i1 = 0; i1 < static_cast<type>(tile[1]); ++i1)
 
-#define LOOP_3R(type, tile) \
-  LOOP_2R(type, tile)       \
+#define KOKKOS_IMPL_LOOP_3R(type, tile) \
+  KOKKOS_IMPL_LOOP_2R(type, tile)       \
   for (type i2 = 0; i2 < static_cast<type>(tile[2]); ++i2)
 
-#define LOOP_4R(type, tile) \
-  LOOP_3R(type, tile)       \
+#define KOKKOS_IMPL_LOOP_4R(type, tile) \
+  KOKKOS_IMPL_LOOP_3R(type, tile)       \
   for (type i3 = 0; i3 < static_cast<type>(tile[3]); ++i3)
 
-#define LOOP_5R(type, tile) \
-  LOOP_4R(type, tile)       \
+#define KOKKOS_IMPL_LOOP_5R(type, tile) \
+  KOKKOS_IMPL_LOOP_4R(type, tile)       \
   for (type i4 = 0; i4 < static_cast<type>(tile[4]); ++i4)
 
-#define LOOP_6R(type, tile) \
-  LOOP_5R(type, tile)       \
+#define KOKKOS_IMPL_LOOP_6R(type, tile) \
+  KOKKOS_IMPL_LOOP_5R(type, tile)       \
   for (type i5 = 0; i5 < static_cast<type>(tile[5]); ++i5)
 
-#define LOOP_7R(type, tile) \
-  LOOP_6R(type, tile)       \
+#define KOKKOS_IMPL_LOOP_7R(type, tile) \
+  KOKKOS_IMPL_LOOP_6R(type, tile)       \
   for (type i6 = 0; i6 < static_cast<type>(tile[6]); ++i6)
 
-#define LOOP_8R(type, tile) \
-  LOOP_7R(type, tile)       \
+#define KOKKOS_IMPL_LOOP_8R(type, tile) \
+  KOKKOS_IMPL_LOOP_7R(type, tile)       \
   for (type i7 = 0; i7 < static_cast<type>(tile[7]); ++i7)
 
-#define LOOP_ARGS_1 i0 + m_offset[0]
-#define LOOP_ARGS_2 LOOP_ARGS_1, i1 + m_offset[1]
-#define LOOP_ARGS_3 LOOP_ARGS_2, i2 + m_offset[2]
-#define LOOP_ARGS_4 LOOP_ARGS_3, i3 + m_offset[3]
-#define LOOP_ARGS_5 LOOP_ARGS_4, i4 + m_offset[4]
-#define LOOP_ARGS_6 LOOP_ARGS_5, i5 + m_offset[5]
-#define LOOP_ARGS_7 LOOP_ARGS_6, i6 + m_offset[6]
-#define LOOP_ARGS_8 LOOP_ARGS_7, i7 + m_offset[7]
+#define KOKKOS_IMPL_LOOP_ARGS_1 i0 + m_offset[0]
+#define KOKKOS_IMPL_LOOP_ARGS_2 KOKKOS_IMPL_LOOP_ARGS_1, i1 + m_offset[1]
+#define KOKKOS_IMPL_LOOP_ARGS_3 KOKKOS_IMPL_LOOP_ARGS_2, i2 + m_offset[2]
+#define KOKKOS_IMPL_LOOP_ARGS_4 KOKKOS_IMPL_LOOP_ARGS_3, i3 + m_offset[3]
+#define KOKKOS_IMPL_LOOP_ARGS_5 KOKKOS_IMPL_LOOP_ARGS_4, i4 + m_offset[4]
+#define KOKKOS_IMPL_LOOP_ARGS_6 KOKKOS_IMPL_LOOP_ARGS_5, i5 + m_offset[5]
+#define KOKKOS_IMPL_LOOP_ARGS_7 KOKKOS_IMPL_LOOP_ARGS_6, i6 + m_offset[6]
+#define KOKKOS_IMPL_LOOP_ARGS_8 KOKKOS_IMPL_LOOP_ARGS_7, i7 + m_offset[7]
 
 // New Loop Macros...
 // parallel_for, non-tagged
-#define APPLY(func, ...) func(__VA_ARGS__);
+#define KOKKOS_IMPL_APPLY(func, ...) func(__VA_ARGS__);
 
 // LayoutRight
 // d = 0 to start
-#define LOOP_R_1(func, type, m_offset, extent, d, ...)               \
+#define KOKKOS_IMPL_LOOP_R_1(func, type, m_offset, extent, d, ...)   \
   KOKKOS_ENABLE_IVDEP_MDRANGE                                        \
   for (type i0 = (type)0; i0 < static_cast<type>(extent[d]); ++i0) { \
-    APPLY(func, __VA_ARGS__, i0 + m_offset[d])                       \
+    KOKKOS_IMPL_APPLY(func, __VA_ARGS__, i0 + m_offset[d])           \
   }
 
-#define LOOP_R_2(func, type, m_offset, extent, d, ...)               \
-  for (type i1 = (type)0; i1 < static_cast<type>(extent[d]); ++i1) { \
-    LOOP_R_1(func, type, m_offset, extent, d + 1, __VA_ARGS__,       \
-             i1 + m_offset[d])                                       \
+#define KOKKOS_IMPL_LOOP_R_2(func, type, m_offset, extent, d, ...)         \
+  for (type i1 = (type)0; i1 < static_cast<type>(extent[d]); ++i1) {       \
+    KOKKOS_IMPL_LOOP_R_1(func, type, m_offset, extent, d + 1, __VA_ARGS__, \
+                         i1 + m_offset[d])                                 \
   }
 
-#define LOOP_R_3(func, type, m_offset, extent, d, ...)               \
-  for (type i2 = (type)0; i2 < static_cast<type>(extent[d]); ++i2) { \
-    LOOP_R_2(func, type, m_offset, extent, d + 1, __VA_ARGS__,       \
-             i2 + m_offset[d])                                       \
+#define KOKKOS_IMPL_LOOP_R_3(func, type, m_offset, extent, d, ...)         \
+  for (type i2 = (type)0; i2 < static_cast<type>(extent[d]); ++i2) {       \
+    KOKKOS_IMPL_LOOP_R_2(func, type, m_offset, extent, d + 1, __VA_ARGS__, \
+                         i2 + m_offset[d])                                 \
   }
 
-#define LOOP_R_4(func, type, m_offset, extent, d, ...)               \
-  for (type i3 = (type)0; i3 < static_cast<type>(extent[d]); ++i3) { \
-    LOOP_R_3(func, type, m_offset, extent, d + 1, __VA_ARGS__,       \
-             i3 + m_offset[d])                                       \
+#define KOKKOS_IMPL_LOOP_R_4(func, type, m_offset, extent, d, ...)         \
+  for (type i3 = (type)0; i3 < static_cast<type>(extent[d]); ++i3) {       \
+    KOKKOS_IMPL_LOOP_R_3(func, type, m_offset, extent, d + 1, __VA_ARGS__, \
+                         i3 + m_offset[d])                                 \
   }
 
-#define LOOP_R_5(func, type, m_offset, extent, d, ...)               \
-  for (type i4 = (type)0; i4 < static_cast<type>(extent[d]); ++i4) { \
-    LOOP_R_4(func, type, m_offset, extent, d + 1, __VA_ARGS__,       \
-             i4 + m_offset[d])                                       \
+#define KOKKOS_IMPL_LOOP_R_5(func, type, m_offset, extent, d, ...)         \
+  for (type i4 = (type)0; i4 < static_cast<type>(extent[d]); ++i4) {       \
+    KOKKOS_IMPL_LOOP_R_4(func, type, m_offset, extent, d + 1, __VA_ARGS__, \
+                         i4 + m_offset[d])                                 \
   }
 
-#define LOOP_R_6(func, type, m_offset, extent, d, ...)               \
-  for (type i5 = (type)0; i5 < static_cast<type>(extent[d]); ++i5) { \
-    LOOP_R_5(func, type, m_offset, extent, d + 1, __VA_ARGS__,       \
-             i5 + m_offset[d])                                       \
+#define KOKKOS_IMPL_LOOP_R_6(func, type, m_offset, extent, d, ...)         \
+  for (type i5 = (type)0; i5 < static_cast<type>(extent[d]); ++i5) {       \
+    KOKKOS_IMPL_LOOP_R_5(func, type, m_offset, extent, d + 1, __VA_ARGS__, \
+                         i5 + m_offset[d])                                 \
   }
 
-#define LOOP_R_7(func, type, m_offset, extent, d, ...)               \
-  for (type i6 = (type)0; i6 < static_cast<type>(extent[d]); ++i6) { \
-    LOOP_R_6(func, type, m_offset, extent, d + 1, __VA_ARGS__,       \
-             i6 + m_offset[d])                                       \
+#define KOKKOS_IMPL_LOOP_R_7(func, type, m_offset, extent, d, ...)         \
+  for (type i6 = (type)0; i6 < static_cast<type>(extent[d]); ++i6) {       \
+    KOKKOS_IMPL_LOOP_R_6(func, type, m_offset, extent, d + 1, __VA_ARGS__, \
+                         i6 + m_offset[d])                                 \
   }
 
-#define LOOP_R_8(func, type, m_offset, extent, d, ...)               \
-  for (type i7 = (type)0; i7 < static_cast<type>(extent[d]); ++i7) { \
-    LOOP_R_7(func, type, m_offset, extent, d + 1, __VA_ARGS__,       \
-             i7 + m_offset[d])                                       \
+#define KOKKOS_IMPL_LOOP_R_8(func, type, m_offset, extent, d, ...)         \
+  for (type i7 = (type)0; i7 < static_cast<type>(extent[d]); ++i7) {       \
+    KOKKOS_IMPL_LOOP_R_7(func, type, m_offset, extent, d + 1, __VA_ARGS__, \
+                         i7 + m_offset[d])                                 \
   }
 
 // LayoutLeft
 // d = rank-1 to start
-#define LOOP_L_1(func, type, m_offset, extent, d, ...)               \
+#define KOKKOS_IMPL_LOOP_L_1(func, type, m_offset, extent, d, ...)   \
   KOKKOS_ENABLE_IVDEP_MDRANGE                                        \
   for (type i0 = (type)0; i0 < static_cast<type>(extent[d]); ++i0) { \
-    APPLY(func, i0 + m_offset[d], __VA_ARGS__)                       \
+    KOKKOS_IMPL_APPLY(func, i0 + m_offset[d], __VA_ARGS__)           \
   }
 
-#define LOOP_L_2(func, type, m_offset, extent, d, ...)               \
+#define KOKKOS_IMPL_LOOP_L_2(func, type, m_offset, extent, d, ...)   \
   for (type i1 = (type)0; i1 < static_cast<type>(extent[d]); ++i1) { \
-    LOOP_L_1(func, type, m_offset, extent, d - 1, i1 + m_offset[d],  \
-             __VA_ARGS__)                                            \
+    KOKKOS_IMPL_LOOP_L_1(func, type, m_offset, extent, d - 1,        \
+                         i1 + m_offset[d], __VA_ARGS__)              \
   }
 
-#define LOOP_L_3(func, type, m_offset, extent, d, ...)               \
+#define KOKKOS_IMPL_LOOP_L_3(func, type, m_offset, extent, d, ...)   \
   for (type i2 = (type)0; i2 < static_cast<type>(extent[d]); ++i2) { \
-    LOOP_L_2(func, type, m_offset, extent, d - 1, i2 + m_offset[d],  \
-             __VA_ARGS__)                                            \
+    KOKKOS_IMPL_LOOP_L_2(func, type, m_offset, extent, d - 1,        \
+                         i2 + m_offset[d], __VA_ARGS__)              \
   }
 
-#define LOOP_L_4(func, type, m_offset, extent, d, ...)               \
+#define KOKKOS_IMPL_LOOP_L_4(func, type, m_offset, extent, d, ...)   \
   for (type i3 = (type)0; i3 < static_cast<type>(extent[d]); ++i3) { \
-    LOOP_L_3(func, type, m_offset, extent, d - 1, i3 + m_offset[d],  \
-             __VA_ARGS__)                                            \
+    KOKKOS_IMPL_LOOP_L_3(func, type, m_offset, extent, d - 1,        \
+                         i3 + m_offset[d], __VA_ARGS__)              \
   }
 
-#define LOOP_L_5(func, type, m_offset, extent, d, ...)               \
+#define KOKKOS_IMPL_LOOP_L_5(func, type, m_offset, extent, d, ...)   \
   for (type i4 = (type)0; i4 < static_cast<type>(extent[d]); ++i4) { \
-    LOOP_L_4(func, type, m_offset, extent, d - 1, i4 + m_offset[d],  \
-             __VA_ARGS__)                                            \
+    KOKKOS_IMPL_LOOP_L_4(func, type, m_offset, extent, d - 1,        \
+                         i4 + m_offset[d], __VA_ARGS__)              \
   }
 
-#define LOOP_L_6(func, type, m_offset, extent, d, ...)               \
+#define KOKKOS_IMPL_LOOP_L_6(func, type, m_offset, extent, d, ...)   \
   for (type i5 = (type)0; i5 < static_cast<type>(extent[d]); ++i5) { \
-    LOOP_L_5(func, type, m_offset, extent, d - 1, i5 + m_offset[d],  \
-             __VA_ARGS__)                                            \
+    KOKKOS_IMPL_LOOP_L_5(func, type, m_offset, extent, d - 1,        \
+                         i5 + m_offset[d], __VA_ARGS__)              \
   }
 
-#define LOOP_L_7(func, type, m_offset, extent, d, ...)               \
+#define KOKKOS_IMPL_LOOP_L_7(func, type, m_offset, extent, d, ...)   \
   for (type i6 = (type)0; i6 < static_cast<type>(extent[d]); ++i6) { \
-    LOOP_L_6(func, type, m_offset, extent, d - 1, i6 + m_offset[d],  \
-             __VA_ARGS__)                                            \
+    KOKKOS_IMPL_LOOP_L_6(func, type, m_offset, extent, d - 1,        \
+                         i6 + m_offset[d], __VA_ARGS__)              \
   }
 
-#define LOOP_L_8(func, type, m_offset, extent, d, ...)               \
+#define KOKKOS_IMPL_LOOP_L_8(func, type, m_offset, extent, d, ...)   \
   for (type i7 = (type)0; i7 < static_cast<type>(extent[d]); ++i7) { \
-    LOOP_L_7(func, type, m_offset, extent, d - 1, i7 + m_offset[d],  \
-             __VA_ARGS__)                                            \
+    KOKKOS_IMPL_LOOP_L_7(func, type, m_offset, extent, d - 1,        \
+                         i7 + m_offset[d], __VA_ARGS__)              \
   }
 
 // Left vs Right
 // TODO: rank not necessary to pass through, can hardcode the values
-#define LOOP_LAYOUT_1(func, type, is_left, m_offset, extent, rank)   \
-  KOKKOS_ENABLE_IVDEP_MDRANGE                                        \
-  for (type i0 = (type)0; i0 < static_cast<type>(extent[0]); ++i0) { \
-    APPLY(func, i0 + m_offset[0])                                    \
+#define KOKKOS_IMPL_LOOP_LAYOUT_1(func, type, is_left, m_offset, extent, rank) \
+  KOKKOS_ENABLE_IVDEP_MDRANGE                                                  \
+  for (type i0 = (type)0; i0 < static_cast<type>(extent[0]); ++i0) {           \
+    KOKKOS_IMPL_APPLY(func, i0 + m_offset[0])                                  \
   }
 
-#define LOOP_LAYOUT_2(func, type, is_left, m_offset, extent, rank)            \
-  if (is_left) {                                                              \
-    for (type i1 = (type)0; i1 < static_cast<type>(extent[rank - 1]); ++i1) { \
-      LOOP_L_1(func, type, m_offset, extent, rank - 2,                        \
-               i1 + m_offset[rank - 1])                                       \
-    }                                                                         \
-  } else {                                                                    \
-    for (type i1 = (type)0; i1 < static_cast<type>(extent[0]); ++i1) {        \
-      LOOP_R_1(func, type, m_offset, extent, 1, i1 + m_offset[0])             \
-    }                                                                         \
+#define KOKKOS_IMPL_LOOP_LAYOUT_2(func, type, is_left, m_offset, extent, rank) \
+  if (is_left) {                                                               \
+    for (type i1 = (type)0; i1 < static_cast<type>(extent[rank - 1]); ++i1) {  \
+      KOKKOS_IMPL_LOOP_L_1(func, type, m_offset, extent, rank - 2,             \
+                           i1 + m_offset[rank - 1])                            \
+    }                                                                          \
+  } else {                                                                     \
+    for (type i1 = (type)0; i1 < static_cast<type>(extent[0]); ++i1) {         \
+      KOKKOS_IMPL_LOOP_R_1(func, type, m_offset, extent, 1, i1 + m_offset[0])  \
+    }                                                                          \
   }
 
-#define LOOP_LAYOUT_3(func, type, is_left, m_offset, extent, rank)            \
-  if (is_left) {                                                              \
-    for (type i2 = (type)0; i2 < static_cast<type>(extent[rank - 1]); ++i2) { \
-      LOOP_L_2(func, type, m_offset, extent, rank - 2,                        \
-               i2 + m_offset[rank - 1])                                       \
-    }                                                                         \
-  } else {                                                                    \
-    for (type i2 = (type)0; i2 < static_cast<type>(extent[0]); ++i2) {        \
-      LOOP_R_2(func, type, m_offset, extent, 1, i2 + m_offset[0])             \
-    }                                                                         \
+#define KOKKOS_IMPL_LOOP_LAYOUT_3(func, type, is_left, m_offset, extent, rank) \
+  if (is_left) {                                                               \
+    for (type i2 = (type)0; i2 < static_cast<type>(extent[rank - 1]); ++i2) {  \
+      KOKKOS_IMPL_LOOP_L_2(func, type, m_offset, extent, rank - 2,             \
+                           i2 + m_offset[rank - 1])                            \
+    }                                                                          \
+  } else {                                                                     \
+    for (type i2 = (type)0; i2 < static_cast<type>(extent[0]); ++i2) {         \
+      KOKKOS_IMPL_LOOP_R_2(func, type, m_offset, extent, 1, i2 + m_offset[0])  \
+    }                                                                          \
   }
 
-#define LOOP_LAYOUT_4(func, type, is_left, m_offset, extent, rank)            \
-  if (is_left) {                                                              \
-    for (type i3 = (type)0; i3 < static_cast<type>(extent[rank - 1]); ++i3) { \
-      LOOP_L_3(func, type, m_offset, extent, rank - 2,                        \
-               i3 + m_offset[rank - 1])                                       \
-    }                                                                         \
-  } else {                                                                    \
-    for (type i3 = (type)0; i3 < static_cast<type>(extent[0]); ++i3) {        \
-      LOOP_R_3(func, type, m_offset, extent, 1, i3 + m_offset[0])             \
-    }                                                                         \
+#define KOKKOS_IMPL_LOOP_LAYOUT_4(func, type, is_left, m_offset, extent, rank) \
+  if (is_left) {                                                               \
+    for (type i3 = (type)0; i3 < static_cast<type>(extent[rank - 1]); ++i3) {  \
+      KOKKOS_IMPL_LOOP_L_3(func, type, m_offset, extent, rank - 2,             \
+                           i3 + m_offset[rank - 1])                            \
+    }                                                                          \
+  } else {                                                                     \
+    for (type i3 = (type)0; i3 < static_cast<type>(extent[0]); ++i3) {         \
+      KOKKOS_IMPL_LOOP_R_3(func, type, m_offset, extent, 1, i3 + m_offset[0])  \
+    }                                                                          \
   }
 
-#define LOOP_LAYOUT_5(func, type, is_left, m_offset, extent, rank)            \
-  if (is_left) {                                                              \
-    for (type i4 = (type)0; i4 < static_cast<type>(extent[rank - 1]); ++i4) { \
-      LOOP_L_4(func, type, m_offset, extent, rank - 2,                        \
-               i4 + m_offset[rank - 1])                                       \
-    }                                                                         \
-  } else {                                                                    \
-    for (type i4 = (type)0; i4 < static_cast<type>(extent[0]); ++i4) {        \
-      LOOP_R_4(func, type, m_offset, extent, 1, i4 + m_offset[0])             \
-    }                                                                         \
+#define KOKKOS_IMPL_LOOP_LAYOUT_5(func, type, is_left, m_offset, extent, rank) \
+  if (is_left) {                                                               \
+    for (type i4 = (type)0; i4 < static_cast<type>(extent[rank - 1]); ++i4) {  \
+      KOKKOS_IMPL_LOOP_L_4(func, type, m_offset, extent, rank - 2,             \
+                           i4 + m_offset[rank - 1])                            \
+    }                                                                          \
+  } else {                                                                     \
+    for (type i4 = (type)0; i4 < static_cast<type>(extent[0]); ++i4) {         \
+      KOKKOS_IMPL_LOOP_R_4(func, type, m_offset, extent, 1, i4 + m_offset[0])  \
+    }                                                                          \
   }
 
-#define LOOP_LAYOUT_6(func, type, is_left, m_offset, extent, rank)            \
-  if (is_left) {                                                              \
-    for (type i5 = (type)0; i5 < static_cast<type>(extent[rank - 1]); ++i5) { \
-      LOOP_L_5(func, type, m_offset, extent, rank - 2,                        \
-               i5 + m_offset[rank - 1])                                       \
-    }                                                                         \
-  } else {                                                                    \
-    for (type i5 = (type)0; i5 < static_cast<type>(extent[0]); ++i5) {        \
-      LOOP_R_5(func, type, m_offset, extent, 1, i5 + m_offset[0])             \
-    }                                                                         \
+#define KOKKOS_IMPL_LOOP_LAYOUT_6(func, type, is_left, m_offset, extent, rank) \
+  if (is_left) {                                                               \
+    for (type i5 = (type)0; i5 < static_cast<type>(extent[rank - 1]); ++i5) {  \
+      KOKKOS_IMPL_LOOP_L_5(func, type, m_offset, extent, rank - 2,             \
+                           i5 + m_offset[rank - 1])                            \
+    }                                                                          \
+  } else {                                                                     \
+    for (type i5 = (type)0; i5 < static_cast<type>(extent[0]); ++i5) {         \
+      KOKKOS_IMPL_LOOP_R_5(func, type, m_offset, extent, 1, i5 + m_offset[0])  \
+    }                                                                          \
   }
 
-#define LOOP_LAYOUT_7(func, type, is_left, m_offset, extent, rank)            \
-  if (is_left) {                                                              \
-    for (type i6 = (type)0; i6 < static_cast<type>(extent[rank - 1]); ++i6) { \
-      LOOP_L_6(func, type, m_offset, extent, rank - 2,                        \
-               i6 + m_offset[rank - 1])                                       \
-    }                                                                         \
-  } else {                                                                    \
-    for (type i6 = (type)0; i6 < static_cast<type>(extent[0]); ++i6) {        \
-      LOOP_R_6(func, type, m_offset, extent, 1, i6 + m_offset[0])             \
-    }                                                                         \
+#define KOKKOS_IMPL_LOOP_LAYOUT_7(func, type, is_left, m_offset, extent, rank) \
+  if (is_left) {                                                               \
+    for (type i6 = (type)0; i6 < static_cast<type>(extent[rank - 1]); ++i6) {  \
+      KOKKOS_IMPL_LOOP_L_6(func, type, m_offset, extent, rank - 2,             \
+                           i6 + m_offset[rank - 1])                            \
+    }                                                                          \
+  } else {                                                                     \
+    for (type i6 = (type)0; i6 < static_cast<type>(extent[0]); ++i6) {         \
+      KOKKOS_IMPL_LOOP_R_6(func, type, m_offset, extent, 1, i6 + m_offset[0])  \
+    }                                                                          \
   }
 
-#define LOOP_LAYOUT_8(func, type, is_left, m_offset, extent, rank)            \
-  if (is_left) {                                                              \
-    for (type i7 = (type)0; i7 < static_cast<type>(extent[rank - 1]); ++i7) { \
-      LOOP_L_7(func, type, m_offset, extent, rank - 2,                        \
-               i7 + m_offset[rank - 1])                                       \
-    }                                                                         \
-  } else {                                                                    \
-    for (type i7 = (type)0; i7 < static_cast<type>(extent[0]); ++i7) {        \
-      LOOP_R_7(func, type, m_offset, extent, 1, i7 + m_offset[0])             \
-    }                                                                         \
+#define KOKKOS_IMPL_LOOP_LAYOUT_8(func, type, is_left, m_offset, extent, rank) \
+  if (is_left) {                                                               \
+    for (type i7 = (type)0; i7 < static_cast<type>(extent[rank - 1]); ++i7) {  \
+      KOKKOS_IMPL_LOOP_L_7(func, type, m_offset, extent, rank - 2,             \
+                           i7 + m_offset[rank - 1])                            \
+    }                                                                          \
+  } else {                                                                     \
+    for (type i7 = (type)0; i7 < static_cast<type>(extent[0]); ++i7) {         \
+      KOKKOS_IMPL_LOOP_R_7(func, type, m_offset, extent, 1, i7 + m_offset[0])  \
+    }                                                                          \
   }
 
 // Partial vs Full Tile
-#define TILE_LOOP_1(func, type, is_left, cond, m_offset, extent_full,  \
-                    extent_partial, rank)                              \
-  if (cond) {                                                          \
-    LOOP_LAYOUT_1(func, type, is_left, m_offset, extent_full, rank)    \
-  } else {                                                             \
-    LOOP_LAYOUT_1(func, type, is_left, m_offset, extent_partial, rank) \
+#define KOKKOS_IMPL_TILE_LOOP_1(func, type, is_left, cond, m_offset,         \
+                                extent_full, extent_partial, rank)           \
+  if (cond) {                                                                \
+    KOKKOS_IMPL_LOOP_LAYOUT_1(func, type, is_left, m_offset, extent_full,    \
+                              rank)                                          \
+  } else {                                                                   \
+    KOKKOS_IMPL_LOOP_LAYOUT_1(func, type, is_left, m_offset, extent_partial, \
+                              rank)                                          \
   }
 
-#define TILE_LOOP_2(func, type, is_left, cond, m_offset, extent_full,  \
-                    extent_partial, rank)                              \
-  if (cond) {                                                          \
-    LOOP_LAYOUT_2(func, type, is_left, m_offset, extent_full, rank)    \
-  } else {                                                             \
-    LOOP_LAYOUT_2(func, type, is_left, m_offset, extent_partial, rank) \
+#define KOKKOS_IMPL_TILE_LOOP_2(func, type, is_left, cond, m_offset,         \
+                                extent_full, extent_partial, rank)           \
+  if (cond) {                                                                \
+    KOKKOS_IMPL_LOOP_LAYOUT_2(func, type, is_left, m_offset, extent_full,    \
+                              rank)                                          \
+  } else {                                                                   \
+    KOKKOS_IMPL_LOOP_LAYOUT_2(func, type, is_left, m_offset, extent_partial, \
+                              rank)                                          \
   }
 
-#define TILE_LOOP_3(func, type, is_left, cond, m_offset, extent_full,  \
-                    extent_partial, rank)                              \
-  if (cond) {                                                          \
-    LOOP_LAYOUT_3(func, type, is_left, m_offset, extent_full, rank)    \
-  } else {                                                             \
-    LOOP_LAYOUT_3(func, type, is_left, m_offset, extent_partial, rank) \
+#define KOKKOS_IMPL_TILE_LOOP_3(func, type, is_left, cond, m_offset,         \
+                                extent_full, extent_partial, rank)           \
+  if (cond) {                                                                \
+    KOKKOS_IMPL_LOOP_LAYOUT_3(func, type, is_left, m_offset, extent_full,    \
+                              rank)                                          \
+  } else {                                                                   \
+    KOKKOS_IMPL_LOOP_LAYOUT_3(func, type, is_left, m_offset, extent_partial, \
+                              rank)                                          \
   }
 
-#define TILE_LOOP_4(func, type, is_left, cond, m_offset, extent_full,  \
-                    extent_partial, rank)                              \
-  if (cond) {                                                          \
-    LOOP_LAYOUT_4(func, type, is_left, m_offset, extent_full, rank)    \
-  } else {                                                             \
-    LOOP_LAYOUT_4(func, type, is_left, m_offset, extent_partial, rank) \
+#define KOKKOS_IMPL_TILE_LOOP_4(func, type, is_left, cond, m_offset,         \
+                                extent_full, extent_partial, rank)           \
+  if (cond) {                                                                \
+    KOKKOS_IMPL_LOOP_LAYOUT_4(func, type, is_left, m_offset, extent_full,    \
+                              rank)                                          \
+  } else {                                                                   \
+    KOKKOS_IMPL_LOOP_LAYOUT_4(func, type, is_left, m_offset, extent_partial, \
+                              rank)                                          \
   }
 
-#define TILE_LOOP_5(func, type, is_left, cond, m_offset, extent_full,  \
-                    extent_partial, rank)                              \
-  if (cond) {                                                          \
-    LOOP_LAYOUT_5(func, type, is_left, m_offset, extent_full, rank)    \
-  } else {                                                             \
-    LOOP_LAYOUT_5(func, type, is_left, m_offset, extent_partial, rank) \
+#define KOKKOS_IMPL_TILE_LOOP_5(func, type, is_left, cond, m_offset,         \
+                                extent_full, extent_partial, rank)           \
+  if (cond) {                                                                \
+    KOKKOS_IMPL_LOOP_LAYOUT_5(func, type, is_left, m_offset, extent_full,    \
+                              rank)                                          \
+  } else {                                                                   \
+    KOKKOS_IMPL_LOOP_LAYOUT_5(func, type, is_left, m_offset, extent_partial, \
+                              rank)                                          \
   }
 
-#define TILE_LOOP_6(func, type, is_left, cond, m_offset, extent_full,  \
-                    extent_partial, rank)                              \
-  if (cond) {                                                          \
-    LOOP_LAYOUT_6(func, type, is_left, m_offset, extent_full, rank)    \
-  } else {                                                             \
-    LOOP_LAYOUT_6(func, type, is_left, m_offset, extent_partial, rank) \
+#define KOKKOS_IMPL_TILE_LOOP_6(func, type, is_left, cond, m_offset,         \
+                                extent_full, extent_partial, rank)           \
+  if (cond) {                                                                \
+    KOKKOS_IMPL_LOOP_LAYOUT_6(func, type, is_left, m_offset, extent_full,    \
+                              rank)                                          \
+  } else {                                                                   \
+    KOKKOS_IMPL_LOOP_LAYOUT_6(func, type, is_left, m_offset, extent_partial, \
+                              rank)                                          \
   }
 
-#define TILE_LOOP_7(func, type, is_left, cond, m_offset, extent_full,  \
-                    extent_partial, rank)                              \
-  if (cond) {                                                          \
-    LOOP_LAYOUT_7(func, type, is_left, m_offset, extent_full, rank)    \
-  } else {                                                             \
-    LOOP_LAYOUT_7(func, type, is_left, m_offset, extent_partial, rank) \
+#define KOKKOS_IMPL_TILE_LOOP_7(func, type, is_left, cond, m_offset,         \
+                                extent_full, extent_partial, rank)           \
+  if (cond) {                                                                \
+    KOKKOS_IMPL_LOOP_LAYOUT_7(func, type, is_left, m_offset, extent_full,    \
+                              rank)                                          \
+  } else {                                                                   \
+    KOKKOS_IMPL_LOOP_LAYOUT_7(func, type, is_left, m_offset, extent_partial, \
+                              rank)                                          \
   }
 
-#define TILE_LOOP_8(func, type, is_left, cond, m_offset, extent_full,  \
-                    extent_partial, rank)                              \
-  if (cond) {                                                          \
-    LOOP_LAYOUT_8(func, type, is_left, m_offset, extent_full, rank)    \
-  } else {                                                             \
-    LOOP_LAYOUT_8(func, type, is_left, m_offset, extent_partial, rank) \
+#define KOKKOS_IMPL_TILE_LOOP_8(func, type, is_left, cond, m_offset,         \
+                                extent_full, extent_partial, rank)           \
+  if (cond) {                                                                \
+    KOKKOS_IMPL_LOOP_LAYOUT_8(func, type, is_left, m_offset, extent_full,    \
+                              rank)                                          \
+  } else {                                                                   \
+    KOKKOS_IMPL_LOOP_LAYOUT_8(func, type, is_left, m_offset, extent_partial, \
+                              rank)                                          \
   }
 
 // parallel_reduce, non-tagged
 // Reduction version
-#define APPLY_REDUX(val, func, ...) func(__VA_ARGS__, val);
+#define KOKKOS_IMPL_APPLY_REDUX(val, func, ...) func(__VA_ARGS__, val);
 
 // LayoutRight
 // d = 0 to start
-#define LOOP_R_1_REDUX(val, func, type, m_offset, extent, d, ...)    \
-  KOKKOS_ENABLE_IVDEP_MDRANGE                                        \
-  for (type i0 = (type)0; i0 < static_cast<type>(extent[d]); ++i0) { \
-    APPLY_REDUX(val, func, __VA_ARGS__, i0 + m_offset[d])            \
+#define KOKKOS_IMPL_LOOP_R_1_REDUX(val, func, type, m_offset, extent, d, ...) \
+  KOKKOS_ENABLE_IVDEP_MDRANGE                                                 \
+  for (type i0 = (type)0; i0 < static_cast<type>(extent[d]); ++i0) {          \
+    KOKKOS_IMPL_APPLY_REDUX(val, func, __VA_ARGS__, i0 + m_offset[d])         \
   }
 
-#define LOOP_R_2_REDUX(val, func, type, m_offset, extent, d, ...)         \
-  for (type i1 = (type)0; i1 < static_cast<type>(extent[d]); ++i1) {      \
-    LOOP_R_1_REDUX(val, func, type, m_offset, extent, d + 1, __VA_ARGS__, \
-                   i1 + m_offset[d])                                      \
+#define KOKKOS_IMPL_LOOP_R_2_REDUX(val, func, type, m_offset, extent, d, ...) \
+  for (type i1 = (type)0; i1 < static_cast<type>(extent[d]); ++i1) {          \
+    KOKKOS_IMPL_LOOP_R_1_REDUX(val, func, type, m_offset, extent, d + 1,      \
+                               __VA_ARGS__, i1 + m_offset[d])                 \
   }
 
-#define LOOP_R_3_REDUX(val, func, type, m_offset, extent, d, ...)         \
-  for (type i2 = (type)0; i2 < static_cast<type>(extent[d]); ++i2) {      \
-    LOOP_R_2_REDUX(val, func, type, m_offset, extent, d + 1, __VA_ARGS__, \
-                   i2 + m_offset[d])                                      \
+#define KOKKOS_IMPL_LOOP_R_3_REDUX(val, func, type, m_offset, extent, d, ...) \
+  for (type i2 = (type)0; i2 < static_cast<type>(extent[d]); ++i2) {          \
+    KOKKOS_IMPL_LOOP_R_2_REDUX(val, func, type, m_offset, extent, d + 1,      \
+                               __VA_ARGS__, i2 + m_offset[d])                 \
   }
 
-#define LOOP_R_4_REDUX(val, func, type, m_offset, extent, d, ...)         \
-  for (type i3 = (type)0; i3 < static_cast<type>(extent[d]); ++i3) {      \
-    LOOP_R_3_REDUX(val, func, type, m_offset, extent, d + 1, __VA_ARGS__, \
-                   i3 + m_offset[d])                                      \
+#define KOKKOS_IMPL_LOOP_R_4_REDUX(val, func, type, m_offset, extent, d, ...) \
+  for (type i3 = (type)0; i3 < static_cast<type>(extent[d]); ++i3) {          \
+    KOKKOS_IMPL_LOOP_R_3_REDUX(val, func, type, m_offset, extent, d + 1,      \
+                               __VA_ARGS__, i3 + m_offset[d])                 \
   }
 
-#define LOOP_R_5_REDUX(val, func, type, m_offset, extent, d, ...)         \
-  for (type i4 = (type)0; i4 < static_cast<type>(extent[d]); ++i4) {      \
-    LOOP_R_4_REDUX(val, func, type, m_offset, extent, d + 1, __VA_ARGS__, \
-                   i4 + m_offset[d])                                      \
+#define KOKKOS_IMPL_LOOP_R_5_REDUX(val, func, type, m_offset, extent, d, ...) \
+  for (type i4 = (type)0; i4 < static_cast<type>(extent[d]); ++i4) {          \
+    KOKKOS_IMPL_LOOP_R_4_REDUX(val, func, type, m_offset, extent, d + 1,      \
+                               __VA_ARGS__, i4 + m_offset[d])                 \
   }
 
-#define LOOP_R_6_REDUX(val, func, type, m_offset, extent, d, ...)         \
-  for (type i5 = (type)0; i5 < static_cast<type>(extent[d]); ++i5) {      \
-    LOOP_R_5_REDUX(val, func, type, m_offset, extent, d + 1, __VA_ARGS__, \
-                   i5 + m_offset[d])                                      \
+#define KOKKOS_IMPL_LOOP_R_6_REDUX(val, func, type, m_offset, extent, d, ...) \
+  for (type i5 = (type)0; i5 < static_cast<type>(extent[d]); ++i5) {          \
+    KOKKOS_IMPL_LOOP_R_5_REDUX(val, func, type, m_offset, extent, d + 1,      \
+                               __VA_ARGS__, i5 + m_offset[d])                 \
   }
 
-#define LOOP_R_7_REDUX(val, func, type, m_offset, extent, d, ...)         \
-  for (type i6 = (type)0; i6 < static_cast<type>(extent[d]); ++i6) {      \
-    LOOP_R_6_REDUX(val, func, type, m_offset, extent, d + 1, __VA_ARGS__, \
-                   i6 + m_offset[d])                                      \
+#define KOKKOS_IMPL_LOOP_R_7_REDUX(val, func, type, m_offset, extent, d, ...) \
+  for (type i6 = (type)0; i6 < static_cast<type>(extent[d]); ++i6) {          \
+    KOKKOS_IMPL_LOOP_R_6_REDUX(val, func, type, m_offset, extent, d + 1,      \
+                               __VA_ARGS__, i6 + m_offset[d])                 \
   }
 
-#define LOOP_R_8_REDUX(val, func, type, m_offset, extent, d, ...)         \
-  for (type i7 = (type)0; i7 < static_cast<type>(extent[d]); ++i7) {      \
-    LOOP_R_7_REDUX(val, func, type, m_offset, extent, d + 1, __VA_ARGS__, \
-                   i7 + m_offset[d])                                      \
+#define KOKKOS_IMPL_LOOP_R_8_REDUX(val, func, type, m_offset, extent, d, ...) \
+  for (type i7 = (type)0; i7 < static_cast<type>(extent[d]); ++i7) {          \
+    KOKKOS_IMPL_LOOP_R_7_REDUX(val, func, type, m_offset, extent, d + 1,      \
+                               __VA_ARGS__, i7 + m_offset[d])                 \
   }
 
 // LayoutLeft
 // d = rank-1 to start
-#define LOOP_L_1_REDUX(val, func, type, m_offset, extent, d, ...)    \
-  KOKKOS_ENABLE_IVDEP_MDRANGE                                        \
-  for (type i0 = (type)0; i0 < static_cast<type>(extent[d]); ++i0) { \
-    APPLY_REDUX(val, func, i0 + m_offset[d], __VA_ARGS__)            \
+#define KOKKOS_IMPL_LOOP_L_1_REDUX(val, func, type, m_offset, extent, d, ...) \
+  KOKKOS_ENABLE_IVDEP_MDRANGE                                                 \
+  for (type i0 = (type)0; i0 < static_cast<type>(extent[d]); ++i0) {          \
+    KOKKOS_IMPL_APPLY_REDUX(val, func, i0 + m_offset[d], __VA_ARGS__)         \
   }
 
-#define LOOP_L_2_REDUX(val, func, type, m_offset, extent, d, ...)              \
-  for (type i1 = (type)0; i1 < static_cast<type>(extent[d]); ++i1) {           \
-    LOOP_L_1_REDUX(val, func, type, m_offset, extent, d - 1, i1 + m_offset[d], \
-                   __VA_ARGS__)                                                \
+#define KOKKOS_IMPL_LOOP_L_2_REDUX(val, func, type, m_offset, extent, d, ...) \
+  for (type i1 = (type)0; i1 < static_cast<type>(extent[d]); ++i1) {          \
+    KOKKOS_IMPL_LOOP_L_1_REDUX(val, func, type, m_offset, extent, d - 1,      \
+                               i1 + m_offset[d], __VA_ARGS__)                 \
   }
 
-#define LOOP_L_3_REDUX(val, func, type, m_offset, extent, d, ...)              \
-  for (type i2 = (type)0; i2 < static_cast<type>(extent[d]); ++i2) {           \
-    LOOP_L_2_REDUX(val, func, type, m_offset, extent, d - 1, i2 + m_offset[d], \
-                   __VA_ARGS__)                                                \
+#define KOKKOS_IMPL_LOOP_L_3_REDUX(val, func, type, m_offset, extent, d, ...) \
+  for (type i2 = (type)0; i2 < static_cast<type>(extent[d]); ++i2) {          \
+    KOKKOS_IMPL_LOOP_L_2_REDUX(val, func, type, m_offset, extent, d - 1,      \
+                               i2 + m_offset[d], __VA_ARGS__)                 \
   }
 
-#define LOOP_L_4_REDUX(val, func, type, m_offset, extent, d, ...)              \
-  for (type i3 = (type)0; i3 < static_cast<type>(extent[d]); ++i3) {           \
-    LOOP_L_3_REDUX(val, func, type, m_offset, extent, d - 1, i3 + m_offset[d], \
-                   __VA_ARGS__)                                                \
+#define KOKKOS_IMPL_LOOP_L_4_REDUX(val, func, type, m_offset, extent, d, ...) \
+  for (type i3 = (type)0; i3 < static_cast<type>(extent[d]); ++i3) {          \
+    KOKKOS_IMPL_LOOP_L_3_REDUX(val, func, type, m_offset, extent, d - 1,      \
+                               i3 + m_offset[d], __VA_ARGS__)                 \
   }
 
-#define LOOP_L_5_REDUX(val, func, type, m_offset, extent, d, ...)              \
-  for (type i4 = (type)0; i4 < static_cast<type>(extent[d]); ++i4) {           \
-    LOOP_L_4_REDUX(val, func, type, m_offset, extent, d - 1, i4 + m_offset[d], \
-                   __VA_ARGS__)                                                \
+#define KOKKOS_IMPL_LOOP_L_5_REDUX(val, func, type, m_offset, extent, d, ...) \
+  for (type i4 = (type)0; i4 < static_cast<type>(extent[d]); ++i4) {          \
+    KOKKOS_IMPL_LOOP_L_4_REDUX(val, func, type, m_offset, extent, d - 1,      \
+                               i4 + m_offset[d], __VA_ARGS__)                 \
   }
 
-#define LOOP_L_6_REDUX(val, func, type, m_offset, extent, d, ...)              \
-  for (type i5 = (type)0; i5 < static_cast<type>(extent[d]); ++i5) {           \
-    LOOP_L_5_REDUX(val, func, type, m_offset, extent, d - 1, i5 + m_offset[d], \
-                   __VA_ARGS__)                                                \
+#define KOKKOS_IMPL_LOOP_L_6_REDUX(val, func, type, m_offset, extent, d, ...) \
+  for (type i5 = (type)0; i5 < static_cast<type>(extent[d]); ++i5) {          \
+    KOKKOS_IMPL_LOOP_L_5_REDUX(val, func, type, m_offset, extent, d - 1,      \
+                               i5 + m_offset[d], __VA_ARGS__)                 \
   }
 
-#define LOOP_L_7_REDUX(val, func, type, m_offset, extent, d, ...)              \
-  for (type i6 = (type)0; i6 < static_cast<type>(extent[d]); ++i6) {           \
-    LOOP_L_6_REDUX(val, func, type, m_offset, extent, d - 1, i6 + m_offset[d], \
-                   __VA_ARGS__)                                                \
+#define KOKKOS_IMPL_LOOP_L_7_REDUX(val, func, type, m_offset, extent, d, ...) \
+  for (type i6 = (type)0; i6 < static_cast<type>(extent[d]); ++i6) {          \
+    KOKKOS_IMPL_LOOP_L_6_REDUX(val, func, type, m_offset, extent, d - 1,      \
+                               i6 + m_offset[d], __VA_ARGS__)                 \
   }
 
-#define LOOP_L_8_REDUX(val, func, type, m_offset, extent, d, ...)              \
-  for (type i7 = (type)0; i7 < static_cast<type>(extent[d]); ++i7) {           \
-    LOOP_L_7_REDUX(val, func, type, m_offset, extent, d - 1, i7 + m_offset[d], \
-                   __VA_ARGS__)                                                \
+#define KOKKOS_IMPL_LOOP_L_8_REDUX(val, func, type, m_offset, extent, d, ...) \
+  for (type i7 = (type)0; i7 < static_cast<type>(extent[d]); ++i7) {          \
+    KOKKOS_IMPL_LOOP_L_7_REDUX(val, func, type, m_offset, extent, d - 1,      \
+                               i7 + m_offset[d], __VA_ARGS__)                 \
   }
 
 // Left vs Right
-#define LOOP_LAYOUT_1_REDUX(val, func, type, is_left, m_offset, extent, rank) \
-  KOKKOS_ENABLE_IVDEP_MDRANGE                                                 \
-  for (type i0 = (type)0; i0 < static_cast<type>(extent[0]); ++i0) {          \
-    APPLY_REDUX(val, func, i0 + m_offset[0])                                  \
+#define KOKKOS_IMPL_LOOP_LAYOUT_1_REDUX(val, func, type, is_left, m_offset, \
+                                        extent, rank)                       \
+  KOKKOS_ENABLE_IVDEP_MDRANGE                                               \
+  for (type i0 = (type)0; i0 < static_cast<type>(extent[0]); ++i0) {        \
+    KOKKOS_IMPL_APPLY_REDUX(val, func, i0 + m_offset[0])                    \
   }
 
-#define LOOP_LAYOUT_2_REDUX(val, func, type, is_left, m_offset, extent, rank) \
+#define KOKKOS_IMPL_LOOP_LAYOUT_2_REDUX(val, func, type, is_left, m_offset,   \
+                                        extent, rank)                         \
   if (is_left) {                                                              \
     for (type i1 = (type)0; i1 < static_cast<type>(extent[rank - 1]); ++i1) { \
-      LOOP_L_1_REDUX(val, func, type, m_offset, extent, rank - 2,             \
-                     i1 + m_offset[rank - 1])                                 \
+      KOKKOS_IMPL_LOOP_L_1_REDUX(val, func, type, m_offset, extent, rank - 2, \
+                                 i1 + m_offset[rank - 1])                     \
     }                                                                         \
   } else {                                                                    \
     for (type i1 = (type)0; i1 < static_cast<type>(extent[0]); ++i1) {        \
-      LOOP_R_1_REDUX(val, func, type, m_offset, extent, 1, i1 + m_offset[0])  \
+      KOKKOS_IMPL_LOOP_R_1_REDUX(val, func, type, m_offset, extent, 1,        \
+                                 i1 + m_offset[0])                            \
     }                                                                         \
   }
 
-#define LOOP_LAYOUT_3_REDUX(val, func, type, is_left, m_offset, extent, rank) \
+#define KOKKOS_IMPL_LOOP_LAYOUT_3_REDUX(val, func, type, is_left, m_offset,   \
+                                        extent, rank)                         \
   if (is_left) {                                                              \
     for (type i2 = (type)0; i2 < static_cast<type>(extent[rank - 1]); ++i2) { \
-      LOOP_L_2_REDUX(val, func, type, m_offset, extent, rank - 2,             \
-                     i2 + m_offset[rank - 1])                                 \
+      KOKKOS_IMPL_LOOP_L_2_REDUX(val, func, type, m_offset, extent, rank - 2, \
+                                 i2 + m_offset[rank - 1])                     \
     }                                                                         \
   } else {                                                                    \
     for (type i2 = (type)0; i2 < static_cast<type>(extent[0]); ++i2) {        \
-      LOOP_R_2_REDUX(val, func, type, m_offset, extent, 1, i2 + m_offset[0])  \
+      KOKKOS_IMPL_LOOP_R_2_REDUX(val, func, type, m_offset, extent, 1,        \
+                                 i2 + m_offset[0])                            \
     }                                                                         \
   }
 
-#define LOOP_LAYOUT_4_REDUX(val, func, type, is_left, m_offset, extent, rank) \
+#define KOKKOS_IMPL_LOOP_LAYOUT_4_REDUX(val, func, type, is_left, m_offset,   \
+                                        extent, rank)                         \
   if (is_left) {                                                              \
     for (type i3 = (type)0; i3 < static_cast<type>(extent[rank - 1]); ++i3) { \
-      LOOP_L_3_REDUX(val, func, type, m_offset, extent, rank - 2,             \
-                     i3 + m_offset[rank - 1])                                 \
+      KOKKOS_IMPL_LOOP_L_3_REDUX(val, func, type, m_offset, extent, rank - 2, \
+                                 i3 + m_offset[rank - 1])                     \
     }                                                                         \
   } else {                                                                    \
     for (type i3 = (type)0; i3 < static_cast<type>(extent[0]); ++i3) {        \
-      LOOP_R_3_REDUX(val, func, type, m_offset, extent, 1, i3 + m_offset[0])  \
+      KOKKOS_IMPL_LOOP_R_3_REDUX(val, func, type, m_offset, extent, 1,        \
+                                 i3 + m_offset[0])                            \
     }                                                                         \
   }
 
-#define LOOP_LAYOUT_5_REDUX(val, func, type, is_left, m_offset, extent, rank) \
+#define KOKKOS_IMPL_LOOP_LAYOUT_5_REDUX(val, func, type, is_left, m_offset,   \
+                                        extent, rank)                         \
   if (is_left) {                                                              \
     for (type i4 = (type)0; i4 < static_cast<type>(extent[rank - 1]); ++i4) { \
-      LOOP_L_4_REDUX(val, func, type, m_offset, extent, rank - 2,             \
-                     i4 + m_offset[rank - 1])                                 \
+      KOKKOS_IMPL_LOOP_L_4_REDUX(val, func, type, m_offset, extent, rank - 2, \
+                                 i4 + m_offset[rank - 1])                     \
     }                                                                         \
   } else {                                                                    \
     for (type i4 = (type)0; i4 < static_cast<type>(extent[0]); ++i4) {        \
-      LOOP_R_4_REDUX(val, func, type, m_offset, extent, 1, i4 + m_offset[0])  \
+      KOKKOS_IMPL_LOOP_R_4_REDUX(val, func, type, m_offset, extent, 1,        \
+                                 i4 + m_offset[0])                            \
     }                                                                         \
   }
 
-#define LOOP_LAYOUT_6_REDUX(val, func, type, is_left, m_offset, extent, rank) \
+#define KOKKOS_IMPL_LOOP_LAYOUT_6_REDUX(val, func, type, is_left, m_offset,   \
+                                        extent, rank)                         \
   if (is_left) {                                                              \
     for (type i5 = (type)0; i5 < static_cast<type>(extent[rank - 1]); ++i5) { \
-      LOOP_L_5_REDUX(val, func, type, m_offset, extent, rank - 2,             \
-                     i5 + m_offset[rank - 1])                                 \
+      KOKKOS_IMPL_LOOP_L_5_REDUX(val, func, type, m_offset, extent, rank - 2, \
+                                 i5 + m_offset[rank - 1])                     \
     }                                                                         \
   } else {                                                                    \
     for (type i5 = (type)0; i5 < static_cast<type>(extent[0]); ++i5) {        \
-      LOOP_R_5_REDUX(val, func, type, m_offset, extent, 1, i5 + m_offset[0])  \
+      KOKKOS_IMPL_LOOP_R_5_REDUX(val, func, type, m_offset, extent, 1,        \
+                                 i5 + m_offset[0])                            \
     }                                                                         \
   }
 
-#define LOOP_LAYOUT_7_REDUX(val, func, type, is_left, m_offset, extent, rank) \
+#define KOKKOS_IMPL_LOOP_LAYOUT_7_REDUX(val, func, type, is_left, m_offset,   \
+                                        extent, rank)                         \
   if (is_left) {                                                              \
     for (type i6 = (type)0; i6 < static_cast<type>(extent[rank - 1]); ++i6) { \
-      LOOP_L_6_REDUX(val, func, type, m_offset, extent, rank - 2,             \
-                     i6 + m_offset[rank - 1])                                 \
+      KOKKOS_IMPL_LOOP_L_6_REDUX(val, func, type, m_offset, extent, rank - 2, \
+                                 i6 + m_offset[rank - 1])                     \
     }                                                                         \
   } else {                                                                    \
     for (type i6 = (type)0; i6 < static_cast<type>(extent[0]); ++i6) {        \
-      LOOP_R_6_REDUX(val, func, type, m_offset, extent, 1, i6 + m_offset[0])  \
+      KOKKOS_IMPL_LOOP_R_6_REDUX(val, func, type, m_offset, extent, 1,        \
+                                 i6 + m_offset[0])                            \
     }                                                                         \
   }
 
-#define LOOP_LAYOUT_8_REDUX(val, func, type, is_left, m_offset, extent, rank) \
+#define KOKKOS_IMPL_LOOP_LAYOUT_8_REDUX(val, func, type, is_left, m_offset,   \
+                                        extent, rank)                         \
   if (is_left) {                                                              \
     for (type i7 = (type)0; i7 < static_cast<type>(extent[rank - 1]); ++i7) { \
-      LOOP_L_7_REDUX(val, func, type, m_offset, extent, rank - 2,             \
-                     i7 + m_offset[rank - 1])                                 \
+      KOKKOS_IMPL_LOOP_L_7_REDUX(val, func, type, m_offset, extent, rank - 2, \
+                                 i7 + m_offset[rank - 1])                     \
     }                                                                         \
   } else {                                                                    \
     for (type i7 = (type)0; i7 < static_cast<type>(extent[0]); ++i7) {        \
-      LOOP_R_7_REDUX(val, func, type, m_offset, extent, 1, i7 + m_offset[0])  \
+      KOKKOS_IMPL_LOOP_R_7_REDUX(val, func, type, m_offset, extent, 1,        \
+                                 i7 + m_offset[0])                            \
     }                                                                         \
   }
 
 // Partial vs Full Tile
-#define TILE_LOOP_1_REDUX(val, func, type, is_left, cond, m_offset,            \
-                          extent_full, extent_partial, rank)                   \
-  if (cond) {                                                                  \
-    LOOP_LAYOUT_1_REDUX(val, func, type, is_left, m_offset, extent_full, rank) \
-  } else {                                                                     \
-    LOOP_LAYOUT_1_REDUX(val, func, type, is_left, m_offset, extent_partial,    \
-                        rank)                                                  \
+#define KOKKOS_IMPL_TILE_LOOP_1_REDUX(val, func, type, is_left, cond,        \
+                                      m_offset, extent_full, extent_partial, \
+                                      rank)                                  \
+  if (cond) {                                                                \
+    KOKKOS_IMPL_LOOP_LAYOUT_1_REDUX(val, func, type, is_left, m_offset,      \
+                                    extent_full, rank)                       \
+  } else {                                                                   \
+    KOKKOS_IMPL_LOOP_LAYOUT_1_REDUX(val, func, type, is_left, m_offset,      \
+                                    extent_partial, rank)                    \
   }
 
-#define TILE_LOOP_2_REDUX(val, func, type, is_left, cond, m_offset,            \
-                          extent_full, extent_partial, rank)                   \
-  if (cond) {                                                                  \
-    LOOP_LAYOUT_2_REDUX(val, func, type, is_left, m_offset, extent_full, rank) \
-  } else {                                                                     \
-    LOOP_LAYOUT_2_REDUX(val, func, type, is_left, m_offset, extent_partial,    \
-                        rank)                                                  \
+#define KOKKOS_IMPL_TILE_LOOP_2_REDUX(val, func, type, is_left, cond,        \
+                                      m_offset, extent_full, extent_partial, \
+                                      rank)                                  \
+  if (cond) {                                                                \
+    KOKKOS_IMPL_LOOP_LAYOUT_2_REDUX(val, func, type, is_left, m_offset,      \
+                                    extent_full, rank)                       \
+  } else {                                                                   \
+    KOKKOS_IMPL_LOOP_LAYOUT_2_REDUX(val, func, type, is_left, m_offset,      \
+                                    extent_partial, rank)                    \
   }
 
-#define TILE_LOOP_3_REDUX(val, func, type, is_left, cond, m_offset,            \
-                          extent_full, extent_partial, rank)                   \
-  if (cond) {                                                                  \
-    LOOP_LAYOUT_3_REDUX(val, func, type, is_left, m_offset, extent_full, rank) \
-  } else {                                                                     \
-    LOOP_LAYOUT_3_REDUX(val, func, type, is_left, m_offset, extent_partial,    \
-                        rank)                                                  \
+#define KOKKOS_IMPL_TILE_LOOP_3_REDUX(val, func, type, is_left, cond,        \
+                                      m_offset, extent_full, extent_partial, \
+                                      rank)                                  \
+  if (cond) {                                                                \
+    KOKKOS_IMPL_LOOP_LAYOUT_3_REDUX(val, func, type, is_left, m_offset,      \
+                                    extent_full, rank)                       \
+  } else {                                                                   \
+    KOKKOS_IMPL_LOOP_LAYOUT_3_REDUX(val, func, type, is_left, m_offset,      \
+                                    extent_partial, rank)                    \
   }
 
-#define TILE_LOOP_4_REDUX(val, func, type, is_left, cond, m_offset,            \
-                          extent_full, extent_partial, rank)                   \
-  if (cond) {                                                                  \
-    LOOP_LAYOUT_4_REDUX(val, func, type, is_left, m_offset, extent_full, rank) \
-  } else {                                                                     \
-    LOOP_LAYOUT_4_REDUX(val, func, type, is_left, m_offset, extent_partial,    \
-                        rank)                                                  \
+#define KOKKOS_IMPL_TILE_LOOP_4_REDUX(val, func, type, is_left, cond,        \
+                                      m_offset, extent_full, extent_partial, \
+                                      rank)                                  \
+  if (cond) {                                                                \
+    KOKKOS_IMPL_LOOP_LAYOUT_4_REDUX(val, func, type, is_left, m_offset,      \
+                                    extent_full, rank)                       \
+  } else {                                                                   \
+    KOKKOS_IMPL_LOOP_LAYOUT_4_REDUX(val, func, type, is_left, m_offset,      \
+                                    extent_partial, rank)                    \
   }
 
-#define TILE_LOOP_5_REDUX(val, func, type, is_left, cond, m_offset,            \
-                          extent_full, extent_partial, rank)                   \
-  if (cond) {                                                                  \
-    LOOP_LAYOUT_5_REDUX(val, func, type, is_left, m_offset, extent_full, rank) \
-  } else {                                                                     \
-    LOOP_LAYOUT_5_REDUX(val, func, type, is_left, m_offset, extent_partial,    \
-                        rank)                                                  \
+#define KOKKOS_IMPL_TILE_LOOP_5_REDUX(val, func, type, is_left, cond,        \
+                                      m_offset, extent_full, extent_partial, \
+                                      rank)                                  \
+  if (cond) {                                                                \
+    KOKKOS_IMPL_LOOP_LAYOUT_5_REDUX(val, func, type, is_left, m_offset,      \
+                                    extent_full, rank)                       \
+  } else {                                                                   \
+    KOKKOS_IMPL_LOOP_LAYOUT_5_REDUX(val, func, type, is_left, m_offset,      \
+                                    extent_partial, rank)                    \
   }
 
-#define TILE_LOOP_6_REDUX(val, func, type, is_left, cond, m_offset,            \
-                          extent_full, extent_partial, rank)                   \
-  if (cond) {                                                                  \
-    LOOP_LAYOUT_6_REDUX(val, func, type, is_left, m_offset, extent_full, rank) \
-  } else {                                                                     \
-    LOOP_LAYOUT_6_REDUX(val, func, type, is_left, m_offset, extent_partial,    \
-                        rank)                                                  \
+#define KOKKOS_IMPL_TILE_LOOP_6_REDUX(val, func, type, is_left, cond,        \
+                                      m_offset, extent_full, extent_partial, \
+                                      rank)                                  \
+  if (cond) {                                                                \
+    KOKKOS_IMPL_LOOP_LAYOUT_6_REDUX(val, func, type, is_left, m_offset,      \
+                                    extent_full, rank)                       \
+  } else {                                                                   \
+    KOKKOS_IMPL_LOOP_LAYOUT_6_REDUX(val, func, type, is_left, m_offset,      \
+                                    extent_partial, rank)                    \
   }
 
-#define TILE_LOOP_7_REDUX(val, func, type, is_left, cond, m_offset,            \
-                          extent_full, extent_partial, rank)                   \
-  if (cond) {                                                                  \
-    LOOP_LAYOUT_7_REDUX(val, func, type, is_left, m_offset, extent_full, rank) \
-  } else {                                                                     \
-    LOOP_LAYOUT_7_REDUX(val, func, type, is_left, m_offset, extent_partial,    \
-                        rank)                                                  \
+#define KOKKOS_IMPL_TILE_LOOP_7_REDUX(val, func, type, is_left, cond,        \
+                                      m_offset, extent_full, extent_partial, \
+                                      rank)                                  \
+  if (cond) {                                                                \
+    KOKKOS_IMPL_LOOP_LAYOUT_7_REDUX(val, func, type, is_left, m_offset,      \
+                                    extent_full, rank)                       \
+  } else {                                                                   \
+    KOKKOS_IMPL_LOOP_LAYOUT_7_REDUX(val, func, type, is_left, m_offset,      \
+                                    extent_partial, rank)                    \
   }
 
-#define TILE_LOOP_8_REDUX(val, func, type, is_left, cond, m_offset,            \
-                          extent_full, extent_partial, rank)                   \
-  if (cond) {                                                                  \
-    LOOP_LAYOUT_8_REDUX(val, func, type, is_left, m_offset, extent_full, rank) \
-  } else {                                                                     \
-    LOOP_LAYOUT_8_REDUX(val, func, type, is_left, m_offset, extent_partial,    \
-                        rank)                                                  \
+#define KOKKOS_IMPL_TILE_LOOP_8_REDUX(val, func, type, is_left, cond,        \
+                                      m_offset, extent_full, extent_partial, \
+                                      rank)                                  \
+  if (cond) {                                                                \
+    KOKKOS_IMPL_LOOP_LAYOUT_8_REDUX(val, func, type, is_left, m_offset,      \
+                                    extent_full, rank)                       \
+  } else {                                                                   \
+    KOKKOS_IMPL_LOOP_LAYOUT_8_REDUX(val, func, type, is_left, m_offset,      \
+                                    extent_partial, rank)                    \
   }
 // end New Loop Macros
 
 // tagged macros
-#define TAGGED_APPLY(tag, func, ...) func(tag, __VA_ARGS__);
+#define KOKKOS_IMPL_TAGGED_APPLY(tag, func, ...) func(tag, __VA_ARGS__);
 
 // LayoutRight
 // d = 0 to start
-#define TAGGED_LOOP_R_1(tag, func, type, m_offset, extent, d, ...)   \
-  KOKKOS_ENABLE_IVDEP_MDRANGE                                        \
-  for (type i0 = (type)0; i0 < static_cast<type>(extent[d]); ++i0) { \
-    TAGGED_APPLY(tag, func, __VA_ARGS__, i0 + m_offset[d])           \
+#define KOKKOS_IMPL_TAGGED_LOOP_R_1(tag, func, type, m_offset, extent, d, ...) \
+  KOKKOS_ENABLE_IVDEP_MDRANGE                                                  \
+  for (type i0 = (type)0; i0 < static_cast<type>(extent[d]); ++i0) {           \
+    KOKKOS_IMPL_TAGGED_APPLY(tag, func, __VA_ARGS__, i0 + m_offset[d])         \
   }
 
-#define TAGGED_LOOP_R_2(tag, func, type, m_offset, extent, d, ...)         \
-  for (type i1 = (type)0; i1 < static_cast<type>(extent[d]); ++i1) {       \
-    TAGGED_LOOP_R_1(tag, func, type, m_offset, extent, d + 1, __VA_ARGS__, \
-                    i1 + m_offset[d])                                      \
+#define KOKKOS_IMPL_TAGGED_LOOP_R_2(tag, func, type, m_offset, extent, d, ...) \
+  for (type i1 = (type)0; i1 < static_cast<type>(extent[d]); ++i1) {           \
+    KOKKOS_IMPL_TAGGED_LOOP_R_1(tag, func, type, m_offset, extent, d + 1,      \
+                                __VA_ARGS__, i1 + m_offset[d])                 \
   }
 
-#define TAGGED_LOOP_R_3(tag, func, type, m_offset, extent, d, ...)         \
-  for (type i2 = (type)0; i2 < static_cast<type>(extent[d]); ++i2) {       \
-    TAGGED_LOOP_R_2(tag, func, type, m_offset, extent, d + 1, __VA_ARGS__, \
-                    i2 + m_offset[d])                                      \
+#define KOKKOS_IMPL_TAGGED_LOOP_R_3(tag, func, type, m_offset, extent, d, ...) \
+  for (type i2 = (type)0; i2 < static_cast<type>(extent[d]); ++i2) {           \
+    KOKKOS_IMPL_TAGGED_LOOP_R_2(tag, func, type, m_offset, extent, d + 1,      \
+                                __VA_ARGS__, i2 + m_offset[d])                 \
   }
 
-#define TAGGED_LOOP_R_4(tag, func, type, m_offset, extent, d, ...)         \
-  for (type i3 = (type)0; i3 < static_cast<type>(extent[d]); ++i3) {       \
-    TAGGED_LOOP_R_3(tag, func, type, m_offset, extent, d + 1, __VA_ARGS__, \
-                    i3 + m_offset[d])                                      \
+#define KOKKOS_IMPL_TAGGED_LOOP_R_4(tag, func, type, m_offset, extent, d, ...) \
+  for (type i3 = (type)0; i3 < static_cast<type>(extent[d]); ++i3) {           \
+    KOKKOS_IMPL_TAGGED_LOOP_R_3(tag, func, type, m_offset, extent, d + 1,      \
+                                __VA_ARGS__, i3 + m_offset[d])                 \
   }
 
-#define TAGGED_LOOP_R_5(tag, func, type, m_offset, extent, d, ...)         \
-  for (type i4 = (type)0; i4 < static_cast<type>(extent[d]); ++i4) {       \
-    TAGGED_LOOP_R_4(tag, func, type, m_offset, extent, d + 1, __VA_ARGS__, \
-                    i4 + m_offset[d])                                      \
+#define KOKKOS_IMPL_TAGGED_LOOP_R_5(tag, func, type, m_offset, extent, d, ...) \
+  for (type i4 = (type)0; i4 < static_cast<type>(extent[d]); ++i4) {           \
+    KOKKOS_IMPL_TAGGED_LOOP_R_4(tag, func, type, m_offset, extent, d + 1,      \
+                                __VA_ARGS__, i4 + m_offset[d])                 \
   }
 
-#define TAGGED_LOOP_R_6(tag, func, type, m_offset, extent, d, ...)         \
-  for (type i5 = (type)0; i5 < static_cast<type>(extent[d]); ++i5) {       \
-    TAGGED_LOOP_R_5(tag, func, type, m_offset, extent, d + 1, __VA_ARGS__, \
-                    i5 + m_offset[d])                                      \
+#define KOKKOS_IMPL_TAGGED_LOOP_R_6(tag, func, type, m_offset, extent, d, ...) \
+  for (type i5 = (type)0; i5 < static_cast<type>(extent[d]); ++i5) {           \
+    KOKKOS_IMPL_TAGGED_LOOP_R_5(tag, func, type, m_offset, extent, d + 1,      \
+                                __VA_ARGS__, i5 + m_offset[d])                 \
   }
 
-#define TAGGED_LOOP_R_7(tag, func, type, m_offset, extent, d, ...)         \
-  for (type i6 = (type)0; i6 < static_cast<type>(extent[d]); ++i6) {       \
-    TAGGED_LOOP_R_6(tag, func, type, m_offset, extent, d + 1, __VA_ARGS__, \
-                    i6 + m_offset[d])                                      \
+#define KOKKOS_IMPL_TAGGED_LOOP_R_7(tag, func, type, m_offset, extent, d, ...) \
+  for (type i6 = (type)0; i6 < static_cast<type>(extent[d]); ++i6) {           \
+    KOKKOS_IMPL_TAGGED_LOOP_R_6(tag, func, type, m_offset, extent, d + 1,      \
+                                __VA_ARGS__, i6 + m_offset[d])                 \
   }
 
-#define TAGGED_LOOP_R_8(tag, func, type, m_offset, extent, d, ...)         \
-  for (type i7 = (type)0; i7 < static_cast<type>(extent[d]); ++i7) {       \
-    TAGGED_LOOP_R_7(tag, func, type, m_offset, extent, d + 1, __VA_ARGS__, \
-                    i7 + m_offset[d])                                      \
+#define KOKKOS_IMPL_TAGGED_LOOP_R_8(tag, func, type, m_offset, extent, d, ...) \
+  for (type i7 = (type)0; i7 < static_cast<type>(extent[d]); ++i7) {           \
+    KOKKOS_IMPL_TAGGED_LOOP_R_7(tag, func, type, m_offset, extent, d + 1,      \
+                                __VA_ARGS__, i7 + m_offset[d])                 \
   }
 
 // LayoutLeft
 // d = rank-1 to start
-#define TAGGED_LOOP_L_1(tag, func, type, m_offset, extent, d, ...)   \
-  KOKKOS_ENABLE_IVDEP_MDRANGE                                        \
-  for (type i0 = (type)0; i0 < static_cast<type>(extent[d]); ++i0) { \
-    TAGGED_APPLY(tag, func, i0 + m_offset[d], __VA_ARGS__)           \
+#define KOKKOS_IMPL_TAGGED_LOOP_L_1(tag, func, type, m_offset, extent, d, ...) \
+  KOKKOS_ENABLE_IVDEP_MDRANGE                                                  \
+  for (type i0 = (type)0; i0 < static_cast<type>(extent[d]); ++i0) {           \
+    KOKKOS_IMPL_TAGGED_APPLY(tag, func, i0 + m_offset[d], __VA_ARGS__)         \
   }
 
-#define TAGGED_LOOP_L_2(tag, func, type, m_offset, extent, d, ...)   \
-  for (type i1 = (type)0; i1 < static_cast<type>(extent[d]); ++i1) { \
-    TAGGED_LOOP_L_1(tag, func, type, m_offset, extent, d - 1,        \
-                    i1 + m_offset[d], __VA_ARGS__)                   \
+#define KOKKOS_IMPL_TAGGED_LOOP_L_2(tag, func, type, m_offset, extent, d, ...) \
+  for (type i1 = (type)0; i1 < static_cast<type>(extent[d]); ++i1) {           \
+    KOKKOS_IMPL_TAGGED_LOOP_L_1(tag, func, type, m_offset, extent, d - 1,      \
+                                i1 + m_offset[d], __VA_ARGS__)                 \
   }
 
-#define TAGGED_LOOP_L_3(tag, func, type, m_offset, extent, d, ...)   \
-  for (type i2 = (type)0; i2 < static_cast<type>(extent[d]); ++i2) { \
-    TAGGED_LOOP_L_2(tag, func, type, m_offset, extent, d - 1,        \
-                    i2 + m_offset[d], __VA_ARGS__)                   \
+#define KOKKOS_IMPL_TAGGED_LOOP_L_3(tag, func, type, m_offset, extent, d, ...) \
+  for (type i2 = (type)0; i2 < static_cast<type>(extent[d]); ++i2) {           \
+    KOKKOS_IMPL_TAGGED_LOOP_L_2(tag, func, type, m_offset, extent, d - 1,      \
+                                i2 + m_offset[d], __VA_ARGS__)                 \
   }
 
-#define TAGGED_LOOP_L_4(tag, func, type, m_offset, extent, d, ...)   \
-  for (type i3 = (type)0; i3 < static_cast<type>(extent[d]); ++i3) { \
-    TAGGED_LOOP_L_3(tag, func, type, m_offset, extent, d - 1,        \
-                    i3 + m_offset[d], __VA_ARGS__)                   \
+#define KOKKOS_IMPL_TAGGED_LOOP_L_4(tag, func, type, m_offset, extent, d, ...) \
+  for (type i3 = (type)0; i3 < static_cast<type>(extent[d]); ++i3) {           \
+    KOKKOS_IMPL_TAGGED_LOOP_L_3(tag, func, type, m_offset, extent, d - 1,      \
+                                i3 + m_offset[d], __VA_ARGS__)                 \
   }
 
-#define TAGGED_LOOP_L_5(tag, func, type, m_offset, extent, d, ...)   \
-  for (type i4 = (type)0; i4 < static_cast<type>(extent[d]); ++i4) { \
-    TAGGED_LOOP_L_4(tag, func, type, m_offset, extent, d - 1,        \
-                    i4 + m_offset[d], __VA_ARGS__)                   \
+#define KOKKOS_IMPL_TAGGED_LOOP_L_5(tag, func, type, m_offset, extent, d, ...) \
+  for (type i4 = (type)0; i4 < static_cast<type>(extent[d]); ++i4) {           \
+    KOKKOS_IMPL_TAGGED_LOOP_L_4(tag, func, type, m_offset, extent, d - 1,      \
+                                i4 + m_offset[d], __VA_ARGS__)                 \
   }
 
-#define TAGGED_LOOP_L_6(tag, func, type, m_offset, extent, d, ...)   \
-  for (type i5 = (type)0; i5 < static_cast<type>(extent[d]); ++i5) { \
-    TAGGED_LOOP_L_5(tag, func, type, m_offset, extent, d - 1,        \
-                    i5 + m_offset[d], __VA_ARGS__)                   \
+#define KOKKOS_IMPL_TAGGED_LOOP_L_6(tag, func, type, m_offset, extent, d, ...) \
+  for (type i5 = (type)0; i5 < static_cast<type>(extent[d]); ++i5) {           \
+    KOKKOS_IMPL_TAGGED_LOOP_L_5(tag, func, type, m_offset, extent, d - 1,      \
+                                i5 + m_offset[d], __VA_ARGS__)                 \
   }
 
-#define TAGGED_LOOP_L_7(tag, func, type, m_offset, extent, d, ...)   \
-  for (type i6 = (type)0; i6 < static_cast<type>(extent[d]); ++i6) { \
-    TAGGED_LOOP_L_6(tag, func, type, m_offset, extent, d - 1,        \
-                    i6 + m_offset[d], __VA_ARGS__)                   \
+#define KOKKOS_IMPL_TAGGED_LOOP_L_7(tag, func, type, m_offset, extent, d, ...) \
+  for (type i6 = (type)0; i6 < static_cast<type>(extent[d]); ++i6) {           \
+    KOKKOS_IMPL_TAGGED_LOOP_L_6(tag, func, type, m_offset, extent, d - 1,      \
+                                i6 + m_offset[d], __VA_ARGS__)                 \
   }
 
-#define TAGGED_LOOP_L_8(tag, func, type, m_offset, extent, d, ...)   \
-  for (type i7 = (type)0; i7 < static_cast<type>(extent[d]); ++i7) { \
-    TAGGED_LOOP_L_7(tag, func, type, m_offset, extent, d - 1,        \
-                    i7 + m_offset[d], __VA_ARGS__)                   \
+#define KOKKOS_IMPL_TAGGED_LOOP_L_8(tag, func, type, m_offset, extent, d, ...) \
+  for (type i7 = (type)0; i7 < static_cast<type>(extent[d]); ++i7) {           \
+    KOKKOS_IMPL_TAGGED_LOOP_L_7(tag, func, type, m_offset, extent, d - 1,      \
+                                i7 + m_offset[d], __VA_ARGS__)                 \
   }
 
 // Left vs Right
 // TODO: rank not necessary to pass through, can hardcode the values
-#define TAGGED_LOOP_LAYOUT_1(tag, func, type, is_left, m_offset, extent, rank) \
-  KOKKOS_ENABLE_IVDEP_MDRANGE                                                  \
-  for (type i0 = (type)0; i0 < static_cast<type>(extent[0]); ++i0) {           \
-    TAGGED_APPLY(tag, func, i0 + m_offset[0])                                  \
+#define KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_1(tag, func, type, is_left, m_offset, \
+                                         extent, rank)                       \
+  KOKKOS_ENABLE_IVDEP_MDRANGE                                                \
+  for (type i0 = (type)0; i0 < static_cast<type>(extent[0]); ++i0) {         \
+    KOKKOS_IMPL_TAGGED_APPLY(tag, func, i0 + m_offset[0])                    \
   }
 
-#define TAGGED_LOOP_LAYOUT_2(tag, func, type, is_left, m_offset, extent, rank) \
+#define KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_2(tag, func, type, is_left, m_offset,   \
+                                         extent, rank)                         \
   if (is_left) {                                                               \
     for (type i1 = (type)0; i1 < static_cast<type>(extent[rank - 1]); ++i1) {  \
-      TAGGED_LOOP_L_1(tag, func, type, m_offset, extent, rank - 2,             \
-                      i1 + m_offset[rank - 1])                                 \
+      KOKKOS_IMPL_TAGGED_LOOP_L_1(tag, func, type, m_offset, extent, rank - 2, \
+                                  i1 + m_offset[rank - 1])                     \
     }                                                                          \
   } else {                                                                     \
     for (type i1 = (type)0; i1 < static_cast<type>(extent[0]); ++i1) {         \
-      TAGGED_LOOP_R_1(tag, func, type, m_offset, extent, 1, i1 + m_offset[0])  \
+      KOKKOS_IMPL_TAGGED_LOOP_R_1(tag, func, type, m_offset, extent, 1,        \
+                                  i1 + m_offset[0])                            \
     }                                                                          \
   }
 
-#define TAGGED_LOOP_LAYOUT_3(tag, func, type, is_left, m_offset, extent, rank) \
+#define KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_3(tag, func, type, is_left, m_offset,   \
+                                         extent, rank)                         \
   if (is_left) {                                                               \
     for (type i2 = (type)0; i2 < static_cast<type>(extent[rank - 1]); ++i2) {  \
-      TAGGED_LOOP_L_2(tag, func, type, m_offset, extent, rank - 2,             \
-                      i2 + m_offset[rank - 1])                                 \
+      KOKKOS_IMPL_TAGGED_LOOP_L_2(tag, func, type, m_offset, extent, rank - 2, \
+                                  i2 + m_offset[rank - 1])                     \
     }                                                                          \
   } else {                                                                     \
     for (type i2 = (type)0; i2 < static_cast<type>(extent[0]); ++i2) {         \
-      TAGGED_LOOP_R_2(tag, func, type, m_offset, extent, 1, i2 + m_offset[0])  \
+      KOKKOS_IMPL_TAGGED_LOOP_R_2(tag, func, type, m_offset, extent, 1,        \
+                                  i2 + m_offset[0])                            \
     }                                                                          \
   }
 
-#define TAGGED_LOOP_LAYOUT_4(tag, func, type, is_left, m_offset, extent, rank) \
+#define KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_4(tag, func, type, is_left, m_offset,   \
+                                         extent, rank)                         \
   if (is_left) {                                                               \
     for (type i3 = (type)0; i3 < static_cast<type>(extent[rank - 1]); ++i3) {  \
-      TAGGED_LOOP_L_3(tag, func, type, m_offset, extent, rank - 2,             \
-                      i3 + m_offset[rank - 1])                                 \
+      KOKKOS_IMPL_TAGGED_LOOP_L_3(tag, func, type, m_offset, extent, rank - 2, \
+                                  i3 + m_offset[rank - 1])                     \
     }                                                                          \
   } else {                                                                     \
     for (type i3 = (type)0; i3 < static_cast<type>(extent[0]); ++i3) {         \
-      TAGGED_LOOP_R_3(tag, func, type, m_offset, extent, 1, i3 + m_offset[0])  \
+      KOKKOS_IMPL_TAGGED_LOOP_R_3(tag, func, type, m_offset, extent, 1,        \
+                                  i3 + m_offset[0])                            \
     }                                                                          \
   }
 
-#define TAGGED_LOOP_LAYOUT_5(tag, func, type, is_left, m_offset, extent, rank) \
+#define KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_5(tag, func, type, is_left, m_offset,   \
+                                         extent, rank)                         \
   if (is_left) {                                                               \
     for (type i4 = (type)0; i4 < static_cast<type>(extent[rank - 1]); ++i4) {  \
-      TAGGED_LOOP_L_4(tag, func, type, m_offset, extent, rank - 2,             \
-                      i4 + m_offset[rank - 1])                                 \
+      KOKKOS_IMPL_TAGGED_LOOP_L_4(tag, func, type, m_offset, extent, rank - 2, \
+                                  i4 + m_offset[rank - 1])                     \
     }                                                                          \
   } else {                                                                     \
     for (type i4 = (type)0; i4 < static_cast<type>(extent[0]); ++i4) {         \
-      TAGGED_LOOP_R_4(tag, func, type, m_offset, extent, 1, i4 + m_offset[0])  \
+      KOKKOS_IMPL_TAGGED_LOOP_R_4(tag, func, type, m_offset, extent, 1,        \
+                                  i4 + m_offset[0])                            \
     }                                                                          \
   }
 
-#define TAGGED_LOOP_LAYOUT_6(tag, func, type, is_left, m_offset, extent, rank) \
+#define KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_6(tag, func, type, is_left, m_offset,   \
+                                         extent, rank)                         \
   if (is_left) {                                                               \
     for (type i5 = (type)0; i5 < static_cast<type>(extent[rank - 1]); ++i5) {  \
-      TAGGED_LOOP_L_5(tag, func, type, m_offset, extent, rank - 2,             \
-                      i5 + m_offset[rank - 1])                                 \
+      KOKKOS_IMPL_TAGGED_LOOP_L_5(tag, func, type, m_offset, extent, rank - 2, \
+                                  i5 + m_offset[rank - 1])                     \
     }                                                                          \
   } else {                                                                     \
     for (type i5 = (type)0; i5 < static_cast<type>(extent[0]); ++i5) {         \
-      TAGGED_LOOP_R_5(tag, func, type, m_offset, extent, 1, i5 + m_offset[0])  \
+      KOKKOS_IMPL_TAGGED_LOOP_R_5(tag, func, type, m_offset, extent, 1,        \
+                                  i5 + m_offset[0])                            \
     }                                                                          \
   }
 
-#define TAGGED_LOOP_LAYOUT_7(tag, func, type, is_left, m_offset, extent, rank) \
+#define KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_7(tag, func, type, is_left, m_offset,   \
+                                         extent, rank)                         \
   if (is_left) {                                                               \
     for (type i6 = (type)0; i6 < static_cast<type>(extent[rank - 1]); ++i6) {  \
-      TAGGED_LOOP_L_6(tag, func, type, m_offset, extent, rank - 2,             \
-                      i6 + m_offset[rank - 1])                                 \
+      KOKKOS_IMPL_TAGGED_LOOP_L_6(tag, func, type, m_offset, extent, rank - 2, \
+                                  i6 + m_offset[rank - 1])                     \
     }                                                                          \
   } else {                                                                     \
     for (type i6 = (type)0; i6 < static_cast<type>(extent[0]); ++i6) {         \
-      TAGGED_LOOP_R_6(tag, func, type, m_offset, extent, 1, i6 + m_offset[0])  \
+      KOKKOS_IMPL_TAGGED_LOOP_R_6(tag, func, type, m_offset, extent, 1,        \
+                                  i6 + m_offset[0])                            \
     }                                                                          \
   }
 
-#define TAGGED_LOOP_LAYOUT_8(tag, func, type, is_left, m_offset, extent, rank) \
+#define KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_8(tag, func, type, is_left, m_offset,   \
+                                         extent, rank)                         \
   if (is_left) {                                                               \
     for (type i7 = (type)0; i7 < static_cast<type>(extent[rank - 1]); ++i7) {  \
-      TAGGED_LOOP_L_7(tag, func, type, m_offset, extent, rank - 2,             \
-                      i7 + m_offset[rank - 1])                                 \
+      KOKKOS_IMPL_TAGGED_LOOP_L_7(tag, func, type, m_offset, extent, rank - 2, \
+                                  i7 + m_offset[rank - 1])                     \
     }                                                                          \
   } else {                                                                     \
     for (type i7 = (type)0; i7 < static_cast<type>(extent[0]); ++i7) {         \
-      TAGGED_LOOP_R_7(tag, func, type, m_offset, extent, 1, i7 + m_offset[0])  \
+      KOKKOS_IMPL_TAGGED_LOOP_R_7(tag, func, type, m_offset, extent, 1,        \
+                                  i7 + m_offset[0])                            \
     }                                                                          \
   }
 
 // Partial vs Full Tile
-#define TAGGED_TILE_LOOP_1(tag, func, type, is_left, cond, m_offset,         \
-                           extent_full, extent_partial, rank)                \
-  if (cond) {                                                                \
-    TAGGED_LOOP_LAYOUT_1(tag, func, type, is_left, m_offset, extent_full,    \
-                         rank)                                               \
-  } else {                                                                   \
-    TAGGED_LOOP_LAYOUT_1(tag, func, type, is_left, m_offset, extent_partial, \
-                         rank)                                               \
+#define KOKKOS_IMPL_TAGGED_TILE_LOOP_1(tag, func, type, is_left, cond,        \
+                                       m_offset, extent_full, extent_partial, \
+                                       rank)                                  \
+  if (cond) {                                                                 \
+    KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_1(tag, func, type, is_left, m_offset,      \
+                                     extent_full, rank)                       \
+  } else {                                                                    \
+    KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_1(tag, func, type, is_left, m_offset,      \
+                                     extent_partial, rank)                    \
   }
 
-#define TAGGED_TILE_LOOP_2(tag, func, type, is_left, cond, m_offset,         \
-                           extent_full, extent_partial, rank)                \
-  if (cond) {                                                                \
-    TAGGED_LOOP_LAYOUT_2(tag, func, type, is_left, m_offset, extent_full,    \
-                         rank)                                               \
-  } else {                                                                   \
-    TAGGED_LOOP_LAYOUT_2(tag, func, type, is_left, m_offset, extent_partial, \
-                         rank)                                               \
+#define KOKKOS_IMPL_TAGGED_TILE_LOOP_2(tag, func, type, is_left, cond,        \
+                                       m_offset, extent_full, extent_partial, \
+                                       rank)                                  \
+  if (cond) {                                                                 \
+    KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_2(tag, func, type, is_left, m_offset,      \
+                                     extent_full, rank)                       \
+  } else {                                                                    \
+    KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_2(tag, func, type, is_left, m_offset,      \
+                                     extent_partial, rank)                    \
   }
 
-#define TAGGED_TILE_LOOP_3(tag, func, type, is_left, cond, m_offset,         \
-                           extent_full, extent_partial, rank)                \
-  if (cond) {                                                                \
-    TAGGED_LOOP_LAYOUT_3(tag, func, type, is_left, m_offset, extent_full,    \
-                         rank)                                               \
-  } else {                                                                   \
-    TAGGED_LOOP_LAYOUT_3(tag, func, type, is_left, m_offset, extent_partial, \
-                         rank)                                               \
+#define KOKKOS_IMPL_TAGGED_TILE_LOOP_3(tag, func, type, is_left, cond,        \
+                                       m_offset, extent_full, extent_partial, \
+                                       rank)                                  \
+  if (cond) {                                                                 \
+    KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_3(tag, func, type, is_left, m_offset,      \
+                                     extent_full, rank)                       \
+  } else {                                                                    \
+    KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_3(tag, func, type, is_left, m_offset,      \
+                                     extent_partial, rank)                    \
   }
 
-#define TAGGED_TILE_LOOP_4(tag, func, type, is_left, cond, m_offset,         \
-                           extent_full, extent_partial, rank)                \
-  if (cond) {                                                                \
-    TAGGED_LOOP_LAYOUT_4(tag, func, type, is_left, m_offset, extent_full,    \
-                         rank)                                               \
-  } else {                                                                   \
-    TAGGED_LOOP_LAYOUT_4(tag, func, type, is_left, m_offset, extent_partial, \
-                         rank)                                               \
+#define KOKKOS_IMPL_TAGGED_TILE_LOOP_4(tag, func, type, is_left, cond,        \
+                                       m_offset, extent_full, extent_partial, \
+                                       rank)                                  \
+  if (cond) {                                                                 \
+    KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_4(tag, func, type, is_left, m_offset,      \
+                                     extent_full, rank)                       \
+  } else {                                                                    \
+    KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_4(tag, func, type, is_left, m_offset,      \
+                                     extent_partial, rank)                    \
   }
 
-#define TAGGED_TILE_LOOP_5(tag, func, type, is_left, cond, m_offset,         \
-                           extent_full, extent_partial, rank)                \
-  if (cond) {                                                                \
-    TAGGED_LOOP_LAYOUT_5(tag, func, type, is_left, m_offset, extent_full,    \
-                         rank)                                               \
-  } else {                                                                   \
-    TAGGED_LOOP_LAYOUT_5(tag, func, type, is_left, m_offset, extent_partial, \
-                         rank)                                               \
+#define KOKKOS_IMPL_TAGGED_TILE_LOOP_5(tag, func, type, is_left, cond,        \
+                                       m_offset, extent_full, extent_partial, \
+                                       rank)                                  \
+  if (cond) {                                                                 \
+    KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_5(tag, func, type, is_left, m_offset,      \
+                                     extent_full, rank)                       \
+  } else {                                                                    \
+    KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_5(tag, func, type, is_left, m_offset,      \
+                                     extent_partial, rank)                    \
   }
 
-#define TAGGED_TILE_LOOP_6(tag, func, type, is_left, cond, m_offset,         \
-                           extent_full, extent_partial, rank)                \
-  if (cond) {                                                                \
-    TAGGED_LOOP_LAYOUT_6(tag, func, type, is_left, m_offset, extent_full,    \
-                         rank)                                               \
-  } else {                                                                   \
-    TAGGED_LOOP_LAYOUT_6(tag, func, type, is_left, m_offset, extent_partial, \
-                         rank)                                               \
+#define KOKKOS_IMPL_TAGGED_TILE_LOOP_6(tag, func, type, is_left, cond,        \
+                                       m_offset, extent_full, extent_partial, \
+                                       rank)                                  \
+  if (cond) {                                                                 \
+    KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_6(tag, func, type, is_left, m_offset,      \
+                                     extent_full, rank)                       \
+  } else {                                                                    \
+    KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_6(tag, func, type, is_left, m_offset,      \
+                                     extent_partial, rank)                    \
   }
 
-#define TAGGED_TILE_LOOP_7(tag, func, type, is_left, cond, m_offset,         \
-                           extent_full, extent_partial, rank)                \
-  if (cond) {                                                                \
-    TAGGED_LOOP_LAYOUT_7(tag, func, type, is_left, m_offset, extent_full,    \
-                         rank)                                               \
-  } else {                                                                   \
-    TAGGED_LOOP_LAYOUT_7(tag, func, type, is_left, m_offset, extent_partial, \
-                         rank)                                               \
+#define KOKKOS_IMPL_TAGGED_TILE_LOOP_7(tag, func, type, is_left, cond,        \
+                                       m_offset, extent_full, extent_partial, \
+                                       rank)                                  \
+  if (cond) {                                                                 \
+    KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_7(tag, func, type, is_left, m_offset,      \
+                                     extent_full, rank)                       \
+  } else {                                                                    \
+    KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_7(tag, func, type, is_left, m_offset,      \
+                                     extent_partial, rank)                    \
   }
 
-#define TAGGED_TILE_LOOP_8(tag, func, type, is_left, cond, m_offset,         \
-                           extent_full, extent_partial, rank)                \
-  if (cond) {                                                                \
-    TAGGED_LOOP_LAYOUT_8(tag, func, type, is_left, m_offset, extent_full,    \
-                         rank)                                               \
-  } else {                                                                   \
-    TAGGED_LOOP_LAYOUT_8(tag, func, type, is_left, m_offset, extent_partial, \
-                         rank)                                               \
+#define KOKKOS_IMPL_TAGGED_TILE_LOOP_8(tag, func, type, is_left, cond,        \
+                                       m_offset, extent_full, extent_partial, \
+                                       rank)                                  \
+  if (cond) {                                                                 \
+    KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_8(tag, func, type, is_left, m_offset,      \
+                                     extent_full, rank)                       \
+  } else {                                                                    \
+    KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_8(tag, func, type, is_left, m_offset,      \
+                                     extent_partial, rank)                    \
   }
 
 // parallel_reduce, tagged
 // Reduction version
-#define TAGGED_APPLY_REDUX(val, tag, func, ...) func(tag, __VA_ARGS__, val);
+#define KOKKOS_IMPL_TAGGED_APPLY_REDUX(val, tag, func, ...) \
+  func(tag, __VA_ARGS__, val);
 
 // LayoutRight
 // d = 0 to start
-#define TAGGED_LOOP_R_1_REDUX(val, tag, func, type, m_offset, extent, d, ...) \
-  KOKKOS_ENABLE_IVDEP_MDRANGE                                                 \
-  for (type i0 = (type)0; i0 < static_cast<type>(extent[d]); ++i0) {          \
-    TAGGED_APPLY_REDUX(val, tag, func, __VA_ARGS__, i0 + m_offset[d])         \
+#define KOKKOS_IMPL_TAGGED_LOOP_R_1_REDUX(val, tag, func, type, m_offset, \
+                                          extent, d, ...)                 \
+  KOKKOS_ENABLE_IVDEP_MDRANGE                                             \
+  for (type i0 = (type)0; i0 < static_cast<type>(extent[d]); ++i0) {      \
+    KOKKOS_IMPL_TAGGED_APPLY_REDUX(val, tag, func, __VA_ARGS__,           \
+                                   i0 + m_offset[d])                      \
   }
 
-#define TAGGED_LOOP_R_2_REDUX(val, tag, func, type, m_offset, extent, d, ...) \
+#define KOKKOS_IMPL_TAGGED_LOOP_R_2_REDUX(val, tag, func, type, m_offset,     \
+                                          extent, d, ...)                     \
   for (type i1 = (type)0; i1 < static_cast<type>(extent[d]); ++i1) {          \
-    TAGGED_LOOP_R_1_REDUX(val, tag, func, type, m_offset, extent, d + 1,      \
-                          __VA_ARGS__, i1 + m_offset[d])                      \
+    KOKKOS_IMPL_TAGGED_LOOP_R_1_REDUX(val, tag, func, type, m_offset, extent, \
+                                      d + 1, __VA_ARGS__, i1 + m_offset[d])   \
   }
 
-#define TAGGED_LOOP_R_3_REDUX(val, tag, func, type, m_offset, extent, d, ...) \
+#define KOKKOS_IMPL_TAGGED_LOOP_R_3_REDUX(val, tag, func, type, m_offset,     \
+                                          extent, d, ...)                     \
   for (type i2 = (type)0; i2 < static_cast<type>(extent[d]); ++i2) {          \
-    TAGGED_LOOP_R_2_REDUX(val, tag, func, type, m_offset, extent, d + 1,      \
-                          __VA_ARGS__, i2 + m_offset[d])                      \
+    KOKKOS_IMPL_TAGGED_LOOP_R_2_REDUX(val, tag, func, type, m_offset, extent, \
+                                      d + 1, __VA_ARGS__, i2 + m_offset[d])   \
   }
 
-#define TAGGED_LOOP_R_4_REDUX(val, tag, func, type, m_offset, extent, d, ...) \
+#define KOKKOS_IMPL_TAGGED_LOOP_R_4_REDUX(val, tag, func, type, m_offset,     \
+                                          extent, d, ...)                     \
   for (type i3 = (type)0; i3 < static_cast<type>(extent[d]); ++i3) {          \
-    TAGGED_LOOP_R_3_REDUX(val, tag, func, type, m_offset, extent, d + 1,      \
-                          __VA_ARGS__, i3 + m_offset[d])                      \
+    KOKKOS_IMPL_TAGGED_LOOP_R_3_REDUX(val, tag, func, type, m_offset, extent, \
+                                      d + 1, __VA_ARGS__, i3 + m_offset[d])   \
   }
 
-#define TAGGED_LOOP_R_5_REDUX(val, tag, func, type, m_offset, extent, d, ...) \
+#define KOKKOS_IMPL_TAGGED_LOOP_R_5_REDUX(val, tag, func, type, m_offset,     \
+                                          extent, d, ...)                     \
   for (type i4 = (type)0; i4 < static_cast<type>(extent[d]); ++i4) {          \
-    TAGGED_LOOP_R_4_REDUX(val, tag, func, type, m_offset, extent, d + 1,      \
-                          __VA_ARGS__, i4 + m_offset[d])                      \
+    KOKKOS_IMPL_TAGGED_LOOP_R_4_REDUX(val, tag, func, type, m_offset, extent, \
+                                      d + 1, __VA_ARGS__, i4 + m_offset[d])   \
   }
 
-#define TAGGED_LOOP_R_6_REDUX(val, tag, func, type, m_offset, extent, d, ...) \
+#define KOKKOS_IMPL_TAGGED_LOOP_R_6_REDUX(val, tag, func, type, m_offset,     \
+                                          extent, d, ...)                     \
   for (type i5 = (type)0; i5 < static_cast<type>(extent[d]); ++i5) {          \
-    TAGGED_LOOP_R_5_REDUX(val, tag, func, type, m_offset, extent, d + 1,      \
-                          __VA_ARGS__, i5 + m_offset[d])                      \
+    KOKKOS_IMPL_TAGGED_LOOP_R_5_REDUX(val, tag, func, type, m_offset, extent, \
+                                      d + 1, __VA_ARGS__, i5 + m_offset[d])   \
   }
 
-#define TAGGED_LOOP_R_7_REDUX(val, tag, func, type, m_offset, extent, d, ...) \
+#define KOKKOS_IMPL_TAGGED_LOOP_R_7_REDUX(val, tag, func, type, m_offset,     \
+                                          extent, d, ...)                     \
   for (type i6 = (type)0; i6 < static_cast<type>(extent[d]); ++i6) {          \
-    TAGGED_LOOP_R_6_REDUX(val, tag, func, type, m_offset, extent, d + 1,      \
-                          __VA_ARGS__, i6 + m_offset[d])                      \
+    KOKKOS_IMPL_TAGGED_LOOP_R_6_REDUX(val, tag, func, type, m_offset, extent, \
+                                      d + 1, __VA_ARGS__, i6 + m_offset[d])   \
   }
 
-#define TAGGED_LOOP_R_8_REDUX(val, tag, func, type, m_offset, extent, d, ...) \
+#define KOKKOS_IMPL_TAGGED_LOOP_R_8_REDUX(val, tag, func, type, m_offset,     \
+                                          extent, d, ...)                     \
   for (type i7 = (type)0; i7 < static_cast<type>(extent[d]); ++i7) {          \
-    TAGGED_LOOP_R_7_REDUX(val, tag, func, type, m_offset, extent, d + 1,      \
-                          __VA_ARGS__, i7 + m_offset[d])                      \
+    KOKKOS_IMPL_TAGGED_LOOP_R_7_REDUX(val, tag, func, type, m_offset, extent, \
+                                      d + 1, __VA_ARGS__, i7 + m_offset[d])   \
   }
 
 // LayoutLeft
 // d = rank-1 to start
-#define TAGGED_LOOP_L_1_REDUX(val, tag, func, type, m_offset, extent, d, ...) \
-  KOKKOS_ENABLE_IVDEP_MDRANGE                                                 \
-  for (type i0 = (type)0; i0 < static_cast<type>(extent[d]); ++i0) {          \
-    TAGGED_APPLY_REDUX(val, tag, func, i0 + m_offset[d], __VA_ARGS__)         \
+#define KOKKOS_IMPL_TAGGED_LOOP_L_1_REDUX(val, tag, func, type, m_offset, \
+                                          extent, d, ...)                 \
+  KOKKOS_ENABLE_IVDEP_MDRANGE                                             \
+  for (type i0 = (type)0; i0 < static_cast<type>(extent[d]); ++i0) {      \
+    KOKKOS_IMPL_TAGGED_APPLY_REDUX(val, tag, func, i0 + m_offset[d],      \
+                                   __VA_ARGS__)                           \
   }
 
-#define TAGGED_LOOP_L_2_REDUX(val, tag, func, type, m_offset, extent, d, ...) \
+#define KOKKOS_IMPL_TAGGED_LOOP_L_2_REDUX(val, tag, func, type, m_offset,     \
+                                          extent, d, ...)                     \
   for (type i1 = (type)0; i1 < static_cast<type>(extent[d]); ++i1) {          \
-    TAGGED_LOOP_L_1_REDUX(val, tag, func, type, m_offset, extent, d - 1,      \
-                          i1 + m_offset[d], __VA_ARGS__)                      \
+    KOKKOS_IMPL_TAGGED_LOOP_L_1_REDUX(val, tag, func, type, m_offset, extent, \
+                                      d - 1, i1 + m_offset[d], __VA_ARGS__)   \
   }
 
-#define TAGGED_LOOP_L_3_REDUX(val, tag, func, type, m_offset, extent, d, ...) \
+#define KOKKOS_IMPL_TAGGED_LOOP_L_3_REDUX(val, tag, func, type, m_offset,     \
+                                          extent, d, ...)                     \
   for (type i2 = (type)0; i2 < static_cast<type>(extent[d]); ++i2) {          \
-    TAGGED_LOOP_L_2_REDUX(val, tag, func, type, m_offset, extent, d - 1,      \
-                          i2 + m_offset[d], __VA_ARGS__)                      \
+    KOKKOS_IMPL_TAGGED_LOOP_L_2_REDUX(val, tag, func, type, m_offset, extent, \
+                                      d - 1, i2 + m_offset[d], __VA_ARGS__)   \
   }
 
-#define TAGGED_LOOP_L_4_REDUX(val, tag, func, type, m_offset, extent, d, ...) \
+#define KOKKOS_IMPL_TAGGED_LOOP_L_4_REDUX(val, tag, func, type, m_offset,     \
+                                          extent, d, ...)                     \
   for (type i3 = (type)0; i3 < static_cast<type>(extent[d]); ++i3) {          \
-    TAGGED_LOOP_L_3_REDUX(val, tag, func, type, m_offset, extent, d - 1,      \
-                          i3 + m_offset[d], __VA_ARGS__)                      \
+    KOKKOS_IMPL_TAGGED_LOOP_L_3_REDUX(val, tag, func, type, m_offset, extent, \
+                                      d - 1, i3 + m_offset[d], __VA_ARGS__)   \
   }
 
-#define TAGGED_LOOP_L_5_REDUX(val, tag, func, type, m_offset, extent, d, ...) \
+#define KOKKOS_IMPL_TAGGED_LOOP_L_5_REDUX(val, tag, func, type, m_offset,     \
+                                          extent, d, ...)                     \
   for (type i4 = (type)0; i4 < static_cast<type>(extent[d]); ++i4) {          \
-    TAGGED_LOOP_L_4_REDUX(val, tag, func, type, m_offset, extent, d - 1,      \
-                          i4 + m_offset[d], __VA_ARGS__)                      \
+    KOKKOS_IMPL_TAGGED_LOOP_L_4_REDUX(val, tag, func, type, m_offset, extent, \
+                                      d - 1, i4 + m_offset[d], __VA_ARGS__)   \
   }
 
-#define TAGGED_LOOP_L_6_REDUX(val, tag, func, type, m_offset, extent, d, ...) \
+#define KOKKOS_IMPL_TAGGED_LOOP_L_6_REDUX(val, tag, func, type, m_offset,     \
+                                          extent, d, ...)                     \
   for (type i5 = (type)0; i5 < static_cast<type>(extent[d]); ++i5) {          \
-    TAGGED_LOOP_L_5_REDUX(val, tag, func, type, m_offset, extent, d - 1,      \
-                          i5 + m_offset[d], __VA_ARGS__)                      \
+    KOKKOS_IMPL_TAGGED_LOOP_L_5_REDUX(val, tag, func, type, m_offset, extent, \
+                                      d - 1, i5 + m_offset[d], __VA_ARGS__)   \
   }
 
-#define TAGGED_LOOP_L_7_REDUX(val, tag, func, type, m_offset, extent, d, ...) \
+#define KOKKOS_IMPL_TAGGED_LOOP_L_7_REDUX(val, tag, func, type, m_offset,     \
+                                          extent, d, ...)                     \
   for (type i6 = (type)0; i6 < static_cast<type>(extent[d]); ++i6) {          \
-    TAGGED_LOOP_L_6_REDUX(val, tag, func, type, m_offset, extent, d - 1,      \
-                          i6 + m_offset[d], __VA_ARGS__)                      \
+    KOKKOS_IMPL_TAGGED_LOOP_L_6_REDUX(val, tag, func, type, m_offset, extent, \
+                                      d - 1, i6 + m_offset[d], __VA_ARGS__)   \
   }
 
-#define TAGGED_LOOP_L_8_REDUX(val, tag, func, type, m_offset, extent, d, ...) \
+#define KOKKOS_IMPL_TAGGED_LOOP_L_8_REDUX(val, tag, func, type, m_offset,     \
+                                          extent, d, ...)                     \
   for (type i7 = (type)0; i7 < static_cast<type>(extent[d]); ++i7) {          \
-    TAGGED_LOOP_L_7_REDUX(val, tag, func, type, m_offset, extent, d - 1,      \
-                          i7 + m_offset[d], __VA_ARGS__)                      \
+    KOKKOS_IMPL_TAGGED_LOOP_L_7_REDUX(val, tag, func, type, m_offset, extent, \
+                                      d - 1, i7 + m_offset[d], __VA_ARGS__)   \
   }
 
 // Left vs Right
-#define TAGGED_LOOP_LAYOUT_1_REDUX(val, tag, func, type, is_left, m_offset, \
-                                   extent, rank)                            \
-  KOKKOS_ENABLE_IVDEP_MDRANGE                                               \
-  for (type i0 = (type)0; i0 < static_cast<type>(extent[0]); ++i0) {        \
-    TAGGED_APPLY_REDUX(val, tag, func, i0 + m_offset[0])                    \
+#define KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_1_REDUX(val, tag, func, type, is_left, \
+                                               m_offset, extent, rank)        \
+  KOKKOS_ENABLE_IVDEP_MDRANGE                                                 \
+  for (type i0 = (type)0; i0 < static_cast<type>(extent[0]); ++i0) {          \
+    KOKKOS_IMPL_TAGGED_APPLY_REDUX(val, tag, func, i0 + m_offset[0])          \
   }
 
-#define TAGGED_LOOP_LAYOUT_2_REDUX(val, tag, func, type, is_left, m_offset,   \
-                                   extent, rank)                              \
+#define KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_2_REDUX(val, tag, func, type, is_left, \
+                                               m_offset, extent, rank)        \
   if (is_left) {                                                              \
     for (type i1 = (type)0; i1 < static_cast<type>(extent[rank - 1]); ++i1) { \
-      TAGGED_LOOP_L_1_REDUX(val, tag, func, type, m_offset, extent, rank - 2, \
-                            i1 + m_offset[rank - 1])                          \
+      KOKKOS_IMPL_TAGGED_LOOP_L_1_REDUX(val, tag, func, type, m_offset,       \
+                                        extent, rank - 2,                     \
+                                        i1 + m_offset[rank - 1])              \
     }                                                                         \
   } else {                                                                    \
     for (type i1 = (type)0; i1 < static_cast<type>(extent[0]); ++i1) {        \
-      TAGGED_LOOP_R_1_REDUX(val, tag, func, type, m_offset, extent, 1,        \
-                            i1 + m_offset[0])                                 \
+      KOKKOS_IMPL_TAGGED_LOOP_R_1_REDUX(val, tag, func, type, m_offset,       \
+                                        extent, 1, i1 + m_offset[0])          \
     }                                                                         \
   }
 
-#define TAGGED_LOOP_LAYOUT_3_REDUX(val, tag, func, type, is_left, m_offset,   \
-                                   extent, rank)                              \
+#define KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_3_REDUX(val, tag, func, type, is_left, \
+                                               m_offset, extent, rank)        \
   if (is_left) {                                                              \
     for (type i2 = (type)0; i2 < static_cast<type>(extent[rank - 1]); ++i2) { \
-      TAGGED_LOOP_L_2_REDUX(val, tag, func, type, m_offset, extent, rank - 2, \
-                            i2 + m_offset[rank - 1])                          \
+      KOKKOS_IMPL_TAGGED_LOOP_L_2_REDUX(val, tag, func, type, m_offset,       \
+                                        extent, rank - 2,                     \
+                                        i2 + m_offset[rank - 1])              \
     }                                                                         \
   } else {                                                                    \
     for (type i2 = (type)0; i2 < static_cast<type>(extent[0]); ++i2) {        \
-      TAGGED_LOOP_R_2_REDUX(val, tag, func, type, m_offset, extent, 1,        \
-                            i2 + m_offset[0])                                 \
+      KOKKOS_IMPL_TAGGED_LOOP_R_2_REDUX(val, tag, func, type, m_offset,       \
+                                        extent, 1, i2 + m_offset[0])          \
     }                                                                         \
   }
 
-#define TAGGED_LOOP_LAYOUT_4_REDUX(val, tag, func, type, is_left, m_offset,   \
-                                   extent, rank)                              \
+#define KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_4_REDUX(val, tag, func, type, is_left, \
+                                               m_offset, extent, rank)        \
   if (is_left) {                                                              \
     for (type i3 = (type)0; i3 < static_cast<type>(extent[rank - 1]); ++i3) { \
-      TAGGED_LOOP_L_3_REDUX(val, tag, func, type, m_offset, extent, rank - 2, \
-                            i3 + m_offset[rank - 1])                          \
+      KOKKOS_IMPL_TAGGED_LOOP_L_3_REDUX(val, tag, func, type, m_offset,       \
+                                        extent, rank - 2,                     \
+                                        i3 + m_offset[rank - 1])              \
     }                                                                         \
   } else {                                                                    \
     for (type i3 = (type)0; i3 < static_cast<type>(extent[0]); ++i3) {        \
-      TAGGED_LOOP_R_3_REDUX(val, tag, func, type, m_offset, extent, 1,        \
-                            i3 + m_offset[0])                                 \
+      KOKKOS_IMPL_TAGGED_LOOP_R_3_REDUX(val, tag, func, type, m_offset,       \
+                                        extent, 1, i3 + m_offset[0])          \
     }                                                                         \
   }
 
-#define TAGGED_LOOP_LAYOUT_5_REDUX(val, tag, func, type, is_left, m_offset,   \
-                                   extent, rank)                              \
+#define KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_5_REDUX(val, tag, func, type, is_left, \
+                                               m_offset, extent, rank)        \
   if (is_left) {                                                              \
     for (type i4 = (type)0; i4 < static_cast<type>(extent[rank - 1]); ++i4) { \
-      TAGGED_LOOP_L_4_REDUX(val, tag, func, type, m_offset, extent, rank - 2, \
-                            i4 + m_offset[rank - 1])                          \
+      KOKKOS_IMPL_TAGGED_LOOP_L_4_REDUX(val, tag, func, type, m_offset,       \
+                                        extent, rank - 2,                     \
+                                        i4 + m_offset[rank - 1])              \
     }                                                                         \
   } else {                                                                    \
     for (type i4 = (type)0; i4 < static_cast<type>(extent[0]); ++i4) {        \
-      TAGGED_LOOP_R_4_REDUX(val, tag, func, type, m_offset, extent, 1,        \
-                            i4 + m_offset[0])                                 \
+      KOKKOS_IMPL_TAGGED_LOOP_R_4_REDUX(val, tag, func, type, m_offset,       \
+                                        extent, 1, i4 + m_offset[0])          \
     }                                                                         \
   }
 
-#define TAGGED_LOOP_LAYOUT_6_REDUX(val, tag, func, type, is_left, m_offset,   \
-                                   extent, rank)                              \
+#define KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_6_REDUX(val, tag, func, type, is_left, \
+                                               m_offset, extent, rank)        \
   if (is_left) {                                                              \
     for (type i5 = (type)0; i5 < static_cast<type>(extent[rank - 1]); ++i5) { \
-      TAGGED_LOOP_L_5_REDUX(val, tag, func, type, m_offset, extent, rank - 2, \
-                            i5 + m_offset[rank - 1])                          \
+      KOKKOS_IMPL_TAGGED_LOOP_L_5_REDUX(val, tag, func, type, m_offset,       \
+                                        extent, rank - 2,                     \
+                                        i5 + m_offset[rank - 1])              \
     }                                                                         \
   } else {                                                                    \
     for (type i5 = (type)0; i5 < static_cast<type>(extent[0]); ++i5) {        \
-      TAGGED_LOOP_R_5_REDUX(val, tag, func, type, m_offset, extent, 1,        \
-                            i5 + m_offset[0])                                 \
+      KOKKOS_IMPL_TAGGED_LOOP_R_5_REDUX(val, tag, func, type, m_offset,       \
+                                        extent, 1, i5 + m_offset[0])          \
     }                                                                         \
   }
 
-#define TAGGED_LOOP_LAYOUT_7_REDUX(val, tag, func, type, is_left, m_offset,   \
-                                   extent, rank)                              \
+#define KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_7_REDUX(val, tag, func, type, is_left, \
+                                               m_offset, extent, rank)        \
   if (is_left) {                                                              \
     for (type i6 = (type)0; i6 < static_cast<type>(extent[rank - 1]); ++i6) { \
-      TAGGED_LOOP_L_6_REDUX(val, tag, func, type, m_offset, extent, rank - 2, \
-                            i6 + m_offset[rank - 1])                          \
+      KOKKOS_IMPL_TAGGED_LOOP_L_6_REDUX(val, tag, func, type, m_offset,       \
+                                        extent, rank - 2,                     \
+                                        i6 + m_offset[rank - 1])              \
     }                                                                         \
   } else {                                                                    \
     for (type i6 = (type)0; i6 < static_cast<type>(extent[0]); ++i6) {        \
-      TAGGED_LOOP_R_6_REDUX(val, tag, func, type, m_offset, extent, 1,        \
-                            i6 + m_offset[0])                                 \
+      KOKKOS_IMPL_TAGGED_LOOP_R_6_REDUX(val, tag, func, type, m_offset,       \
+                                        extent, 1, i6 + m_offset[0])          \
     }                                                                         \
   }
 
-#define TAGGED_LOOP_LAYOUT_8_REDUX(val, tag, func, type, is_left, m_offset,   \
-                                   extent, rank)                              \
+#define KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_8_REDUX(val, tag, func, type, is_left, \
+                                               m_offset, extent, rank)        \
   if (is_left) {                                                              \
     for (type i7 = (type)0; i7 < static_cast<type>(extent[rank - 1]); ++i7) { \
-      TAGGED_LOOP_L_7_REDUX(val, tag, func, type, m_offset, extent, rank - 2, \
-                            i7 + m_offset[rank - 1])                          \
+      KOKKOS_IMPL_TAGGED_LOOP_L_7_REDUX(val, tag, func, type, m_offset,       \
+                                        extent, rank - 2,                     \
+                                        i7 + m_offset[rank - 1])              \
     }                                                                         \
   } else {                                                                    \
     for (type i7 = (type)0; i7 < static_cast<type>(extent[0]); ++i7) {        \
-      TAGGED_LOOP_R_7_REDUX(val, tag, func, type, m_offset, extent, 1,        \
-                            i7 + m_offset[0])                                 \
+      KOKKOS_IMPL_TAGGED_LOOP_R_7_REDUX(val, tag, func, type, m_offset,       \
+                                        extent, 1, i7 + m_offset[0])          \
     }                                                                         \
   }
 
 // Partial vs Full Tile
-#define TAGGED_TILE_LOOP_1_REDUX(val, tag, func, type, is_left, cond,         \
-                                 m_offset, extent_full, extent_partial, rank) \
-  if (cond) {                                                                 \
-    TAGGED_LOOP_LAYOUT_1_REDUX(val, tag, func, type, is_left, m_offset,       \
-                               extent_full, rank)                             \
-  } else {                                                                    \
-    TAGGED_LOOP_LAYOUT_1_REDUX(val, tag, func, type, is_left, m_offset,       \
-                               extent_partial, rank)                          \
-  }
-
-#define TAGGED_TILE_LOOP_2_REDUX(val, tag, func, type, is_left, cond,         \
-                                 m_offset, extent_full, extent_partial, rank) \
-  if (cond) {                                                                 \
-    TAGGED_LOOP_LAYOUT_2_REDUX(val, tag, func, type, is_left, m_offset,       \
-                               extent_full, rank)                             \
-  } else {                                                                    \
-    TAGGED_LOOP_LAYOUT_2_REDUX(val, tag, func, type, is_left, m_offset,       \
-                               extent_partial, rank)                          \
-  }
-
-#define TAGGED_TILE_LOOP_3_REDUX(val, tag, func, type, is_left, cond,         \
-                                 m_offset, extent_full, extent_partial, rank) \
-  if (cond) {                                                                 \
-    TAGGED_LOOP_LAYOUT_3_REDUX(val, tag, func, type, is_left, m_offset,       \
-                               extent_full, rank)                             \
-  } else {                                                                    \
-    TAGGED_LOOP_LAYOUT_3_REDUX(val, tag, func, type, is_left, m_offset,       \
-                               extent_partial, rank)                          \
-  }
-
-#define TAGGED_TILE_LOOP_4_REDUX(val, tag, func, type, is_left, cond,         \
-                                 m_offset, extent_full, extent_partial, rank) \
-  if (cond) {                                                                 \
-    TAGGED_LOOP_LAYOUT_4_REDUX(val, tag, func, type, is_left, m_offset,       \
-                               extent_full, rank)                             \
-  } else {                                                                    \
-    TAGGED_LOOP_LAYOUT_4_REDUX(val, tag, func, type, is_left, m_offset,       \
-                               extent_partial, rank)                          \
-  }
-
-#define TAGGED_TILE_LOOP_5_REDUX(val, tag, func, type, is_left, cond,         \
-                                 m_offset, extent_full, extent_partial, rank) \
-  if (cond) {                                                                 \
-    TAGGED_LOOP_LAYOUT_5_REDUX(val, tag, func, type, is_left, m_offset,       \
-                               extent_full, rank)                             \
-  } else {                                                                    \
-    TAGGED_LOOP_LAYOUT_5_REDUX(val, tag, func, type, is_left, m_offset,       \
-                               extent_partial, rank)                          \
-  }
-
-#define TAGGED_TILE_LOOP_6_REDUX(val, tag, func, type, is_left, cond,         \
-                                 m_offset, extent_full, extent_partial, rank) \
-  if (cond) {                                                                 \
-    TAGGED_LOOP_LAYOUT_6_REDUX(val, tag, func, type, is_left, m_offset,       \
-                               extent_full, rank)                             \
-  } else {                                                                    \
-    TAGGED_LOOP_LAYOUT_6_REDUX(val, tag, func, type, is_left, m_offset,       \
-                               extent_partial, rank)                          \
-  }
-
-#define TAGGED_TILE_LOOP_7_REDUX(val, tag, func, type, is_left, cond,         \
-                                 m_offset, extent_full, extent_partial, rank) \
-  if (cond) {                                                                 \
-    TAGGED_LOOP_LAYOUT_7_REDUX(val, tag, func, type, is_left, m_offset,       \
-                               extent_full, rank)                             \
-  } else {                                                                    \
-    TAGGED_LOOP_LAYOUT_7_REDUX(val, tag, func, type, is_left, m_offset,       \
-                               extent_partial, rank)                          \
-  }
-
-#define TAGGED_TILE_LOOP_8_REDUX(val, tag, func, type, is_left, cond,         \
-                                 m_offset, extent_full, extent_partial, rank) \
-  if (cond) {                                                                 \
-    TAGGED_LOOP_LAYOUT_8_REDUX(val, tag, func, type, is_left, m_offset,       \
-                               extent_full, rank)                             \
-  } else {                                                                    \
-    TAGGED_LOOP_LAYOUT_8_REDUX(val, tag, func, type, is_left, m_offset,       \
-                               extent_partial, rank)                          \
+#define KOKKOS_IMPL_TAGGED_TILE_LOOP_1_REDUX(val, tag, func, type, is_left, \
+                                             cond, m_offset, extent_full,   \
+                                             extent_partial, rank)          \
+  if (cond) {                                                               \
+    KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_1_REDUX(val, tag, func, type, is_left,   \
+                                           m_offset, extent_full, rank)     \
+  } else {                                                                  \
+    KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_1_REDUX(val, tag, func, type, is_left,   \
+                                           m_offset, extent_partial, rank)  \
+  }
+
+#define KOKKOS_IMPL_TAGGED_TILE_LOOP_2_REDUX(val, tag, func, type, is_left, \
+                                             cond, m_offset, extent_full,   \
+                                             extent_partial, rank)          \
+  if (cond) {                                                               \
+    KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_2_REDUX(val, tag, func, type, is_left,   \
+                                           m_offset, extent_full, rank)     \
+  } else {                                                                  \
+    KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_2_REDUX(val, tag, func, type, is_left,   \
+                                           m_offset, extent_partial, rank)  \
+  }
+
+#define KOKKOS_IMPL_TAGGED_TILE_LOOP_3_REDUX(val, tag, func, type, is_left, \
+                                             cond, m_offset, extent_full,   \
+                                             extent_partial, rank)          \
+  if (cond) {                                                               \
+    KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_3_REDUX(val, tag, func, type, is_left,   \
+                                           m_offset, extent_full, rank)     \
+  } else {                                                                  \
+    KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_3_REDUX(val, tag, func, type, is_left,   \
+                                           m_offset, extent_partial, rank)  \
+  }
+
+#define KOKKOS_IMPL_TAGGED_TILE_LOOP_4_REDUX(val, tag, func, type, is_left, \
+                                             cond, m_offset, extent_full,   \
+                                             extent_partial, rank)          \
+  if (cond) {                                                               \
+    KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_4_REDUX(val, tag, func, type, is_left,   \
+                                           m_offset, extent_full, rank)     \
+  } else {                                                                  \
+    KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_4_REDUX(val, tag, func, type, is_left,   \
+                                           m_offset, extent_partial, rank)  \
+  }
+
+#define KOKKOS_IMPL_TAGGED_TILE_LOOP_5_REDUX(val, tag, func, type, is_left, \
+                                             cond, m_offset, extent_full,   \
+                                             extent_partial, rank)          \
+  if (cond) {                                                               \
+    KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_5_REDUX(val, tag, func, type, is_left,   \
+                                           m_offset, extent_full, rank)     \
+  } else {                                                                  \
+    KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_5_REDUX(val, tag, func, type, is_left,   \
+                                           m_offset, extent_partial, rank)  \
+  }
+
+#define KOKKOS_IMPL_TAGGED_TILE_LOOP_6_REDUX(val, tag, func, type, is_left, \
+                                             cond, m_offset, extent_full,   \
+                                             extent_partial, rank)          \
+  if (cond) {                                                               \
+    KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_6_REDUX(val, tag, func, type, is_left,   \
+                                           m_offset, extent_full, rank)     \
+  } else {                                                                  \
+    KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_6_REDUX(val, tag, func, type, is_left,   \
+                                           m_offset, extent_partial, rank)  \
+  }
+
+#define KOKKOS_IMPL_TAGGED_TILE_LOOP_7_REDUX(val, tag, func, type, is_left, \
+                                             cond, m_offset, extent_full,   \
+                                             extent_partial, rank)          \
+  if (cond) {                                                               \
+    KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_7_REDUX(val, tag, func, type, is_left,   \
+                                           m_offset, extent_full, rank)     \
+  } else {                                                                  \
+    KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_7_REDUX(val, tag, func, type, is_left,   \
+                                           m_offset, extent_partial, rank)  \
+  }
+
+#define KOKKOS_IMPL_TAGGED_TILE_LOOP_8_REDUX(val, tag, func, type, is_left, \
+                                             cond, m_offset, extent_full,   \
+                                             extent_partial, rank)          \
+  if (cond) {                                                               \
+    KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_8_REDUX(val, tag, func, type, is_left,   \
+                                           m_offset, extent_full, rank)     \
+  } else {                                                                  \
+    KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_8_REDUX(val, tag, func, type, is_left,   \
+                                           m_offset, extent_partial, rank)  \
   }
 
 // end tagged macros
@@ -1212,14 +1323,15 @@ struct Tile_Loop_Type<1, IsLeft, IType, void, void> {
   template <typename Func, typename Offset, typename ExtentA, typename ExtentB>
   static void apply(Func const& func, bool cond, Offset const& offset,
                     ExtentA const& a, ExtentB const& b) {
-    TILE_LOOP_1(func, IType, IsLeft, cond, offset, a, b, 1);
+    KOKKOS_IMPL_TILE_LOOP_1(func, IType, IsLeft, cond, offset, a, b, 1);
   }
 
   template <typename ValType, typename Func, typename Offset, typename ExtentA,
             typename ExtentB>
   static void apply(ValType& value, Func const& func, bool cond,
                     Offset const& offset, ExtentA const& a, ExtentB const& b) {
-    TILE_LOOP_1_REDUX(value, func, IType, IsLeft, cond, offset, a, b, 1);
+    KOKKOS_IMPL_TILE_LOOP_1_REDUX(value, func, IType, IsLeft, cond, offset, a,
+                                  b, 1);
   }
 };
 
@@ -1228,14 +1340,15 @@ struct Tile_Loop_Type<2, IsLeft, IType, void, void> {
   template <typename Func, typename Offset, typename ExtentA, typename ExtentB>
   static void apply(Func const& func, bool cond, Offset const& offset,
                     ExtentA const& a, ExtentB const& b) {
-    TILE_LOOP_2(func, IType, IsLeft, cond, offset, a, b, 2);
+    KOKKOS_IMPL_TILE_LOOP_2(func, IType, IsLeft, cond, offset, a, b, 2);
   }
 
   template <typename ValType, typename Func, typename Offset, typename ExtentA,
             typename ExtentB>
   static void apply(ValType& value, Func const& func, bool cond,
                     Offset const& offset, ExtentA const& a, ExtentB const& b) {
-    TILE_LOOP_2_REDUX(value, func, IType, IsLeft, cond, offset, a, b, 2);
+    KOKKOS_IMPL_TILE_LOOP_2_REDUX(value, func, IType, IsLeft, cond, offset, a,
+                                  b, 2);
   }
 };
 
@@ -1244,14 +1357,15 @@ struct Tile_Loop_Type<3, IsLeft, IType, void, void> {
   template <typename Func, typename Offset, typename ExtentA, typename ExtentB>
   static void apply(Func const& func, bool cond, Offset const& offset,
                     ExtentA const& a, ExtentB const& b) {
-    TILE_LOOP_3(func, IType, IsLeft, cond, offset, a, b, 3);
+    KOKKOS_IMPL_TILE_LOOP_3(func, IType, IsLeft, cond, offset, a, b, 3);
   }
 
   template <typename ValType, typename Func, typename Offset, typename ExtentA,
             typename ExtentB>
   static void apply(ValType& value, Func const& func, bool cond,
                     Offset const& offset, ExtentA const& a, ExtentB const& b) {
-    TILE_LOOP_3_REDUX(value, func, IType, IsLeft, cond, offset, a, b, 3);
+    KOKKOS_IMPL_TILE_LOOP_3_REDUX(value, func, IType, IsLeft, cond, offset, a,
+                                  b, 3);
   }
 };
 
@@ -1260,14 +1374,15 @@ struct Tile_Loop_Type<4, IsLeft, IType, void, void> {
   template <typename Func, typename Offset, typename ExtentA, typename ExtentB>
   static void apply(Func const& func, bool cond, Offset const& offset,
                     ExtentA const& a, ExtentB const& b) {
-    TILE_LOOP_4(func, IType, IsLeft, cond, offset, a, b, 4);
+    KOKKOS_IMPL_TILE_LOOP_4(func, IType, IsLeft, cond, offset, a, b, 4);
   }
 
   template <typename ValType, typename Func, typename Offset, typename ExtentA,
             typename ExtentB>
   static void apply(ValType& value, Func const& func, bool cond,
                     Offset const& offset, ExtentA const& a, ExtentB const& b) {
-    TILE_LOOP_4_REDUX(value, func, IType, IsLeft, cond, offset, a, b, 4);
+    KOKKOS_IMPL_TILE_LOOP_4_REDUX(value, func, IType, IsLeft, cond, offset, a,
+                                  b, 4);
   }
 };
 
@@ -1276,14 +1391,15 @@ struct Tile_Loop_Type<5, IsLeft, IType, void, void> {
   template <typename Func, typename Offset, typename ExtentA, typename ExtentB>
   static void apply(Func const& func, bool cond, Offset const& offset,
                     ExtentA const& a, ExtentB const& b) {
-    TILE_LOOP_5(func, IType, IsLeft, cond, offset, a, b, 5);
+    KOKKOS_IMPL_TILE_LOOP_5(func, IType, IsLeft, cond, offset, a, b, 5);
   }
 
   template <typename ValType, typename Func, typename Offset, typename ExtentA,
             typename ExtentB>
   static void apply(ValType& value, Func const& func, bool cond,
                     Offset const& offset, ExtentA const& a, ExtentB const& b) {
-    TILE_LOOP_5_REDUX(value, func, IType, IsLeft, cond, offset, a, b, 5);
+    KOKKOS_IMPL_TILE_LOOP_5_REDUX(value, func, IType, IsLeft, cond, offset, a,
+                                  b, 5);
   }
 };
 
@@ -1292,14 +1408,15 @@ struct Tile_Loop_Type<6, IsLeft, IType, void, void> {
   template <typename Func, typename Offset, typename ExtentA, typename ExtentB>
   static void apply(Func const& func, bool cond, Offset const& offset,
                     ExtentA const& a, ExtentB const& b) {
-    TILE_LOOP_6(func, IType, IsLeft, cond, offset, a, b, 6);
+    KOKKOS_IMPL_TILE_LOOP_6(func, IType, IsLeft, cond, offset, a, b, 6);
   }
 
   template <typename ValType, typename Func, typename Offset, typename ExtentA,
             typename ExtentB>
   static void apply(ValType& value, Func const& func, bool cond,
                     Offset const& offset, ExtentA const& a, ExtentB const& b) {
-    TILE_LOOP_6_REDUX(value, func, IType, IsLeft, cond, offset, a, b, 6);
+    KOKKOS_IMPL_TILE_LOOP_6_REDUX(value, func, IType, IsLeft, cond, offset, a,
+                                  b, 6);
   }
 };
 
@@ -1308,14 +1425,15 @@ struct Tile_Loop_Type<7, IsLeft, IType, void, void> {
   template <typename Func, typename Offset, typename ExtentA, typename ExtentB>
   static void apply(Func const& func, bool cond, Offset const& offset,
                     ExtentA const& a, ExtentB const& b) {
-    TILE_LOOP_7(func, IType, IsLeft, cond, offset, a, b, 7);
+    KOKKOS_IMPL_TILE_LOOP_7(func, IType, IsLeft, cond, offset, a, b, 7);
   }
 
   template <typename ValType, typename Func, typename Offset, typename ExtentA,
             typename ExtentB>
   static void apply(ValType& value, Func const& func, bool cond,
                     Offset const& offset, ExtentA const& a, ExtentB const& b) {
-    TILE_LOOP_7_REDUX(value, func, IType, IsLeft, cond, offset, a, b, 7);
+    KOKKOS_IMPL_TILE_LOOP_7_REDUX(value, func, IType, IsLeft, cond, offset, a,
+                                  b, 7);
   }
 };
 
@@ -1324,14 +1442,15 @@ struct Tile_Loop_Type<8, IsLeft, IType, void, void> {
   template <typename Func, typename Offset, typename ExtentA, typename ExtentB>
   static void apply(Func const& func, bool cond, Offset const& offset,
                     ExtentA const& a, ExtentB const& b) {
-    TILE_LOOP_8(func, IType, IsLeft, cond, offset, a, b, 8);
+    KOKKOS_IMPL_TILE_LOOP_8(func, IType, IsLeft, cond, offset, a, b, 8);
   }
 
   template <typename ValType, typename Func, typename Offset, typename ExtentA,
             typename ExtentB>
   static void apply(ValType& value, Func const& func, bool cond,
                     Offset const& offset, ExtentA const& a, ExtentB const& b) {
-    TILE_LOOP_8_REDUX(value, func, IType, IsLeft, cond, offset, a, b, 8);
+    KOKKOS_IMPL_TILE_LOOP_8_REDUX(value, func, IType, IsLeft, cond, offset, a,
+                                  b, 8);
   }
 };
 
@@ -1343,15 +1462,16 @@ struct Tile_Loop_Type<1, IsLeft, IType, Tagged,
   template <typename Func, typename Offset, typename ExtentA, typename ExtentB>
   static void apply(Func const& func, bool cond, Offset const& offset,
                     ExtentA const& a, ExtentB const& b) {
-    TAGGED_TILE_LOOP_1(Tagged(), func, IType, IsLeft, cond, offset, a, b, 1);
+    KOKKOS_IMPL_TAGGED_TILE_LOOP_1(Tagged(), func, IType, IsLeft, cond, offset,
+                                   a, b, 1);
   }
 
   template <typename ValType, typename Func, typename Offset, typename ExtentA,
             typename ExtentB>
   static void apply(ValType& value, Func const& func, bool cond,
                     Offset const& offset, ExtentA const& a, ExtentB const& b) {
-    TAGGED_TILE_LOOP_1_REDUX(value, Tagged(), func, IType, IsLeft, cond, offset,
-                             a, b, 1);
+    KOKKOS_IMPL_TAGGED_TILE_LOOP_1_REDUX(value, Tagged(), func, IType, IsLeft,
+                                         cond, offset, a, b, 1);
   }
 };
 
@@ -1361,15 +1481,16 @@ struct Tile_Loop_Type<2, IsLeft, IType, Tagged,
   template <typename Func, typename Offset, typename ExtentA, typename ExtentB>
   static void apply(Func const& func, bool cond, Offset const& offset,
                     ExtentA const& a, ExtentB const& b) {
-    TAGGED_TILE_LOOP_2(Tagged(), func, IType, IsLeft, cond, offset, a, b, 2);
+    KOKKOS_IMPL_TAGGED_TILE_LOOP_2(Tagged(), func, IType, IsLeft, cond, offset,
+                                   a, b, 2);
   }
 
   template <typename ValType, typename Func, typename Offset, typename ExtentA,
             typename ExtentB>
   static void apply(ValType& value, Func const& func, bool cond,
                     Offset const& offset, ExtentA const& a, ExtentB const& b) {
-    TAGGED_TILE_LOOP_2_REDUX(value, Tagged(), func, IType, IsLeft, cond, offset,
-                             a, b, 2);
+    KOKKOS_IMPL_TAGGED_TILE_LOOP_2_REDUX(value, Tagged(), func, IType, IsLeft,
+                                         cond, offset, a, b, 2);
   }
 };
 
@@ -1379,15 +1500,16 @@ struct Tile_Loop_Type<3, IsLeft, IType, Tagged,
   template <typename Func, typename Offset, typename ExtentA, typename ExtentB>
   static void apply(Func const& func, bool cond, Offset const& offset,
                     ExtentA const& a, ExtentB const& b) {
-    TAGGED_TILE_LOOP_3(Tagged(), func, IType, IsLeft, cond, offset, a, b, 3);
+    KOKKOS_IMPL_TAGGED_TILE_LOOP_3(Tagged(), func, IType, IsLeft, cond, offset,
+                                   a, b, 3);
   }
 
   template <typename ValType, typename Func, typename Offset, typename ExtentA,
             typename ExtentB>
   static void apply(ValType& value, Func const& func, bool cond,
                     Offset const& offset, ExtentA const& a, ExtentB const& b) {
-    TAGGED_TILE_LOOP_3_REDUX(value, Tagged(), func, IType, IsLeft, cond, offset,
-                             a, b, 3);
+    KOKKOS_IMPL_TAGGED_TILE_LOOP_3_REDUX(value, Tagged(), func, IType, IsLeft,
+                                         cond, offset, a, b, 3);
   }
 };
 
@@ -1397,15 +1519,16 @@ struct Tile_Loop_Type<4, IsLeft, IType, Tagged,
   template <typename Func, typename Offset, typename ExtentA, typename ExtentB>
   static void apply(Func const& func, bool cond, Offset const& offset,
                     ExtentA const& a, ExtentB const& b) {
-    TAGGED_TILE_LOOP_4(Tagged(), func, IType, IsLeft, cond, offset, a, b, 4);
+    KOKKOS_IMPL_TAGGED_TILE_LOOP_4(Tagged(), func, IType, IsLeft, cond, offset,
+                                   a, b, 4);
   }
 
   template <typename ValType, typename Func, typename Offset, typename ExtentA,
             typename ExtentB>
   static void apply(ValType& value, Func const& func, bool cond,
                     Offset const& offset, ExtentA const& a, ExtentB const& b) {
-    TAGGED_TILE_LOOP_4_REDUX(value, Tagged(), func, IType, IsLeft, cond, offset,
-                             a, b, 4);
+    KOKKOS_IMPL_TAGGED_TILE_LOOP_4_REDUX(value, Tagged(), func, IType, IsLeft,
+                                         cond, offset, a, b, 4);
   }
 };
 
@@ -1415,15 +1538,16 @@ struct Tile_Loop_Type<5, IsLeft, IType, Tagged,
   template <typename Func, typename Offset, typename ExtentA, typename ExtentB>
   static void apply(Func const& func, bool cond, Offset const& offset,
                     ExtentA const& a, ExtentB const& b) {
-    TAGGED_TILE_LOOP_5(Tagged(), func, IType, IsLeft, cond, offset, a, b, 5);
+    KOKKOS_IMPL_TAGGED_TILE_LOOP_5(Tagged(), func, IType, IsLeft, cond, offset,
+                                   a, b, 5);
   }
 
   template <typename ValType, typename Func, typename Offset, typename ExtentA,
             typename ExtentB>
   static void apply(ValType& value, Func const& func, bool cond,
                     Offset const& offset, ExtentA const& a, ExtentB const& b) {
-    TAGGED_TILE_LOOP_5_REDUX(value, Tagged(), func, IType, IsLeft, cond, offset,
-                             a, b, 5);
+    KOKKOS_IMPL_TAGGED_TILE_LOOP_5_REDUX(value, Tagged(), func, IType, IsLeft,
+                                         cond, offset, a, b, 5);
   }
 };
 
@@ -1433,15 +1557,16 @@ struct Tile_Loop_Type<6, IsLeft, IType, Tagged,
   template <typename Func, typename Offset, typename ExtentA, typename ExtentB>
   static void apply(Func const& func, bool cond, Offset const& offset,
                     ExtentA const& a, ExtentB const& b) {
-    TAGGED_TILE_LOOP_6(Tagged(), func, IType, IsLeft, cond, offset, a, b, 6);
+    KOKKOS_IMPL_TAGGED_TILE_LOOP_6(Tagged(), func, IType, IsLeft, cond, offset,
+                                   a, b, 6);
   }
 
   template <typename ValType, typename Func, typename Offset, typename ExtentA,
             typename ExtentB>
   static void apply(ValType& value, Func const& func, bool cond,
                     Offset const& offset, ExtentA const& a, ExtentB const& b) {
-    TAGGED_TILE_LOOP_6_REDUX(value, Tagged(), func, IType, IsLeft, cond, offset,
-                             a, b, 6);
+    KOKKOS_IMPL_TAGGED_TILE_LOOP_6_REDUX(value, Tagged(), func, IType, IsLeft,
+                                         cond, offset, a, b, 6);
   }
 };
 
@@ -1451,15 +1576,16 @@ struct Tile_Loop_Type<7, IsLeft, IType, Tagged,
   template <typename Func, typename Offset, typename ExtentA, typename ExtentB>
   static void apply(Func const& func, bool cond, Offset const& offset,
                     ExtentA const& a, ExtentB const& b) {
-    TAGGED_TILE_LOOP_7(Tagged(), func, IType, IsLeft, cond, offset, a, b, 7);
+    KOKKOS_IMPL_TAGGED_TILE_LOOP_7(Tagged(), func, IType, IsLeft, cond, offset,
+                                   a, b, 7);
   }
 
   template <typename ValType, typename Func, typename Offset, typename ExtentA,
             typename ExtentB>
   static void apply(ValType& value, Func const& func, bool cond,
                     Offset const& offset, ExtentA const& a, ExtentB const& b) {
-    TAGGED_TILE_LOOP_7_REDUX(value, Tagged(), func, IType, IsLeft, cond, offset,
-                             a, b, 7);
+    KOKKOS_IMPL_TAGGED_TILE_LOOP_7_REDUX(value, Tagged(), func, IType, IsLeft,
+                                         cond, offset, a, b, 7);
   }
 };
 
@@ -1469,15 +1595,16 @@ struct Tile_Loop_Type<8, IsLeft, IType, Tagged,
   template <typename Func, typename Offset, typename ExtentA, typename ExtentB>
   static void apply(Func const& func, bool cond, Offset const& offset,
                     ExtentA const& a, ExtentB const& b) {
-    TAGGED_TILE_LOOP_8(Tagged(), func, IType, IsLeft, cond, offset, a, b, 8);
+    KOKKOS_IMPL_TAGGED_TILE_LOOP_8(Tagged(), func, IType, IsLeft, cond, offset,
+                                   a, b, 8);
   }
 
   template <typename ValType, typename Func, typename Offset, typename ExtentA,
             typename ExtentB>
   static void apply(ValType& value, Func const& func, bool cond,
                     Offset const& offset, ExtentA const& a, ExtentB const& b) {
-    TAGGED_TILE_LOOP_8_REDUX(value, Tagged(), func, IType, IsLeft, cond, offset,
-                             a, b, 8);
+    KOKKOS_IMPL_TAGGED_TILE_LOOP_8_REDUX(value, Tagged(), func, IType, IsLeft,
+                                         cond, offset, a, b, 8);
   }
 };
 // end Structs for calling loops
@@ -1589,19 +1716,19 @@ struct HostIterateTile<RP, Functor, Tag, ValueType,
     if (RP::inner_direction == Iterate::Left) {
       if (full_tile) {
         //      #pragma simd
-        LOOP_2L(index_type, m_tiledims) { apply(LOOP_ARGS_2); }
+        KOKKOS_IMPL_LOOP_2L(index_type, m_tiledims) { apply(LOOP_ARGS_2); }
       } else {
         //      #pragma simd
-        LOOP_2L(index_type, m_tiledims) { apply(LOOP_ARGS_2); }
+        KOKKOS_IMPL_LOOP_2L(index_type, m_tiledims) { apply(LOOP_ARGS_2); }
       }
     }  // end Iterate::Left
     else {
       if (full_tile) {
         //      #pragma simd
-        LOOP_2R(index_type, m_tiledims) { apply(LOOP_ARGS_2); }
+        KOKKOS_IMPL_LOOP_2R(index_type, m_tiledims) { apply(LOOP_ARGS_2); }
       } else {
         //      #pragma simd
-        LOOP_2R(index_type, m_tiledims) { apply(LOOP_ARGS_2); }
+        KOKKOS_IMPL_LOOP_2R(index_type, m_tiledims) { apply(LOOP_ARGS_2); }
       }
     }  // end Iterate::Right
 
@@ -1633,19 +1760,19 @@ struct HostIterateTile<RP, Functor, Tag, ValueType,
     if (RP::inner_direction == Iterate::Left) {
       if (full_tile) {
         //      #pragma simd
-        LOOP_3L(index_type, m_tiledims) { apply(LOOP_ARGS_3); }
+        KOKKOS_IMPL_LOOP_3L(index_type, m_tiledims) { apply(LOOP_ARGS_3); }
       } else {
         //      #pragma simd
-        LOOP_3L(index_type, m_tiledims) { apply(LOOP_ARGS_3); }
+        KOKKOS_IMPL_LOOP_3L(index_type, m_tiledims) { apply(LOOP_ARGS_3); }
       }
     }  // end Iterate::Left
     else {
       if (full_tile) {
         //      #pragma simd
-        LOOP_3R(index_type, m_tiledims) { apply(LOOP_ARGS_3); }
+        KOKKOS_IMPL_LOOP_3R(index_type, m_tiledims) { apply(LOOP_ARGS_3); }
       } else {
         //      #pragma simd
-        LOOP_3R(index_type, m_tiledims) { apply(LOOP_ARGS_3); }
+        KOKKOS_IMPL_LOOP_3R(index_type, m_tiledims) { apply(LOOP_ARGS_3); }
       }
     }  // end Iterate::Right
 
@@ -1677,19 +1804,19 @@ struct HostIterateTile<RP, Functor, Tag, ValueType,
     if (RP::inner_direction == Iterate::Left) {
       if (full_tile) {
         //      #pragma simd
-        LOOP_4L(index_type, m_tiledims) { apply(LOOP_ARGS_4); }
+        KOKKOS_IMPL_LOOP_4L(index_type, m_tiledims) { apply(LOOP_ARGS_4); }
       } else {
         //      #pragma simd
-        LOOP_4L(index_type, m_tiledims) { apply(LOOP_ARGS_4); }
+        KOKKOS_IMPL_LOOP_4L(index_type, m_tiledims) { apply(LOOP_ARGS_4); }
       }
     }  // end Iterate::Left
     else {
       if (full_tile) {
         //      #pragma simd
-        LOOP_4R(index_type, m_tiledims) { apply(LOOP_ARGS_4); }
+        KOKKOS_IMPL_LOOP_4R(index_type, m_tiledims) { apply(LOOP_ARGS_4); }
       } else {
         //      #pragma simd
-        LOOP_4R(index_type, m_tiledims) { apply(LOOP_ARGS_4); }
+        KOKKOS_IMPL_LOOP_4R(index_type, m_tiledims) { apply(LOOP_ARGS_4); }
       }
     }  // end Iterate::Right
 
@@ -1721,19 +1848,19 @@ struct HostIterateTile<RP, Functor, Tag, ValueType,
     if (RP::inner_direction == Iterate::Left) {
       if (full_tile) {
         //      #pragma simd
-        LOOP_5L(index_type, m_tiledims) { apply(LOOP_ARGS_5); }
+        KOKKOS_IMPL_LOOP_5L(index_type, m_tiledims) { apply(LOOP_ARGS_5); }
       } else {
         //      #pragma simd
-        LOOP_5L(index_type, m_tiledims) { apply(LOOP_ARGS_5); }
+        KOKKOS_IMPL_LOOP_5L(index_type, m_tiledims) { apply(LOOP_ARGS_5); }
       }
     }  // end Iterate::Left
     else {
       if (full_tile) {
         //      #pragma simd
-        LOOP_5R(index_type, m_tiledims) { apply(LOOP_ARGS_5); }
+        KOKKOS_IMPL_LOOP_5R(index_type, m_tiledims) { apply(LOOP_ARGS_5); }
       } else {
         //      #pragma simd
-        LOOP_5R(index_type, m_tiledims) { apply(LOOP_ARGS_5); }
+        KOKKOS_IMPL_LOOP_5R(index_type, m_tiledims) { apply(LOOP_ARGS_5); }
       }
     }  // end Iterate::Right
 
@@ -1765,19 +1892,19 @@ struct HostIterateTile<RP, Functor, Tag, ValueType,
     if (RP::inner_direction == Iterate::Left) {
       if (full_tile) {
         //      #pragma simd
-        LOOP_6L(index_type, m_tiledims) { apply(LOOP_ARGS_6); }
+        KOKKOS_IMPL_LOOP_6L(index_type, m_tiledims) { apply(LOOP_ARGS_6); }
       } else {
         //      #pragma simd
-        LOOP_6L(index_type, m_tiledims) { apply(LOOP_ARGS_6); }
+        KOKKOS_IMPL_LOOP_6L(index_type, m_tiledims) { apply(LOOP_ARGS_6); }
       }
     }  // end Iterate::Left
     else {
       if (full_tile) {
         //      #pragma simd
-        LOOP_6R(index_type, m_tiledims) { apply(LOOP_ARGS_6); }
+        KOKKOS_IMPL_LOOP_6R(index_type, m_tiledims) { apply(LOOP_ARGS_6); }
       } else {
         //      #pragma simd
-        LOOP_6R(index_type, m_tiledims) { apply(LOOP_ARGS_6); }
+        KOKKOS_IMPL_LOOP_6R(index_type, m_tiledims) { apply(LOOP_ARGS_6); }
       }
     }  // end Iterate::Right
 
@@ -1809,19 +1936,19 @@ struct HostIterateTile<RP, Functor, Tag, ValueType,
     if (RP::inner_direction == Iterate::Left) {
       if (full_tile) {
         //      #pragma simd
-        LOOP_7L(index_type, m_tiledims) { apply(LOOP_ARGS_7); }
+        KOKKOS_IMPL_LOOP_7L(index_type, m_tiledims) { apply(LOOP_ARGS_7); }
       } else {
         //      #pragma simd
-        LOOP_7L(index_type, m_tiledims) { apply(LOOP_ARGS_7); }
+        KOKKOS_IMPL_LOOP_7L(index_type, m_tiledims) { apply(LOOP_ARGS_7); }
       }
     }  // end Iterate::Left
     else {
       if (full_tile) {
         //      #pragma simd
-        LOOP_7R(index_type, m_tiledims) { apply(LOOP_ARGS_7); }
+        KOKKOS_IMPL_LOOP_7R(index_type, m_tiledims) { apply(LOOP_ARGS_7); }
       } else {
         //      #pragma simd
-        LOOP_7R(index_type, m_tiledims) { apply(LOOP_ARGS_7); }
+        KOKKOS_IMPL_LOOP_7R(index_type, m_tiledims) { apply(LOOP_ARGS_7); }
       }
     }  // end Iterate::Right
 
@@ -1853,19 +1980,19 @@ struct HostIterateTile<RP, Functor, Tag, ValueType,
     if (RP::inner_direction == Iterate::Left) {
       if (full_tile) {
         //      #pragma simd
-        LOOP_8L(index_type, m_tiledims) { apply(LOOP_ARGS_8); }
+        KOKKOS_IMPL_LOOP_8L(index_type, m_tiledims) { apply(LOOP_ARGS_8); }
       } else {
         //      #pragma simd
-        LOOP_8L(index_type, m_tiledims) { apply(LOOP_ARGS_8); }
+        KOKKOS_IMPL_LOOP_8L(index_type, m_tiledims) { apply(LOOP_ARGS_8); }
       }
     }  // end Iterate::Left
     else {
       if (full_tile) {
         //      #pragma simd
-        LOOP_8R(index_type, m_tiledims) { apply(LOOP_ARGS_8); }
+        KOKKOS_IMPL_LOOP_8R(index_type, m_tiledims) { apply(LOOP_ARGS_8); }
       } else {
         //      #pragma simd
-        LOOP_8R(index_type, m_tiledims) { apply(LOOP_ARGS_8); }
+        KOKKOS_IMPL_LOOP_8R(index_type, m_tiledims) { apply(LOOP_ARGS_8); }
       }
     }  // end Iterate::Right
 
@@ -1966,8 +2093,8 @@ struct HostIterateTile<RP, Functor, Tag, ValueType,
     const bool full_tile = check_iteration_bounds(m_tiledims, m_offset);
 
     Tile_Loop_Type<RP::rank, (RP::inner_direction == Iterate::Left), index_type,
-                   Tag>::apply(val, m_func, full_tile, m_offset, m_rp.m_tile,
-                               m_tiledims);
+                   Tag>::apply(val, m_func.get_functor(), full_tile, m_offset,
+                               m_rp.m_tile, m_tiledims);
   }
 
 #else
@@ -2004,19 +2131,19 @@ struct HostIterateTile<RP, Functor, Tag, ValueType,
     if (RP::inner_direction == Iterate::Left) {
       if (full_tile) {
         //      #pragma simd
-        LOOP_2L(index_type, m_tiledims) { apply(LOOP_ARGS_2); }
+        KOKKOS_IMPL_LOOP_2L(index_type, m_tiledims) { apply(LOOP_ARGS_2); }
       } else {
         //      #pragma simd
-        LOOP_2L(index_type, m_tiledims) { apply(LOOP_ARGS_2); }
+        KOKKOS_IMPL_LOOP_2L(index_type, m_tiledims) { apply(LOOP_ARGS_2); }
       }
     }  // end Iterate::Left
     else {
       if (full_tile) {
         //      #pragma simd
-        LOOP_2R(index_type, m_tiledims) { apply(LOOP_ARGS_2); }
+        KOKKOS_IMPL_LOOP_2R(index_type, m_tiledims) { apply(LOOP_ARGS_2); }
       } else {
         //      #pragma simd
-        LOOP_2R(index_type, m_tiledims) { apply(LOOP_ARGS_2); }
+        KOKKOS_IMPL_LOOP_2R(index_type, m_tiledims) { apply(LOOP_ARGS_2); }
       }
     }  // end Iterate::Right
 
@@ -2048,19 +2175,19 @@ struct HostIterateTile<RP, Functor, Tag, ValueType,
     if (RP::inner_direction == Iterate::Left) {
       if (full_tile) {
         //      #pragma simd
-        LOOP_3L(index_type, m_tiledims) { apply(LOOP_ARGS_3); }
+        KOKKOS_IMPL_LOOP_3L(index_type, m_tiledims) { apply(LOOP_ARGS_3); }
       } else {
         //      #pragma simd
-        LOOP_3L(index_type, m_tiledims) { apply(LOOP_ARGS_3); }
+        KOKKOS_IMPL_LOOP_3L(index_type, m_tiledims) { apply(LOOP_ARGS_3); }
       }
     }  // end Iterate::Left
     else {
       if (full_tile) {
         //      #pragma simd
-        LOOP_3R(index_type, m_tiledims) { apply(LOOP_ARGS_3); }
+        KOKKOS_IMPL_LOOP_3R(index_type, m_tiledims) { apply(LOOP_ARGS_3); }
       } else {
         //      #pragma simd
-        LOOP_3R(index_type, m_tiledims) { apply(LOOP_ARGS_3); }
+        KOKKOS_IMPL_LOOP_3R(index_type, m_tiledims) { apply(LOOP_ARGS_3); }
       }
     }  // end Iterate::Right
 
@@ -2092,19 +2219,19 @@ struct HostIterateTile<RP, Functor, Tag, ValueType,
     if (RP::inner_direction == Iterate::Left) {
       if (full_tile) {
         //      #pragma simd
-        LOOP_4L(index_type, m_tiledims) { apply(LOOP_ARGS_4); }
+        KOKKOS_IMPL_LOOP_4L(index_type, m_tiledims) { apply(LOOP_ARGS_4); }
       } else {
         //      #pragma simd
-        LOOP_4L(index_type, m_tiledims) { apply(LOOP_ARGS_4); }
+        KOKKOS_IMPL_LOOP_4L(index_type, m_tiledims) { apply(LOOP_ARGS_4); }
       }
     }  // end Iterate::Left
     else {
       if (full_tile) {
         //      #pragma simd
-        LOOP_4R(index_type, m_tiledims) { apply(LOOP_ARGS_4); }
+        KOKKOS_IMPL_LOOP_4R(index_type, m_tiledims) { apply(LOOP_ARGS_4); }
       } else {
         //      #pragma simd
-        LOOP_4R(index_type, m_tiledims) { apply(LOOP_ARGS_4); }
+        KOKKOS_IMPL_LOOP_4R(index_type, m_tiledims) { apply(LOOP_ARGS_4); }
       }
     }  // end Iterate::Right
 
@@ -2136,19 +2263,19 @@ struct HostIterateTile<RP, Functor, Tag, ValueType,
     if (RP::inner_direction == Iterate::Left) {
       if (full_tile) {
         //      #pragma simd
-        LOOP_5L(index_type, m_tiledims) { apply(LOOP_ARGS_5); }
+        KOKKOS_IMPL_LOOP_5L(index_type, m_tiledims) { apply(LOOP_ARGS_5); }
       } else {
         //      #pragma simd
-        LOOP_5L(index_type, m_tiledims) { apply(LOOP_ARGS_5); }
+        KOKKOS_IMPL_LOOP_5L(index_type, m_tiledims) { apply(LOOP_ARGS_5); }
       }
     }  // end Iterate::Left
     else {
       if (full_tile) {
         //      #pragma simd
-        LOOP_5R(index_type, m_tiledims) { apply(LOOP_ARGS_5); }
+        KOKKOS_IMPL_LOOP_5R(index_type, m_tiledims) { apply(LOOP_ARGS_5); }
       } else {
         //      #pragma simd
-        LOOP_5R(index_type, m_tiledims) { apply(LOOP_ARGS_5); }
+        KOKKOS_IMPL_LOOP_5R(index_type, m_tiledims) { apply(LOOP_ARGS_5); }
       }
     }  // end Iterate::Right
 
@@ -2180,19 +2307,19 @@ struct HostIterateTile<RP, Functor, Tag, ValueType,
     if (RP::inner_direction == Iterate::Left) {
       if (full_tile) {
         //      #pragma simd
-        LOOP_6L(index_type, m_tiledims) { apply(LOOP_ARGS_6); }
+        KOKKOS_IMPL_LOOP_6L(index_type, m_tiledims) { apply(LOOP_ARGS_6); }
       } else {
         //      #pragma simd
-        LOOP_6L(index_type, m_tiledims) { apply(LOOP_ARGS_6); }
+        KOKKOS_IMPL_LOOP_6L(index_type, m_tiledims) { apply(LOOP_ARGS_6); }
       }
     }  // end Iterate::Left
     else {
       if (full_tile) {
         //      #pragma simd
-        LOOP_6R(index_type, m_tiledims) { apply(LOOP_ARGS_6); }
+        KOKKOS_IMPL_LOOP_6R(index_type, m_tiledims) { apply(LOOP_ARGS_6); }
       } else {
         //      #pragma simd
-        LOOP_6R(index_type, m_tiledims) { apply(LOOP_ARGS_6); }
+        KOKKOS_IMPL_LOOP_6R(index_type, m_tiledims) { apply(LOOP_ARGS_6); }
       }
     }  // end Iterate::Right
 
@@ -2224,19 +2351,19 @@ struct HostIterateTile<RP, Functor, Tag, ValueType,
     if (RP::inner_direction == Iterate::Left) {
       if (full_tile) {
         //      #pragma simd
-        LOOP_7L(index_type, m_tiledims) { apply(LOOP_ARGS_7); }
+        KOKKOS_IMPL_LOOP_7L(index_type, m_tiledims) { apply(LOOP_ARGS_7); }
       } else {
         //      #pragma simd
-        LOOP_7L(index_type, m_tiledims) { apply(LOOP_ARGS_7); }
+        KOKKOS_IMPL_LOOP_7L(index_type, m_tiledims) { apply(LOOP_ARGS_7); }
       }
     }  // end Iterate::Left
     else {
       if (full_tile) {
         //      #pragma simd
-        LOOP_7R(index_type, m_tiledims) { apply(LOOP_ARGS_7); }
+        KOKKOS_IMPL_LOOP_7R(index_type, m_tiledims) { apply(LOOP_ARGS_7); }
       } else {
         //      #pragma simd
-        LOOP_7R(index_type, m_tiledims) { apply(LOOP_ARGS_7); }
+        KOKKOS_IMPL_LOOP_7R(index_type, m_tiledims) { apply(LOOP_ARGS_7); }
       }
     }  // end Iterate::Right
 
@@ -2268,19 +2395,19 @@ struct HostIterateTile<RP, Functor, Tag, ValueType,
     if (RP::inner_direction == Iterate::Left) {
       if (full_tile) {
         //      #pragma simd
-        LOOP_8L(index_type, m_tiledims) { apply(LOOP_ARGS_8); }
+        KOKKOS_IMPL_LOOP_8L(index_type, m_tiledims) { apply(LOOP_ARGS_8); }
       } else {
         //      #pragma simd
-        LOOP_8L(index_type, m_tiledims) { apply(LOOP_ARGS_8); }
+        KOKKOS_IMPL_LOOP_8L(index_type, m_tiledims) { apply(LOOP_ARGS_8); }
       }
     }  // end Iterate::Left
     else {
       if (full_tile) {
         //      #pragma simd
-        LOOP_8R(index_type, m_tiledims) { apply(LOOP_ARGS_8); }
+        KOKKOS_IMPL_LOOP_8R(index_type, m_tiledims) { apply(LOOP_ARGS_8); }
       } else {
         //      #pragma simd
-        LOOP_8R(index_type, m_tiledims) { apply(LOOP_ARGS_8); }
+        KOKKOS_IMPL_LOOP_8R(index_type, m_tiledims) { apply(LOOP_ARGS_8); }
       }
     }  // end Iterate::Right
 
@@ -2417,19 +2544,19 @@ struct HostIterateTile<RP, Functor, Tag, ValueType,
     if (RP::inner_direction == Iterate::Left) {
       if (full_tile) {
         //      #pragma simd
-        LOOP_2L(index_type, m_tiledims) { apply(LOOP_ARGS_2); }
+        KOKKOS_IMPL_LOOP_2L(index_type, m_tiledims) { apply(LOOP_ARGS_2); }
       } else {
         //      #pragma simd
-        LOOP_2L(index_type, m_tiledims) { apply(LOOP_ARGS_2); }
+        KOKKOS_IMPL_LOOP_2L(index_type, m_tiledims) { apply(LOOP_ARGS_2); }
       }
     }  // end Iterate::Left
     else {
       if (full_tile) {
         //      #pragma simd
-        LOOP_2R(index_type, m_tiledims) { apply(LOOP_ARGS_2); }
+        KOKKOS_IMPL_LOOP_2R(index_type, m_tiledims) { apply(LOOP_ARGS_2); }
       } else {
         //      #pragma simd
-        LOOP_2R(index_type, m_tiledims) { apply(LOOP_ARGS_2); }
+        KOKKOS_IMPL_LOOP_2R(index_type, m_tiledims) { apply(LOOP_ARGS_2); }
       }
     }  // end Iterate::Right
 
@@ -2461,19 +2588,19 @@ struct HostIterateTile<RP, Functor, Tag, ValueType,
     if (RP::inner_direction == Iterate::Left) {
       if (full_tile) {
         //      #pragma simd
-        LOOP_3L(index_type, m_tiledims) { apply(LOOP_ARGS_3); }
+        KOKKOS_IMPL_LOOP_3L(index_type, m_tiledims) { apply(LOOP_ARGS_3); }
       } else {
         //      #pragma simd
-        LOOP_3L(index_type, m_tiledims) { apply(LOOP_ARGS_3); }
+        KOKKOS_IMPL_LOOP_3L(index_type, m_tiledims) { apply(LOOP_ARGS_3); }
       }
     }  // end Iterate::Left
     else {
       if (full_tile) {
         //      #pragma simd
-        LOOP_3R(index_type, m_tiledims) { apply(LOOP_ARGS_3); }
+        KOKKOS_IMPL_LOOP_3R(index_type, m_tiledims) { apply(LOOP_ARGS_3); }
       } else {
         //      #pragma simd
-        LOOP_3R(index_type, m_tiledims) { apply(LOOP_ARGS_3); }
+        KOKKOS_IMPL_LOOP_3R(index_type, m_tiledims) { apply(LOOP_ARGS_3); }
       }
     }  // end Iterate::Right
 
@@ -2505,19 +2632,19 @@ struct HostIterateTile<RP, Functor, Tag, ValueType,
     if (RP::inner_direction == Iterate::Left) {
       if (full_tile) {
         //      #pragma simd
-        LOOP_4L(index_type, m_tiledims) { apply(LOOP_ARGS_4); }
+        KOKKOS_IMPL_LOOP_4L(index_type, m_tiledims) { apply(LOOP_ARGS_4); }
       } else {
         //      #pragma simd
-        LOOP_4L(index_type, m_tiledims) { apply(LOOP_ARGS_4); }
+        KOKKOS_IMPL_LOOP_4L(index_type, m_tiledims) { apply(LOOP_ARGS_4); }
       }
     }  // end Iterate::Left
     else {
       if (full_tile) {
         //      #pragma simd
-        LOOP_4R(index_type, m_tiledims) { apply(LOOP_ARGS_4); }
+        KOKKOS_IMPL_LOOP_4R(index_type, m_tiledims) { apply(LOOP_ARGS_4); }
       } else {
         //      #pragma simd
-        LOOP_4R(index_type, m_tiledims) { apply(LOOP_ARGS_4); }
+        KOKKOS_IMPL_LOOP_4R(index_type, m_tiledims) { apply(LOOP_ARGS_4); }
       }
     }  // end Iterate::Right
 
@@ -2549,19 +2676,19 @@ struct HostIterateTile<RP, Functor, Tag, ValueType,
     if (RP::inner_direction == Iterate::Left) {
       if (full_tile) {
         //      #pragma simd
-        LOOP_5L(index_type, m_tiledims) { apply(LOOP_ARGS_5); }
+        KOKKOS_IMPL_LOOP_5L(index_type, m_tiledims) { apply(LOOP_ARGS_5); }
       } else {
         //      #pragma simd
-        LOOP_5L(index_type, m_tiledims) { apply(LOOP_ARGS_5); }
+        KOKKOS_IMPL_LOOP_5L(index_type, m_tiledims) { apply(LOOP_ARGS_5); }
       }
     }  // end Iterate::Left
     else {
       if (full_tile) {
         //      #pragma simd
-        LOOP_5R(index_type, m_tiledims) { apply(LOOP_ARGS_5); }
+        KOKKOS_IMPL_LOOP_5R(index_type, m_tiledims) { apply(LOOP_ARGS_5); }
       } else {
         //      #pragma simd
-        LOOP_5R(index_type, m_tiledims) { apply(LOOP_ARGS_5); }
+        KOKKOS_IMPL_LOOP_5R(index_type, m_tiledims) { apply(LOOP_ARGS_5); }
       }
     }  // end Iterate::Right
 
@@ -2593,19 +2720,19 @@ struct HostIterateTile<RP, Functor, Tag, ValueType,
     if (RP::inner_direction == Iterate::Left) {
       if (full_tile) {
         //      #pragma simd
-        LOOP_6L(index_type, m_tiledims) { apply(LOOP_ARGS_6); }
+        KOKKOS_IMPL_LOOP_6L(index_type, m_tiledims) { apply(LOOP_ARGS_6); }
       } else {
         //      #pragma simd
-        LOOP_6L(index_type, m_tiledims) { apply(LOOP_ARGS_6); }
+        KOKKOS_IMPL_LOOP_6L(index_type, m_tiledims) { apply(LOOP_ARGS_6); }
       }
     }  // end Iterate::Left
     else {
       if (full_tile) {
         //      #pragma simd
-        LOOP_6R(index_type, m_tiledims) { apply(LOOP_ARGS_6); }
+        KOKKOS_IMPL_LOOP_6R(index_type, m_tiledims) { apply(LOOP_ARGS_6); }
       } else {
         //      #pragma simd
-        LOOP_6R(index_type, m_tiledims) { apply(LOOP_ARGS_6); }
+        KOKKOS_IMPL_LOOP_6R(index_type, m_tiledims) { apply(LOOP_ARGS_6); }
       }
     }  // end Iterate::Right
 
@@ -2637,19 +2764,19 @@ struct HostIterateTile<RP, Functor, Tag, ValueType,
     if (RP::inner_direction == Iterate::Left) {
       if (full_tile) {
         //      #pragma simd
-        LOOP_7L(index_type, m_tiledims) { apply(LOOP_ARGS_7); }
+        KOKKOS_IMPL_LOOP_7L(index_type, m_tiledims) { apply(LOOP_ARGS_7); }
       } else {
         //      #pragma simd
-        LOOP_7L(index_type, m_tiledims) { apply(LOOP_ARGS_7); }
+        KOKKOS_IMPL_LOOP_7L(index_type, m_tiledims) { apply(LOOP_ARGS_7); }
       }
     }  // end Iterate::Left
     else {
       if (full_tile) {
         //      #pragma simd
-        LOOP_7R(index_type, m_tiledims) { apply(LOOP_ARGS_7); }
+        KOKKOS_IMPL_LOOP_7R(index_type, m_tiledims) { apply(LOOP_ARGS_7); }
       } else {
         //      #pragma simd
-        LOOP_7R(index_type, m_tiledims) { apply(LOOP_ARGS_7); }
+        KOKKOS_IMPL_LOOP_7R(index_type, m_tiledims) { apply(LOOP_ARGS_7); }
       }
     }  // end Iterate::Right
 
@@ -2681,19 +2808,19 @@ struct HostIterateTile<RP, Functor, Tag, ValueType,
     if (RP::inner_direction == Iterate::Left) {
       if (full_tile) {
         //      #pragma simd
-        LOOP_8L(index_type, m_tiledims) { apply(LOOP_ARGS_8); }
+        KOKKOS_IMPL_LOOP_8L(index_type, m_tiledims) { apply(LOOP_ARGS_8); }
       } else {
         //      #pragma simd
-        LOOP_8L(index_type, m_tiledims) { apply(LOOP_ARGS_8); }
+        KOKKOS_IMPL_LOOP_8L(index_type, m_tiledims) { apply(LOOP_ARGS_8); }
       }
     }  // end Iterate::Left
     else {
       if (full_tile) {
         //      #pragma simd
-        LOOP_8R(index_type, m_tiledims) { apply(LOOP_ARGS_8); }
+        KOKKOS_IMPL_LOOP_8R(index_type, m_tiledims) { apply(LOOP_ARGS_8); }
       } else {
         //      #pragma simd
-        LOOP_8R(index_type, m_tiledims) { apply(LOOP_ARGS_8); }
+        KOKKOS_IMPL_LOOP_8R(index_type, m_tiledims) { apply(LOOP_ARGS_8); }
       }
     }  // end Iterate::Right
 
@@ -2721,6 +2848,162 @@ struct HostIterateTile<RP, Functor, Tag, ValueType,
 // ------------------------------------------------------------------ //
 
 #undef KOKKOS_ENABLE_NEW_LOOP_MACROS
+#undef KOKKOS_IMPL_LOOP_1L
+#undef KOKKOS_IMPL_LOOP_2L
+#undef KOKKOS_IMPL_LOOP_3L
+#undef KOKKOS_IMPL_LOOP_4L
+#undef KOKKOS_IMPL_LOOP_5L
+#undef KOKKOS_IMPL_LOOP_6L
+#undef KOKKOS_IMPL_LOOP_7L
+#undef KOKKOS_IMPL_LOOP_8L
+#undef KOKKOS_IMPL_LOOP_1R
+#undef KOKKOS_IMPL_LOOP_2R
+#undef KOKKOS_IMPL_LOOP_3R
+#undef KOKKOS_IMPL_LOOP_4R
+#undef KOKKOS_IMPL_LOOP_5R
+#undef KOKKOS_IMPL_LOOP_6R
+#undef KOKKOS_IMPL_LOOP_7R
+#undef KOKKOS_IMPL_LOOP_8R
+#undef KOKKOS_IMPL_LOOP_ARGS_1
+#undef KOKKOS_IMPL_LOOP_ARGS_2
+#undef KOKKOS_IMPL_LOOP_ARGS_3
+#undef KOKKOS_IMPL_LOOP_ARGS_4
+#undef KOKKOS_IMPL_LOOP_ARGS_5
+#undef KOKKOS_IMPL_LOOP_ARGS_6
+#undef KOKKOS_IMPL_LOOP_ARGS_7
+#undef KOKKOS_IMPL_LOOP_ARGS_8
+#undef KOKKOS_IMPL_APPLY
+#undef KOKKOS_IMPL_LOOP_R_1
+#undef KOKKOS_IMPL_LOOP_R_2
+#undef KOKKOS_IMPL_LOOP_R_3
+#undef KOKKOS_IMPL_LOOP_R_4
+#undef KOKKOS_IMPL_LOOP_R_5
+#undef KOKKOS_IMPL_LOOP_R_6
+#undef KOKKOS_IMPL_LOOP_R_7
+#undef KOKKOS_IMPL_LOOP_R_8
+#undef KOKKOS_IMPL_LOOP_L_1
+#undef KOKKOS_IMPL_LOOP_L_2
+#undef KOKKOS_IMPL_LOOP_L_3
+#undef KOKKOS_IMPL_LOOP_L_4
+#undef KOKKOS_IMPL_LOOP_L_5
+#undef KOKKOS_IMPL_LOOP_L_6
+#undef KOKKOS_IMPL_LOOP_L_7
+#undef KOKKOS_IMPL_LOOP_L_8
+#undef KOKKOS_IMPL_LOOP_LAYOUT_1
+#undef KOKKOS_IMPL_LOOP_LAYOUT_2
+#undef KOKKOS_IMPL_LOOP_LAYOUT_3
+#undef KOKKOS_IMPL_LOOP_LAYOUT_4
+#undef KOKKOS_IMPL_LOOP_LAYOUT_5
+#undef KOKKOS_IMPL_LOOP_LAYOUT_6
+#undef KOKKOS_IMPL_LOOP_LAYOUT_7
+#undef KOKKOS_IMPL_LOOP_LAYOUT_8
+#undef KOKKOS_IMPL_TILE_LOOP_1
+#undef KOKKOS_IMPL_TILE_LOOP_2
+#undef KOKKOS_IMPL_TILE_LOOP_3
+#undef KOKKOS_IMPL_TILE_LOOP_4
+#undef KOKKOS_IMPL_TILE_LOOP_5
+#undef KOKKOS_IMPL_TILE_LOOP_6
+#undef KOKKOS_IMPL_TILE_LOOP_7
+#undef KOKKOS_IMPL_TILE_LOOP_8
+#undef KOKKOS_IMPL_APPLY_REDUX
+#undef KOKKOS_IMPL_LOOP_R_1_REDUX
+#undef KOKKOS_IMPL_LOOP_R_2_REDUX
+#undef KOKKOS_IMPL_LOOP_R_3_REDUX
+#undef KOKKOS_IMPL_LOOP_R_4_REDUX
+#undef KOKKOS_IMPL_LOOP_R_5_REDUX
+#undef KOKKOS_IMPL_LOOP_R_6_REDUX
+#undef KOKKOS_IMPL_LOOP_R_7_REDUX
+#undef KOKKOS_IMPL_LOOP_R_8_REDUX
+#undef KOKKOS_IMPL_LOOP_L_1_REDUX
+#undef KOKKOS_IMPL_LOOP_L_2_REDUX
+#undef KOKKOS_IMPL_LOOP_L_3_REDUX
+#undef KOKKOS_IMPL_LOOP_L_4_REDUX
+#undef KOKKOS_IMPL_LOOP_L_5_REDUX
+#undef KOKKOS_IMPL_LOOP_L_6_REDUX
+#undef KOKKOS_IMPL_LOOP_L_7_REDUX
+#undef KOKKOS_IMPL_LOOP_L_8_REDUX
+#undef KOKKOS_IMPL_LOOP_LAYOUT_1_REDUX
+#undef KOKKOS_IMPL_LOOP_LAYOUT_2_REDUX
+#undef KOKKOS_IMPL_LOOP_LAYOUT_3_REDUX
+#undef KOKKOS_IMPL_LOOP_LAYOUT_4_REDUX
+#undef KOKKOS_IMPL_LOOP_LAYOUT_5_REDUX
+#undef KOKKOS_IMPL_LOOP_LAYOUT_6_REDUX
+#undef KOKKOS_IMPL_LOOP_LAYOUT_7_REDUX
+#undef KOKKOS_IMPL_LOOP_LAYOUT_8_REDUX
+#undef KOKKOS_IMPL_TILE_LOOP_1_REDUX
+#undef KOKKOS_IMPL_TILE_LOOP_2_REDUX
+#undef KOKKOS_IMPL_TILE_LOOP_3_REDUX
+#undef KOKKOS_IMPL_TILE_LOOP_4_REDUX
+#undef KOKKOS_IMPL_TILE_LOOP_5_REDUX
+#undef KOKKOS_IMPL_TILE_LOOP_6_REDUX
+#undef KOKKOS_IMPL_TILE_LOOP_7_REDUX
+#undef KOKKOS_IMPL_TILE_LOOP_8_REDUX
+#undef KOKKOS_IMPL_TAGGED_APPLY
+#undef KOKKOS_IMPL_TAGGED_LOOP_R_1
+#undef KOKKOS_IMPL_TAGGED_LOOP_R_2
+#undef KOKKOS_IMPL_TAGGED_LOOP_R_3
+#undef KOKKOS_IMPL_TAGGED_LOOP_R_4
+#undef KOKKOS_IMPL_TAGGED_LOOP_R_5
+#undef KOKKOS_IMPL_TAGGED_LOOP_R_6
+#undef KOKKOS_IMPL_TAGGED_LOOP_R_7
+#undef KOKKOS_IMPL_TAGGED_LOOP_R_8
+#undef KOKKOS_IMPL_TAGGED_LOOP_L_1
+#undef KOKKOS_IMPL_TAGGED_LOOP_L_2
+#undef KOKKOS_IMPL_TAGGED_LOOP_L_3
+#undef KOKKOS_IMPL_TAGGED_LOOP_L_4
+#undef KOKKOS_IMPL_TAGGED_LOOP_L_5
+#undef KOKKOS_IMPL_TAGGED_LOOP_L_6
+#undef KOKKOS_IMPL_TAGGED_LOOP_L_7
+#undef KOKKOS_IMPL_TAGGED_LOOP_L_8
+#undef KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_1
+#undef KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_2
+#undef KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_3
+#undef KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_4
+#undef KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_5
+#undef KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_6
+#undef KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_7
+#undef KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_8
+#undef KOKKOS_IMPL_TAGGED_TILE_LOOP_1
+#undef KOKKOS_IMPL_TAGGED_TILE_LOOP_2
+#undef KOKKOS_IMPL_TAGGED_TILE_LOOP_3
+#undef KOKKOS_IMPL_TAGGED_TILE_LOOP_4
+#undef KOKKOS_IMPL_TAGGED_TILE_LOOP_5
+#undef KOKKOS_IMPL_TAGGED_TILE_LOOP_6
+#undef KOKKOS_IMPL_TAGGED_TILE_LOOP_7
+#undef KOKKOS_IMPL_TAGGED_TILE_LOOP_8
+#undef KOKKOS_IMPL_TAGGED_APPLY_REDUX
+#undef KOKKOS_IMPL_TAGGED_LOOP_R_1_REDUX
+#undef KOKKOS_IMPL_TAGGED_LOOP_R_2_REDUX
+#undef KOKKOS_IMPL_TAGGED_LOOP_R_3_REDUX
+#undef KOKKOS_IMPL_TAGGED_LOOP_R_4_REDUX
+#undef KOKKOS_IMPL_TAGGED_LOOP_R_5_REDUX
+#undef KOKKOS_IMPL_TAGGED_LOOP_R_6_REDUX
+#undef KOKKOS_IMPL_TAGGED_LOOP_R_7_REDUX
+#undef KOKKOS_IMPL_TAGGED_LOOP_R_8_REDUX
+#undef KOKKOS_IMPL_TAGGED_LOOP_L_1_REDUX
+#undef KOKKOS_IMPL_TAGGED_LOOP_L_2_REDUX
+#undef KOKKOS_IMPL_TAGGED_LOOP_L_3_REDUX
+#undef KOKKOS_IMPL_TAGGED_LOOP_L_4_REDUX
+#undef KOKKOS_IMPL_TAGGED_LOOP_L_5_REDUX
+#undef KOKKOS_IMPL_TAGGED_LOOP_L_6_REDUX
+#undef KOKKOS_IMPL_TAGGED_LOOP_L_7_REDUX
+#undef KOKKOS_IMPL_TAGGED_LOOP_L_8_REDUX
+#undef KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_1_REDUX
+#undef KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_2_REDUX
+#undef KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_3_REDUX
+#undef KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_4_REDUX
+#undef KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_5_REDUX
+#undef KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_6_REDUX
+#undef KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_7_REDUX
+#undef KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_8_REDUX
+#undef KOKKOS_IMPL_TAGGED_TILE_LOOP_1_REDUX
+#undef KOKKOS_IMPL_TAGGED_TILE_LOOP_2_REDUX
+#undef KOKKOS_IMPL_TAGGED_TILE_LOOP_3_REDUX
+#undef KOKKOS_IMPL_TAGGED_TILE_LOOP_4_REDUX
+#undef KOKKOS_IMPL_TAGGED_TILE_LOOP_5_REDUX
+#undef KOKKOS_IMPL_TAGGED_TILE_LOOP_6_REDUX
+#undef KOKKOS_IMPL_TAGGED_TILE_LOOP_7_REDUX
+#undef KOKKOS_IMPL_TAGGED_TILE_LOOP_8_REDUX
 
 }  // namespace Impl
 }  // namespace Kokkos
diff --git a/packages/kokkos/algorithms/unit_tests/TestOpenMP_Sort1D.cpp b/packages/kokkos/core/src/impl/Kokkos_Abort.cpp
similarity index 52%
rename from packages/kokkos/algorithms/unit_tests/TestOpenMP_Sort1D.cpp
rename to packages/kokkos/core/src/impl/Kokkos_Abort.cpp
index e06486618f38e13a709da4e6225f241cd323a494..23f663e37fb647c55c0d1fd273ec05484c3454db 100644
--- a/packages/kokkos/algorithms/unit_tests/TestOpenMP_Sort1D.cpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Abort.cpp
@@ -14,26 +14,31 @@
 //
 //@HEADER
 
-#include <Kokkos_Macros.hpp>
-#ifdef KOKKOS_ENABLE_OPENMP
-
-#include <gtest/gtest.h>
-#include <Kokkos_Core.hpp>
-
-//----------------------------------------------------------------------------
-#include <TestRandom.hpp>
-#include <TestSort.hpp>
-#include <iomanip>
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#endif
 
-namespace Test {
+#include <cstdlib>
+#include <iostream>
+#include <Kokkos_Abort.hpp>
+#include <impl/Kokkos_Stacktrace.hpp>
 
-TEST(openmp, SortUnsigned1D) {
-  Impl::test_1D_sort<Kokkos::OpenMP, unsigned>(171);
-}
+namespace Kokkos {
+namespace Impl {
 
-TEST(openmp, SortIssue1160) { Impl::test_issue_1160_sort<Kokkos::OpenMP>(); }
+void host_abort(const char *const message) {
+  std::cerr << message;
 
-}  // namespace Test
+#ifdef KOKKOS_IMPL_ENABLE_STACKTRACE
+  std::cerr << "\nBacktrace:\n";
+  save_stacktrace();
+  print_demangled_saved_stacktrace(std::cerr);
 #else
-void KOKKOS_ALGORITHMS_UNITTESTS_TESTOPENMP_PREVENT_LINK_ERROR() {}
+  std::cerr << "\nTraceback functionality not available\n";
 #endif
+
+  ::abort();
+}
+
+}  // namespace Impl
+}  // namespace Kokkos
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Assembly.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Assembly.hpp
deleted file mode 100644
index 59d70e7f7c8523d67a4ec97bd41b25b6a7fc5ec0..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/src/impl/Kokkos_Atomic_Assembly.hpp
+++ /dev/null
@@ -1,79 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#include <Kokkos_Macros.hpp>
-#if defined(KOKKOS_ATOMIC_HPP) && !defined(KOKKOS_ATOMIC_ASSEMBLY_HPP)
-#define KOKKOS_ATOMIC_ASSEMBLY_HPP
-namespace Kokkos {
-
-namespace Impl {
-
-#if !defined(_WIN32)
-struct cas128_t {
-  uint64_t lower;
-  uint64_t upper;
-
-  KOKKOS_INLINE_FUNCTION
-  cas128_t() {
-    lower = 0;
-    upper = 0;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  cas128_t(const cas128_t& a) {
-    lower = a.lower;
-    upper = a.upper;
-  }
-  KOKKOS_INLINE_FUNCTION
-  cas128_t(volatile cas128_t* a) {
-    lower = a->lower;
-    upper = a->upper;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  bool operator!=(const cas128_t& a) const {
-    return (lower != a.lower) || upper != a.upper;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator=(const cas128_t& a) {
-    lower = a.lower;
-    upper = a.upper;
-  }
-  KOKKOS_INLINE_FUNCTION
-  void operator=(const cas128_t& a) volatile {
-    lower = a.lower;
-    upper = a.upper;
-  }
-} __attribute__((__aligned__(16)));
-#endif
-
-#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64)
-inline cas128_t cas128(volatile cas128_t* ptr, cas128_t cmp, cas128_t swap) {
-  bool swapped = false;
-  __asm__ __volatile__(
-      "lock cmpxchg16b %1\n\t"
-      "setz %0"
-      : "=q"(swapped), "+m"(*ptr), "+d"(cmp.upper), "+a"(cmp.lower)
-      : "c"(swap.upper), "b"(swap.lower), "q"(swapped));
-  return cmp;
-}
-#endif
-
-}  // namespace Impl
-}  // namespace Kokkos
-
-#endif
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp
deleted file mode 100644
index 08091ab9ce4c56da255e153fac714e3962026d0f..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp
+++ /dev/null
@@ -1,409 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-#include <xmmintrin.h>
-#endif
-
-#include <Kokkos_Macros.hpp>
-#if defined(KOKKOS_ATOMIC_HPP) && \
-    !defined(KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP)
-#define KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP
-
-#include <impl/Kokkos_Atomic_Memory_Order.hpp>
-#include <impl/Kokkos_Memory_Fence.hpp>
-
-#if defined(KOKKOS_ENABLE_CUDA)
-#include <Cuda/Kokkos_Cuda_Atomic_Intrinsics.hpp>
-#endif
-
-namespace Kokkos {
-
-//----------------------------------------------------------------------------
-// Cuda native CAS supports int, unsigned int, and unsigned long long int
-// (non-standard type). Must cast-away 'volatile' for the CAS call.
-
-#if defined(KOKKOS_ENABLE_CUDA)
-
-#if defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND)
-__inline__ __device__ int atomic_compare_exchange(volatile int* const dest,
-                                                  const int compare,
-                                                  const int val) {
-  return atomicCAS((int*)dest, compare, val);
-}
-
-__inline__ __device__ unsigned int atomic_compare_exchange(
-    volatile unsigned int* const dest, const unsigned int compare,
-    const unsigned int val) {
-  return atomicCAS((unsigned int*)dest, compare, val);
-}
-
-__inline__ __device__ unsigned long long int atomic_compare_exchange(
-    volatile unsigned long long int* const dest,
-    const unsigned long long int compare, const unsigned long long int val) {
-  return atomicCAS((unsigned long long int*)dest, compare, val);
-}
-
-template <typename T>
-__inline__ __device__ T atomic_compare_exchange(
-    volatile T* const dest, const T& compare,
-    std::enable_if_t<sizeof(T) == sizeof(int), const T&> val) {
-  const int tmp = atomicCAS((int*)dest, *((int*)&compare), *((int*)&val));
-  return *((T*)&tmp);
-}
-
-template <typename T>
-__inline__ __device__ T atomic_compare_exchange(
-    volatile T* const dest, const T& compare,
-    std::enable_if_t<sizeof(T) != sizeof(int) &&
-                         sizeof(T) == sizeof(unsigned long long int),
-                     const T&>
-        val) {
-  using type     = unsigned long long int;
-  const type tmp = atomicCAS((type*)dest, *((type*)&compare), *((type*)&val));
-  return *((T*)&tmp);
-}
-
-template <typename T>
-__inline__ __device__ T atomic_compare_exchange(
-    volatile T* const dest, const T& compare,
-    std::enable_if_t<(sizeof(T) != 4) && (sizeof(T) != 8), const T>& val) {
-  T return_val;
-  // This is a way to (hopefully) avoid dead lock in a warp
-  int done                 = 0;
-  unsigned int mask        = __activemask();
-  unsigned int active      = __ballot_sync(mask, 1);
-  unsigned int done_active = 0;
-  while (active != done_active) {
-    if (!done) {
-      if (Impl::lock_address_cuda_space((void*)dest)) {
-        Kokkos::memory_fence();
-        return_val = *dest;
-        if (return_val == compare) *dest = val;
-        Kokkos::memory_fence();
-        Impl::unlock_address_cuda_space((void*)dest);
-        done = 1;
-      }
-    }
-    done_active = __ballot_sync(mask, done);
-  }
-  return return_val;
-}
-#endif
-#endif
-
-//----------------------------------------------------------------------------
-// GCC native CAS supports int, long, unsigned int, unsigned long.
-// Intel native CAS support int and long with the same interface as GCC.
-#if !defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND)
-#if defined(KOKKOS_ENABLE_WINDOWS_ATOMICS)
-// atomic_compare_exchange are already defined in Kokkos_Atomic_Windows.hpp
-#elif defined(KOKKOS_ENABLE_GNU_ATOMICS) || defined(KOKKOS_ENABLE_INTEL_ATOMICS)
-
-inline int atomic_compare_exchange(volatile int* const dest, const int compare,
-                                   const int val) {
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-  _mm_prefetch((const char*)dest, _MM_HINT_ET0);
-#endif
-  return __sync_val_compare_and_swap(dest, compare, val);
-}
-
-inline long atomic_compare_exchange(volatile long* const dest,
-                                    const long compare, const long val) {
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-  _mm_prefetch((const char*)dest, _MM_HINT_ET0);
-#endif
-  return __sync_val_compare_and_swap(dest, compare, val);
-}
-
-#if defined(KOKKOS_ENABLE_GNU_ATOMICS)
-
-// GCC supports unsigned
-
-inline unsigned int atomic_compare_exchange(volatile unsigned int* const dest,
-                                            const unsigned int compare,
-                                            const unsigned int val) {
-  return __sync_val_compare_and_swap(dest, compare, val);
-}
-
-inline unsigned long atomic_compare_exchange(volatile unsigned long* const dest,
-                                             const unsigned long compare,
-                                             const unsigned long val) {
-  return __sync_val_compare_and_swap(dest, compare, val);
-}
-
-inline unsigned long long atomic_compare_exchange(
-    volatile unsigned long long* const dest, const unsigned long long compare,
-    const unsigned long long val) {
-  return __sync_val_compare_and_swap(dest, compare, val);
-}
-
-#endif
-
-template <typename T>
-inline T atomic_compare_exchange(
-    volatile T* const dest, const T& compare,
-    std::enable_if_t<sizeof(T) == sizeof(int), const T&> val) {
-  union U {
-    int i;
-    T t;
-    KOKKOS_INLINE_FUNCTION U() {}
-  } tmp;
-
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-  _mm_prefetch((const char*)dest, _MM_HINT_ET0);
-#endif
-
-  tmp.i =
-      __sync_val_compare_and_swap((int*)dest, *((int*)&compare), *((int*)&val));
-  return tmp.t;
-}
-
-template <typename T>
-inline T atomic_compare_exchange(
-    volatile T* const dest, const T& compare,
-    std::enable_if_t<sizeof(T) != sizeof(int) && sizeof(T) == sizeof(long),
-                     const T&>
-        val) {
-  union U {
-    long i;
-    T t;
-    KOKKOS_INLINE_FUNCTION U() {}
-  } tmp;
-
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-  _mm_prefetch((const char*)dest, _MM_HINT_ET0);
-#endif
-
-  tmp.i = __sync_val_compare_and_swap((long*)dest, *((long*)&compare),
-                                      *((long*)&val));
-  return tmp.t;
-}
-
-#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64)
-template <typename T>
-inline T atomic_compare_exchange(
-    volatile T* const dest, const T& compare,
-    std::enable_if_t<sizeof(T) != sizeof(int) && sizeof(T) != sizeof(long) &&
-                         sizeof(T) == sizeof(Impl::cas128_t),
-                     const T&>
-        val) {
-  union U {
-    Impl::cas128_t i;
-    T t;
-    KOKKOS_INLINE_FUNCTION U() {}
-  } tmp;
-
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-  _mm_prefetch((const char*)dest, _MM_HINT_ET0);
-#endif
-
-  tmp.i = Impl::cas128((Impl::cas128_t*)dest, *((Impl::cas128_t*)&compare),
-                       *((Impl::cas128_t*)&val));
-  return tmp.t;
-}
-#endif
-
-template <typename T>
-inline T atomic_compare_exchange(
-    volatile T* const dest, const T compare,
-    std::enable_if_t<(sizeof(T) != 4) && (sizeof(T) != 8)
-#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64)
-                         && (sizeof(T) != 16)
-#endif
-                         ,
-                     const T>& val) {
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-  _mm_prefetch((const char*)dest, _MM_HINT_ET0);
-#endif
-
-  while (!Impl::lock_address_host_space((void*)dest))
-    ;
-  Kokkos::memory_fence();
-  T return_val = *dest;
-  if (return_val == compare) {
-    // Don't use the following line of code here:
-    //
-    // const T tmp = *dest = val;
-    //
-    // Instead, put each assignment in its own statement.  This is
-    // because the overload of T::operator= for volatile *this should
-    // return void, not volatile T&.  See Kokkos #177:
-    //
-    // https://github.com/kokkos/kokkos/issues/177
-    *dest       = val;
-    const T tmp = *dest;
-#ifndef KOKKOS_COMPILER_CLANG
-    (void)tmp;
-#endif
-    Kokkos::memory_fence();
-  }
-  Impl::unlock_address_host_space((void*)dest);
-  return return_val;
-}
-//----------------------------------------------------------------------------
-
-#elif defined(KOKKOS_ENABLE_OPENMP_ATOMICS)
-
-template <typename T>
-KOKKOS_INLINE_FUNCTION T atomic_compare_exchange(volatile T* const dest,
-                                                 const T compare, const T val) {
-  T retval;
-#pragma omp critical
-  {
-    retval = dest[0];
-    if (retval == compare) dest[0] = val;
-  }
-  return retval;
-}
-
-#elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS)
-
-template <typename T>
-KOKKOS_INLINE_FUNCTION T atomic_compare_exchange(volatile T* const dest_v,
-                                                 const T compare, const T val) {
-  T* dest  = const_cast<T*>(dest_v);
-  T retval = *dest;
-  if (retval == compare) *dest = val;
-  return retval;
-}
-
-#endif
-#endif
-
-// dummy for non-CUDA Kokkos headers being processed by NVCC
-#if defined(__CUDA_ARCH__) && !defined(KOKKOS_ENABLE_CUDA)
-template <typename T>
-__inline__ __device__ T atomic_compare_exchange(
-    volatile T* const, const Kokkos::Impl::type_identity_t<T>,
-    const Kokkos::Impl::type_identity_t<T>) {
-  return T();
-}
-#endif
-
-template <typename T>
-KOKKOS_INLINE_FUNCTION bool atomic_compare_exchange_strong(
-    volatile T* const dest, const T compare, const T val) {
-  return compare == atomic_compare_exchange(dest, compare, val);
-}
-//----------------------------------------------------------------------------
-
-namespace Impl {
-// memory-ordered versions are in the Impl namespace
-
-template <class T, class MemoryOrderFailure>
-KOKKOS_INLINE_FUNCTION bool _atomic_compare_exchange_strong_fallback(
-    T* dest, T compare, T val, memory_order_seq_cst_t, MemoryOrderFailure) {
-  Kokkos::memory_fence();
-  auto rv = Kokkos::atomic_compare_exchange_strong(dest, compare, val);
-  Kokkos::memory_fence();
-  return rv;
-}
-
-template <class T, class MemoryOrderFailure>
-KOKKOS_INLINE_FUNCTION bool _atomic_compare_exchange_strong_fallback(
-    T* dest, T compare, T val, memory_order_acquire_t, MemoryOrderFailure) {
-  auto rv = Kokkos::atomic_compare_exchange_strong(dest, compare, val);
-  Kokkos::memory_fence();
-  return rv;
-}
-
-template <class T, class MemoryOrderFailure>
-KOKKOS_INLINE_FUNCTION bool _atomic_compare_exchange_strong_fallback(
-    T* dest, T compare, T val, memory_order_release_t, MemoryOrderFailure) {
-  Kokkos::memory_fence();
-  return Kokkos::atomic_compare_exchange_strong(dest, compare, val);
-}
-
-template <class T, class MemoryOrderFailure>
-KOKKOS_INLINE_FUNCTION bool _atomic_compare_exchange_strong_fallback(
-    T* dest, T compare, T val, memory_order_relaxed_t, MemoryOrderFailure) {
-  return Kokkos::atomic_compare_exchange_strong(dest, compare, val);
-}
-
-#if (defined(KOKKOS_ENABLE_GNU_ATOMICS) && !defined(__CUDA_ARCH__)) ||   \
-    (defined(KOKKOS_ENABLE_INTEL_ATOMICS) && !defined(__CUDA_ARCH__)) || \
-    defined(KOKKOS_ENABLE_CUDA_ASM_ATOMICS)
-
-#if defined(__CUDA_ARCH__)
-#define KOKKOS_INTERNAL_INLINE_DEVICE_IF_CUDA_ARCH __inline__ __device__
-#else
-#define KOKKOS_INTERNAL_INLINE_DEVICE_IF_CUDA_ARCH inline
-#endif
-
-template <class T, class MemoryOrderSuccess, class MemoryOrderFailure>
-KOKKOS_INTERNAL_INLINE_DEVICE_IF_CUDA_ARCH bool _atomic_compare_exchange_strong(
-    T* dest, T compare, T val, MemoryOrderSuccess, MemoryOrderFailure,
-    std::enable_if_t<
-        (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8 ||
-         sizeof(T) == 16) &&
-            std::is_same<typename MemoryOrderSuccess::memory_order,
-                         std::remove_cv_t<MemoryOrderSuccess>>::value &&
-            std::is_same<typename MemoryOrderFailure::memory_order,
-                         std::remove_cv_t<MemoryOrderFailure>>::value,
-        void const**> = nullptr) {
-  return __atomic_compare_exchange_n(dest, &compare, val, /* weak = */ false,
-                                     MemoryOrderSuccess::gnu_constant,
-                                     MemoryOrderFailure::gnu_constant);
-}
-
-template <class T, class MemoryOrderSuccess, class MemoryOrderFailure>
-KOKKOS_INTERNAL_INLINE_DEVICE_IF_CUDA_ARCH bool _atomic_compare_exchange_strong(
-    T* dest, T compare, T val, MemoryOrderSuccess order_success,
-    MemoryOrderFailure order_failure,
-    std::enable_if_t<
-        !(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 ||
-          sizeof(T) == 8 || sizeof(T) == 16) &&
-            std::is_same<typename MemoryOrderSuccess::memory_order,
-                         std::remove_cv_t<MemoryOrderSuccess>>::value &&
-            std::is_same<typename MemoryOrderFailure::memory_order,
-                         std::remove_cv_t<MemoryOrderFailure>>::value,
-        void const**> = nullptr) {
-  return _atomic_compare_exchange_fallback(dest, compare, val, order_success,
-                                           order_failure);
-}
-
-#else
-
-template <class T, class MemoryOrderSuccess, class MemoryOrderFailure>
-KOKKOS_INLINE_FUNCTION bool _atomic_compare_exchange_strong(
-    T* dest, T compare, T val, MemoryOrderSuccess order_success,
-    MemoryOrderFailure order_failure) {
-  return _atomic_compare_exchange_strong_fallback(dest, compare, val,
-                                                  order_success, order_failure);
-}
-
-#endif
-
-// TODO static asserts in overloads that don't make sense (as listed in
-// https://gcc.gnu.org/onlinedocs/gcc-5.2.0/gcc/_005f_005fatomic-Builtins.html)
-template <class T, class MemoryOrderSuccess, class MemoryOrderFailure>
-KOKKOS_FORCEINLINE_FUNCTION bool atomic_compare_exchange_strong(
-    T* dest, T compare, T val, MemoryOrderSuccess order_success,
-    MemoryOrderFailure order_failure) {
-  return _atomic_compare_exchange_strong(dest, compare, val, order_success,
-                                         order_failure);
-}
-
-}  // end namespace Impl
-
-}  // namespace Kokkos
-
-#if defined(KOKKOS_ENABLE_CUDA)
-#include <Cuda/Kokkos_Cuda_Atomic_Intrinsics_Restore_Builtins.hpp>
-#endif
-
-#endif
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Weak.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Weak.hpp
deleted file mode 100644
index 884927783602b6cf427164b4635ff7a53db045f2..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Weak.hpp
+++ /dev/null
@@ -1,380 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-#include <xmmintrin.h>
-#endif
-
-#include <Kokkos_Macros.hpp>
-#include <Kokkos_Atomic.hpp>
-#ifndef KOKKOS_ATOMIC_COMPARE_EXCHANGE_WEAK_HPP
-#define KOKKOS_ATOMIC_COMPARE_EXCHANGE_WEAK_HPP
-
-namespace Kokkos {
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-// Cuda sm_70 or greater supports C++-like semantics directly
-
-#if defined(KOKKOS_ENABLE_CUDA)
-
-#if defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND)
-
-#if __CUDA_ARCH__ >= 700
-// See: https://github.com/ogiroux/freestanding
-#define kokkos_cuda_internal_cas_release_32(ptr, old, expected, desired) \
-  asm volatile("atom.cas.release.sys.b32 %0, [%1], %2, %3;"              \
-               : "=r"(old)                                               \
-               : "l"(ptr), "r"(expected), "r"(desired)                   \
-               : "memory")
-#define kokkos_cuda_internal_cas_acquire_32(ptr, old, expected, desired) \
-  asm volatile("atom.cas.acquire.sys.b32 %0, [%1], %2, %3;"              \
-               : "=r"(old)                                               \
-               : "l"(ptr), "r"(expected), "r"(desired)                   \
-               : "memory")
-#define kokkos_cuda_internal_cas_acq_rel_32(ptr, old, expected, desired) \
-  asm volatile("atom.cas.acq_rel.sys.b32 %0, [%1], %2, %3;"              \
-               : "=r"(old)                                               \
-               : "l"(ptr), "r"(expected), "r"(desired)                   \
-               : "memory")
-#define kokkos_cuda_internal_cas_relaxed_32(ptr, old, expected, desired) \
-  asm volatile("atom.cas.relaxed.sys.b32 %0, [%1], %2, %3;"              \
-               : "=r"(old)                                               \
-               : "l"(ptr), "r"(expected), "r"(desired)                   \
-               : "memory")
-#define kokkos_cuda_internal_fence_seq_cst() \
-  asm volatile("fence.sc.sys;" : : : "memory")
-#define kokkos_cuda_internal_fence_acq_rel() \
-  asm volatile("fence.acq_rel.sys;" : : : "memory")
-#else
-#define kokkos_cuda_internal_fence_acq_rel() \
-  asm volatile("membar.sys;" : : : "memory")
-#define kokkos_cuda_internal_fence_seq_cst() \
-  asm volatile("membar.sys;" : : : "memory")
-#endif
-
-// 32-bit version
-template <class T, std::enable_if_t<sizeof(T) == 4, int> = 0>
-__inline__ __device__ bool atomic_compare_exchange_weak(
-    T volatile* const dest, T* const expected, T const desired,
-    std::memory_order success_order = std::memory_order_seq_cst,
-    std::memory_order failure_order = std::memory_order_seq_cst) {
-  // TODO assert that success_order >= failure_order
-  // See: https://github.com/ogiroux/freestanding
-  int32_t tmp = 0;
-  int32_t old = 0;
-  memcpy(&tmp, &desired, sizeof(T));
-  memcpy(&old, expected, sizeof(T));
-  int32_t old_tmp = old;
-#if __CUDA_ARCH__ >= 700
-  switch (success_order) {
-    case std::memory_order_seq_cst:
-      // sequentially consistent is just an acquire with a seq_cst fence
-      kokkos_cuda_internal_fence_seq_cst();
-      kokkos_cuda_internal_cas_acquire_32((T*)dest, old, old_tmp, tmp);
-      break;
-    case std::memory_order_acquire:
-      kokkos_cuda_internal_cas_acquire_32((T*)dest, old, old_tmp, tmp);
-      break;
-    case std::memory_order_consume:
-      // same as acquire on PTX compatible platforms
-      kokkos_cuda_internal_cas_acquire_32((T*)dest, old, old_tmp, tmp);
-      break;
-    case std::memory_order_acq_rel:
-      kokkos_cuda_internal_cas_acq_rel_32((T*)dest, old, old_tmp, tmp);
-      break;
-    case std::memory_order_release:
-      kokkos_cuda_internal_cas_release_32((T*)dest, old, old_tmp, tmp);
-      break;
-    case std::memory_order_relaxed:
-      kokkos_cuda_internal_cas_relaxed_32((T*)dest, old, old_tmp, tmp);
-      break;
-  };
-#else
-  // All of the orders that require a fence before the relaxed atomic operation:
-  if (success_order == std::memory_order_release ||
-      success_order == std::memory_order_acq_rel) {
-    kokkos_cuda_internal_fence_acq_rel();
-  } else if (success_order == std::memory_order_seq_cst) {
-    kokkos_cuda_internal_fence_seq_cst();
-  }
-  // This is relaxed:
-  // Cuda API requires casting away volatile
-  atomicCAS((T*)dest, old_tmp, tmp);
-#endif
-  bool const rv = (old == old_tmp);
-#if __CUDA_ARCH__ < 700
-  if (rv) {
-    if (success_order == std::memory_order_acquire ||
-        success_order == std::memory_order_consume ||
-        success_order == std::memory_order_acq_rel) {
-      kokkos_cuda_internal_fence_acq_rel();
-    } else if (success_order == std::memory_order_seq_cst) {
-      kokkos_cuda_internal_fence_seq_cst();
-    }
-  } else {
-    if (failure_order == std::memory_order_acquire ||
-        failure_order == std::memory_order_consume ||
-        failure_order == std::memory_order_acq_rel) {
-      kokkos_cuda_internal_fence_acq_rel();
-    } else if (failure_order == std::memory_order_seq_cst) {
-      kokkos_cuda_internal_fence_seq_cst();
-    }
-  }
-#endif
-  memcpy(expected, &old, sizeof(T));
-  return rv;
-}
-
-// 64-bit version
-template <class T, std::enable_if_t<sizeof(T) == 8, int> = 0>
-bool atomic_compare_exchange_weak(
-    T volatile* const dest, T* const expected, T const desired,
-    std::memory_order success_order = std::memory_order_seq_cst,
-    std::memory_order failure_order = std::memory_order_seq_cst) {
-  // TODO assert that success_order >= failure_order
-  // See: https://github.com/ogiroux/freestanding
-  int64_t tmp = 0;
-  int64_t old = 0;
-  memcpy(&tmp, &desired, sizeof(T));
-  memcpy(&old, expected, sizeof(T));
-  int64_t old_tmp = old;
-#if __CUDA_ARCH__ >= 700
-  switch (success_order) {
-    case std::memory_order_seq_cst:
-      // sequentially consistent is just an acquire with a seq_cst fence
-      kokkos_cuda_internal_fence_seq_cst();
-      kokkos_cuda_internal_cas_acquire_64((T*)dest, old, old_tmp, tmp);
-      break;
-    case std::memory_order_acquire:
-      kokkos_cuda_internal_cas_acquire_64((T*)dest, old, old_tmp, tmp);
-      break;
-    case std::memory_order_consume:
-      // same as acquire on PTX compatible platforms
-      kokkos_cuda_internal_cas_acquire_64((T*)dest, old, old_tmp, tmp);
-      break;
-    case std::memory_order_acq_rel:
-      kokkos_cuda_internal_cas_acq_rel_64((T*)dest, old, old_tmp, tmp);
-      break;
-    case std::memory_order_release:
-      kokkos_cuda_internal_cas_release_64((T*)dest, old, old_tmp, tmp);
-      break;
-    case std::memory_order_relaxed:
-      kokkos_cuda_internal_cas_relaxed_64((T*)dest, old, old_tmp, tmp);
-      break;
-  };
-#else
-  // Cuda API requires casting away volatile
-  atomicCAS((T*)dest, old_tmp, tmp);
-#endif
-  bool const rv = (old == old_tmp);
-  memcpy(expected, &old, sizeof(T));
-  return rv;
-}
-
-#endif  // defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND)
-
-#endif  // defined( KOKKOS_ENABLE_CUDA )
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-// GCC native CAS supports int, long, unsigned int, unsigned long.
-// Intel native CAS support int and long with the same interface as GCC.
-#if !defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND)
-#if defined(KOKKOS_ENABLE_GNU_ATOMICS) || defined(KOKKOS_ENABLE_INTEL_ATOMICS)
-
-inline int atomic_compare_exchange(volatile int* const dest, const int compare,
-                                   const int val) {
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-  _mm_prefetch((const char*)dest, _MM_HINT_ET0);
-#endif
-  return __sync_val_compare_and_swap(dest, compare, val);
-}
-
-inline long atomic_compare_exchange(volatile long* const dest,
-                                    const long compare, const long val) {
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-  _mm_prefetch((const char*)dest, _MM_HINT_ET0);
-#endif
-  return __sync_val_compare_and_swap(dest, compare, val);
-}
-
-#if defined(KOKKOS_ENABLE_GNU_ATOMICS)
-
-// GCC supports unsigned
-
-inline unsigned int atomic_compare_exchange(volatile unsigned int* const dest,
-                                            const unsigned int compare,
-                                            const unsigned int val) {
-  return __sync_val_compare_and_swap(dest, compare, val);
-}
-
-inline unsigned long atomic_compare_exchange(volatile unsigned long* const dest,
-                                             const unsigned long compare,
-                                             const unsigned long val) {
-  return __sync_val_compare_and_swap(dest, compare, val);
-}
-
-inline unsigned long long atomic_compare_exchange(
-    volatile unsigned long long* const dest, const unsigned long long compare,
-    const unsigned long long val) {
-  return __sync_val_compare_and_swap(dest, compare, val);
-}
-
-#endif
-
-template <typename T>
-inline T atomic_compare_exchange(
-    volatile T* const dest, const T& compare,
-    std::enable_if_t<sizeof(T) == sizeof(int), const T&> val) {
-  union U {
-    int i;
-    T t;
-    KOKKOS_INLINE_FUNCTION U() {}
-  } tmp;
-
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-  _mm_prefetch((const char*)dest, _MM_HINT_ET0);
-#endif
-
-  tmp.i =
-      __sync_val_compare_and_swap((int*)dest, *((int*)&compare), *((int*)&val));
-  return tmp.t;
-}
-
-template <typename T>
-inline T atomic_compare_exchange(
-    volatile T* const dest, const T& compare,
-    std::enable_if_t<sizeof(T) != sizeof(int) && sizeof(T) == sizeof(long),
-                     const T&>
-        val) {
-  union U {
-    long i;
-    T t;
-    KOKKOS_INLINE_FUNCTION U() {}
-  } tmp;
-
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-  _mm_prefetch((const char*)dest, _MM_HINT_ET0);
-#endif
-
-  tmp.i = __sync_val_compare_and_swap((long*)dest, *((long*)&compare),
-                                      *((long*)&val));
-  return tmp.t;
-}
-
-#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64)
-template <typename T>
-inline T atomic_compare_exchange(
-    volatile T* const dest, const T& compare,
-    std::enable_if_t<sizeof(T) != sizeof(int) && sizeof(T) != sizeof(long) &&
-                         sizeof(T) == sizeof(Impl::cas128_t),
-                     const T&>
-        val) {
-  union U {
-    Impl::cas128_t i;
-    T t;
-    KOKKOS_INLINE_FUNCTION U() {}
-  } tmp;
-
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-  _mm_prefetch((const char*)dest, _MM_HINT_ET0);
-#endif
-
-  tmp.i = Impl::cas128((Impl::cas128_t*)dest, *((Impl::cas128_t*)&compare),
-                       *((Impl::cas128_t*)&val));
-  return tmp.t;
-}
-#endif
-
-template <typename T>
-inline T atomic_compare_exchange(
-    volatile T* const dest, const T compare,
-    std::enable_if_t<(sizeof(T) != 4) && (sizeof(T) != 8)
-#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64)
-                         && (sizeof(T) != 16)
-#endif
-                         ,
-                     const T>& val) {
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-  _mm_prefetch((const char*)dest, _MM_HINT_ET0);
-#endif
-
-  while (!Impl::lock_address_host_space((void*)dest))
-    ;
-  Kokkos::memory_fence();
-  T return_val = *dest;
-  if (return_val == compare) {
-    // Don't use the following line of code here:
-    //
-    // const T tmp = *dest = val;
-    //
-    // Instead, put each assignment in its own statement.  This is
-    // because the overload of T::operator= for volatile *this should
-    // return void, not volatile T&.  See Kokkos #177:
-    //
-    // https://github.com/kokkos/kokkos/issues/177
-    *dest       = val;
-    const T tmp = *dest;
-#ifndef KOKKOS_COMPILER_CLANG
-    (void)tmp;
-#endif
-    Kokkos::memory_fence();
-  }
-  Impl::unlock_address_host_space((void*)dest);
-  return return_val;
-}
-//----------------------------------------------------------------------------
-
-#elif defined(KOKKOS_ENABLE_OPENMP_ATOMICS)
-
-template <typename T>
-KOKKOS_INLINE_FUNCTION T atomic_compare_exchange(volatile T* const dest,
-                                                 const T compare, const T val) {
-  T retval;
-#pragma omp critical
-  {
-    retval = dest[0];
-    if (retval == compare) dest[0] = val;
-  }
-  return retval;
-}
-
-#elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS)
-
-template <typename T>
-KOKKOS_INLINE_FUNCTION T atomic_compare_exchange(volatile T* const dest_v,
-                                                 const T compare, const T val) {
-  T* dest  = const_cast<T*>(dest_v);
-  T retval = *dest;
-  if (retval == compare) *dest = val;
-  return retval;
-}
-
-#endif
-#endif
-
-template <typename T>
-KOKKOS_INLINE_FUNCTION bool atomic_compare_exchange_strong(
-    volatile T* const dest, const T compare, const T val) {
-  return compare == atomic_compare_exchange(dest, compare, val);
-}
-//----------------------------------------------------------------------------
-
-}  // namespace Kokkos
-
-#endif
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Decrement.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Decrement.hpp
deleted file mode 100644
index aac5233b3a579662bfe0b2ff2fad75521bc747dd..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/src/impl/Kokkos_Atomic_Decrement.hpp
+++ /dev/null
@@ -1,119 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-#include <xmmintrin.h>
-#endif
-
-#include <Kokkos_Macros.hpp>
-#if defined(KOKKOS_ATOMIC_HPP) && !defined(KOKKOS_ATOMIC_DECREMENT_HPP)
-#define KOKKOS_ATOMIC_DECREMENT_HPP
-
-#include "impl/Kokkos_Atomic_Fetch_Sub.hpp"
-
-namespace Kokkos {
-
-// Atomic decrement
-template <>
-KOKKOS_INLINE_FUNCTION void atomic_decrement<char>(volatile char* a) {
-#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) && \
-    !defined(_WIN32) && !defined(__CUDA_ARCH__)
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-  _mm_prefetch((const char*)a, _MM_HINT_ET0);
-#endif
-  __asm__ __volatile__("lock decb %0"
-                       : /* no output registers */
-                       : "m"(a[0])
-                       : "memory");
-#elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS)
-  char* a_nv = const_cast<char*>(a);
-  --(*a_nv);
-#else
-  Kokkos::atomic_fetch_sub(a, char(1));
-#endif
-}
-
-template <>
-KOKKOS_INLINE_FUNCTION void atomic_decrement<short>(volatile short* a) {
-#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) && \
-    !defined(_WIN32) && !defined(__CUDA_ARCH__)
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-  _mm_prefetch((const char*)a, _MM_HINT_ET0);
-#endif
-  __asm__ __volatile__("lock decw %0"
-                       : /* no output registers */
-                       : "m"(a[0])
-                       : "memory");
-#elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS)
-  short* a_nv = const_cast<short*>(a);
-  --(*a_nv);
-#else
-  Kokkos::atomic_fetch_sub(a, short(1));
-#endif
-}
-
-template <>
-KOKKOS_INLINE_FUNCTION void atomic_decrement<int>(volatile int* a) {
-#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) && \
-    !defined(_WIN32) && !defined(__CUDA_ARCH__)
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-  _mm_prefetch((const char*)a, _MM_HINT_ET0);
-#endif
-  __asm__ __volatile__("lock decl %0"
-                       : /* no output registers */
-                       : "m"(a[0])
-                       : "memory");
-#elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS)
-  int* a_nv = const_cast<int*>(a);
-  --(*a_nv);
-#else
-  Kokkos::atomic_fetch_sub(a, int(1));
-#endif
-}
-
-template <>
-KOKKOS_INLINE_FUNCTION void atomic_decrement<long long int>(
-    volatile long long int* a) {
-#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) && \
-    !defined(_WIN32) && !defined(__CUDA_ARCH__)
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-  _mm_prefetch((const char*)a, _MM_HINT_ET0);
-#endif
-  __asm__ __volatile__("lock decq %0"
-                       : /* no output registers */
-                       : "m"(a[0])
-                       : "memory");
-#elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS)
-  long long int* a_nv = const_cast<long long int*>(a);
-  --(*a_nv);
-#else
-  using T = long long int;
-  Kokkos::atomic_fetch_sub(a, T(1));
-#endif
-}
-
-template <typename T>
-KOKKOS_INLINE_FUNCTION void atomic_decrement(volatile T* a) {
-#if defined(KOKKOS_ENABLE_SERIAL_ATOMICS)
-  T* a_nv = const_cast<T*>(a);
-  --(*a_nv);
-#else
-  Kokkos::atomic_fetch_sub(a, T(1));
-#endif
-}
-
-}  // End of namespace Kokkos
-#endif
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp
deleted file mode 100644
index abfc1f631a5a428f83c61fa332bd169fc52a7201..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp
+++ /dev/null
@@ -1,376 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-#include <xmmintrin.h>
-#endif
-
-#include <Kokkos_Macros.hpp>
-#if defined(KOKKOS_ATOMIC_HPP) && !defined(KOKKOS_ATOMIC_EXCHANGE_HPP)
-#define KOKKOS_ATOMIC_EXCHANGE_HPP
-
-namespace Kokkos {
-
-//----------------------------------------------------------------------------
-
-#if defined(KOKKOS_ENABLE_CUDA)
-#if defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND)
-
-__inline__ __device__ int atomic_exchange(volatile int* const dest,
-                                          const int val) {
-  // return __iAtomicExch( (int*) dest , val );
-  return atomicExch((int*)dest, val);
-}
-
-__inline__ __device__ unsigned int atomic_exchange(
-    volatile unsigned int* const dest, const unsigned int val) {
-  // return __uAtomicExch( (unsigned int*) dest , val );
-  return atomicExch((unsigned int*)dest, val);
-}
-
-__inline__ __device__ unsigned long long int atomic_exchange(
-    volatile unsigned long long int* const dest,
-    const unsigned long long int val) {
-  // return __ullAtomicExch( (unsigned long long*) dest , val );
-  return atomicExch((unsigned long long*)dest, val);
-}
-
-/** \brief  Atomic exchange for any type with compatible size */
-template <typename T>
-__inline__ __device__ T
-atomic_exchange(volatile T* const dest,
-                std::enable_if_t<sizeof(T) == sizeof(int), const T&> val) {
-  // int tmp = __ullAtomicExch( (int*) dest , *((int*)&val) );
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-  _mm_prefetch((const char*)dest, _MM_HINT_ET0);
-#endif
-
-  int tmp = atomicExch(((int*)dest), *((int*)&val));
-  return *((T*)&tmp);
-}
-
-template <typename T>
-__inline__ __device__ T atomic_exchange(
-    volatile T* const dest,
-    std::enable_if_t<sizeof(T) != sizeof(int) &&
-                         sizeof(T) == sizeof(unsigned long long int),
-                     const T&>
-        val) {
-  using type = unsigned long long int;
-
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-  _mm_prefetch((const char*)dest, _MM_HINT_ET0);
-#endif
-
-  // type tmp = __ullAtomicExch( (type*) dest , *((type*)&val) );
-  type tmp = atomicExch(((type*)dest), *((type*)&val));
-  return *((T*)&tmp);
-}
-
-template <typename T>
-__inline__ __device__ T atomic_exchange(
-    volatile T* const dest,
-    std::enable_if_t<(sizeof(T) != 4) && (sizeof(T) != 8), const T>& val) {
-  T return_val;
-  // This is a way to (hopefully) avoid dead lock in a warp
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-  _mm_prefetch((const char*)dest, _MM_HINT_ET0);
-#endif
-
-  int done                 = 0;
-  unsigned int mask        = __activemask();
-  unsigned int active      = __ballot_sync(mask, 1);
-  unsigned int done_active = 0;
-  while (active != done_active) {
-    if (!done) {
-      if (Impl::lock_address_cuda_space((void*)dest)) {
-        Kokkos::memory_fence();
-        return_val = *dest;
-        *dest      = val;
-        Kokkos::memory_fence();
-        Impl::unlock_address_cuda_space((void*)dest);
-        done = 1;
-      }
-    }
-    done_active = __ballot_sync(mask, done);
-  }
-  return return_val;
-}
-/** \brief  Atomic exchange for any type with compatible size */
-template <typename T>
-__inline__ __device__ void atomic_assign(
-    volatile T* const dest,
-    std::enable_if_t<sizeof(T) == sizeof(int), const T&> val) {
-  // (void) __ullAtomicExch( (int*) dest , *((int*)&val) );
-  (void)atomicExch(((int*)dest), *((int*)&val));
-}
-
-template <typename T>
-__inline__ __device__ void atomic_assign(
-    volatile T* const dest,
-    std::enable_if_t<sizeof(T) != sizeof(int) &&
-                         sizeof(T) == sizeof(unsigned long long int),
-                     const T&>
-        val) {
-  using type = unsigned long long int;
-  // (void) __ullAtomicExch( (type*) dest , *((type*)&val) );
-  (void)atomicExch(((type*)dest), *((type*)&val));
-}
-
-template <typename T>
-__inline__ __device__ void atomic_assign(
-    volatile T* const dest,
-    std::enable_if_t<sizeof(T) != sizeof(int) &&
-                         sizeof(T) != sizeof(unsigned long long int),
-                     const T&>
-        val) {
-  (void)atomic_exchange(dest, val);
-}
-
-#endif
-#endif
-
-//----------------------------------------------------------------------------
-
-#if !defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND)
-#if defined(KOKKOS_ENABLE_GNU_ATOMICS) || defined(KOKKOS_ENABLE_INTEL_ATOMICS)
-
-template <typename T>
-inline T atomic_exchange(
-    volatile T* const dest,
-    std::enable_if_t<sizeof(T) == sizeof(int) || sizeof(T) == sizeof(long),
-                     const T&>
-        val) {
-  using type = std::conditional_t<sizeof(T) == sizeof(int), int, long>;
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-  _mm_prefetch((const char*)dest, _MM_HINT_ET0);
-#endif
-
-  const type v = *((type*)&val);  // Extract to be sure the value doesn't change
-
-  type assumed;
-
-  union U {
-    T val_T;
-    type val_type;
-    inline U() {}
-  } old;
-
-  old.val_T = *dest;
-
-  do {
-    assumed = old.val_type;
-    old.val_type =
-        __sync_val_compare_and_swap((volatile type*)dest, assumed, v);
-  } while (assumed != old.val_type);
-
-  return old.val_T;
-}
-
-#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64)
-template <typename T>
-inline T atomic_exchange(
-    volatile T* const dest,
-    std::enable_if_t<sizeof(T) == sizeof(Impl::cas128_t), const T&> val) {
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-  _mm_prefetch((const char*)dest, _MM_HINT_ET0);
-#endif
-
-  union U {
-    Impl::cas128_t i;
-    T t;
-    inline U() {}
-  } assume, oldval, newval;
-
-  oldval.t = *dest;
-  newval.t = val;
-
-  do {
-    assume.i = oldval.i;
-    oldval.i = Impl::cas128((volatile Impl::cas128_t*)dest, assume.i, newval.i);
-  } while (assume.i != oldval.i);
-
-  return oldval.t;
-}
-#endif
-
-//----------------------------------------------------------------------------
-
-template <typename T>
-inline T atomic_exchange(volatile T* const dest,
-                         std::enable_if_t<(sizeof(T) != 4) && (sizeof(T) != 8)
-#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64)
-                                              && (sizeof(T) != 16)
-#endif
-                                              ,
-                                          const T>& val) {
-  while (!Impl::lock_address_host_space((void*)dest))
-    ;
-  Kokkos::memory_fence();
-  T return_val = *dest;
-  // Don't use the following line of code here:
-  //
-  // const T tmp = *dest = val;
-  //
-  // Instead, put each assignment in its own statement.  This is
-  // because the overload of T::operator= for volatile *this should
-  // return void, not volatile T&.  See Kokkos #177:
-  //
-  // https://github.com/kokkos/kokkos/issues/177
-  *dest       = val;
-  const T tmp = *dest;
-#ifndef KOKKOS_COMPILER_CLANG
-  (void)tmp;
-#endif
-  Kokkos::memory_fence();
-  Impl::unlock_address_host_space((void*)dest);
-  return return_val;
-}
-
-template <typename T>
-inline void atomic_assign(
-    volatile T* const dest,
-    std::enable_if_t<sizeof(T) == sizeof(int) || sizeof(T) == sizeof(long),
-                     const T&>
-        val) {
-  using type = std::conditional_t<sizeof(T) == sizeof(int), int, long>;
-
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-  _mm_prefetch((const char*)dest, _MM_HINT_ET0);
-#endif
-
-  const type v = *((type*)&val);  // Extract to be sure the value doesn't change
-
-  type assumed;
-
-  union U {
-    T val_T;
-    type val_type;
-    inline U() {}
-  } old;
-
-  old.val_T = *dest;
-
-  do {
-    assumed = old.val_type;
-    old.val_type =
-        __sync_val_compare_and_swap((volatile type*)dest, assumed, v);
-  } while (assumed != old.val_type);
-}
-
-#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64)
-template <typename T>
-inline void atomic_assign(
-    volatile T* const dest,
-    std::enable_if_t<sizeof(T) == sizeof(Impl::cas128_t), const T&> val) {
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-  _mm_prefetch((const char*)dest, _MM_HINT_ET0);
-#endif
-
-  union U {
-    Impl::cas128_t i;
-    T t;
-    inline U() {}
-  } assume, oldval, newval;
-
-  oldval.t = *dest;
-  newval.t = val;
-  do {
-    assume.i = oldval.i;
-    oldval.i = Impl::cas128((volatile Impl::cas128_t*)dest, assume.i, newval.i);
-  } while (assume.i != oldval.i);
-}
-#endif
-
-template <typename T>
-inline void atomic_assign(volatile T* const dest,
-                          std::enable_if_t<(sizeof(T) != 4) && (sizeof(T) != 8)
-#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64)
-                                               && (sizeof(T) != 16)
-#endif
-                                               ,
-                                           const T>& val) {
-  while (!Impl::lock_address_host_space((void*)dest))
-    ;
-  Kokkos::memory_fence();
-  // This is likely an aggregate type with a defined
-  // 'volatile T & operator = ( const T & ) volatile'
-  // member.  The volatile return value implicitly defines a
-  // dereference that some compilers (gcc 4.7.2) warn is being ignored.
-  // Suppress warning by casting return to void.
-  //(void)( *dest = val );
-  *dest = val;
-  Kokkos::memory_fence();
-  Impl::unlock_address_host_space((void*)dest);
-}
-//----------------------------------------------------------------------------
-
-#elif defined(KOKKOS_ENABLE_OPENMP_ATOMICS)
-
-template <typename T>
-inline T atomic_exchange(volatile T* const dest, const T val) {
-  T retval;
-  //#pragma omp atomic capture
-#pragma omp critical
-  {
-    retval  = dest[0];
-    dest[0] = val;
-  }
-  return retval;
-}
-
-template <typename T>
-inline void atomic_assign(volatile T* const dest, const T val) {
-  //#pragma omp atomic
-#pragma omp critical
-  { dest[0] = val; }
-}
-
-#elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS)
-
-template <typename T>
-inline T atomic_exchange(volatile T* const dest_v, const T val) {
-  T* dest  = const_cast<T*>(dest_v);
-  T retval = *dest;
-  *dest    = val;
-  return retval;
-}
-
-template <typename T>
-inline void atomic_assign(volatile T* const dest_v, const T val) {
-  T* dest = const_cast<T*>(dest_v);
-  *dest   = val;
-}
-
-#endif
-#endif
-
-// dummy for non-CUDA Kokkos headers being processed by NVCC
-#if defined(__CUDA_ARCH__) && !defined(KOKKOS_ENABLE_CUDA)
-template <typename T>
-__inline__ __device__ T
-atomic_exchange(volatile T* const, const Kokkos::Impl::type_identity_t<T>) {
-  return T();
-}
-
-template <typename T>
-__inline__ __device__ void atomic_assign(
-    volatile T* const, const Kokkos::Impl::type_identity_t<T>) {}
-#endif
-
-}  // namespace Kokkos
-
-#endif
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp
deleted file mode 100644
index a8c421fbdbd84e169eb9046f5593573b5de4a189..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp
+++ /dev/null
@@ -1,360 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-#include <xmmintrin.h>
-#endif
-
-#include <Kokkos_Macros.hpp>
-#if defined(KOKKOS_ATOMIC_HPP) && !defined(KOKKOS_ATOMIC_FETCH_ADD_HPP)
-#define KOKKOS_ATOMIC_FETCH_ADD_HPP
-
-namespace Kokkos {
-
-//----------------------------------------------------------------------------
-
-#if defined(KOKKOS_ENABLE_CUDA)
-#if defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND)
-
-// Support for int, unsigned int, unsigned long long int, and float
-
-__inline__ __device__ int atomic_fetch_add(volatile int* const dest,
-                                           const int val) {
-  return atomicAdd((int*)dest, val);
-}
-
-__inline__ __device__ unsigned int atomic_fetch_add(
-    volatile unsigned int* const dest, const unsigned int val) {
-  return atomicAdd((unsigned int*)dest, val);
-}
-
-__inline__ __device__ unsigned long long int atomic_fetch_add(
-    volatile unsigned long long int* const dest,
-    const unsigned long long int val) {
-  return atomicAdd((unsigned long long int*)dest, val);
-}
-
-__inline__ __device__ float atomic_fetch_add(volatile float* const dest,
-                                             const float val) {
-  return atomicAdd((float*)dest, val);
-}
-
-#if (600 <= __CUDA_ARCH__)
-__inline__ __device__ double atomic_fetch_add(volatile double* const dest,
-                                              const double val) {
-  return atomicAdd((double*)dest, val);
-}
-#endif
-
-template <typename T>
-__inline__ __device__ T
-atomic_fetch_add(volatile T* const dest,
-                 std::enable_if_t<sizeof(T) == sizeof(int), const T> val) {
-  // to work around a bug in the clang cuda compiler, the name here needs to be
-  // different from the one internal to the other overloads
-  union U1 {
-    int i;
-    T t;
-    KOKKOS_INLINE_FUNCTION U1() {}
-  } assume, oldval, newval;
-
-  oldval.t = *dest;
-
-  do {
-    assume.i = oldval.i;
-    newval.t = assume.t + val;
-    oldval.i = atomicCAS((int*)dest, assume.i, newval.i);
-  } while (assume.i != oldval.i);
-
-  return oldval.t;
-}
-
-template <typename T>
-__inline__ __device__ T atomic_fetch_add(
-    volatile T* const dest,
-    std::enable_if_t<sizeof(T) != sizeof(int) &&
-                         sizeof(T) == sizeof(unsigned long long int),
-                     const T>
-        val) {
-  // to work around a bug in the clang cuda compiler, the name here needs to be
-  // different from the one internal to the other overloads
-  union U2 {
-    unsigned long long int i;
-    T t;
-    KOKKOS_INLINE_FUNCTION U2() {}
-  } assume, oldval, newval;
-
-  oldval.t = *dest;
-
-  do {
-    assume.i = oldval.i;
-    newval.t = assume.t + val;
-    oldval.i = atomicCAS((unsigned long long int*)dest, assume.i, newval.i);
-  } while (assume.i != oldval.i);
-
-  return oldval.t;
-}
-
-//----------------------------------------------------------------------------
-
-template <typename T>
-__inline__ __device__ T atomic_fetch_add(
-    volatile T* const dest,
-    std::enable_if_t<(sizeof(T) != 4) && (sizeof(T) != 8), const T>& val) {
-  T return_val;
-  // This is a way to (hopefully) avoid dead lock in a warp
-  int done                 = 0;
-  unsigned int mask        = __activemask();
-  unsigned int active      = __ballot_sync(mask, 1);
-  unsigned int done_active = 0;
-  while (active != done_active) {
-    if (!done) {
-      bool locked = Impl::lock_address_cuda_space((void*)dest);
-      if (locked) {
-        Kokkos::memory_fence();
-        return_val = *dest;
-        *dest      = return_val + val;
-        Kokkos::memory_fence();
-        Impl::unlock_address_cuda_space((void*)dest);
-        done = 1;
-      }
-    }
-
-    done_active = __ballot_sync(mask, done);
-  }
-  return return_val;
-}
-#endif
-#endif
-//----------------------------------------------------------------------------
-#if !defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND)
-#if defined(KOKKOS_ENABLE_GNU_ATOMICS) || defined(KOKKOS_ENABLE_INTEL_ATOMICS)
-
-#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64)
-inline int atomic_fetch_add(volatile int* dest, const int val) {
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-  _mm_prefetch((const char*)dest, _MM_HINT_ET0);
-#endif
-
-  int original = val;
-
-  __asm__ __volatile__("lock xadd %1, %0"
-                       : "+m"(*dest), "+r"(original)
-                       : "m"(*dest), "r"(original)
-                       : "memory");
-
-  return original;
-}
-#else
-inline int atomic_fetch_add(volatile int* const dest, const int val) {
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-  _mm_prefetch((const char*)dest, _MM_HINT_ET0);
-#endif
-  return __sync_fetch_and_add(dest, val);
-}
-#endif
-
-inline long int atomic_fetch_add(volatile long int* const dest,
-                                 const long int val) {
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-  _mm_prefetch((const char*)dest, _MM_HINT_ET0);
-#endif
-  return __sync_fetch_and_add(dest, val);
-}
-
-#if defined(KOKKOS_ENABLE_GNU_ATOMICS)
-
-inline unsigned int atomic_fetch_add(volatile unsigned int* const dest,
-                                     const unsigned int val) {
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-  _mm_prefetch((const char*)dest, _MM_HINT_ET0);
-#endif
-  return __sync_fetch_and_add(dest, val);
-}
-
-inline unsigned long int atomic_fetch_add(
-    volatile unsigned long int* const dest, const unsigned long int val) {
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-  _mm_prefetch((const char*)dest, _MM_HINT_ET0);
-#endif
-  return __sync_fetch_and_add(dest, val);
-}
-
-inline unsigned long long int atomic_fetch_add(
-    volatile unsigned long long int* const dest,
-    const unsigned long long int val) {
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-  _mm_prefetch((const char*)dest, _MM_HINT_ET0);
-#endif
-  return __sync_fetch_and_add(dest, val);
-}
-
-#endif
-
-template <typename T>
-inline T atomic_fetch_add(
-    volatile T* const dest,
-    std::enable_if_t<sizeof(T) == sizeof(int), const T> val) {
-  union U {
-    int i;
-    T t;
-    inline U() {}
-  } assume, oldval, newval;
-
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-  _mm_prefetch((const char*)dest, _MM_HINT_ET0);
-#endif
-
-  oldval.t = *dest;
-
-  do {
-    assume.i = oldval.i;
-    newval.t = assume.t + val;
-    oldval.i = __sync_val_compare_and_swap((int*)dest, assume.i, newval.i);
-  } while (assume.i != oldval.i);
-
-  return oldval.t;
-}
-
-template <typename T>
-inline T atomic_fetch_add(
-    volatile T* const dest,
-    std::enable_if_t<sizeof(T) != sizeof(int) && sizeof(T) == sizeof(long),
-                     const T>
-        val) {
-  union U {
-    long i;
-    T t;
-    inline U() {}
-  } assume, oldval, newval;
-
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-  _mm_prefetch((const char*)dest, _MM_HINT_ET0);
-#endif
-
-  oldval.t = *dest;
-
-  do {
-    assume.i = oldval.i;
-    newval.t = assume.t + val;
-    oldval.i = __sync_val_compare_and_swap((long*)dest, assume.i, newval.i);
-  } while (assume.i != oldval.i);
-
-  return oldval.t;
-}
-
-#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64)
-template <typename T>
-inline T atomic_fetch_add(
-    volatile T* const dest,
-    std::enable_if_t<sizeof(T) != sizeof(int) && sizeof(T) != sizeof(long) &&
-                         sizeof(T) == sizeof(Impl::cas128_t),
-                     const T>
-        val) {
-  union U {
-    Impl::cas128_t i;
-    T t;
-    inline U() {}
-  } assume, oldval, newval;
-
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-  _mm_prefetch((const char*)dest, _MM_HINT_ET0);
-#endif
-
-  oldval.t = *dest;
-
-  do {
-    assume.i = oldval.i;
-    newval.t = assume.t + val;
-    oldval.i = Impl::cas128((volatile Impl::cas128_t*)dest, assume.i, newval.i);
-  } while (assume.i != oldval.i);
-
-  return oldval.t;
-}
-#endif
-
-//----------------------------------------------------------------------------
-
-template <typename T>
-inline T atomic_fetch_add(volatile T* const dest,
-                          std::enable_if_t<(sizeof(T) != 4) && (sizeof(T) != 8)
-#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64)
-                                               && (sizeof(T) != 16)
-#endif
-                                               ,
-                                           const T>& val) {
-  while (!Impl::lock_address_host_space((void*)dest))
-    ;
-  Kokkos::memory_fence();
-  T return_val = *dest;
-
-  // Don't use the following line of code here:
-  //
-  // const T tmp = *dest = return_val + val;
-  //
-  // Instead, put each assignment in its own statement.  This is
-  // because the overload of T::operator= for volatile *this should
-  // return void, not volatile T&.  See Kokkos #177:
-  //
-  // https://github.com/kokkos/kokkos/issues/177
-  *dest       = return_val + val;
-  const T tmp = *dest;
-  (void)tmp;
-  Kokkos::memory_fence();
-  Impl::unlock_address_host_space((void*)dest);
-
-  return return_val;
-}
-//----------------------------------------------------------------------------
-
-#elif defined(KOKKOS_ENABLE_OPENMP_ATOMICS)
-
-template <typename T>
-T atomic_fetch_add(volatile T* const dest, const T val) {
-  T retval;
-#pragma omp atomic capture
-  {
-    retval = dest[0];
-    dest[0] += val;
-  }
-  return retval;
-}
-
-#elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS)
-
-template <typename T>
-T atomic_fetch_add(volatile T* const dest_v, std::add_const_t<T> val) {
-  T* dest  = const_cast<T*>(dest_v);
-  T retval = *dest;
-  *dest += val;
-  return retval;
-}
-
-#endif
-#endif
-//----------------------------------------------------------------------------
-
-// dummy for non-CUDA Kokkos headers being processed by NVCC
-#if defined(__CUDA_ARCH__) && !defined(KOKKOS_ENABLE_CUDA)
-template <typename T>
-__inline__ __device__ T atomic_fetch_add(volatile T* const,
-                                         Kokkos::Impl::type_identity_t<T>) {
-  return T();
-}
-#endif
-
-}  // namespace Kokkos
-#endif
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_And.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_And.hpp
deleted file mode 100644
index 25049db8f0b87ae33b94a9570f6f2e96f3d64982..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_And.hpp
+++ /dev/null
@@ -1,164 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-#include <xmmintrin.h>
-#endif
-
-#include <Kokkos_Macros.hpp>
-#if defined(KOKKOS_ATOMIC_HPP) && !defined(KOKKOS_ATOMIC_FETCH_AND_HPP)
-#define KOKKOS_ATOMIC_FETCH_AND_HPP
-
-namespace Kokkos {
-
-//----------------------------------------------------------------------------
-
-#if defined(KOKKOS_ENABLE_CUDA)
-#if defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND)
-
-// Support for int, unsigned int, unsigned long long int, and float
-
-__inline__ __device__ int atomic_fetch_and(volatile int* const dest,
-                                           const int val) {
-  return atomicAnd((int*)dest, val);
-}
-
-__inline__ __device__ unsigned int atomic_fetch_and(
-    volatile unsigned int* const dest, const unsigned int val) {
-  return atomicAnd((unsigned int*)dest, val);
-}
-
-#if defined(__CUDA_ARCH__) && (350 <= __CUDA_ARCH__)
-__inline__ __device__ unsigned long long int atomic_fetch_and(
-    volatile unsigned long long int* const dest,
-    const unsigned long long int val) {
-  return atomicAnd((unsigned long long int*)dest, val);
-}
-#endif
-#endif
-#endif
-
-// 08/05/20 Overload to work around https://bugs.llvm.org/show_bug.cgi?id=46922
-
-#if (defined(KOKKOS_ENABLE_CUDA) &&                   \
-     (defined(__CUDA_ARCH__) ||                       \
-      defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND))) || \
-    (defined(KOKKOS_ENABLE_HIP))
-__inline__ __device__ unsigned long atomic_fetch_and(
-    volatile unsigned long* const dest, const unsigned long val) {
-  return atomic_fetch_and<unsigned long>(dest, val);
-}
-__inline__ __device__ long atomic_fetch_and(volatile long* const dest,
-                                            long val) {
-  return atomic_fetch_and<long>(dest, val);
-}
-#endif
-
-//----------------------------------------------------------------------------
-#if !defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND)
-#if defined(KOKKOS_ENABLE_GNU_ATOMICS) || defined(KOKKOS_ENABLE_INTEL_ATOMICS)
-
-inline int atomic_fetch_and(volatile int* const dest, const int val) {
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-  _mm_prefetch((const char*)dest, _MM_HINT_ET0);
-#endif
-  return __sync_fetch_and_and(dest, val);
-}
-
-inline long int atomic_fetch_and(volatile long int* const dest,
-                                 const long int val) {
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-  _mm_prefetch((const char*)dest, _MM_HINT_ET0);
-#endif
-  return __sync_fetch_and_and(dest, val);
-}
-
-#if defined(KOKKOS_ENABLE_GNU_ATOMICS)
-
-inline unsigned int atomic_fetch_and(volatile unsigned int* const dest,
-                                     const unsigned int val) {
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-  _mm_prefetch((const char*)dest, _MM_HINT_ET0);
-#endif
-  return __sync_fetch_and_and(dest, val);
-}
-
-inline unsigned long int atomic_fetch_and(
-    volatile unsigned long int* const dest, const unsigned long int val) {
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-  _mm_prefetch((const char*)dest, _MM_HINT_ET0);
-#endif
-  return __sync_fetch_and_and(dest, val);
-}
-
-inline unsigned long long int atomic_fetch_and(
-    volatile unsigned long long int* const dest,
-    const unsigned long long int val) {
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-  _mm_prefetch((const char*)dest, _MM_HINT_ET0);
-#endif
-  return __sync_fetch_and_and(dest, val);
-}
-
-#endif
-
-//----------------------------------------------------------------------------
-
-#elif defined(KOKKOS_ENABLE_OPENMP_ATOMICS)
-
-template <typename T>
-T atomic_fetch_and(volatile T* const dest, const T val) {
-  T retval;
-#pragma omp atomic capture
-  {
-    retval = dest[0];
-    dest[0] &= val;
-  }
-  return retval;
-}
-
-#elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS)
-
-template <typename T>
-T atomic_fetch_and(volatile T* const dest_v, const T val) {
-  T* dest  = const_cast<T*>(dest_v);
-  T retval = *dest;
-  *dest &= val;
-  return retval;
-}
-
-#endif
-#endif
-//----------------------------------------------------------------------------
-
-// dummy for non-CUDA Kokkos headers being processed by NVCC
-#if defined(__CUDA_ARCH__) && !defined(KOKKOS_ENABLE_CUDA)
-template <typename T>
-__inline__ __device__ T atomic_fetch_and(volatile T* const,
-                                         Kokkos::Impl::type_identity_t<T>) {
-  return T();
-}
-#endif
-
-// Simpler version of atomic_fetch_and without the fetch
-template <typename T>
-KOKKOS_INLINE_FUNCTION void atomic_and(volatile T* const dest, const T src) {
-  (void)atomic_fetch_and(dest, src);
-}
-
-}  // namespace Kokkos
-
-#endif
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Or.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Or.hpp
deleted file mode 100644
index fa581bc1552fc75b3ade87071a92755e70b3c95f..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Or.hpp
+++ /dev/null
@@ -1,165 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-#include <xmmintrin.h>
-#endif
-
-#include <Kokkos_Macros.hpp>
-#if defined(KOKKOS_ATOMIC_HPP) && !defined(KOKKOS_ATOMIC_FETCH_OR_HPP)
-#define KOKKOS_ATOMIC_FETCH_OR_HPP
-
-namespace Kokkos {
-
-//----------------------------------------------------------------------------
-
-#if defined(KOKKOS_ENABLE_CUDA)
-#if defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND)
-
-// Support for int, unsigned int, unsigned long long int, and float
-
-__inline__ __device__ int atomic_fetch_or(volatile int* const dest,
-                                          const int val) {
-  return atomicOr((int*)dest, val);
-}
-
-__inline__ __device__ unsigned int atomic_fetch_or(
-    volatile unsigned int* const dest, const unsigned int val) {
-  return atomicOr((unsigned int*)dest, val);
-}
-
-#if defined(__CUDA_ARCH__) && (350 <= __CUDA_ARCH__)
-__inline__ __device__ unsigned long long int atomic_fetch_or(
-    volatile unsigned long long int* const dest,
-    const unsigned long long int val) {
-  return atomicOr((unsigned long long int*)dest, val);
-}
-#endif
-#endif
-#endif
-
-// 08/05/20 Overload to work around https://bugs.llvm.org/show_bug.cgi?id=46922
-
-#if (defined(KOKKOS_ENABLE_CUDA) &&                   \
-     (defined(__CUDA_ARCH__) ||                       \
-      defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND))) || \
-    (defined(KOKKOS_ENABLE_HIP))
-__inline__ __device__ unsigned long atomic_fetch_or(
-    volatile unsigned long* const dest, const unsigned long val) {
-  return atomic_fetch_or<unsigned long>(dest, val);
-}
-
-__inline__ __device__ long atomic_fetch_or(volatile long* const dest,
-                                           long val) {
-  return atomic_fetch_or<long>(dest, val);
-}
-#endif
-
-//----------------------------------------------------------------------------
-#if !defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND)
-#if defined(KOKKOS_ENABLE_GNU_ATOMICS) || defined(KOKKOS_ENABLE_INTEL_ATOMICS)
-
-inline int atomic_fetch_or(volatile int* const dest, const int val) {
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-  _mm_prefetch((const char*)dest, _MM_HINT_ET0);
-#endif
-  return __sync_fetch_and_or(dest, val);
-}
-
-inline long int atomic_fetch_or(volatile long int* const dest,
-                                const long int val) {
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-  _mm_prefetch((const char*)dest, _MM_HINT_ET0);
-#endif
-  return __sync_fetch_and_or(dest, val);
-}
-
-#if defined(KOKKOS_ENABLE_GNU_ATOMICS)
-
-inline unsigned int atomic_fetch_or(volatile unsigned int* const dest,
-                                    const unsigned int val) {
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-  _mm_prefetch((const char*)dest, _MM_HINT_ET0);
-#endif
-  return __sync_fetch_and_or(dest, val);
-}
-
-inline unsigned long int atomic_fetch_or(volatile unsigned long int* const dest,
-                                         const unsigned long int val) {
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-  _mm_prefetch((const char*)dest, _MM_HINT_ET0);
-#endif
-  return __sync_fetch_and_or(dest, val);
-}
-
-inline unsigned long long int atomic_fetch_or(
-    volatile unsigned long long int* const dest,
-    const unsigned long long int val) {
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-  _mm_prefetch((const char*)dest, _MM_HINT_ET0);
-#endif
-  return __sync_fetch_and_or(dest, val);
-}
-
-#endif
-
-//----------------------------------------------------------------------------
-
-#elif defined(KOKKOS_ENABLE_OPENMP_ATOMICS)
-
-template <typename T>
-T atomic_fetch_or(volatile T* const dest, const T val) {
-  T retval;
-#pragma omp atomic capture
-  {
-    retval = dest[0];
-    dest[0] |= val;
-  }
-  return retval;
-}
-
-#elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS)
-
-template <typename T>
-T atomic_fetch_or(volatile T* const dest_v, const T val) {
-  T* dest  = const_cast<T*>(dest_v);
-  T retval = *dest;
-  *dest |= val;
-  return retval;
-}
-
-#endif
-#endif
-//----------------------------------------------------------------------------
-
-// dummy for non-CUDA Kokkos headers being processed by NVCC
-#if defined(__CUDA_ARCH__) && !defined(KOKKOS_ENABLE_CUDA)
-template <typename T>
-__inline__ __device__ T atomic_fetch_or(volatile T* const,
-                                        Kokkos::Impl::type_identity_t<T>) {
-  return T();
-}
-#endif
-
-// Simpler version of atomic_fetch_or without the fetch
-template <typename T>
-KOKKOS_INLINE_FUNCTION void atomic_or(volatile T* const dest, const T src) {
-  (void)atomic_fetch_or(dest, src);
-}
-
-}  // namespace Kokkos
-
-#endif
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp
deleted file mode 100644
index a4db7d7cf45fc5bf7a30dd1ac286007453554b1f..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp
+++ /dev/null
@@ -1,295 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-#include <xmmintrin.h>
-#endif
-
-#include <Kokkos_Macros.hpp>
-#if defined(KOKKOS_ATOMIC_HPP) && !defined(KOKKOS_ATOMIC_FETCH_SUB_HPP)
-#define KOKKOS_ATOMIC_FETCH_SUB_HPP
-
-namespace Kokkos {
-
-//----------------------------------------------------------------------------
-
-#if defined(KOKKOS_ENABLE_CUDA)
-#if defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND)
-
-// Support for int, unsigned int, unsigned long long int, and float
-
-__inline__ __device__ int atomic_fetch_sub(volatile int* const dest,
-                                           const int val) {
-  return atomicSub((int*)dest, val);
-}
-
-__inline__ __device__ unsigned int atomic_fetch_sub(
-    volatile unsigned int* const dest, const unsigned int val) {
-  return atomicSub((unsigned int*)dest, val);
-}
-
-__inline__ __device__ unsigned int atomic_fetch_sub(
-    volatile int64_t* const dest, const int64_t val) {
-  return atomic_fetch_add(dest, -val);
-}
-
-__inline__ __device__ unsigned int atomic_fetch_sub(volatile float* const dest,
-                                                    const float val) {
-  return atomicAdd((float*)dest, -val);
-}
-
-#if (600 <= __CUDA_ARCH__)
-__inline__ __device__ unsigned int atomic_fetch_sub(volatile double* const dest,
-                                                    const double val) {
-  return atomicAdd((double*)dest, -val);
-}
-#endif
-
-template <typename T>
-__inline__ __device__ T
-atomic_fetch_sub(volatile T* const dest,
-                 std::enable_if_t<sizeof(T) == sizeof(int), const T> val) {
-  union U {
-    int i;
-    T t;
-    KOKKOS_INLINE_FUNCTION U() {}
-  } oldval, assume, newval;
-
-  oldval.t = *dest;
-
-  do {
-    assume.i = oldval.i;
-    newval.t = assume.t - val;
-    oldval.i = atomicCAS((int*)dest, assume.i, newval.i);
-  } while (assume.i != oldval.i);
-
-  return oldval.t;
-}
-
-template <typename T>
-__inline__ __device__ T atomic_fetch_sub(
-    volatile T* const dest,
-    std::enable_if_t<sizeof(T) != sizeof(int) &&
-                         sizeof(T) == sizeof(unsigned long long int),
-                     const T>
-        val) {
-  union U {
-    unsigned long long int i;
-    T t;
-    KOKKOS_INLINE_FUNCTION U() {}
-  } oldval, assume, newval;
-
-  oldval.t = *dest;
-
-  do {
-    assume.i = oldval.i;
-    newval.t = assume.t - val;
-    oldval.i = atomicCAS((unsigned long long int*)dest, assume.i, newval.i);
-  } while (assume.i != oldval.i);
-
-  return oldval.t;
-}
-
-//----------------------------------------------------------------------------
-
-template <typename T>
-__inline__ __device__ T atomic_fetch_sub(
-    volatile T* const dest,
-    std::enable_if_t<(sizeof(T) != 4) && (sizeof(T) != 8), const T>& val) {
-  T return_val;
-  // This is a way to (hopefully) avoid dead lock in a warp
-  int done                 = 0;
-  unsigned int mask        = __activemask();
-  unsigned int active      = __ballot_sync(mask, 1);
-  unsigned int done_active = 0;
-  while (active != done_active) {
-    if (!done) {
-      if (Impl::lock_address_cuda_space((void*)dest)) {
-        Kokkos::memory_fence();
-        return_val = *dest;
-        *dest      = return_val - val;
-        Kokkos::memory_fence();
-        Impl::unlock_address_cuda_space((void*)dest);
-        done = 1;
-      }
-    }
-    done_active = __ballot_sync(mask, done);
-  }
-  return return_val;
-}
-#endif
-#endif
-//----------------------------------------------------------------------------
-#if !defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND)
-#if defined(KOKKOS_ENABLE_GNU_ATOMICS) || defined(KOKKOS_ENABLE_INTEL_ATOMICS)
-
-inline int atomic_fetch_sub(volatile int* const dest, const int val) {
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-  _mm_prefetch((const char*)dest, _MM_HINT_ET0);
-#endif
-  return __sync_fetch_and_sub(dest, val);
-}
-
-inline long int atomic_fetch_sub(volatile long int* const dest,
-                                 const long int val) {
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-  _mm_prefetch((const char*)dest, _MM_HINT_ET0);
-#endif
-  return __sync_fetch_and_sub(dest, val);
-}
-
-#if defined(KOKKOS_ENABLE_GNU_ATOMICS)
-
-inline unsigned int atomic_fetch_sub(volatile unsigned int* const dest,
-                                     const unsigned int val) {
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-  _mm_prefetch((const char*)dest, _MM_HINT_ET0);
-#endif
-  return __sync_fetch_and_sub(dest, val);
-}
-
-inline unsigned long int atomic_fetch_sub(
-    volatile unsigned long int* const dest, const unsigned long int val) {
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-  _mm_prefetch((const char*)dest, _MM_HINT_ET0);
-#endif
-  return __sync_fetch_and_sub(dest, val);
-}
-
-inline unsigned long long int atomic_fetch_sub(
-    volatile unsigned long long int* const dest,
-    const unsigned long long int val) {
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-  _mm_prefetch((const char*)dest, _MM_HINT_ET0);
-#endif
-  return __sync_fetch_and_sub(dest, val);
-}
-
-#endif
-
-template <typename T>
-inline T atomic_fetch_sub(
-    volatile T* const dest,
-    std::enable_if_t<sizeof(T) == sizeof(int), const T> val) {
-  union U {
-    int i;
-    T t;
-    KOKKOS_INLINE_FUNCTION U() {}
-  } oldval, assume, newval;
-
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-  _mm_prefetch((const char*)dest, _MM_HINT_ET0);
-#endif
-
-  oldval.t = *dest;
-
-  do {
-    assume.i = oldval.i;
-    newval.t = assume.t - val;
-    oldval.i = __sync_val_compare_and_swap((int*)dest, assume.i, newval.i);
-  } while (assume.i != oldval.i);
-
-  return oldval.t;
-}
-
-template <typename T>
-inline T atomic_fetch_sub(
-    volatile T* const dest,
-    std::enable_if_t<sizeof(T) != sizeof(int) && sizeof(T) == sizeof(long),
-                     const T>
-        val) {
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-  _mm_prefetch((const char*)dest, _MM_HINT_ET0);
-#endif
-
-  union U {
-    long i;
-    T t;
-    KOKKOS_INLINE_FUNCTION U() {}
-  } oldval, assume, newval;
-
-  oldval.t = *dest;
-
-  do {
-    assume.i = oldval.i;
-    newval.t = assume.t - val;
-    oldval.i = __sync_val_compare_and_swap((long*)dest, assume.i, newval.i);
-  } while (assume.i != oldval.i);
-
-  return oldval.t;
-}
-
-//----------------------------------------------------------------------------
-
-template <typename T>
-inline T atomic_fetch_sub(
-    volatile T* const dest,
-    std::enable_if_t<(sizeof(T) != 4) && (sizeof(T) != 8), const T>& val) {
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-  _mm_prefetch((const char*)dest, _MM_HINT_ET0);
-#endif
-
-  while (!Impl::lock_address_host_space((void*)dest))
-    ;
-  Kokkos::memory_fence();
-  T return_val = *dest;
-  *dest        = return_val - val;
-  Kokkos::memory_fence();
-  Impl::unlock_address_host_space((void*)dest);
-  return return_val;
-}
-
-//----------------------------------------------------------------------------
-
-#elif defined(KOKKOS_ENABLE_OPENMP_ATOMICS)
-
-template <typename T>
-T atomic_fetch_sub(volatile T* const dest, const T val) {
-  T retval;
-#pragma omp atomic capture
-  {
-    retval = dest[0];
-    dest[0] -= val;
-  }
-  return retval;
-}
-
-#elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS)
-
-template <typename T>
-T atomic_fetch_sub(volatile T* const dest_v, const T val) {
-  T* dest  = const_cast<T*>(dest_v);
-  T retval = *dest;
-  *dest -= val;
-  return retval;
-}
-
-#endif
-#endif
-
-// dummy for non-CUDA Kokkos headers being processed by NVCC
-#if defined(__CUDA_ARCH__) && !defined(KOKKOS_ENABLE_CUDA)
-template <typename T>
-__inline__ __device__ T atomic_fetch_sub(volatile T* const,
-                                         Kokkos::Impl::type_identity_t<T>) {
-  return T();
-}
-#endif
-
-}  // namespace Kokkos
-
-#include <impl/Kokkos_Atomic_Assembly.hpp>
-#endif
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp
deleted file mode 100644
index 69d101fb8ea09706a3494fa974558a92a89d40c3..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp
+++ /dev/null
@@ -1,527 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#include <Kokkos_Macros.hpp>
-#if defined(KOKKOS_ATOMIC_HPP) && !defined(KOKKOS_ATOMIC_GENERIC_HPP)
-#define KOKKOS_ATOMIC_GENERIC_HPP
-#include <Kokkos_Macros.hpp>
-
-// Combination operands to be used in an Compare and Exchange based atomic
-// operation
-namespace Kokkos {
-namespace Impl {
-
-template <class Op, class Scalar1, class Scalar2, class Enable = bool>
-struct _check_early_exit_impl {
-  KOKKOS_FORCEINLINE_FUNCTION
-  static constexpr bool check(Op const&, Scalar1 const&,
-                              Scalar2 const&) noexcept {
-    return false;
-  }
-};
-
-template <class Op, class Scalar1, class Scalar2>
-struct _check_early_exit_impl<
-    Op, Scalar1, Scalar2,
-    decltype(std::declval<Op const&>().check_early_exit(
-        std::declval<Scalar1 const&>(), std::declval<Scalar2 const&>()))> {
-  KOKKOS_FORCEINLINE_FUNCTION
-  static constexpr bool check(Op const& op, Scalar1 const& v1,
-                              Scalar2 const& v2) {
-    return op.check_early_exit(v1, v2);
-  }
-};
-
-template <class Op, class Scalar1, class Scalar2>
-KOKKOS_FORCEINLINE_FUNCTION constexpr bool check_early_exit(
-    Op const& op, Scalar1 const& v1, Scalar2 const& v2) noexcept {
-  return _check_early_exit_impl<Op, Scalar1, Scalar2>::check(op, v1, v2);
-}
-
-template <class Scalar1, class Scalar2>
-struct MaxOper {
-  KOKKOS_FORCEINLINE_FUNCTION
-  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
-    return (val1 > val2 ? val1 : val2);
-  }
-  KOKKOS_FORCEINLINE_FUNCTION
-  static constexpr bool check_early_exit(Scalar1 const& val1,
-                                         Scalar2 const& val2) noexcept {
-    return (val1 > val2);
-  }
-};
-
-template <class Scalar1, class Scalar2>
-struct MinOper {
-  KOKKOS_FORCEINLINE_FUNCTION
-  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
-    return (val1 < val2 ? val1 : val2);
-  }
-  KOKKOS_FORCEINLINE_FUNCTION
-  static constexpr bool check_early_exit(Scalar1 const& val1,
-                                         Scalar2 const& val2) noexcept {
-    return (val1 < val2);
-  }
-};
-
-template <class Scalar1, class Scalar2>
-struct AddOper {
-  KOKKOS_FORCEINLINE_FUNCTION
-  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
-    return val1 + val2;
-  }
-};
-
-template <class Scalar1, class Scalar2>
-struct SubOper {
-  KOKKOS_FORCEINLINE_FUNCTION
-  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
-    return val1 - val2;
-  }
-};
-
-template <class Scalar1, class Scalar2>
-struct MulOper {
-  KOKKOS_FORCEINLINE_FUNCTION
-  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
-    return val1 * val2;
-  }
-};
-
-template <class Scalar1, class Scalar2>
-struct DivOper {
-  KOKKOS_FORCEINLINE_FUNCTION
-  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
-    return val1 / val2;
-  }
-};
-
-template <class Scalar1, class Scalar2>
-struct ModOper {
-  KOKKOS_FORCEINLINE_FUNCTION
-  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
-    return val1 % val2;
-  }
-};
-
-template <class Scalar1, class Scalar2>
-struct AndOper {
-  KOKKOS_FORCEINLINE_FUNCTION
-  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
-    return val1 & val2;
-  }
-};
-
-template <class Scalar1, class Scalar2>
-struct OrOper {
-  KOKKOS_FORCEINLINE_FUNCTION
-  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
-    return val1 | val2;
-  }
-};
-
-template <class Scalar1, class Scalar2>
-struct XorOper {
-  KOKKOS_FORCEINLINE_FUNCTION
-  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
-    return val1 ^ val2;
-  }
-};
-
-template <class Scalar1, class Scalar2>
-struct LShiftOper {
-  KOKKOS_FORCEINLINE_FUNCTION
-  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
-    return val1 << val2;
-  }
-};
-
-template <class Scalar1, class Scalar2>
-struct RShiftOper {
-  KOKKOS_FORCEINLINE_FUNCTION
-  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
-    return val1 >> val2;
-  }
-};
-
-template <class Oper, typename T>
-KOKKOS_INLINE_FUNCTION T atomic_fetch_oper(
-    const Oper& op, volatile T* const dest,
-    std::enable_if_t<sizeof(T) != sizeof(int) &&
-                         sizeof(T) == sizeof(unsigned long long int),
-                     const T>
-        val) {
-  union U {
-    unsigned long long int i;
-    T t;
-    KOKKOS_INLINE_FUNCTION U() {}
-  } oldval, assume, newval;
-
-  oldval.t = *dest;
-
-  do {
-    if (check_early_exit(op, oldval.t, val)) return oldval.t;
-    assume.i = oldval.i;
-    newval.t = op.apply(assume.t, val);
-    oldval.i = Kokkos::atomic_compare_exchange((unsigned long long int*)dest,
-                                               assume.i, newval.i);
-  } while (assume.i != oldval.i);
-
-  return oldval.t;
-}
-
-template <class Oper, typename T>
-KOKKOS_INLINE_FUNCTION T atomic_oper_fetch(
-    const Oper& op, volatile T* const dest,
-    std::enable_if_t<sizeof(T) != sizeof(int) &&
-                         sizeof(T) == sizeof(unsigned long long int),
-                     const T>
-        val) {
-  union U {
-    unsigned long long int i;
-    T t;
-    KOKKOS_INLINE_FUNCTION U() {}
-  } oldval, assume, newval;
-
-  oldval.t = *dest;
-
-  do {
-    if (check_early_exit(op, oldval.t, val)) return oldval.t;
-    assume.i = oldval.i;
-    newval.t = op.apply(assume.t, val);
-    oldval.i = Kokkos::atomic_compare_exchange((unsigned long long int*)dest,
-                                               assume.i, newval.i);
-  } while (assume.i != oldval.i);
-
-  return newval.t;
-}
-
-template <class Oper, typename T>
-KOKKOS_INLINE_FUNCTION T
-atomic_fetch_oper(const Oper& op, volatile T* const dest,
-                  std::enable_if_t<sizeof(T) == sizeof(int), const T> val) {
-  union U {
-    int i;
-    T t;
-    KOKKOS_INLINE_FUNCTION U() {}
-  } oldval, assume, newval;
-
-  oldval.t = *dest;
-
-  do {
-    if (check_early_exit(op, oldval.t, val)) return oldval.t;
-    assume.i = oldval.i;
-    newval.t = op.apply(assume.t, val);
-    oldval.i = Kokkos::atomic_compare_exchange((int*)dest, assume.i, newval.i);
-  } while (assume.i != oldval.i);
-
-  return oldval.t;
-}
-
-template <class Oper, typename T>
-KOKKOS_INLINE_FUNCTION T
-atomic_oper_fetch(const Oper& op, volatile T* const dest,
-                  std::enable_if_t<sizeof(T) == sizeof(int), const T> val) {
-  union U {
-    int i;
-    T t;
-    KOKKOS_INLINE_FUNCTION U() {}
-  } oldval, assume, newval;
-
-  oldval.t = *dest;
-
-  do {
-    if (check_early_exit(op, oldval.t, val)) return oldval.t;
-    assume.i = oldval.i;
-    newval.t = op.apply(assume.t, val);
-    oldval.i = Kokkos::atomic_compare_exchange((int*)dest, assume.i, newval.i);
-  } while (assume.i != oldval.i);
-
-  return newval.t;
-}
-
-template <class Oper, typename T>
-KOKKOS_INLINE_FUNCTION T atomic_fetch_oper(
-    const Oper& op, volatile T* const dest,
-    std::enable_if_t<(sizeof(T) != 4) && (sizeof(T) != 8), const T> val) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-  while (!Impl::lock_address_host_space((void*)dest))
-    ;
-  Kokkos::memory_fence();
-  T return_val = *dest;
-  *dest        = op.apply(return_val, val);
-  Kokkos::memory_fence();
-  Impl::unlock_address_host_space((void*)dest);
-  return return_val;
-#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA)
-  // This is a way to (hopefully) avoid dead lock in a warp
-  T return_val;
-  int done                 = 0;
-  unsigned int mask        = __activemask();
-  unsigned int active      = __ballot_sync(mask, 1);
-  unsigned int done_active = 0;
-  while (active != done_active) {
-    if (!done) {
-      if (Impl::lock_address_cuda_space((void*)dest)) {
-        Kokkos::memory_fence();
-        return_val = *dest;
-        *dest      = op.apply(return_val, val);
-        Kokkos::memory_fence();
-        Impl::unlock_address_cuda_space((void*)dest);
-        done = 1;
-      }
-    }
-    done_active = __ballot_sync(mask, done);
-  }
-  return return_val;
-#elif defined(__HIP_DEVICE_COMPILE__)
-  T return_val             = *dest;
-  int done                 = 0;
-  unsigned int active      = __ballot(1);
-  unsigned int done_active = 0;
-  while (active != done_active) {
-    if (!done) {
-      if (Impl::lock_address_hip_space((void*)dest)) {
-        return_val = *dest;
-        *dest      = op.apply(return_val, val);
-        Impl::unlock_address_hip_space((void*)dest);
-        done = 1;
-      }
-    }
-    done_active = __ballot(done);
-  }
-  return return_val;
-#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL)
-  // FIXME_SYCL
-  Kokkos::abort("Not implemented!");
-  (void)op;
-  (void)dest;
-  (void)val;
-  return 0;
-#endif
-}
-
-template <class Oper, typename T>
-KOKKOS_INLINE_FUNCTION T
-atomic_oper_fetch(const Oper& op, volatile T* const dest,
-                  std::enable_if_t<(sizeof(T) != 4) && (sizeof(T) != 8)
-#if defined(KOKKOS_ENABLE_ASM) && \
-    defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST)
-                                       && (sizeof(T) != 16)
-#endif
-                                       ,
-                                   const T>& val) {
-
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-  while (!Impl::lock_address_host_space((void*)dest))
-    ;
-  Kokkos::memory_fence();
-  T return_val = op.apply(*dest, val);
-  *dest        = return_val;
-  Kokkos::memory_fence();
-  Impl::unlock_address_host_space((void*)dest);
-  return return_val;
-#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA)
-  T return_val;
-  // This is a way to (hopefully) avoid dead lock in a warp
-  int done                 = 0;
-  unsigned int mask        = __activemask();
-  unsigned int active      = __ballot_sync(mask, 1);
-  unsigned int done_active = 0;
-  while (active != done_active) {
-    if (!done) {
-      if (Impl::lock_address_cuda_space((void*)dest)) {
-        Kokkos::memory_fence();
-        return_val = op.apply(*dest, val);
-        *dest      = return_val;
-        Kokkos::memory_fence();
-        Impl::unlock_address_cuda_space((void*)dest);
-        done = 1;
-      }
-    }
-    done_active = __ballot_sync(mask, done);
-  }
-  return return_val;
-#elif defined(__HIP_DEVICE_COMPILE__)
-  T return_val;
-  int done                 = 0;
-  unsigned int active      = __ballot(1);
-  unsigned int done_active = 0;
-  while (active != done_active) {
-    if (!done) {
-      if (Impl::lock_address_hip_space((void*)dest)) {
-        return_val = op.apply(*dest, val);
-        *dest      = return_val;
-        Impl::unlock_address_hip_space((void*)dest);
-        done = 1;
-      }
-    }
-    done_active = __ballot(done);
-  }
-  return return_val;
-#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL)
-  // FIXME_SYCL
-  std::abort();
-  (void)op;
-  (void)dest;
-  (void)val;
-  return 0;
-#endif
-}
-
-}  // namespace Impl
-}  // namespace Kokkos
-
-namespace Kokkos {
-
-// Fetch_Oper atomics: return value before operation
-template <typename T>
-KOKKOS_INLINE_FUNCTION T atomic_fetch_max(volatile T* const dest, const T val) {
-  return Impl::atomic_fetch_oper(Impl::MaxOper<T, const T>(), dest, val);
-}
-
-template <typename T>
-KOKKOS_INLINE_FUNCTION T atomic_fetch_min(volatile T* const dest, const T val) {
-  return Impl::atomic_fetch_oper(Impl::MinOper<T, const T>(), dest, val);
-}
-
-template <typename T>
-KOKKOS_INLINE_FUNCTION T atomic_fetch_mul(volatile T* const dest, const T val) {
-  return Impl::atomic_fetch_oper(Impl::MulOper<T, const T>(), dest, val);
-}
-
-template <typename T>
-KOKKOS_INLINE_FUNCTION T atomic_fetch_div(volatile T* const dest, const T val) {
-  return Impl::atomic_fetch_oper(Impl::DivOper<T, const T>(), dest, val);
-}
-
-template <typename T>
-KOKKOS_INLINE_FUNCTION T atomic_fetch_mod(volatile T* const dest, const T val) {
-  return Impl::atomic_fetch_oper(Impl::ModOper<T, const T>(), dest, val);
-}
-
-#if !defined(KOKKOS_ENABLE_SERIAL_ATOMICS)
-
-template <typename T>
-KOKKOS_INLINE_FUNCTION T atomic_fetch_and(volatile T* const dest, const T val) {
-  return Impl::atomic_fetch_oper(Impl::AndOper<T, const T>(), dest, val);
-}
-
-template <typename T>
-KOKKOS_INLINE_FUNCTION T atomic_fetch_or(volatile T* const dest, const T val) {
-  return Impl::atomic_fetch_oper(Impl::OrOper<T, const T>(), dest, val);
-}
-
-#endif
-
-template <typename T>
-KOKKOS_INLINE_FUNCTION T atomic_fetch_xor(volatile T* const dest, const T val) {
-  return Impl::atomic_fetch_oper(Impl::XorOper<T, const T>(), dest, val);
-}
-
-template <typename T>
-KOKKOS_INLINE_FUNCTION T atomic_fetch_lshift(volatile T* const dest,
-                                             const unsigned int val) {
-  return Impl::atomic_fetch_oper(Impl::LShiftOper<T, const unsigned int>(),
-                                 dest, val);
-}
-
-template <typename T>
-KOKKOS_INLINE_FUNCTION T atomic_fetch_rshift(volatile T* const dest,
-                                             const unsigned int val) {
-  return Impl::atomic_fetch_oper(Impl::RShiftOper<T, const unsigned int>(),
-                                 dest, val);
-}
-
-// Oper Fetch atomics: return value after operation
-template <typename T>
-KOKKOS_INLINE_FUNCTION T atomic_max_fetch(volatile T* const dest, const T val) {
-  return Impl::atomic_oper_fetch(Impl::MaxOper<T, const T>(), dest, val);
-}
-
-template <typename T>
-KOKKOS_INLINE_FUNCTION T atomic_min_fetch(volatile T* const dest, const T val) {
-  return Impl::atomic_oper_fetch(Impl::MinOper<T, const T>(), dest, val);
-}
-
-template <typename T>
-KOKKOS_INLINE_FUNCTION T atomic_mul_fetch(volatile T* const dest, const T val) {
-  return Impl::atomic_oper_fetch(Impl::MulOper<T, const T>(), dest, val);
-}
-
-template <typename T>
-KOKKOS_INLINE_FUNCTION T atomic_div_fetch(volatile T* const dest, const T val) {
-  return Impl::atomic_oper_fetch(Impl::DivOper<T, const T>(), dest, val);
-}
-
-template <typename T>
-KOKKOS_INLINE_FUNCTION T atomic_mod_fetch(volatile T* const dest, const T val) {
-  return Impl::atomic_oper_fetch(Impl::ModOper<T, const T>(), dest, val);
-}
-
-template <typename T>
-KOKKOS_INLINE_FUNCTION T atomic_and_fetch(volatile T* const dest, const T val) {
-  return Impl::atomic_oper_fetch(Impl::AndOper<T, const T>(), dest, val);
-}
-
-template <typename T>
-KOKKOS_INLINE_FUNCTION T atomic_or_fetch(volatile T* const dest, const T val) {
-  return Impl::atomic_oper_fetch(Impl::OrOper<T, const T>(), dest, val);
-}
-
-template <typename T>
-KOKKOS_INLINE_FUNCTION T atomic_xor_fetch(volatile T* const dest, const T val) {
-  return Impl::atomic_oper_fetch(Impl::XorOper<T, const T>(), dest, val);
-}
-
-template <typename T>
-KOKKOS_INLINE_FUNCTION T atomic_lshift_fetch(volatile T* const dest,
-                                             const unsigned int val) {
-  return Impl::atomic_oper_fetch(Impl::LShiftOper<T, const unsigned int>(),
-                                 dest, val);
-}
-
-template <typename T>
-KOKKOS_INLINE_FUNCTION T atomic_rshift_fetch(volatile T* const dest,
-                                             const unsigned int val) {
-  return Impl::atomic_oper_fetch(Impl::RShiftOper<T, const unsigned int>(),
-                                 dest, val);
-}
-
-#ifdef _WIN32
-template <typename T>
-KOKKOS_INLINE_FUNCTION T atomic_add_fetch(volatile T* const dest, const T val) {
-  return Impl::atomic_oper_fetch(Impl::AddOper<T, const T>(), dest, val);
-}
-
-template <typename T>
-KOKKOS_INLINE_FUNCTION T atomic_sub_fetch(volatile T* const dest, const T val) {
-  return Impl::atomic_oper_fetch(Impl::SubOper<T, const T>(), dest, val);
-}
-
-template <typename T>
-KOKKOS_INLINE_FUNCTION T atomic_fetch_add(volatile T* const dest, const T val) {
-  return Impl::atomic_fetch_oper(Impl::AddOper<T, const T>(), dest, val);
-}
-
-template <typename T>
-KOKKOS_INLINE_FUNCTION T atomic_fetch_sub(volatile T* const dest, const T val) {
-  return Impl::atomic_fetch_oper(Impl::SubOper<T, const T>(), dest, val);
-}
-#endif
-
-}  // namespace Kokkos
-#endif
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Generic_Secondary.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Generic_Secondary.hpp
deleted file mode 100644
index af43bf66795b88369465c89e16f460df33a92030..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/src/impl/Kokkos_Atomic_Generic_Secondary.hpp
+++ /dev/null
@@ -1,58 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#if defined(KOKKOS_ATOMIC_HPP) && !defined(KOKKOS_ATOMIC_GENERIC_SECONDARY_HPP)
-#define KOKKOS_ATOMIC_GENERIC_SECONDARY_HPP
-#include <Kokkos_Macros.hpp>
-
-namespace Kokkos {
-
-#ifndef KOKKOS_ENABLE_SERIAL_ATOMICS
-template <typename T>
-KOKKOS_INLINE_FUNCTION T atomic_exchange(volatile T* const dest, const T val) {
-  T oldval = *dest;
-  T assume;
-  do {
-    assume = oldval;
-    oldval = atomic_compare_exchange(dest, assume, val);
-  } while (assume != oldval);
-
-  return oldval;
-}
-#endif
-
-template <typename T>
-KOKKOS_INLINE_FUNCTION void atomic_add(volatile T* const dest, const T val) {
-  (void)atomic_fetch_add(dest, val);
-}
-
-template <typename T>
-KOKKOS_INLINE_FUNCTION void atomic_sub(volatile T* const dest, const T val) {
-  (void)atomic_fetch_sub(dest, val);
-}
-
-template <typename T>
-KOKKOS_INLINE_FUNCTION void atomic_mul(volatile T* const dest, const T val) {
-  (void)atomic_fetch_mul(dest, val);
-}
-
-template <typename T>
-KOKKOS_INLINE_FUNCTION void atomic_div(volatile T* const dest, const T val) {
-  (void)atomic_fetch_div(dest, val);
-}
-
-}  // namespace Kokkos
-#endif
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Increment.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Increment.hpp
deleted file mode 100644
index b40e7dfecb2693c3bf434326395e2882721cb841..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/src/impl/Kokkos_Atomic_Increment.hpp
+++ /dev/null
@@ -1,119 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-#include <xmmintrin.h>
-#endif
-
-#include <Kokkos_Macros.hpp>
-#if defined(KOKKOS_ATOMIC_HPP) && !defined(KOKKOS_ATOMIC_INCREMENT_HPP)
-#define KOKKOS_ATOMIC_INCREMENT_HPP
-
-namespace Kokkos {
-
-// Atomic increment
-template <>
-KOKKOS_INLINE_FUNCTION void atomic_increment<char>(volatile char* a) {
-#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) && \
-    !defined(_WIN32) && !defined(__CUDA_ARCH__)
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-  _mm_prefetch((const char*)a, _MM_HINT_ET0);
-#endif
-  __asm__ __volatile__("lock incb %0"
-                       : /* no output registers */
-                       : "m"(a[0])
-                       : "memory");
-#elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS)
-  char* a_nv = const_cast<char*>(a);
-  ++(*a_nv);
-#else
-  Kokkos::atomic_fetch_add(a, char(1));
-#endif
-}
-
-template <>
-KOKKOS_INLINE_FUNCTION void atomic_increment<short>(volatile short* a) {
-#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) && \
-    !defined(_WIN32) && !defined(__CUDA_ARCH__)
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-  _mm_prefetch((const char*)a, _MM_HINT_ET0);
-#endif
-  __asm__ __volatile__("lock incw %0"
-                       : /* no output registers */
-                       : "m"(a[0])
-                       : "memory");
-#elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS)
-  short* a_nv = const_cast<short*>(a);
-  ++(*a_nv);
-#else
-  Kokkos::atomic_fetch_add(a, short(1));
-#endif
-}
-
-#ifndef _WIN32
-template <>
-KOKKOS_INLINE_FUNCTION void atomic_increment<int>(volatile int* a) {
-#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) && \
-    !defined(_WIN32) && !defined(__CUDA_ARCH__)
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-  _mm_prefetch((const char*)a, _MM_HINT_ET0);
-#endif
-  __asm__ __volatile__("lock incl %0"
-                       : /* no output registers */
-                       : "m"(a[0])
-                       : "memory");
-#elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS)
-  int* a_nv = const_cast<int*>(a);
-  ++(*a_nv);
-#else
-  Kokkos::atomic_fetch_add(a, int(1));
-#endif
-}
-#endif
-
-template <>
-KOKKOS_INLINE_FUNCTION void atomic_increment<long long int>(
-    volatile long long int* a) {
-#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) && \
-    !defined(_WIN32) && !defined(__CUDA_ARCH__)
-#if defined(KOKKOS_ENABLE_RFO_PREFETCH)
-  _mm_prefetch((const char*)a, _MM_HINT_ET0);
-#endif
-  __asm__ __volatile__("lock incq %0"
-                       : /* no output registers */
-                       : "m"(a[0])
-                       : "memory");
-#elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS)
-  long long int* a_nv = const_cast<long long int*>(a);
-  ++(*a_nv);
-#else
-  using T = long long int;
-  Kokkos::atomic_fetch_add(a, T(1));
-#endif
-}
-
-template <typename T>
-KOKKOS_INLINE_FUNCTION void atomic_increment(volatile T* a) {
-#if defined(KOKKOS_ENABLE_SERIAL_ATOMICS)
-  T* a_nv = const_cast<T*>(a);
-  ++(*a_nv);
-#else
-  Kokkos::atomic_fetch_add(a, T(1));
-#endif
-}
-
-}  // End of namespace Kokkos
-#endif
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Load.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Load.hpp
deleted file mode 100644
index fc4a04b501362a23f741cccd2726b99249dc7e49..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/src/impl/Kokkos_Atomic_Load.hpp
+++ /dev/null
@@ -1,201 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#ifndef KOKKOS_IMPL_KOKKOS_ATOMIC_LOAD_HPP
-#define KOKKOS_IMPL_KOKKOS_ATOMIC_LOAD_HPP
-
-#include <Kokkos_Macros.hpp>
-#if defined(KOKKOS_ATOMIC_HPP)
-
-#include <impl/Kokkos_Atomic_Memory_Order.hpp>
-#include <impl/Kokkos_Atomic_Generic.hpp>
-
-#if defined(KOKKOS_ENABLE_CUDA)
-#include <Cuda/Kokkos_Cuda_Atomic_Intrinsics.hpp>
-#endif
-
-namespace Kokkos {
-namespace Impl {
-
-// Olivier's implementation helpfully binds to the same builtins as GNU, so
-// we make this code common across multiple options
-#if (defined(KOKKOS_ENABLE_GNU_ATOMICS) && !defined(__CUDA_ARCH__)) ||   \
-    (defined(KOKKOS_ENABLE_INTEL_ATOMICS) && !defined(__CUDA_ARCH__)) || \
-    defined(KOKKOS_ENABLE_CUDA_ASM_ATOMICS)
-
-#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA_ASM_ATOMICS)
-#define KOKKOS_INTERNAL_INLINE_DEVICE_IF_CUDA_ARCH __inline__ __device__
-#else
-#define KOKKOS_INTERNAL_INLINE_DEVICE_IF_CUDA_ARCH inline
-#endif
-
-template <class T, class MemoryOrder>
-KOKKOS_INTERNAL_INLINE_DEVICE_IF_CUDA_ARCH T _atomic_load(
-    T* ptr, MemoryOrder,
-    std::enable_if_t<(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 ||
-                      sizeof(T) == 8) &&
-                         std::is_same<typename MemoryOrder::memory_order,
-                                      std::remove_cv_t<MemoryOrder>>::value,
-                     void const**> = nullptr) {
-  return __atomic_load_n(ptr, MemoryOrder::gnu_constant);
-}
-
-template <class T, class MemoryOrder>
-KOKKOS_INTERNAL_INLINE_DEVICE_IF_CUDA_ARCH T _atomic_load(
-    T* ptr, MemoryOrder,
-    std::enable_if_t<!(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 ||
-                       sizeof(T) == 8) &&
-                         std::is_default_constructible<T>::value &&
-                         std::is_same<typename MemoryOrder::memory_order,
-                                      std::remove_cv_t<MemoryOrder>>::value,
-                     void const**> = nullptr) {
-  T rv{};
-  __atomic_load(ptr, &rv, MemoryOrder::gnu_constant);
-  return rv;
-}
-
-#undef KOKKOS_INTERNAL_INLINE_DEVICE_IF_CUDA_ARCH
-
-#elif defined(__CUDA_ARCH__)
-
-// Not compiling for Volta or later, or Cuda ASM atomics were manually disabled
-
-template <class T>
-__device__ __inline__ T _relaxed_atomic_load_impl(
-    T* ptr, std::enable_if_t<(sizeof(T) == 1 || sizeof(T) == 2 ||
-                              sizeof(T) == 4 || sizeof(T) == 8),
-                             void const**> = nullptr) {
-  return *ptr;
-}
-
-template <class T>
-struct NoOpOper {
-  __device__ __inline__ static constexpr T apply(T const& t,
-                                                 T const&) noexcept {
-    return t;
-  }
-};
-
-template <class T>
-__device__ __inline__ T _relaxed_atomic_load_impl(
-    T* ptr, std::enable_if_t<!(sizeof(T) == 1 || sizeof(T) == 2 ||
-                               sizeof(T) == 4 || sizeof(T) == 8),
-                             void const**> = nullptr) {
-  T rv{};
-  // TODO remove a copy operation here?
-  return Kokkos::Impl::atomic_oper_fetch(NoOpOper<T>{}, ptr, rv);
-}
-
-template <class T>
-__device__ __inline__ T _atomic_load(T* ptr, memory_order_seq_cst_t) {
-  Kokkos::memory_fence();
-  T rv = Impl::_relaxed_atomic_load_impl(ptr);
-  Kokkos::memory_fence();
-  return rv;
-}
-
-template <class T>
-__device__ __inline__ T _atomic_load(T* ptr, memory_order_acquire_t) {
-  T rv = Impl::_relaxed_atomic_load_impl(ptr);
-  Kokkos::memory_fence();
-  return rv;
-}
-
-template <class T>
-__device__ __inline__ T _atomic_load(T* ptr, memory_order_relaxed_t) {
-  return _relaxed_atomic_load_impl(ptr);
-}
-
-#elif defined(KOKKOS_ENABLE_OPENMP_ATOMICS)
-
-template <class T, class MemoryOrder>
-inline T _atomic_load(T* ptr, MemoryOrder) {
-  // AFAICT, all OpenMP atomics are sequentially consistent, so memory order
-  // doesn't matter
-  T retval{};
-#pragma omp atomic read
-  { retval = *ptr; }
-  return retval;
-}
-
-#elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS)
-
-template <class T, class MemoryOrder>
-inline T _atomic_load(T* ptr, MemoryOrder) {
-  return *ptr;
-}
-
-#elif defined(KOKKOS_ENABLE_WINDOWS_ATOMICS)
-
-template <class T, class MemoryOrder>
-inline T _atomic_load(T* ptr, MemoryOrder) {
-  atomic_compare_exchange(ptr, 0, 0);
-  return *ptr;
-}
-
-#endif  // end of all atomic implementations
-
-template <class T>
-KOKKOS_FORCEINLINE_FUNCTION T atomic_load(T* ptr,
-                                          Impl::memory_order_seq_cst_t) {
-  return _atomic_load(ptr, Impl::memory_order_seq_cst);
-}
-
-template <class T>
-KOKKOS_FORCEINLINE_FUNCTION T atomic_load(T* ptr,
-                                          Impl::memory_order_acquire_t) {
-  return _atomic_load(ptr, Impl::memory_order_acquire);
-}
-
-template <class T>
-KOKKOS_FORCEINLINE_FUNCTION T atomic_load(T* ptr,
-                                          Impl::memory_order_relaxed_t) {
-  return _atomic_load(ptr, Impl::memory_order_relaxed);
-}
-
-template <class T>
-KOKKOS_FORCEINLINE_FUNCTION T atomic_load(T* /*ptr*/,
-                                          Impl::memory_order_release_t) {
-  static_assert(
-      sizeof(T) == 0,  // just something that will always be false, but only on
-                       // instantiation
-      "atomic_load with memory order release doesn't make any sense!");
-}
-
-template <class T>
-KOKKOS_FORCEINLINE_FUNCTION T atomic_load(T* /*ptr*/,
-                                          Impl::memory_order_acq_rel_t) {
-  static_assert(
-      sizeof(T) == 0,  // just something that will always be false, but only on
-                       // instantiation
-      "atomic_load with memory order acq_rel doesn't make any sense!");
-}
-
-template <class T>
-KOKKOS_FORCEINLINE_FUNCTION T atomic_load(T* ptr) {
-  // relaxed by default!
-  return _atomic_load(ptr, Impl::memory_order_relaxed);
-}
-
-}  // end namespace Impl
-}  // end namespace Kokkos
-
-#if defined(KOKKOS_ENABLE_CUDA)
-#include <Cuda/Kokkos_Cuda_Atomic_Intrinsics_Restore_Builtins.hpp>
-#endif
-
-#endif  // defined(KOKKOS_ATOMIC_HPP)
-#endif  // KOKKOS_IMPL_KOKKOS_ATOMIC_LOAD_HPP
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Memory_Order.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Memory_Order.hpp
deleted file mode 100644
index d8c68212676a3e536f348b6c4214a89e37a9e7fe..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/src/impl/Kokkos_Atomic_Memory_Order.hpp
+++ /dev/null
@@ -1,94 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#ifndef KOKKOS_KOKKOS_ATOMIC_MEMORY_ORDER_HPP
-#define KOKKOS_KOKKOS_ATOMIC_MEMORY_ORDER_HPP
-
-#include <Kokkos_Macros.hpp>
-
-#include <atomic>
-
-namespace Kokkos {
-namespace Impl {
-
-/** @file
- * Provides strongly-typed analogs of the standard memory order enumerators.
- * In addition to (very slightly) reducing the constant propagation burden on
- * the compiler, this allows us to give compile-time errors for things that
- * don't make sense, like atomic_load with memory order release.
- */
-
-struct memory_order_seq_cst_t {
-  using memory_order = memory_order_seq_cst_t;
-#if defined(KOKKOS_ENABLE_GNU_ATOMICS) ||   \
-    defined(KOKKOS_ENABLE_INTEL_ATOMICS) || \
-    defined(KOKKOS_ENABLE_CUDA_ASM_ATOMICS)
-  static constexpr auto gnu_constant = __ATOMIC_SEQ_CST;
-#endif
-  static constexpr auto std_constant = std::memory_order_seq_cst;
-};
-constexpr memory_order_seq_cst_t memory_order_seq_cst = {};
-
-struct memory_order_relaxed_t {
-  using memory_order = memory_order_relaxed_t;
-#if defined(KOKKOS_ENABLE_GNU_ATOMICS) ||   \
-    defined(KOKKOS_ENABLE_INTEL_ATOMICS) || \
-    defined(KOKKOS_ENABLE_CUDA_ASM_ATOMICS)
-  static constexpr auto gnu_constant = __ATOMIC_RELAXED;
-#endif
-  static constexpr auto std_constant = std::memory_order_relaxed;
-};
-constexpr memory_order_relaxed_t memory_order_relaxed = {};
-
-struct memory_order_acquire_t {
-  using memory_order = memory_order_acquire_t;
-#if defined(KOKKOS_ENABLE_GNU_ATOMICS) ||   \
-    defined(KOKKOS_ENABLE_INTEL_ATOMICS) || \
-    defined(KOKKOS_ENABLE_CUDA_ASM_ATOMICS)
-  static constexpr auto gnu_constant = __ATOMIC_ACQUIRE;
-#endif
-  static constexpr auto std_constant = std::memory_order_acquire;
-};
-constexpr memory_order_acquire_t memory_order_acquire = {};
-
-struct memory_order_release_t {
-  using memory_order = memory_order_release_t;
-#if defined(KOKKOS_ENABLE_GNU_ATOMICS) ||   \
-    defined(KOKKOS_ENABLE_INTEL_ATOMICS) || \
-    defined(KOKKOS_ENABLE_CUDA_ASM_ATOMICS)
-  static constexpr auto gnu_constant = __ATOMIC_RELEASE;
-#endif
-  static constexpr auto std_constant = std::memory_order_release;
-};
-constexpr memory_order_release_t memory_order_release = {};
-
-struct memory_order_acq_rel_t {
-  using memory_order = memory_order_acq_rel_t;
-#if defined(KOKKOS_ENABLE_GNU_ATOMICS) ||   \
-    defined(KOKKOS_ENABLE_INTEL_ATOMICS) || \
-    defined(KOKKOS_ENABLE_CUDA_ASM_ATOMICS)
-  static constexpr auto gnu_constant = __ATOMIC_ACQ_REL;
-#endif
-  static constexpr auto std_constant = std::memory_order_acq_rel;
-};
-constexpr memory_order_acq_rel_t memory_order_acq_rel = {};
-
-// Intentionally omit consume (for now)
-
-}  // end namespace Impl
-}  // end namespace Kokkos
-
-#endif  // KOKKOS_KOKKOS_ATOMIC_MEMORY_ORDER_HPP
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_MinMax.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_MinMax.hpp
deleted file mode 100644
index 42898c82a4631a5efb564ef95c41ea0ad5b6f2f2..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/src/impl/Kokkos_Atomic_MinMax.hpp
+++ /dev/null
@@ -1,291 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#include <Kokkos_Macros.hpp>
-#if defined(KOKKOS_ATOMIC_HPP) && !defined(KOKKOS_ATOMIC_MINMAX_HPP)
-#define KOKKOS_ATOMIC_MINMAX_HPP
-
-namespace Kokkos {
-
-//----------------------------------------------------------------------------
-
-#if defined(KOKKOS_ENABLE_CUDA)
-#if defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND)
-
-// Support for int, unsigned int, unsigned long long int, and float
-
-// Atomic_fetch_{min,max}
-
-#ifdef KOKKOS_IMPL_CUDA_CLANG_WORKAROUND
-
-// Host implementations for CLANG compiler
-
-inline __host__ int atomic_fetch_min(volatile int* const dest, const int val) {
-  return Impl::atomic_fetch_oper(Impl::MinOper<const int, const int>(), dest,
-                                 val);
-}
-
-inline __host__ unsigned int atomic_fetch_min(volatile unsigned int* const dest,
-                                              const unsigned int val) {
-  return Impl::atomic_fetch_oper(
-      Impl::MinOper<const unsigned int, const unsigned int>(), dest, val);
-}
-
-inline __host__ unsigned long long int atomic_fetch_min(
-    volatile unsigned long long int* const dest,
-    const unsigned long long int val) {
-  return Impl::atomic_fetch_oper(Impl::MinOper<const unsigned long long int,
-                                               const unsigned long long int>(),
-                                 dest, val);
-}
-
-inline __host__ int atomic_fetch_max(volatile int* const dest, const int val) {
-  return Impl::atomic_fetch_oper(Impl::MaxOper<const int, const int>(), dest,
-                                 val);
-}
-
-inline __host__ unsigned int atomic_fetch_max(volatile unsigned int* const dest,
-                                              const unsigned int val) {
-  return Impl::atomic_fetch_oper(
-      Impl::MaxOper<const unsigned int, const unsigned int>(), dest, val);
-}
-
-inline __host__ unsigned long long int atomic_fetch_max(
-    volatile unsigned long long int* const dest,
-    const unsigned long long int val) {
-  return Impl::atomic_fetch_oper(Impl::MaxOper<const unsigned long long int,
-                                               const unsigned long long int>(),
-                                 dest, val);
-}
-
-#endif
-
-#if (350 > __CUDA_ARCH__)
-
-// Fallback for atomic{Min,Max} for Kepler
-
-inline __device__ int atomic_fetch_min(volatile int* const dest,
-                                       const int val) {
-  return Impl::atomic_fetch_oper(Impl::MinOper<const int, const int>(), dest,
-                                 val);
-}
-
-inline __device__ unsigned int atomic_fetch_min(
-    volatile unsigned int* const dest, const unsigned int val) {
-  return Impl::atomic_fetch_oper(
-      Impl::MinOper<const unsigned int, const unsigned int>(), dest, val);
-}
-
-inline __device__ unsigned long long int atomic_fetch_min(
-    volatile unsigned long long int* const dest,
-    const unsigned long long int val) {
-  return Impl::atomic_fetch_oper(Impl::MinOper<const unsigned long long int,
-                                               const unsigned long long int>(),
-                                 dest, val);
-}
-
-inline __device__ int atomic_fetch_max(volatile int* const dest,
-                                       const int val) {
-  return Impl::atomic_fetch_oper(Impl::MaxOper<const int, const int>(), dest,
-                                 val);
-}
-
-inline __device__ unsigned int atomic_fetch_max(
-    volatile unsigned int* const dest, const unsigned int val) {
-  return Impl::atomic_fetch_oper(
-      Impl::MaxOper<const unsigned int, const unsigned int>(), dest, val);
-}
-
-inline __device__ unsigned long long int atomic_fetch_max(
-    volatile unsigned long long int* const dest,
-    const unsigned long long int val) {
-  return Impl::atomic_fetch_oper(Impl::MaxOper<const unsigned long long int,
-                                               const unsigned long long int>(),
-                                 dest, val);
-}
-
-#else  // Supported by devices of compute capability 3.5 and higher
-
-inline __device__ int atomic_fetch_min(volatile int* const dest,
-                                       const int val) {
-  return atomicMin((int*)dest, val);
-}
-
-inline __device__ unsigned int atomic_fetch_min(
-    volatile unsigned int* const dest, const unsigned int val) {
-  return atomicMin((unsigned int*)dest, val);
-}
-
-inline __device__ unsigned long long int atomic_fetch_min(
-    volatile unsigned long long int* const dest,
-    const unsigned long long int val) {
-  return atomicMin((unsigned long long int*)dest, val);
-}
-
-inline __device__ int atomic_fetch_max(volatile int* const dest,
-                                       const int val) {
-  return atomicMax((int*)dest, val);
-}
-
-inline __device__ unsigned int atomic_fetch_max(
-    volatile unsigned int* const dest, const unsigned int val) {
-  return atomicMax((unsigned int*)dest, val);
-}
-
-inline __device__ unsigned long long int atomic_fetch_max(
-    volatile unsigned long long int* const dest,
-    const unsigned long long int val) {
-  return atomicMax((unsigned long long int*)dest, val);
-}
-
-#endif
-
-// Atomic_{min,max}_fetch
-
-#ifdef KOKKOS_IMPL_CUDA_CLANG_WORKAROUND
-
-// Host implementations for CLANG compiler
-
-inline __host__ int atomic_min_fetch(volatile int* const dest, const int val) {
-  return Impl::atomic_oper_fetch(Impl::MinOper<const int, const int>(), dest,
-                                 val);
-}
-
-inline __host__ unsigned int atomic_min_fetch(volatile unsigned int* const dest,
-                                              const unsigned int val) {
-  return Impl::atomic_oper_fetch(
-      Impl::MinOper<const unsigned int, const unsigned int>(), dest, val);
-}
-
-inline __host__ unsigned long long int atomic_min_fetch(
-    volatile unsigned long long int* const dest,
-    const unsigned long long int val) {
-  return Impl::atomic_oper_fetch(Impl::MinOper<const unsigned long long int,
-                                               const unsigned long long int>(),
-                                 dest, val);
-}
-
-inline __host__ int atomic_max_fetch(volatile int* const dest, const int val) {
-  return Impl::atomic_oper_fetch(Impl::MaxOper<const int, const int>(), dest,
-                                 val);
-}
-
-inline __host__ unsigned int atomic_max_fetch(volatile unsigned int* const dest,
-                                              const unsigned int val) {
-  return Impl::atomic_oper_fetch(
-      Impl::MaxOper<const unsigned int, const unsigned int>(), dest, val);
-}
-
-inline __host__ unsigned long long int atomic_max_fetch(
-    volatile unsigned long long int* const dest,
-    const unsigned long long int val) {
-  return Impl::atomic_oper_fetch(Impl::MaxOper<const unsigned long long int,
-                                               const unsigned long long int>(),
-                                 dest, val);
-}
-#endif
-
-#if (350 > __CUDA_ARCH__)
-
-// Fallback for atomic{Min,Max} for Kepler
-
-inline __device__ int atomic_min_fetch(volatile int* const dest,
-                                       const int val) {
-  return Impl::atomic_oper_fetch(Impl::MinOper<const int, const int>(), dest,
-                                 val);
-}
-
-inline __device__ unsigned int atomic_min_fetch(
-    volatile unsigned int* const dest, const unsigned int val) {
-  return Impl::atomic_oper_fetch(
-      Impl::MinOper<const unsigned int, const unsigned int>(), dest, val);
-}
-
-inline __device__ unsigned long long int atomic_min_fetch(
-    volatile unsigned long long int* const dest,
-    const unsigned long long int val) {
-  return Impl::atomic_oper_fetch(Impl::MinOper<const unsigned long long int,
-                                               const unsigned long long int>(),
-                                 dest, val);
-}
-
-inline __device__ int atomic_max_fetch(volatile int* const dest,
-                                       const int val) {
-  return Impl::atomic_oper_fetch(Impl::MaxOper<const int, const int>(), dest,
-                                 val);
-}
-
-inline __device__ unsigned int atomic_max_fetch(
-    volatile unsigned int* const dest, const unsigned int val) {
-  return Impl::atomic_oper_fetch(
-      Impl::MaxOper<const unsigned int, const unsigned int>(), dest, val);
-}
-
-inline __device__ unsigned long long int atomic_max_fetch(
-    volatile unsigned long long int* const dest,
-    const unsigned long long int val) {
-  return Impl::atomic_oper_fetch(Impl::MaxOper<const unsigned long long int,
-                                               const unsigned long long int>(),
-                                 dest, val);
-}
-
-#else  // Supported by devices of compute capability 3.5 and higher
-
-inline __device__ int atomic_min_fetch(volatile int* const dest,
-                                       const int val) {
-  const int old = atomicMin((int*)dest, val);
-  return old < val ? old : val;
-}
-
-inline __device__ unsigned int atomic_min_fetch(
-    volatile unsigned int* const dest, const unsigned int val) {
-  const unsigned int old = atomicMin((unsigned int*)dest, val);
-  return old < val ? old : val;
-}
-
-inline __device__ unsigned long long int atomic_min_fetch(
-    volatile unsigned long long int* const dest,
-    const unsigned long long int val) {
-  const unsigned long long old = atomicMin((unsigned long long*)dest, val);
-  return old < val ? old : val;
-}
-
-inline __device__ int atomic_max_fetch(volatile int* const dest,
-                                       const int val) {
-  const int old = atomicMax((int*)dest, val);
-  return old >= val ? old : val;
-}
-
-inline __device__ unsigned int atomic_max_fetch(
-    volatile unsigned int* const dest, const unsigned int val) {
-  const unsigned int old = atomicMax((unsigned int*)dest, val);
-  return old >= val ? old : val;
-}
-
-inline __device__ unsigned long long int atomic_max_fetch(
-    volatile unsigned long long int* const dest,
-    const unsigned long long int val) {
-  const unsigned long long old = atomicMax((unsigned long long*)dest, val);
-  return old >= val ? old : val;
-}
-
-#endif
-
-#endif
-#endif
-}  // namespace Kokkos
-
-#endif
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Store.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Store.hpp
deleted file mode 100644
index 3c82e0e3dd0f84d126cf3912054434913ad52f1e..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/src/impl/Kokkos_Atomic_Store.hpp
+++ /dev/null
@@ -1,197 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#ifndef KOKKOS_IMPL_KOKKOS_ATOMIC_STORE_HPP
-#define KOKKOS_IMPL_KOKKOS_ATOMIC_STORE_HPP
-
-#include <Kokkos_Macros.hpp>
-#if defined(KOKKOS_ATOMIC_HPP)
-
-#include <impl/Kokkos_Atomic_Memory_Order.hpp>
-#include <impl/Kokkos_Atomic_Generic.hpp>
-
-#if defined(KOKKOS_ENABLE_CUDA)
-#include <Cuda/Kokkos_Cuda_Atomic_Intrinsics.hpp>
-#endif
-
-namespace Kokkos {
-namespace Impl {
-
-// Olivier's implementation helpfully binds to the same builtins as GNU, so
-// we make this code common across multiple options
-#if (defined(KOKKOS_ENABLE_GNU_ATOMICS) && !defined(__CUDA_ARCH__)) ||   \
-    (defined(KOKKOS_ENABLE_INTEL_ATOMICS) && !defined(__CUDA_ARCH__)) || \
-    defined(KOKKOS_ENABLE_CUDA_ASM_ATOMICS)
-
-#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA_ASM_ATOMICS)
-#define KOKKOS_INTERNAL_INLINE_DEVICE_IF_CUDA_ARCH __inline__ __device__
-#else
-#define KOKKOS_INTERNAL_INLINE_DEVICE_IF_CUDA_ARCH inline
-#endif
-
-template <class T, class MemoryOrder>
-KOKKOS_INTERNAL_INLINE_DEVICE_IF_CUDA_ARCH void _atomic_store(
-    T* ptr, T val, MemoryOrder,
-    std::enable_if_t<(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 ||
-                      sizeof(T) == 8) &&
-                         std::is_same<typename MemoryOrder::memory_order,
-                                      std::remove_cv_t<MemoryOrder>>::value,
-                     void const**> = nullptr) {
-  __atomic_store_n(ptr, val, MemoryOrder::gnu_constant);
-}
-
-template <class T, class MemoryOrder>
-KOKKOS_INTERNAL_INLINE_DEVICE_IF_CUDA_ARCH void _atomic_store(
-    T* ptr, T val, MemoryOrder,
-    std::enable_if_t<!(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 ||
-                       sizeof(T) == 8) &&
-                         std::is_default_constructible<T>::value &&
-                         std::is_same<typename MemoryOrder::memory_order,
-                                      std::remove_cv_t<MemoryOrder>>::value,
-                     void const**> = nullptr) {
-  __atomic_store(ptr, &val, MemoryOrder::gnu_constant);
-}
-
-#undef KOKKOS_INTERNAL_INLINE_DEVICE_IF_CUDA_ARCH
-
-#elif defined(__CUDA_ARCH__)
-
-// Not compiling for Volta or later, or Cuda ASM atomics were manually disabled
-
-template <class T>
-__device__ __inline__ void _relaxed_atomic_store_impl(
-    T* ptr, T val,
-    std::enable_if_t<(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 ||
-                      sizeof(T) == 8),
-                     void const**> = nullptr) {
-  *ptr = val;
-}
-
-template <class T>
-struct StoreOper {
-  __device__ __inline__ static constexpr T apply(T const&,
-                                                 T const& val) noexcept {
-    return val;
-  }
-};
-
-template <class T>
-__device__ __inline__ void _relaxed_atomic_store_impl(
-    T* ptr, T val,
-    std::enable_if_t<!(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 ||
-                       sizeof(T) == 8),
-                     void const**> = nullptr) {
-  Kokkos::Impl::atomic_oper_fetch(StoreOper<T>{}, ptr, (T &&) val);
-}
-
-template <class T>
-__device__ __inline__ void _atomic_store(T* ptr, T val,
-                                         memory_order_seq_cst_t) {
-  Kokkos::memory_fence();
-  Impl::_relaxed_atomic_store_impl(ptr, val);
-  Kokkos::memory_fence();
-}
-
-template <class T>
-__device__ __inline__ void _atomic_store(T* ptr, T val,
-                                         memory_order_release_t) {
-  Kokkos::memory_fence();
-  _relaxed_atomic_store_impl(ptr, val);
-}
-
-template <class T>
-__device__ __inline__ void _atomic_store(T* ptr, T val,
-                                         memory_order_relaxed_t) {
-  _relaxed_atomic_store_impl(ptr, val);
-}
-
-#elif defined(KOKKOS_ENABLE_OPENMP_ATOMICS)
-
-template <class T, class MemoryOrder>
-inline void _atomic_store(T* ptr, T val, MemoryOrder) {
-  // AFAICT, all OpenMP atomics are sequentially consistent, so memory order
-  // doesn't matter
-#pragma omp atomic write
-  { *ptr = val; }
-}
-
-#elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS)
-
-template <class T, class MemoryOrder>
-inline void _atomic_store(T* ptr, T val, MemoryOrder) {
-  *ptr = val;
-}
-
-#elif defined(KOKKOS_ENABLE_WINDOWS_ATOMICS)
-
-template <class T, class MemoryOrder>
-inline void _atomic_store(T* ptr, T val, MemoryOrder) {
-  atomic_exchange(ptr, val);
-}
-
-#endif  // end of all atomic implementations
-
-template <class T>
-KOKKOS_FORCEINLINE_FUNCTION void atomic_store(T* ptr, T val,
-                                              Impl::memory_order_seq_cst_t) {
-  _atomic_store(ptr, val, Impl::memory_order_seq_cst);
-}
-
-template <class T>
-KOKKOS_FORCEINLINE_FUNCTION void atomic_store(T* ptr, T val,
-                                              Impl::memory_order_release_t) {
-  _atomic_store(ptr, val, Impl::memory_order_release);
-}
-
-template <class T>
-KOKKOS_FORCEINLINE_FUNCTION void atomic_store(T* ptr, T val,
-                                              Impl::memory_order_relaxed_t) {
-  _atomic_store(ptr, val, Impl::memory_order_relaxed);
-}
-
-template <class T>
-KOKKOS_FORCEINLINE_FUNCTION void atomic_store(T* /*ptr*/, T /*val*/,
-                                              Impl::memory_order_acquire_t) {
-  static_assert(
-      sizeof(T) == 0,  // just something that will always be false, but only on
-                       // instantiation
-      "atomic_store with memory order acquire doesn't make any sense!");
-}
-
-template <class T>
-KOKKOS_FORCEINLINE_FUNCTION void atomic_store(T* /*ptr*/, T /*val*/,
-                                              Impl::memory_order_acq_rel_t) {
-  static_assert(
-      sizeof(T) == 0,  // just something that will always be false, but only on
-                       // instantiation
-      "atomic_store with memory order acq_rel doesn't make any sense!");
-}
-
-template <class T>
-KOKKOS_FORCEINLINE_FUNCTION void atomic_store(T* ptr, T val) {
-  // relaxed by default!
-  _atomic_store(ptr, val, Impl::memory_order_relaxed);
-}
-
-}  // end namespace Impl
-}  // end namespace Kokkos
-
-#if defined(KOKKOS_ENABLE_CUDA)
-#include <Cuda/Kokkos_Cuda_Atomic_Intrinsics_Restore_Builtins.hpp>
-#endif
-
-#endif  // defined(KOKKOS_ATOMIC_HPP)
-#endif  // KOKKOS_IMPL_KOKKOS_ATOMIC_STORE_HPP
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_View.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_View.hpp
index 45d01c9f9f33bc90dd1e42dab4c8207970023586..23d4c2524c791913f4bf8aeeabf00e2bed8edd25 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Atomic_View.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_View.hpp
@@ -39,7 +39,7 @@ class AtomicDataElement {
 
   KOKKOS_INLINE_FUNCTION
   const_value_type operator=(const_value_type& val) const {
-    Kokkos::Impl::atomic_store(ptr, val, Kokkos::Impl::memory_order_relaxed);
+    Kokkos::atomic_store(ptr, val);
     return val;
   }
 
@@ -194,9 +194,7 @@ class AtomicDataElement {
   bool operator>(const_value_type& val) const { return *ptr > val; }
 
   KOKKOS_INLINE_FUNCTION
-  operator value_type() const {
-    return Kokkos::Impl::atomic_load(ptr, Kokkos::Impl::memory_order_relaxed);
-  }
+  operator value_type() const { return Kokkos::atomic_load(ptr); }
 };
 
 template <class ViewTraits>
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Windows.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Windows.hpp
deleted file mode 100644
index ffb0d2bae27ecfb0f595addf8cbd9592c1ecc396..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/src/impl/Kokkos_Atomic_Windows.hpp
+++ /dev/null
@@ -1,127 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-#ifndef KOKKOS_ATOMIC_WINDOWS_HPP
-#define KOKKOS_ATOMIC_WINDOWS_HPP
-
-#ifdef _WIN32
-
-#ifndef NOMINMAX
-#define NOMINMAX
-#endif
-#include <winsock2.h>
-#include <windows.h>
-
-namespace Kokkos {
-namespace Impl {
-#ifdef _MSC_VER
-_declspec(align(16))
-#endif
-    struct cas128_t {
-  LONGLONG lower;
-  LONGLONG upper;
-  KOKKOS_INLINE_FUNCTION
-  bool operator!=(const cas128_t& a) const {
-    return (lower != a.lower) || upper != a.upper;
-  }
-}
-#if defined(__GNUC__) || defined(__clang__)
-__attribute__((aligned(16)))
-#endif
-;
-}  // namespace Impl
-
-#if !defined(__CUDA_ARCH__) || defined(__clang__)
-template <typename T>
-inline T atomic_compare_exchange(
-    volatile T* const dest, const T& compare,
-    std::enable_if_t<sizeof(T) == sizeof(CHAR), const T&> val) {
-  union U {
-    CHAR i;
-    T t;
-    KOKKOS_INLINE_FUNCTION U(){};
-  } tmp;
-
-  tmp.i = _InterlockedCompareExchange8((CHAR*)dest, *((CHAR*)&val),
-                                       *((CHAR*)&compare));
-  return tmp.t;
-}
-
-template <typename T>
-inline T atomic_compare_exchange(
-    volatile T* const dest, const T& compare,
-    std::enable_if_t<sizeof(T) == sizeof(SHORT), const T&> val) {
-  union U {
-    SHORT i;
-    T t;
-    KOKKOS_INLINE_FUNCTION U(){};
-  } tmp;
-
-  tmp.i = _InterlockedCompareExchange16((SHORT*)dest, *((SHORT*)&val),
-                                        *((SHORT*)&compare));
-  return tmp.t;
-}
-
-template <typename T>
-inline T atomic_compare_exchange(
-    volatile T* const dest, const T& compare,
-    std::enable_if_t<sizeof(T) == sizeof(LONG), const T&> val) {
-  union U {
-    LONG i;
-    T t;
-    KOKKOS_INLINE_FUNCTION U() {}
-  } tmp;
-
-  tmp.i = _InterlockedCompareExchange((LONG*)dest, *((LONG*)&val),
-                                      *((LONG*)&compare));
-  return tmp.t;
-}
-
-template <typename T>
-inline T atomic_compare_exchange(
-    volatile T* const dest, const T& compare,
-    std::enable_if_t<sizeof(T) == sizeof(LONGLONG), const T&> val) {
-  union U {
-    LONGLONG i;
-    T t;
-    KOKKOS_INLINE_FUNCTION U() {}
-  } tmp;
-
-  tmp.i = _InterlockedCompareExchange64((LONGLONG*)dest, *((LONGLONG*)&val),
-                                        *((LONGLONG*)&compare));
-  return tmp.t;
-}
-
-template <typename T>
-inline T atomic_compare_exchange(
-    volatile T* const dest, const T& compare,
-    std::enable_if_t<sizeof(T) == sizeof(Impl::cas128_t), const T&> val) {
-  T compare_and_result(compare);
-  union U {
-    Impl::cas128_t i;
-    T t;
-    KOKKOS_INLINE_FUNCTION U(){};
-  } newval;
-  newval.t = val;
-  _InterlockedCompareExchange128((LONGLONG*)dest, newval.i.upper,
-                                 newval.i.lower,
-                                 ((LONGLONG*)&compare_and_result));
-  return compare_and_result;
-}
-#endif
-
-}  // namespace Kokkos
-#endif
-#endif
diff --git a/packages/kokkos/core/src/impl/Kokkos_BitOps.hpp b/packages/kokkos/core/src/impl/Kokkos_BitOps.hpp
index 16a28f2419faa4f0d40ef1c9809ea3b4d53f1333..0d83127df9a57e6011dd5711cb111f7f7e9f5e87 100644
--- a/packages/kokkos/core/src/impl/Kokkos_BitOps.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_BitOps.hpp
@@ -21,7 +21,7 @@
 #include <cstdint>
 #include <climits>
 
-#ifdef KOKKOS_COMPILER_INTEL
+#if defined(KOKKOS_COMPILER_INTEL) || defined(KOKKOS_COMPILER_INTEL_LLVM)
 #include <immintrin.h>
 #endif
 
@@ -45,7 +45,7 @@ inline int int_log2_device(unsigned i) {
 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
   constexpr int shift = sizeof(unsigned) * CHAR_BIT - 1;
   return shift - __clz(i);
-#elif defined(KOKKOS_COMPILER_INTEL)
+#elif defined(KOKKOS_COMPILER_INTEL) || defined(KOKKOS_COMPILER_INTEL_LLVM)
   return _bit_scan_reverse(i);
 #else
   return int_log2_fallback(i);
@@ -55,7 +55,7 @@ inline int int_log2_device(unsigned i) {
 KOKKOS_IMPL_HOST_FUNCTION
 inline int int_log2_host(unsigned i) {
 // duplicating shift to avoid unused warning in else branch
-#if defined(KOKKOS_COMPILER_INTEL)
+#if defined(KOKKOS_COMPILER_INTEL) || defined(KOKKOS_COMPILER_INTEL_LLVM)
   constexpr int shift = sizeof(unsigned) * CHAR_BIT - 1;
   (void)shift;
   return _bit_scan_reverse(i);
@@ -104,7 +104,7 @@ inline int bit_first_zero_device(unsigned i) noexcept {
   constexpr unsigned full = ~0u;
 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
   return full != i ? __ffs(~i) - 1 : -1;
-#elif defined(KOKKOS_COMPILER_INTEL)
+#elif defined(KOKKOS_COMPILER_INTEL) || defined(KOKKOS_COMPILER_INTEL_LLVM)
   return full != i ? _bit_scan_forward(~i) : -1;
 #else
   (void)full;
@@ -115,7 +115,7 @@ inline int bit_first_zero_device(unsigned i) noexcept {
 KOKKOS_IMPL_HOST_FUNCTION
 inline int bit_first_zero_host(unsigned i) noexcept {
   constexpr unsigned full = ~0u;
-#if defined(KOKKOS_COMPILER_INTEL)
+#if defined(KOKKOS_COMPILER_INTEL) || defined(KOKKOS_COMPILER_INTEL_LLVM)
   return full != i ? _bit_scan_forward(~i) : -1;
 #elif defined(KOKKOS_COMPILER_CRAYC)
   return full != i ? _popcnt(i ^ (i + 1)) - 1 : -1;
@@ -153,7 +153,7 @@ int bit_scan_forward_fallback(unsigned i) {
 KOKKOS_IMPL_DEVICE_FUNCTION inline int bit_scan_forward_device(unsigned i) {
 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
   return __ffs(i) - 1;
-#elif defined(KOKKOS_COMPILER_INTEL)
+#elif defined(KOKKOS_COMPILER_INTEL) || defined(KOKKOS_COMPILER_INTEL_LLVM)
   return _bit_scan_forward(i);
 #else
   return bit_scan_forward_fallback(i);
@@ -161,7 +161,7 @@ KOKKOS_IMPL_DEVICE_FUNCTION inline int bit_scan_forward_device(unsigned i) {
 }
 
 KOKKOS_IMPL_HOST_FUNCTION inline int bit_scan_forward_host(unsigned i) {
-#if defined(KOKKOS_COMPILER_INTEL)
+#if defined(KOKKOS_COMPILER_INTEL) || defined(KOKKOS_COMPILER_INTEL_LLVM)
   return _bit_scan_forward(i);
 #elif defined(KOKKOS_COMPILER_CRAYC)
   return i ? _popcnt(~i & (i - 1)) : -1;
@@ -200,7 +200,7 @@ int bit_count_fallback(unsigned i) {
 KOKKOS_IMPL_DEVICE_FUNCTION inline int bit_count_device(unsigned i) {
 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
   return __popc(i);
-#elif defined(KOKKOS_COMPILER_INTEL)
+#elif defined(KOKKOS_COMPILER_INTEL) || defined(KOKKOS_COMPILER_INTEL_LLVM)
   return _popcnt32(i);
 #else
   return bit_count_fallback(i);
@@ -208,7 +208,7 @@ KOKKOS_IMPL_DEVICE_FUNCTION inline int bit_count_device(unsigned i) {
 }
 
 KOKKOS_IMPL_HOST_FUNCTION inline int bit_count_host(unsigned i) {
-#if defined(KOKKOS_COMPILER_INTEL)
+#if defined(KOKKOS_COMPILER_INTEL) || defined(KOKKOS_COMPILER_INTEL_LLVM)
   return _popcnt32(i);
 #elif defined(KOKKOS_COMPILER_CRAYC)
   return _popcnt(i);
diff --git a/packages/kokkos/core/src/impl/Kokkos_CPUDiscovery.cpp b/packages/kokkos/core/src/impl/Kokkos_CPUDiscovery.cpp
index 85ffaaf9698df1279e27ccbdd0d2934350fed4dd..6ff7718cb53e1e23c1dbb91994eacf249a1e7916 100644
--- a/packages/kokkos/core/src/impl/Kokkos_CPUDiscovery.cpp
+++ b/packages/kokkos/core/src/impl/Kokkos_CPUDiscovery.cpp
@@ -54,3 +54,5 @@ int Kokkos::Impl::mpi_local_rank_on_node() {
   }
   return -1;
 }
+
+bool Kokkos::Impl::mpi_detected() { return mpi_local_rank_on_node() != -1; }
diff --git a/packages/kokkos/core/src/impl/Kokkos_CPUDiscovery.hpp b/packages/kokkos/core/src/impl/Kokkos_CPUDiscovery.hpp
index 27b8cd3a7c17baaad6951279ca77ba5fedb26c57..a049fd804aa34b506ad6454892892148d8e1918b 100644
--- a/packages/kokkos/core/src/impl/Kokkos_CPUDiscovery.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_CPUDiscovery.hpp
@@ -18,6 +18,8 @@ namespace Impl {
 
 int mpi_ranks_per_node();
 int mpi_local_rank_on_node();
+// returns true if MPI execution environment is detected, false otherwise.
+bool mpi_detected();
 
 }  // namespace Impl
 }  // namespace Kokkos
diff --git a/packages/kokkos/core/src/impl/Kokkos_ChaseLev.hpp b/packages/kokkos/core/src/impl/Kokkos_ChaseLev.hpp
index 855654408e28fdaea7ae46d8701624cd760aac78..d8ab77b205639889980a0b1f3994ed292b1e1aee 100644
--- a/packages/kokkos/core/src/impl/Kokkos_ChaseLev.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_ChaseLev.hpp
@@ -172,7 +172,8 @@ struct ChaseLevDeque {
         }
 #else
         if (!Impl::atomic_compare_exchange_strong(
-                &m_top, t, t + 1, memory_order_seq_cst, memory_order_relaxed)) {
+                &m_top, t, t + 1, desul::MemoryOrderSeqCst(),
+                desul::MemoryOrderRelaxed())) {
           /* failed race, someone else stole it */
           return_value = nullptr;
         }
@@ -195,7 +196,7 @@ struct ChaseLevDeque {
   KOKKOS_INLINE_FUNCTION
   bool push(node_type& node) {
     auto b  = m_bottom;  // memory order relaxed
-    auto t  = Impl::atomic_load(&m_top, memory_order_acquire);
+    auto t  = Impl::atomic_load(&m_top, desul::MemoryOrderAcquire());
     auto& a = m_array;
     if (b - t > a.size() - 1) {
       /* queue is full, resize */
@@ -204,7 +205,7 @@ struct ChaseLevDeque {
       return false;
     }
     a[b] = &node;  // relaxed
-    Impl::atomic_store(&m_bottom, b + 1, memory_order_release);
+    Impl::atomic_store(&m_bottom, b + 1, desul::MemoryOrderRelease());
     return true;
   }
 
@@ -213,7 +214,7 @@ struct ChaseLevDeque {
     auto t = m_top;  // TODO @tasking @memory_order DSH: atomic load acquire
     Kokkos::memory_fence();  // seq_cst fence, so why does the above need to be
                              // acquire?
-    auto b = Impl::atomic_load(&m_bottom, memory_order_acquire);
+    auto b = Impl::atomic_load(&m_bottom, desul::MemoryOrderAcquire());
     OptionalRef<T> return_value;
     if (t < b) {
       /* Non-empty queue */
@@ -231,8 +232,9 @@ struct ChaseLevDeque {
         return_value = nullptr;
       }
 #else
-      if (!Impl::atomic_compare_exchange_strong(
-              &m_top, t, t + 1, memory_order_seq_cst, memory_order_relaxed)) {
+      if (!Impl::atomic_compare_exchange_strong(&m_top, t, t + 1,
+                                                desul::MemoryOrderSeqCst(),
+                                                desul::MemoryOrderRelaxed())) {
         return_value = nullptr;
       }
 #endif
@@ -247,7 +249,7 @@ struct ChaseLevDeque {
       // essentially using the memory order in this version as a fence, which
       // may be unnecessary
       auto buffer_ptr = (node_type***)&m_array.buffer;
-      auto a = Impl::atomic_load(buffer_ptr, memory_order_acquire); //
+      auto a = Impl::atomic_load(buffer_ptr, desul::MemoryOrderAcquire()); //
    technically consume ordered, but acquire should be fine return_value =
    *static_cast<T*>(a[t % m_array->size]); // relaxed; we'd have to replace the
    m_array->size if we ever allow growth
diff --git a/packages/kokkos/core/src/impl/Kokkos_CheckedIntegerOps.hpp b/packages/kokkos/core/src/impl/Kokkos_CheckedIntegerOps.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..69e616f6fb7d597632d203d5249f517d9232d92f
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_CheckedIntegerOps.hpp
@@ -0,0 +1,66 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_CHECKED_INTEGER_OPS_HPP
+#define KOKKOS_CHECKED_INTEGER_OPS_HPP
+
+#include <type_traits>
+
+#include <impl/Kokkos_Error.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+#if defined(__has_builtin)
+#if __has_builtin(__builtin_mul_overflow)
+#define KOKKOS_IMPL_USE_MUL_OVERFLOW_BUILTIN
+#endif
+#endif
+
+template <typename T>
+std::enable_if_t<std::is_integral_v<T>, bool> constexpr multiply_overflow(
+    T a, T b, T& res) {
+  static_assert(std::is_unsigned_v<T>,
+                "Operation not implemented for signed integers.");
+
+#if defined(KOKKOS_IMPL_USE_MUL_OVERFLOW_BUILTIN)
+  return __builtin_mul_overflow(a, b, &res);
+#else
+  auto product = a * b;
+  if ((a == 0) || (b == 0) || (a == product / b)) {
+    res = product;
+    return false;
+  } else {
+    return true;
+  }
+#endif
+}
+
+#undef KOKKOS_IMPL_USE_MUL_OVERFLOW_BUILTIN
+
+template <typename T>
+T multiply_overflow_abort(T a, T b) {
+  T result;
+  if (multiply_overflow(a, b, result))
+    Kokkos::abort("Arithmetic overflow detected.");
+
+  return result;
+}
+
+}  // namespace Impl
+}  // namespace Kokkos
+
+#endif  // KOKKOS_CHECKED_INTEGER_OPS_HPP
diff --git a/packages/kokkos/core/src/impl/Kokkos_ClockTic.hpp b/packages/kokkos/core/src/impl/Kokkos_ClockTic.hpp
index 9e8c70076c6d192349f6aab09e4ad35877c3e9d1..6e3d99ebd685308cc29543fde5475ec148b717c6 100644
--- a/packages/kokkos/core/src/impl/Kokkos_ClockTic.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_ClockTic.hpp
@@ -90,6 +90,27 @@ KOKKOS_IMPL_HOST_FUNCTION inline uint64_t clock_tic_host() noexcept {
 
   return (uint64_t)cycles;
 
+#elif defined(__ppc__)
+  // see : pages.cs.wisc.edu/~legault/miniproj-736.pdf or
+  // cmssdt.cern.ch/lxr/source/FWCore/Utilities/interface/HRRealTime.h
+
+  uint64_t result = 0;
+  uint32_t upper, lower, tmp;
+
+  __asm__ volatile(
+      "0: \n"
+      "\tmftbu %0     \n"
+      "\tmftb  %1     \n"
+      "\tmftbu %2     \n"
+      "\tcmpw  %2, %0 \n"
+      "\tbne   0b     \n"
+      : "=r"(upper), "=r"(lower), "=r"(tmp));
+  result = upper;
+  result = result << 32;
+  result = result | lower;
+
+  return (result);
+
 #else
 
   return std::chrono::high_resolution_clock::now().time_since_epoch().count();
diff --git a/packages/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp b/packages/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp
index d7304779c77c76fa623da9601d2048cb9079365b..e6dd3c63391d13b75765a39d02908bbfd1fed819 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp
@@ -210,6 +210,8 @@ struct CombinedReducerImpl<std::integer_sequence<size_t, Idxs...>, Space,
      ...);
   }
 
+  KOKKOS_FUNCTION auto& reference() const { return *m_value_view.data(); }
+
   // TODO figure out if we also need to call through to final
 
   KOKKOS_FUNCTION
@@ -247,6 +249,22 @@ struct CombinedReducerImpl<std::integer_sequence<size_t, Idxs...>, Space,
 
      ...);
   }
+
+  template <int Idx, class View>
+  KOKKOS_FUNCTION static void write_one_value_back_on_device(
+      View const& inputView, typename View::const_value_type& value) noexcept {
+    *inputView.data() = value;
+  }
+
+  template <typename... CombinedReducers>
+  KOKKOS_FUNCTION void write_value_back_to_original_references_on_device(
+      value_type const& value,
+      CombinedReducers const&... reducers_that_reference_original_values) noexcept {
+    (write_one_value_back_on_device<Idxs>(
+         reducers_that_reference_original_values.view(),
+         value.template get<Idxs, typename CombinedReducers::value_type>()),
+     ...);
+  }
 };
 
 // Apparently this can't be an alias template because of a bug/unimplemented
@@ -466,9 +484,9 @@ KOKKOS_INLINE_FUNCTION constexpr auto make_combined_reducer(
   //----------------------------------------
 }
 
-template <class Functor, class Space, class... ReferencesOrViewsOrReducers>
+template <class Space, class Functor, class... ReferencesOrViewsOrReducers>
 KOKKOS_INLINE_FUNCTION constexpr auto make_wrapped_combined_functor(
-    Functor const& functor, Space, ReferencesOrViewsOrReducers&&...) {
+    Functor const& functor, ReferencesOrViewsOrReducers&&...) {
   //----------------------------------------
   return CombinedReductionFunctorWrapper<
       Functor, Space,
@@ -478,6 +496,32 @@ KOKKOS_INLINE_FUNCTION constexpr auto make_wrapped_combined_functor(
 
 template <typename FunctorType>
 using functor_has_value_t = typename FunctorType::value_type;
+
+template <typename MemberType, typename BoundaryStructType, typename Functor,
+          typename ReturnType1, typename ReturnType2, typename... ReturnTypes>
+KOKKOS_INLINE_FUNCTION void parallel_reduce_combined_reducers_impl(
+    BoundaryStructType const& boundaries, Functor const& functor,
+    ReturnType1&& returnType1, ReturnType2&& returnType2,
+    ReturnTypes&&... returnTypes) noexcept {
+  using mem_space_type = typename MemberType::execution_space::memory_space;
+
+  decltype(Impl::make_combined_reducer_value<mem_space_type>(
+      returnType1, returnType2, returnTypes...)) combined_value;
+
+  auto combined_functor = Impl::make_wrapped_combined_functor<mem_space_type>(
+      functor, returnType1, returnType2, returnTypes...);
+
+  auto combined_reducer = Impl::make_combined_reducer<mem_space_type>(
+      combined_value, returnType1, returnType2, returnTypes...);
+
+  parallel_reduce(boundaries, combined_functor, combined_reducer);
+
+  combined_reducer.write_value_back_to_original_references_on_device(
+      combined_value, Impl::_make_reducer_from_arg<mem_space_type>(returnType1),
+      Impl::_make_reducer_from_arg<mem_space_type>(returnType2),
+      Impl::_make_reducer_from_arg<mem_space_type>(returnTypes)...);
+}
+
 }  // end namespace Impl
 
 //==============================================================================
@@ -499,8 +543,8 @@ auto parallel_reduce(std::string const& label, PolicyType const& policy,
   // directly
   using space_type = Kokkos::DefaultHostExecutionSpace::memory_space;
 
-  auto value = Impl::make_combined_reducer_value<space_type>(
-      returnType1, returnType2, returnTypes...);
+  decltype(Impl::make_combined_reducer_value<space_type>(
+      returnType1, returnType2, returnTypes...)) value;
 
   using combined_reducer_type = Impl::CombinedReducer<
       space_type, Impl::_reducer_from_arg_t<space_type, ReturnType1>,
@@ -509,8 +553,8 @@ auto parallel_reduce(std::string const& label, PolicyType const& policy,
   auto combined_reducer = Impl::make_combined_reducer<space_type>(
       value, returnType1, returnType2, returnTypes...);
 
-  auto combined_functor = Impl::make_wrapped_combined_functor(
-      functor, space_type{}, returnType1, returnType2, returnTypes...);
+  auto combined_functor = Impl::make_wrapped_combined_functor<space_type>(
+      functor, returnType1, returnType2, returnTypes...);
 
   using combined_functor_type = decltype(combined_functor);
   static_assert(
@@ -577,66 +621,36 @@ void parallel_reduce(size_t n, Functor const& functor,
 //------------------------------------------------------------------------------
 // <editor-fold desc="Team overloads"> {{{2
 
-// Copied three times because that's the best way we have right now to match
-// Impl::TeamThreadRangeBoundariesStruct,
-// Impl::ThreadVectorRangeBoundariesStruct, and
-// Impl::TeamVectorRangeBoundariesStruct.
-// TODO make these work after restructuring
-
-// template <class iType, class MemberType, class Functor, class ReturnType1,
-//          class ReturnType2, class... ReturnTypes>
-// KOKKOS_INLINE_FUNCTION void parallel_reduce(
-//    std::string const& label,
-//    Impl::TeamThreadRangeBoundariesStruct<iType, MemberType> const&
-//    boundaries, Functor const& functor, ReturnType1&& returnType1,
-//    ReturnType2&& returnType2, ReturnTypes&&... returnTypes) noexcept {
-//  const auto combined_reducer =
-//      Impl::make_combined_reducer<Kokkos::AnonymousSpace>(
-//          returnType1, returnType2, returnTypes...);
-//
-//  auto combined_functor = Impl::make_wrapped_combined_functor(
-//      functor, Kokkos::AnonymousSpace{}, returnType1, returnType2,
-//      returnTypes...);
-//
-//  parallel_reduce(label, boundaries, combined_functor, combined_reducer);
-//}
-//
-// template <class iType, class MemberType, class Functor, class ReturnType1,
-//          class ReturnType2, class... ReturnTypes>
-// KOKKOS_INLINE_FUNCTION void parallel_reduce(
-//    std::string const& label,
-//    Impl::ThreadVectorRangeBoundariesStruct<iType, MemberType> const&
-//        boundaries,
-//    Functor const& functor, ReturnType1&& returnType1,
-//    ReturnType2&& returnType2, ReturnTypes&&... returnTypes) noexcept {
-//  const auto combined_reducer =
-//      Impl::make_combined_reducer<Kokkos::AnonymousSpace>(
-//          returnType1, returnType2, returnTypes...);
-//
-//  auto combined_functor = Impl::make_wrapped_combined_functor(
-//      functor, Kokkos::AnonymousSpace{}, returnType1, returnType2,
-//      returnTypes...);
-//
-//  parallel_reduce(label, boundaries, combined_functor, combined_reducer);
-//}
-
-// template <class iType, class MemberType, class Functor, class ReturnType1,
-//          class ReturnType2, class... ReturnTypes>
-// KOKKOS_INLINE_FUNCTION void parallel_reduce(
-//    std::string const& label,
-//    Impl::TeamVectorRangeBoundariesStruct<iType, MemberType> const&
-//    boundaries, Functor const& functor, ReturnType1&& returnType1,
-//    ReturnType2&& returnType2, ReturnTypes&&... returnTypes) noexcept {
-//  const auto combined_reducer =
-//      Impl::make_combined_reducer<Kokkos::AnonymousSpace>(
-//          returnType1, returnType2, returnTypes...);
-//
-//  auto combined_functor = Impl::make_wrapped_combined_functor(
-//      functor, Kokkos::AnonymousSpace{}, returnType1, returnType2,
-//      returnTypes...);
-//
-//  parallel_reduce(label, boundaries, combined_functor, combined_reducer);
-//}
+template <class iType, class MemberType, class Functor, class ReturnType1,
+          class ReturnType2, class... ReturnTypes>
+KOKKOS_INLINE_FUNCTION void parallel_reduce(
+    Impl::TeamThreadRangeBoundariesStruct<iType, MemberType> const& boundaries,
+    Functor const& functor, ReturnType1&& returnType1,
+    ReturnType2&& returnType2, ReturnTypes&&... returnTypes) noexcept {
+  Impl::parallel_reduce_combined_reducers_impl<MemberType>(
+      boundaries, functor, returnType1, returnType2, returnTypes...);
+}
+
+template <class iType, class MemberType, class Functor, class ReturnType1,
+          class ReturnType2, class... ReturnTypes>
+KOKKOS_INLINE_FUNCTION void parallel_reduce(
+    Impl::ThreadVectorRangeBoundariesStruct<iType, MemberType> const&
+        boundaries,
+    Functor const& functor, ReturnType1&& returnType1,
+    ReturnType2&& returnType2, ReturnTypes&&... returnTypes) noexcept {
+  Impl::parallel_reduce_combined_reducers_impl<MemberType>(
+      boundaries, functor, returnType1, returnType2, returnTypes...);
+}
+
+template <class iType, class MemberType, class Functor, class ReturnType1,
+          class ReturnType2, class... ReturnTypes>
+KOKKOS_INLINE_FUNCTION void parallel_reduce(
+    Impl::TeamVectorRangeBoundariesStruct<iType, MemberType> const& boundaries,
+    Functor const& functor, ReturnType1&& returnType1,
+    ReturnType2&& returnType2, ReturnTypes&&... returnTypes) noexcept {
+  Impl::parallel_reduce_combined_reducers_impl<MemberType>(
+      boundaries, functor, returnType1, returnType2, returnTypes...);
+}
 
 // </editor-fold> end Team overloads }}}2
 //------------------------------------------------------------------------------
diff --git a/packages/kokkos/core/src/impl/Kokkos_Core.cpp b/packages/kokkos/core/src/impl/Kokkos_Core.cpp
index a35510dd64ec7e08e88e751bc2d87ae2702c5b01..5c182db5663a7a4197404d6ea2d8f37defc25476 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Core.cpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Core.cpp
@@ -165,6 +165,26 @@ bool is_valid_map_device_id_by(std::string const& x) {
 
 }  // namespace
 
+[[nodiscard]] int Kokkos::device_id() noexcept {
+#if defined(KOKKOS_ENABLE_CUDA)
+  return Cuda().cuda_device();
+#elif defined(KOKKOS_ENABLE_HIP)
+  return HIP().hip_device();
+#elif defined(KOKKOS_ENABLE_OPENACC)
+  return Experimental::OpenACC().acc_device_number();
+#elif defined(KOKKOS_ENABLE_OPENMPTARGET)
+  return omp_get_default_device();  // FIXME_OPENMPTARGET
+#elif defined(KOKKOS_ENABLE_SYCL)
+  return Experimental::Impl::SYCLInternal::m_syclDev;
+#else
+  return -1;
+#endif
+}
+
+[[nodiscard]] int Kokkos::num_threads() noexcept {
+  return DefaultHostExecutionSpace().concurrency();
+}
+
 Kokkos::Impl::ExecSpaceManager& Kokkos::Impl::ExecSpaceManager::get_instance() {
   static ExecSpaceManager space_initializer = {};
   return space_initializer;
@@ -500,14 +520,20 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) {
                                  std::to_string(KOKKOS_COMPILER_INTEL));
   declare_configuration_metadata("tools_only", "compiler_family", "intel");
 #endif
+#ifdef KOKKOS_COMPILER_INTEL_LLVM
+  declare_configuration_metadata("compiler_version",
+                                 "KOKKOS_COMPILER_INTEL_LLVM",
+                                 std::to_string(KOKKOS_COMPILER_INTEL_LLVM));
+  declare_configuration_metadata("tools_only", "compiler_family", "intel_llvm");
+#endif
 #ifdef KOKKOS_COMPILER_NVCC
   declare_configuration_metadata("compiler_version", "KOKKOS_COMPILER_NVCC",
                                  std::to_string(KOKKOS_COMPILER_NVCC));
   declare_configuration_metadata("tools_only", "compiler_family", "nvcc");
 #endif
-#ifdef KOKKOS_COMPILER_PGI
-  declare_configuration_metadata("compiler_version", "KOKKOS_COMPILER_PGI",
-                                 std::to_string(KOKKOS_COMPILER_PGI));
+#ifdef KOKKOS_COMPILER_NVHPC
+  declare_configuration_metadata("compiler_version", "KOKKOS_COMPILER_NVHPC",
+                                 std::to_string(KOKKOS_COMPILER_NVHPC));
   declare_configuration_metadata("tools_only", "compiler_family", "pgi");
 #endif
 #ifdef KOKKOS_COMPILER_MSVC
@@ -516,26 +542,6 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) {
   declare_configuration_metadata("tools_only", "compiler_family", "msvc");
 #endif
 
-#ifdef KOKKOS_ENABLE_GNU_ATOMICS
-  declare_configuration_metadata("atomics", "KOKKOS_ENABLE_GNU_ATOMICS", "yes");
-#else
-  declare_configuration_metadata("atomics", "KOKKOS_ENABLE_GNU_ATOMICS", "no");
-#endif
-#ifdef KOKKOS_ENABLE_INTEL_ATOMICS
-  declare_configuration_metadata("atomics", "KOKKOS_ENABLE_INTEL_ATOMICS",
-                                 "yes");
-#else
-  declare_configuration_metadata("atomics", "KOKKOS_ENABLE_INTEL_ATOMICS",
-                                 "no");
-#endif
-#ifdef KOKKOS_ENABLE_WINDOWS_ATOMICS
-  declare_configuration_metadata("atomics", "KOKKOS_ENABLE_WINDOWS_ATOMICS",
-                                 "yes");
-#else
-  declare_configuration_metadata("atomics", "KOKKOS_ENABLE_WINDOWS_ATOMICS",
-                                 "no");
-#endif
-
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
   declare_configuration_metadata("vectorization", "KOKKOS_ENABLE_PRAGMA_IVDEP",
                                  "yes");
@@ -748,24 +754,31 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) {
 #elif defined(KOKKOS_ARCH_HOPPER90)
   declare_configuration_metadata("architecture", "GPU architecture",
                                  "HOPPER90");
-#elif defined(KOKKOS_ARCH_VEGA900)
-  declare_configuration_metadata("architecture", "GPU architecture", "VEGA900");
-#elif defined(KOKKOS_ARCH_VEGA906)
-  declare_configuration_metadata("architecture", "GPU architecture", "VEGA906");
-#elif defined(KOKKOS_ARCH_VEGA908)
-  declare_configuration_metadata("architecture", "GPU architecture", "VEGA908");
-#elif defined(KOKKOS_ARCH_VEGA90A)
-  declare_configuration_metadata("architecture", "GPU architecture", "VEGA90A");
-#elif defined(KOKKOS_ARCH_NAVI1030)
+#elif defined(KOKKOS_ARCH_AMD_GFX906)
+  declare_configuration_metadata("architecture", "GPU architecture",
+                                 "AMD_GFX906");
+#elif defined(KOKKOS_ARCH_AMD_GFX908)
   declare_configuration_metadata("architecture", "GPU architecture",
-                                 "NAVI1030");
-#elif defined(KOKKOS_ARCH_NAVI1100)
+                                 "AMD_GFX908");
+#elif defined(KOKKOS_ARCH_AMD_GFX90A)
   declare_configuration_metadata("architecture", "GPU architecture",
-                                 "NAVI1100");
+                                 "AMD_GFX90A");
+#elif defined(KOKKOS_ARCH_AMD_GFX1030)
+  declare_configuration_metadata("architecture", "GPU architecture",
+                                 "AMD_GFX1030");
+#elif defined(KOKKOS_ARCH_AMD_GFX1100)
+  declare_configuration_metadata("architecture", "GPU architecture",
+                                 "AMD_GFX1100");
 
 #else
   declare_configuration_metadata("architecture", "GPU architecture", "none");
 #endif
+
+#ifdef KOKKOS_IMPL_32BIT
+  declare_configuration_metadata("architecture", "platform", "32bit");
+#else
+  declare_configuration_metadata("architecture", "platform", "64bit");
+#endif
 }
 
 void post_initialize_internal(const Kokkos::InitializationSettings& settings) {
@@ -780,8 +793,14 @@ void post_initialize_internal(const Kokkos::InitializationSettings& settings) {
 }
 
 void initialize_internal(const Kokkos::InitializationSettings& settings) {
+  // The tool initialization is only called in post_initialize_internal.
+  // Pausing tools here, so that if someone has set callbacks programmatically
+  // these callbacks are not called inside the backend initialization, before
+  // the tool initialization happened.
+  Kokkos::Tools::Experimental::pause_tools();
   pre_initialize_internal(settings);
   initialize_backends(settings);
+  Kokkos::Tools::Experimental::resume_tools();
   post_initialize_internal(settings);
 }
 
@@ -1273,15 +1292,3 @@ void Kokkos::print_configuration(std::ostream& os, bool verbose) {
 bool Kokkos::show_warnings() noexcept { return g_show_warnings; }
 
 bool Kokkos::tune_internals() noexcept { return g_tune_internals; }
-
-namespace Kokkos {
-
-#ifdef KOKKOS_COMPILER_PGI
-namespace Impl {
-// Bizzarely, an extra jump instruction forces the PGI compiler to not have a
-// bug related to (probably?) empty base optimization and/or aggregate
-// construction.
-void _kokkos_pgi_compiler_bug_workaround() {}
-}  // end namespace Impl
-#endif
-}  // namespace Kokkos
diff --git a/packages/kokkos/core/src/impl/Kokkos_Default_Graph_Impl.hpp b/packages/kokkos/core/src/impl/Kokkos_Default_Graph_Impl.hpp
index d65b448f1f71a7335458e505252e3dd589a86fb1..3693dff3d465e66730cf75f488b65a6b19c84020 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Default_Graph_Impl.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Default_Graph_Impl.hpp
@@ -23,8 +23,8 @@
 #include <impl/Kokkos_GraphImpl_fwd.hpp>
 #include <impl/Kokkos_Default_Graph_fwd.hpp>
 
-#include <Kokkos_Serial.hpp>
-#include <Kokkos_OpenMP.hpp>
+#include <Serial/Kokkos_Serial.hpp>
+#include <OpenMP/Kokkos_OpenMP.hpp>
 // FIXME @graph other backends?
 
 #include <impl/Kokkos_OptionalRef.hpp>
@@ -161,8 +161,6 @@ struct GraphImpl : private ExecutionSpaceInstanceStorage<ExecutionSpace> {
 
 }  // end namespace Kokkos
 
-#include <OpenMP/Kokkos_OpenMP_Parallel.hpp>
-
 #include <impl/Kokkos_Default_GraphNodeKernel.hpp>
 #include <impl/Kokkos_Default_GraphNode_Impl.hpp>
 
diff --git a/packages/kokkos/core/src/impl/Kokkos_Error.cpp b/packages/kokkos/core/src/impl/Kokkos_Error.cpp
index efd0fb998e1ac854a7eb483375cfd69aa57a915a..4babe2d72bd148b7101ce8c28c1acd9d6267aa1d 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Error.cpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Error.cpp
@@ -21,12 +21,11 @@
 #include <cstring>
 #include <cstdlib>
 
-#include <iostream>
+#include <ostream>
 #include <sstream>
 #include <iomanip>
 #include <stdexcept>
 #include <impl/Kokkos_Error.hpp>
-#include <impl/Kokkos_Stacktrace.hpp>
 #include <Cuda/Kokkos_Cuda_Error.hpp>
 
 //----------------------------------------------------------------------------
@@ -34,26 +33,11 @@
 
 namespace Kokkos {
 namespace Impl {
-void traceback_callstack(std::ostream &msg) {
-#ifdef KOKKOS_IMPL_ENABLE_STACKTRACE
-  msg << "\nBacktrace:\n";
-  save_stacktrace();
-  print_demangled_saved_stacktrace(msg);
-#else
-  msg << "\nTraceback functionality not available\n";
-#endif
-}
 
 void throw_runtime_exception(const std::string &msg) {
   throw std::runtime_error(msg);
 }
 
-void host_abort(const char *const message) {
-  std::cerr << message;
-  traceback_callstack(std::cerr);
-  ::abort();
-}
-
 std::string human_memory_size(size_t arg_bytes) {
   double bytes   = arg_bytes;
   const double K = 1024;
@@ -102,11 +86,6 @@ void Experimental::RawMemoryAllocationFailure::print_error_message(
   o << "  (The allocation mechanism was ";
   switch (m_mechanism) {
     case AllocationMechanism::StdMalloc: o << "standard malloc()."; break;
-    case AllocationMechanism::PosixMemAlign: o << "posix_memalign()."; break;
-    case AllocationMechanism::PosixMMap: o << "POSIX mmap()."; break;
-    case AllocationMechanism::IntelMMAlloc:
-      o << "the Intel _mm_malloc() intrinsic.";
-      break;
     case AllocationMechanism::CudaMalloc: o << "cudaMalloc()."; break;
     case AllocationMechanism::CudaMallocManaged:
       o << "cudaMallocManaged().";
@@ -126,6 +105,7 @@ void Experimental::RawMemoryAllocationFailure::print_error_message(
     case AllocationMechanism::SYCLMallocHost:
       o << "sycl::malloc_host().";
       break;
+    default: o << "unsupported.";
   }
   append_additional_error_information(o);
   o << ")" << std::endl;
@@ -151,7 +131,7 @@ namespace Experimental {
 void CudaRawMemoryAllocationFailure::append_additional_error_information(
     std::ostream &o) const {
   if (m_error_code != cudaSuccess) {
-    o << "  The Cuda allocation returned the error code \"\""
+    o << "  The Cuda allocation returned the error code \""
       << cudaGetErrorName(m_error_code) << "\".";
   }
 }
diff --git a/packages/kokkos/core/src/impl/Kokkos_Error.hpp b/packages/kokkos/core/src/impl/Kokkos_Error.hpp
index e2054d4ce076e766c9b6acaee45b86afa9d8517e..3d0b1d3274c83a3178b5d72ae01299183fc4a6a7 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Error.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Error.hpp
@@ -20,82 +20,14 @@
 #include <string>
 #include <iosfwd>
 #include <Kokkos_Macros.hpp>
-#ifdef KOKKOS_ENABLE_CUDA
-#include <Cuda/Kokkos_Cuda_abort.hpp>
-#endif
-#ifdef KOKKOS_ENABLE_HIP
-#include <HIP/Kokkos_HIP_Abort.hpp>
-#endif
-#ifdef KOKKOS_ENABLE_SYCL
-#include <SYCL/Kokkos_SYCL_Abort.hpp>
-#endif
+#include <Kokkos_Abort.hpp>
+#include <Kokkos_Assert.hpp>
 
 namespace Kokkos {
 namespace Impl {
 
-[[noreturn]] void host_abort(const char *const);
-
-#if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__)
-
-#if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK)
-// required to workaround failures in random number generator unit tests with
-// pre-volta architectures
-#define KOKKOS_IMPL_ABORT_NORETURN
-#else
-// cuda_abort aborts when building for other platforms than macOS
-#define KOKKOS_IMPL_ABORT_NORETURN [[noreturn]]
-#endif
-
-#elif defined(KOKKOS_COMPILER_NVHPC)
-
-#define KOKKOS_IMPL_ABORT_NORETURN
-
-#elif defined(KOKKOS_ENABLE_HIP) && defined(__HIP_DEVICE_COMPILE__)
-// HIP aborts
-#define KOKKOS_IMPL_ABORT_NORETURN [[noreturn]]
-#elif defined(KOKKOS_ENABLE_SYCL) && defined(__SYCL_DEVICE_ONLY__)
-// FIXME_SYCL SYCL doesn't abort
-#define KOKKOS_IMPL_ABORT_NORETURN
-#elif !defined(KOKKOS_ENABLE_OPENMPTARGET) && !defined(KOKKOS_ENABLE_OPENACC)
-// Host aborts
-#define KOKKOS_IMPL_ABORT_NORETURN [[noreturn]]
-#else
-// Everything else does not abort
-#define KOKKOS_IMPL_ABORT_NORETURN
-#endif
-
-// FIXME_SYCL
-// Accomodate host pass for device functions that are not [[noreturn]]
-#if defined(KOKKOS_ENABLE_SYCL) || \
-    (defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK))
-#define KOKKOS_IMPL_ABORT_NORETURN_DEVICE
-#else
-#define KOKKOS_IMPL_ABORT_NORETURN_DEVICE KOKKOS_IMPL_ABORT_NORETURN
-#endif
-
-#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) ||          \
-    defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_OPENMPTARGET) || \
-    defined(KOKKOS_ENABLE_OPENACC)
-KOKKOS_IMPL_ABORT_NORETURN_DEVICE inline KOKKOS_IMPL_DEVICE_FUNCTION void
-device_abort(const char *const msg) {
-#if defined(KOKKOS_ENABLE_CUDA)
-  ::Kokkos::Impl::cuda_abort(msg);
-#elif defined(KOKKOS_ENABLE_HIP)
-  ::Kokkos::Impl::hip_abort(msg);
-#elif defined(KOKKOS_ENABLE_SYCL)
-  ::Kokkos::Impl::sycl_abort(msg);
-#elif defined(KOKKOS_ENABLE_OPENMPTARGET) || defined(KOKKOS_ENABLE_OPENACC)
-  printf("%s", msg);  // FIXME_OPENMPTARGET FIXME_OPENACC
-#else
-#error faulty logic
-#endif
-}
-#endif
-
 [[noreturn]] void throw_runtime_exception(const std::string &msg);
 
-void traceback_callstack(std::ostream &);
-
 std::string human_memory_size(size_t arg_bytes);
 
 }  // namespace Impl
@@ -113,9 +45,11 @@ class RawMemoryAllocationFailure : public std::bad_alloc {
   };
   enum class AllocationMechanism {
     StdMalloc,
-    PosixMemAlign,
-    PosixMMap,
-    IntelMMAlloc,
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
+    PosixMemAlign KOKKOS_DEPRECATED,
+    PosixMMap KOKKOS_DEPRECATED,
+    IntelMMAlloc KOKKOS_DEPRECATED,
+#endif
     CudaMalloc,
     CudaMallocManaged,
     CudaHostAlloc,
@@ -193,72 +127,4 @@ class RawMemoryAllocationFailure : public std::bad_alloc {
 
 }  // namespace Kokkos
 
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-KOKKOS_IMPL_ABORT_NORETURN KOKKOS_INLINE_FUNCTION void abort(
-    const char *const message) {
-  KOKKOS_IF_ON_HOST(::Kokkos::Impl::host_abort(message);)
-  KOKKOS_IF_ON_DEVICE(::Kokkos::Impl::device_abort(message);)
-}
-
-#undef KOKKOS_IMPL_ABORT_NORETURN
-
-}  // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#if !defined(NDEBUG) || defined(KOKKOS_ENFORCE_CONTRACTS) || \
-    defined(KOKKOS_ENABLE_DEBUG)
-#define KOKKOS_EXPECTS(...)                                                    \
-  {                                                                            \
-    if (!bool(__VA_ARGS__)) {                                                  \
-      ::Kokkos::abort(                                                         \
-          "Kokkos contract violation:\n  "                                     \
-          "  Expected precondition `" #__VA_ARGS__                             \
-          "` evaluated false.\n"                                               \
-          "Error at " KOKKOS_IMPL_TOSTRING(__FILE__) ":" KOKKOS_IMPL_TOSTRING( \
-              __LINE__) " \n");                                                \
-    }                                                                          \
-  }
-#define KOKKOS_ENSURES(...)                                                    \
-  {                                                                            \
-    if (!bool(__VA_ARGS__)) {                                                  \
-      ::Kokkos::abort(                                                         \
-          "Kokkos contract violation:\n  "                                     \
-          "  Ensured postcondition `" #__VA_ARGS__                             \
-          "` evaluated false.\n"                                               \
-          "Error at " KOKKOS_IMPL_TOSTRING(__FILE__) ":" KOKKOS_IMPL_TOSTRING( \
-              __LINE__) " \n");                                                \
-    }                                                                          \
-  }
-// some projects already define this for themselves, so don't mess
-// them up
-#ifndef KOKKOS_ASSERT
-#define KOKKOS_ASSERT(...)                                                     \
-  {                                                                            \
-    if (!bool(__VA_ARGS__)) {                                                  \
-      ::Kokkos::abort(                                                         \
-          "Kokkos contract violation:\n  "                                     \
-          "  Asserted condition `" #__VA_ARGS__                                \
-          "` evaluated false.\n"                                               \
-          "Error at " KOKKOS_IMPL_TOSTRING(__FILE__) ":" KOKKOS_IMPL_TOSTRING( \
-              __LINE__) " \n");                                                \
-    }                                                                          \
-  }
-#endif  // ifndef KOKKOS_ASSERT
-#else   // not debug mode
-#define KOKKOS_EXPECTS(...)
-#define KOKKOS_ENSURES(...)
-#ifndef KOKKOS_ASSERT
-#define KOKKOS_ASSERT(...)
-#endif  // ifndef KOKKOS_ASSERT
-#endif  // end debug mode ifdefs
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
 #endif /* #ifndef KOKKOS_IMPL_ERROR_HPP */
diff --git a/packages/kokkos/core/src/impl/Kokkos_ExecSpaceManager.hpp b/packages/kokkos/core/src/impl/Kokkos_ExecSpaceManager.hpp
index f0edc8ac47d3716b8db70316289cee5f2a4415b7..58ed54275a64d6a8915e89f035d2140996f37ae5 100644
--- a/packages/kokkos/core/src/impl/Kokkos_ExecSpaceManager.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_ExecSpaceManager.hpp
@@ -102,9 +102,7 @@ constexpr bool check_valid_execution_space() {
   static_assert(is_detected_v<initialize_finalize_t, ExecutionSpace>);
   static_assert(is_detected_v<fence_t, ExecutionSpace>);
   static_assert(is_detected_v<concurrency_t, ExecutionSpace>);
-#ifndef KOKKOS_ENABLE_HPX  // FIXME_HPX
   static_assert(sizeof(ExecutionSpace) <= 2 * sizeof(void*));
-#endif
   return true;
 }
 
diff --git a/packages/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp b/packages/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp
index 5ef017a004fb920e2033738fe65b8a72d5eeedcf..e844a5295e504675dc8c07c7b736e0e28d4226cc 100644
--- a/packages/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp
@@ -42,10 +42,10 @@ struct DeduceFunctorPatternInterface<
   using type = FunctorPatternInterface::FOR;
 };
 
-template <class FunctorType, class ExecPolicy, class ReducerType,
+template <class CombinedFunctorReducerType, class ExecPolicy,
           class ExecutionSpace>
 struct DeduceFunctorPatternInterface<
-    ParallelReduce<FunctorType, ExecPolicy, ReducerType, ExecutionSpace>> {
+    ParallelReduce<CombinedFunctorReducerType, ExecPolicy, ExecutionSpace>> {
   using type = FunctorPatternInterface::REDUCE;
 };
 
@@ -64,14 +64,15 @@ struct DeduceFunctorPatternInterface<ParallelScanWithTotal<
 
 /** \brief  Query Functor and execution policy argument tag for value type.
  *
- *  If 'value_type' is not explicitly declared in the functor
- *  then attempt to deduce the type from FunctorType::operator()
- *  interface used by the pattern and policy.
+ *  If 'value_type' is not explicitly declared in the functor and
+ * OverrideValueType is void, then attempt to deduce the type from
+ * FunctorType::operator() interface used by the pattern and policy.
  *
  *  For the REDUCE pattern generate a Reducer and finalization function
  *  derived from what is available within the functor.
  */
-template <typename PatternInterface, class Policy, class Functor>
+template <typename PatternInterface, class Policy, class Functor,
+          typename OverrideValueType>
 struct FunctorAnalysis {
  private:
   using FOR    = FunctorPatternInterface::FOR;
@@ -124,9 +125,10 @@ struct FunctorAnalysis {
   //----------------------------------------
   // Check for Functor::value_type, which is either a simple type T or T[]
 
+  // If the functor doesn't have a value_type alias, use OverrideValueType.
   template <typename F, typename = std::false_type>
   struct has_value_type {
-    using type = void;
+    using type = OverrideValueType;
   };
 
   template <typename F>
@@ -141,9 +143,9 @@ struct FunctorAnalysis {
   };
 
   //----------------------------------------
-  // If Functor::value_type does not exist then evaluate operator(),
-  // depending upon the pattern and whether the policy has a work tag,
-  // to determine the reduction or scan value_type.
+  // If Functor::value_type does not exist and OverrideValueType is void, then
+  // evaluate operator(), depending upon the pattern and whether the policy has
+  // a work tag, to determine the reduction or scan value_type.
 
   template <typename F, typename P = PatternInterface,
             typename V = typename has_value_type<F>::type,
@@ -320,13 +322,15 @@ struct FunctorAnalysis {
 
  private:
   template <bool IsArray, class FF>
-  KOKKOS_INLINE_FUNCTION static constexpr std::enable_if_t<IsArray, unsigned>
+  KOKKOS_INLINE_FUNCTION static constexpr std::enable_if_t<IsArray,
+                                                           unsigned int>
   get_length(FF const& f) {
     return f.value_count;
   }
 
   template <bool IsArray, class FF>
-  KOKKOS_INLINE_FUNCTION static constexpr std::enable_if_t<!IsArray, unsigned>
+  KOKKOS_INLINE_FUNCTION static constexpr std::enable_if_t<!IsArray,
+                                                           unsigned int>
   get_length(FF const&) {
     return candidate_is_void ? 0 : 1;
   }
@@ -337,12 +341,12 @@ struct FunctorAnalysis {
         !candidate_is_void && !candidate_is_array ? sizeof(ValueType) : 0
   };
 
-  KOKKOS_FORCEINLINE_FUNCTION static constexpr unsigned value_count(
+  KOKKOS_FORCEINLINE_FUNCTION static constexpr unsigned int value_count(
       const Functor& f) {
     return FunctorAnalysis::template get_length<candidate_is_array>(f);
   }
 
-  KOKKOS_FORCEINLINE_FUNCTION static constexpr unsigned value_size(
+  KOKKOS_FORCEINLINE_FUNCTION static constexpr unsigned int value_size(
       const Functor& f) {
     return FunctorAnalysis::template get_length<candidate_is_array>(f) *
            sizeof(ValueType);
@@ -351,13 +355,13 @@ struct FunctorAnalysis {
   //----------------------------------------
 
   template <class Unknown>
-  KOKKOS_FORCEINLINE_FUNCTION static constexpr unsigned value_count(
+  KOKKOS_FORCEINLINE_FUNCTION static constexpr unsigned int value_count(
       const Unknown&) {
     return candidate_is_void ? 0 : 1;
   }
 
   template <class Unknown>
-  KOKKOS_FORCEINLINE_FUNCTION static constexpr unsigned value_size(
+  KOKKOS_FORCEINLINE_FUNCTION static constexpr unsigned int value_size(
       const Unknown&) {
     return candidate_is_void ? 0 : sizeof(ValueType);
   }
@@ -627,11 +631,9 @@ struct FunctorAnalysis {
                         detected_volatile_join_no_tag<F>::value)>>
       : public has_volatile_join_no_tag_function<F> {
     enum : bool { value = true };
-#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_3
     static_assert(Impl::dependent_false_v<F>,
                   "Reducer with a join() operator taking "
                   "volatile-qualified parameters is no longer supported");
-#endif
   };
 
   template <class F = Functor, typename = void>
@@ -650,11 +652,9 @@ struct FunctorAnalysis {
                                          detected_volatile_join_tag<F>::value)>>
       : public has_volatile_join_tag_function<F> {
     enum : bool { value = true };
-#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_3
     static_assert(Impl::dependent_false_v<F>,
                   "Reducer with a join() operator taking "
                   "volatile-qualified parameters is no longer supported");
-#endif
   };
 
   //----------------------------------------
@@ -903,12 +903,12 @@ struct FunctorAnalysis {
 
   struct Reducer {
    private:
-    Functor const* const m_functor;
+    Functor m_functor;
 
     template <bool IsArray>
     KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t<IsArray, int> len() const
         noexcept {
-      return m_functor->value_count;
+      return m_functor.value_count;
     }
 
     template <bool IsArray>
@@ -924,6 +924,28 @@ struct FunctorAnalysis {
     using reference_type = FunctorAnalysis::reference_type;
     using functor_type   = Functor;  // Adapts a functor
 
+    static constexpr bool has_join_member_function() {
+      return DeduceJoin<>::value;
+    }
+    static constexpr bool has_init_member_function() {
+      return DeduceInit<>::value;
+    }
+    static constexpr bool has_final_member_function() {
+      return DeduceFinal<>::value;
+    }
+
+    KOKKOS_FUNCTION unsigned int value_size() const {
+      return FunctorAnalysis::value_size(m_functor);
+    }
+
+    KOKKOS_FUNCTION unsigned int value_count() const {
+      return FunctorAnalysis::value_count(m_functor);
+    }
+
+    KOKKOS_FUNCTION static constexpr unsigned int static_value_size() {
+      return StaticValueSize;
+    }
+
     template <bool is_array = candidate_is_array>
     KOKKOS_INLINE_FUNCTION static std::enable_if_t<is_array, reference_type>
     reference(ValueType* dst) noexcept {
@@ -948,20 +970,23 @@ struct FunctorAnalysis {
 
     KOKKOS_INLINE_FUNCTION
     void join(ValueType* dst, ValueType const* src) const noexcept {
-      DeduceJoin<>::join(m_functor, dst, src);
+      DeduceJoin<>::join(&m_functor, dst, src);
     }
 
     KOKKOS_INLINE_FUNCTION reference_type init(ValueType* const dst) const
         noexcept {
-      DeduceInit<>::init(m_functor, dst);
+      DeduceInit<>::init(&m_functor, dst);
       return reference(dst);
     }
 
     KOKKOS_INLINE_FUNCTION
     void final(ValueType* dst) const noexcept {
-      DeduceFinal<>::final(m_functor, dst);
+      DeduceFinal<>::final(&m_functor, dst);
     }
 
+    KOKKOS_INLINE_FUNCTION
+    const Functor& get_functor() const { return m_functor; }
+
     Reducer(Reducer const&) = default;
     Reducer(Reducer&&)      = default;
     Reducer& operator=(Reducer const&) = delete;
@@ -969,7 +994,7 @@ struct FunctorAnalysis {
     ~Reducer()                    = default;
 
     KOKKOS_INLINE_FUNCTION explicit constexpr Reducer(
-        Functor const* arg_functor) noexcept
+        Functor const& arg_functor) noexcept
         : m_functor(arg_functor) {}
   };
 };
diff --git a/packages/kokkos/core/src/impl/Kokkos_HBWSpace.cpp b/packages/kokkos/core/src/impl/Kokkos_HBWSpace.cpp
index 7402f1a74476c8184ccbb3873c25d7069ebbde2a..cd640b88cb92ac56a3ec1914d13d95882a6a3a86 100644
--- a/packages/kokkos/core/src/impl/Kokkos_HBWSpace.cpp
+++ b/packages/kokkos/core/src/impl/Kokkos_HBWSpace.cpp
@@ -310,41 +310,4 @@ void SharedAllocationRecord<Kokkos::Experimental::HBWSpace, void>::
 }  // namespace Impl
 }  // namespace Kokkos
 
-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-namespace Experimental {
-namespace {
-const unsigned HBW_SPACE_ATOMIC_MASK     = 0xFFFF;
-const unsigned HBW_SPACE_ATOMIC_XOR_MASK = 0x5A39;
-static int HBW_SPACE_ATOMIC_LOCKS[HBW_SPACE_ATOMIC_MASK + 1];
-}  // namespace
-
-namespace Impl {
-void init_lock_array_hbw_space() {
-  static int is_initialized = 0;
-  if (!is_initialized)
-    for (int i = 0; i < static_cast<int>(HBW_SPACE_ATOMIC_MASK + 1); i++)
-      HBW_SPACE_ATOMIC_LOCKS[i] = 0;
-}
-
-bool lock_address_hbw_space(void *ptr) {
-  return 0 == atomic_compare_exchange(
-                  &HBW_SPACE_ATOMIC_LOCKS[((size_t(ptr) >> 2) &
-                                           HBW_SPACE_ATOMIC_MASK) ^
-                                          HBW_SPACE_ATOMIC_XOR_MASK],
-                  0, 1);
-}
-
-void unlock_address_hbw_space(void *ptr) {
-  atomic_exchange(
-      &HBW_SPACE_ATOMIC_LOCKS[((size_t(ptr) >> 2) & HBW_SPACE_ATOMIC_MASK) ^
-                              HBW_SPACE_ATOMIC_XOR_MASK],
-      0);
-}
-
-}  // namespace Impl
-}  // namespace Experimental
-}  // namespace Kokkos
 #endif
diff --git a/packages/kokkos/core/src/impl/Kokkos_Half_FloatingPointWrapper.hpp b/packages/kokkos/core/src/impl/Kokkos_Half_FloatingPointWrapper.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..4a22898d168257def9a8e984e97b98216f9e1475
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_Half_FloatingPointWrapper.hpp
@@ -0,0 +1,1173 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_HALF_FLOATING_POINT_WRAPPER_HPP_
+#define KOKKOS_HALF_FLOATING_POINT_WRAPPER_HPP_
+
+#include <Kokkos_Macros.hpp>
+#include <Kokkos_BitManipulation.hpp>  // bit_cast
+
+#include <type_traits>
+#include <iosfwd>  // istream & ostream for extraction and insertion ops
+#include <string>
+
+namespace Kokkos::Experimental::Impl {
+/// @brief templated struct for determining if half_t is an alias to float.
+/// @tparam T The type to specialize on.
+template <class T>
+struct is_float16 : std::false_type {};
+
+/// @brief templated struct for determining if bhalf_t is an alias to float.
+/// @tparam T The type to specialize on.
+template <class T>
+struct is_bfloat16 : std::false_type {};
+}  // namespace Kokkos::Experimental::Impl
+
+#ifdef KOKKOS_IMPL_HALF_TYPE_DEFINED
+
+// KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH: A macro to select which
+// floating_pointer_wrapper operator paths should be used. For CUDA, let the
+// compiler conditionally select when device ops are used For SYCL, we have a
+// full half type on both host and device
+#if defined(__CUDA_ARCH__) || defined(KOKKOS_ENABLE_SYCL)
+#define KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH
+#endif
+
+/************************* BEGIN forward declarations *************************/
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+template <class FloatType>
+class floating_point_wrapper;
+}
+
+// Declare half_t (binary16)
+using half_t = Kokkos::Experimental::Impl::floating_point_wrapper<
+    Kokkos::Impl::half_impl_t ::type>;
+namespace Impl {
+template <>
+struct is_float16<half_t> : std::true_type {};
+}  // namespace Impl
+KOKKOS_INLINE_FUNCTION
+half_t cast_to_half(float val);
+KOKKOS_INLINE_FUNCTION
+half_t cast_to_half(bool val);
+KOKKOS_INLINE_FUNCTION
+half_t cast_to_half(double val);
+KOKKOS_INLINE_FUNCTION
+half_t cast_to_half(short val);
+KOKKOS_INLINE_FUNCTION
+half_t cast_to_half(int val);
+KOKKOS_INLINE_FUNCTION
+half_t cast_to_half(long val);
+KOKKOS_INLINE_FUNCTION
+half_t cast_to_half(long long val);
+KOKKOS_INLINE_FUNCTION
+half_t cast_to_half(unsigned short val);
+KOKKOS_INLINE_FUNCTION
+half_t cast_to_half(unsigned int val);
+KOKKOS_INLINE_FUNCTION
+half_t cast_to_half(unsigned long val);
+KOKKOS_INLINE_FUNCTION
+half_t cast_to_half(unsigned long long val);
+KOKKOS_INLINE_FUNCTION
+half_t cast_to_half(half_t);
+
+template <class T>
+KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, float>::value, T>
+    cast_from_half(half_t);
+template <class T>
+KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, bool>::value, T>
+    cast_from_half(half_t);
+template <class T>
+KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, double>::value, T>
+    cast_from_half(half_t);
+template <class T>
+KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, short>::value, T>
+    cast_from_half(half_t);
+template <class T>
+KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, int>::value, T>
+    cast_from_half(half_t);
+template <class T>
+KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, long>::value, T>
+    cast_from_half(half_t);
+template <class T>
+KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, long long>::value, T>
+    cast_from_half(half_t);
+template <class T>
+KOKKOS_INLINE_FUNCTION
+    std::enable_if_t<std::is_same<T, unsigned short>::value, T>
+        cast_from_half(half_t);
+template <class T>
+KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, unsigned int>::value, T>
+    cast_from_half(half_t);
+template <class T>
+KOKKOS_INLINE_FUNCTION
+    std::enable_if_t<std::is_same<T, unsigned long>::value, T>
+        cast_from_half(half_t);
+template <class T>
+KOKKOS_INLINE_FUNCTION
+    std::enable_if_t<std::is_same<T, unsigned long long>::value, T>
+        cast_from_half(half_t);
+
+// declare bhalf_t
+#ifdef KOKKOS_IMPL_BHALF_TYPE_DEFINED
+using bhalf_t = Kokkos::Experimental::Impl::floating_point_wrapper<
+    Kokkos::Impl ::bhalf_impl_t ::type>;
+namespace Impl {
+template <>
+struct is_bfloat16<bhalf_t> : std::true_type {};
+}  // namespace Impl
+KOKKOS_INLINE_FUNCTION
+bhalf_t cast_to_bhalf(float val);
+KOKKOS_INLINE_FUNCTION
+bhalf_t cast_to_bhalf(bool val);
+KOKKOS_INLINE_FUNCTION
+bhalf_t cast_to_bhalf(double val);
+KOKKOS_INLINE_FUNCTION
+bhalf_t cast_to_bhalf(short val);
+KOKKOS_INLINE_FUNCTION
+bhalf_t cast_to_bhalf(int val);
+KOKKOS_INLINE_FUNCTION
+bhalf_t cast_to_bhalf(long val);
+KOKKOS_INLINE_FUNCTION
+bhalf_t cast_to_bhalf(long long val);
+KOKKOS_INLINE_FUNCTION
+bhalf_t cast_to_bhalf(unsigned short val);
+KOKKOS_INLINE_FUNCTION
+bhalf_t cast_to_bhalf(unsigned int val);
+KOKKOS_INLINE_FUNCTION
+bhalf_t cast_to_bhalf(unsigned long val);
+KOKKOS_INLINE_FUNCTION
+bhalf_t cast_to_bhalf(unsigned long long val);
+KOKKOS_INLINE_FUNCTION
+bhalf_t cast_to_bhalf(bhalf_t val);
+
+template <class T>
+KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, float>::value, T>
+    cast_from_bhalf(bhalf_t);
+template <class T>
+KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, bool>::value, T>
+    cast_from_bhalf(bhalf_t);
+template <class T>
+KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, double>::value, T>
+    cast_from_bhalf(bhalf_t);
+template <class T>
+KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, short>::value, T>
+    cast_from_bhalf(bhalf_t);
+template <class T>
+KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, int>::value, T>
+    cast_from_bhalf(bhalf_t);
+template <class T>
+KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, long>::value, T>
+    cast_from_bhalf(bhalf_t);
+template <class T>
+KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, long long>::value, T>
+    cast_from_bhalf(bhalf_t);
+template <class T>
+KOKKOS_INLINE_FUNCTION
+    std::enable_if_t<std::is_same<T, unsigned short>::value, T>
+        cast_from_bhalf(bhalf_t);
+template <class T>
+KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, unsigned int>::value, T>
+    cast_from_bhalf(bhalf_t);
+template <class T>
+KOKKOS_INLINE_FUNCTION
+    std::enable_if_t<std::is_same<T, unsigned long>::value, T>
+        cast_from_bhalf(bhalf_t);
+template <class T>
+KOKKOS_INLINE_FUNCTION
+    std::enable_if_t<std::is_same<T, unsigned long long>::value, T>
+        cast_from_bhalf(bhalf_t);
+#endif  // KOKKOS_IMPL_BHALF_TYPE_DEFINED
+
+template <class T>
+static KOKKOS_INLINE_FUNCTION Kokkos::Experimental::half_t cast_to_wrapper(
+    T x, const volatile Kokkos::Impl::half_impl_t::type&);
+
+#ifdef KOKKOS_IMPL_BHALF_TYPE_DEFINED
+template <class T>
+static KOKKOS_INLINE_FUNCTION Kokkos::Experimental::bhalf_t cast_to_wrapper(
+    T x, const volatile Kokkos::Impl::bhalf_impl_t::type&);
+#endif  // KOKKOS_IMPL_BHALF_TYPE_DEFINED
+
+template <class T>
+static KOKKOS_INLINE_FUNCTION T
+cast_from_wrapper(const Kokkos::Experimental::half_t& x);
+
+#ifdef KOKKOS_IMPL_BHALF_TYPE_DEFINED
+template <class T>
+static KOKKOS_INLINE_FUNCTION T
+cast_from_wrapper(const Kokkos::Experimental::bhalf_t& x);
+#endif  // KOKKOS_IMPL_BHALF_TYPE_DEFINED
+/************************** END forward declarations **************************/
+
+namespace Impl {
+
+template <typename FloatType>
+struct BitComparisonWrapper {
+  std::uint16_t value;
+
+  template <typename Number>
+  KOKKOS_FUNCTION friend bool operator==(BitComparisonWrapper a, Number b) {
+    return static_cast<FloatType>(a) == b;
+  }
+
+  template <typename Number>
+  KOKKOS_FUNCTION friend bool operator!=(BitComparisonWrapper a, Number b) {
+    return static_cast<FloatType>(a) != b;
+  }
+
+  template <typename Number>
+  KOKKOS_FUNCTION friend bool operator<(BitComparisonWrapper a, Number b) {
+    return static_cast<FloatType>(a) < b;
+  }
+
+  template <typename Number>
+  KOKKOS_FUNCTION friend bool operator<=(BitComparisonWrapper a, Number b) {
+    return static_cast<FloatType>(a) <= b;
+  }
+
+  template <typename Number>
+  KOKKOS_FUNCTION friend bool operator>(BitComparisonWrapper a, Number b) {
+    return static_cast<FloatType>(a) > b;
+  }
+
+  template <typename Number>
+  KOKKOS_FUNCTION friend bool operator>=(BitComparisonWrapper a, Number b) {
+    return static_cast<FloatType>(a) >= b;
+  }
+};
+
+template <typename FloatType>
+inline constexpr BitComparisonWrapper<FloatType> exponent_mask;
+template <typename FloatType>
+inline constexpr BitComparisonWrapper<FloatType> fraction_mask;
+
+#ifdef KOKKOS_IMPL_HALF_TYPE_DEFINED
+template <>
+inline constexpr BitComparisonWrapper<Kokkos::Experimental::half_t>
+    exponent_mask<Kokkos::Experimental::half_t>{0b0'11111'0000000000};
+template <>
+inline constexpr BitComparisonWrapper<Kokkos::Experimental::half_t>
+    fraction_mask<Kokkos::Experimental::half_t>{0b0'00000'1111111111};
+#endif
+
+#ifdef KOKKOS_IMPL_BHALF_TYPE_DEFINED
+template <>
+inline constexpr BitComparisonWrapper<Kokkos::Experimental::bhalf_t>
+    exponent_mask<Kokkos::Experimental::bhalf_t>{0b0'11111111'0000000};
+template <>
+inline constexpr BitComparisonWrapper<Kokkos::Experimental::bhalf_t>
+    fraction_mask<Kokkos::Experimental::bhalf_t>{0b0'00000000'1111111};
+#endif
+
+template <class FloatType>
+class alignas(FloatType) floating_point_wrapper {
+ public:
+  using impl_type           = FloatType;
+  using bit_comparison_type = BitComparisonWrapper<floating_point_wrapper>;
+
+ private:
+  impl_type val;
+  using fixed_width_integer_type = std::conditional_t<
+      sizeof(impl_type) == 2, uint16_t,
+      std::conditional_t<
+          sizeof(impl_type) == 4, uint32_t,
+          std::conditional_t<sizeof(impl_type) == 8, uint64_t, void>>>;
+  static_assert(!std::is_void<fixed_width_integer_type>::value,
+                "Invalid impl_type");
+
+ public:
+  // In-class initialization and defaulted default constructors not used
+  // since Cuda supports half precision initialization via the below constructor
+  KOKKOS_FUNCTION
+  floating_point_wrapper() : val(0.0F) {}
+
+// Copy constructors
+// Getting "C2580: multiple versions of a defaulted special
+// member function are not allowed" with VS 16.11.3 and CUDA 11.4.2
+#if defined(_WIN32) && defined(KOKKOS_ENABLE_CUDA)
+  KOKKOS_FUNCTION
+  floating_point_wrapper(const floating_point_wrapper& rhs) : val(rhs.val) {}
+
+  KOKKOS_FUNCTION
+  floating_point_wrapper& operator=(const floating_point_wrapper& rhs) {
+    val = rhs.val;
+    return *this;
+  }
+#else
+  KOKKOS_DEFAULTED_FUNCTION
+  floating_point_wrapper(const floating_point_wrapper&) noexcept = default;
+
+  KOKKOS_DEFAULTED_FUNCTION
+  floating_point_wrapper& operator=(const floating_point_wrapper&) noexcept =
+      default;
+#endif
+
+  KOKKOS_INLINE_FUNCTION
+  floating_point_wrapper(const volatile floating_point_wrapper& rhs) {
+#if defined(KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH) && !defined(KOKKOS_ENABLE_SYCL)
+    val = rhs.val;
+#else
+    const volatile fixed_width_integer_type* rv_ptr =
+        reinterpret_cast<const volatile fixed_width_integer_type*>(&rhs.val);
+    const fixed_width_integer_type rv_val = *rv_ptr;
+    val       = reinterpret_cast<const impl_type&>(rv_val);
+#endif  // KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH
+  }
+
+  KOKKOS_FUNCTION
+  floating_point_wrapper(bit_comparison_type rhs) {
+    val = Kokkos::bit_cast<impl_type>(rhs);
+  }
+
+  // Don't support implicit conversion back to impl_type.
+  // impl_type is a storage only type on host.
+  KOKKOS_FUNCTION
+  explicit operator impl_type() const { return val; }
+  KOKKOS_FUNCTION
+  explicit operator float() const { return cast_from_wrapper<float>(*this); }
+  KOKKOS_FUNCTION
+  explicit operator bool() const { return cast_from_wrapper<bool>(*this); }
+  KOKKOS_FUNCTION
+  explicit operator double() const { return cast_from_wrapper<double>(*this); }
+  KOKKOS_FUNCTION
+  explicit operator short() const { return cast_from_wrapper<short>(*this); }
+  KOKKOS_FUNCTION
+  explicit operator int() const { return cast_from_wrapper<int>(*this); }
+  KOKKOS_FUNCTION
+  explicit operator long() const { return cast_from_wrapper<long>(*this); }
+  KOKKOS_FUNCTION
+  explicit operator long long() const {
+    return cast_from_wrapper<long long>(*this);
+  }
+  KOKKOS_FUNCTION
+  explicit operator unsigned short() const {
+    return cast_from_wrapper<unsigned short>(*this);
+  }
+  KOKKOS_FUNCTION
+  explicit operator unsigned int() const {
+    return cast_from_wrapper<unsigned int>(*this);
+  }
+  KOKKOS_FUNCTION
+  explicit operator unsigned long() const {
+    return cast_from_wrapper<unsigned long>(*this);
+  }
+  KOKKOS_FUNCTION
+  explicit operator unsigned long long() const {
+    return cast_from_wrapper<unsigned long long>(*this);
+  }
+
+  /**
+   * Conversion constructors.
+   *
+   * Support implicit conversions from impl_type, float, double ->
+   * floating_point_wrapper. Mixed precision expressions require upcasting which
+   * is done in the
+   * "// Binary Arithmetic" operator overloads below.
+   *
+   * Support implicit conversions from integral types -> floating_point_wrapper.
+   * Expressions involving floating_point_wrapper with integral types require
+   * downcasting the integral types to floating_point_wrapper. Existing operator
+   * overloads can handle this with the addition of the below implicit
+   * conversion constructors.
+   */
+  KOKKOS_FUNCTION
+  constexpr floating_point_wrapper(impl_type rhs) : val(rhs) {}
+  KOKKOS_FUNCTION
+  floating_point_wrapper(float rhs) : val(cast_to_wrapper(rhs, val).val) {}
+  KOKKOS_FUNCTION
+  floating_point_wrapper(double rhs) : val(cast_to_wrapper(rhs, val).val) {}
+  KOKKOS_FUNCTION
+  explicit floating_point_wrapper(bool rhs)
+      : val(cast_to_wrapper(rhs, val).val) {}
+  KOKKOS_FUNCTION
+  floating_point_wrapper(short rhs) : val(cast_to_wrapper(rhs, val).val) {}
+  KOKKOS_FUNCTION
+  floating_point_wrapper(int rhs) : val(cast_to_wrapper(rhs, val).val) {}
+  KOKKOS_FUNCTION
+  floating_point_wrapper(long rhs) : val(cast_to_wrapper(rhs, val).val) {}
+  KOKKOS_FUNCTION
+  floating_point_wrapper(long long rhs) : val(cast_to_wrapper(rhs, val).val) {}
+  KOKKOS_FUNCTION
+  floating_point_wrapper(unsigned short rhs)
+      : val(cast_to_wrapper(rhs, val).val) {}
+  KOKKOS_FUNCTION
+  floating_point_wrapper(unsigned int rhs)
+      : val(cast_to_wrapper(rhs, val).val) {}
+  KOKKOS_FUNCTION
+  floating_point_wrapper(unsigned long rhs)
+      : val(cast_to_wrapper(rhs, val).val) {}
+  KOKKOS_FUNCTION
+  floating_point_wrapper(unsigned long long rhs)
+      : val(cast_to_wrapper(rhs, val).val) {}
+
+  // Unary operators
+  KOKKOS_FUNCTION
+  floating_point_wrapper operator+() const {
+    floating_point_wrapper tmp = *this;
+#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH
+    tmp.val = +tmp.val;
+#else
+    tmp.val   = cast_to_wrapper(+cast_from_wrapper<float>(tmp), val).val;
+#endif
+    return tmp;
+  }
+
+  KOKKOS_FUNCTION
+  floating_point_wrapper operator-() const {
+    floating_point_wrapper tmp = *this;
+#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH
+    tmp.val = -tmp.val;
+#else
+    tmp.val   = cast_to_wrapper(-cast_from_wrapper<float>(tmp), val).val;
+#endif
+    return tmp;
+  }
+
+  // Prefix operators
+  KOKKOS_FUNCTION
+  floating_point_wrapper& operator++() {
+#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH
+    val = val + impl_type(1.0F);  // cuda has no operator++ for __nv_bfloat
+#else
+    float tmp = cast_from_wrapper<float>(*this);
+    ++tmp;
+    val       = cast_to_wrapper(tmp, val).val;
+#endif
+    return *this;
+  }
+
+  KOKKOS_FUNCTION
+  floating_point_wrapper& operator--() {
+#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH
+    val = val - impl_type(1.0F);  // cuda has no operator-- for __nv_bfloat
+#else
+    float tmp = cast_from_wrapper<float>(*this);
+    --tmp;
+    val = cast_to_wrapper(tmp, val).val;
+#endif
+    return *this;
+  }
+
+  // Postfix operators
+  KOKKOS_FUNCTION
+  floating_point_wrapper operator++(int) {
+    floating_point_wrapper tmp = *this;
+    operator++();
+    return tmp;
+  }
+
+  KOKKOS_FUNCTION
+  floating_point_wrapper operator--(int) {
+    floating_point_wrapper tmp = *this;
+    operator--();
+    return tmp;
+  }
+
+  // Binary operators
+  KOKKOS_FUNCTION
+  floating_point_wrapper& operator=(impl_type rhs) {
+    val = rhs;
+    return *this;
+  }
+
+  template <class T>
+  KOKKOS_FUNCTION floating_point_wrapper& operator=(T rhs) {
+    val = cast_to_wrapper(rhs, val).val;
+    return *this;
+  }
+
+  template <class T>
+  KOKKOS_FUNCTION void operator=(T rhs) volatile {
+    impl_type new_val = cast_to_wrapper(rhs, val).val;
+    volatile fixed_width_integer_type* val_ptr =
+        reinterpret_cast<volatile fixed_width_integer_type*>(
+            const_cast<impl_type*>(&val));
+    *val_ptr = reinterpret_cast<fixed_width_integer_type&>(new_val);
+  }
+
+  // Compound operators
+  KOKKOS_FUNCTION
+  floating_point_wrapper& operator+=(floating_point_wrapper rhs) {
+#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH
+    val = val + rhs.val;  // cuda has no operator+= for __nv_bfloat
+#else
+    val = cast_to_wrapper(
+              cast_from_wrapper<float>(*this) + cast_from_wrapper<float>(rhs),
+              val)
+              .val;
+#endif
+    return *this;
+  }
+
+  KOKKOS_FUNCTION
+  void operator+=(const volatile floating_point_wrapper& rhs) volatile {
+    floating_point_wrapper tmp_rhs = rhs;
+    floating_point_wrapper tmp_lhs = *this;
+
+    tmp_lhs += tmp_rhs;
+    *this = tmp_lhs;
+  }
+
+  // Compound operators: upcast overloads for +=
+  template <class T>
+  KOKKOS_FUNCTION friend std::enable_if_t<
+      std::is_same<T, float>::value || std::is_same<T, double>::value, T>
+  operator+=(T& lhs, floating_point_wrapper rhs) {
+    lhs += static_cast<T>(rhs);
+    return lhs;
+  }
+
+  KOKKOS_FUNCTION
+  floating_point_wrapper& operator+=(float rhs) {
+    float result = static_cast<float>(val) + rhs;
+    val          = static_cast<impl_type>(result);
+    return *this;
+  }
+
+  KOKKOS_FUNCTION
+  floating_point_wrapper& operator+=(double rhs) {
+    double result = static_cast<double>(val) + rhs;
+    val           = static_cast<impl_type>(result);
+    return *this;
+  }
+
+  KOKKOS_FUNCTION
+  floating_point_wrapper& operator-=(floating_point_wrapper rhs) {
+#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH
+    val = val - rhs.val;  // cuda has no operator-= for __nv_bfloat
+#else
+    val = cast_to_wrapper(
+              cast_from_wrapper<float>(*this) - cast_from_wrapper<float>(rhs),
+              val)
+              .val;
+#endif
+    return *this;
+  }
+
+  KOKKOS_FUNCTION
+  void operator-=(const volatile floating_point_wrapper& rhs) volatile {
+    floating_point_wrapper tmp_rhs = rhs;
+    floating_point_wrapper tmp_lhs = *this;
+
+    tmp_lhs -= tmp_rhs;
+    *this = tmp_lhs;
+  }
+
+  // Compund operators: upcast overloads for -=
+  template <class T>
+  KOKKOS_FUNCTION friend std::enable_if_t<
+      std::is_same<T, float>::value || std::is_same<T, double>::value, T>
+  operator-=(T& lhs, floating_point_wrapper rhs) {
+    lhs -= static_cast<T>(rhs);
+    return lhs;
+  }
+
+  KOKKOS_FUNCTION
+  floating_point_wrapper& operator-=(float rhs) {
+    float result = static_cast<float>(val) - rhs;
+    val          = static_cast<impl_type>(result);
+    return *this;
+  }
+
+  KOKKOS_FUNCTION
+  floating_point_wrapper& operator-=(double rhs) {
+    double result = static_cast<double>(val) - rhs;
+    val           = static_cast<impl_type>(result);
+    return *this;
+  }
+
+  KOKKOS_FUNCTION
+  floating_point_wrapper& operator*=(floating_point_wrapper rhs) {
+#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH
+    val = val * rhs.val;  // cuda has no operator*= for __nv_bfloat
+#else
+    val = cast_to_wrapper(
+              cast_from_wrapper<float>(*this) * cast_from_wrapper<float>(rhs),
+              val)
+              .val;
+#endif
+    return *this;
+  }
+
+  KOKKOS_FUNCTION
+  void operator*=(const volatile floating_point_wrapper& rhs) volatile {
+    floating_point_wrapper tmp_rhs = rhs;
+    floating_point_wrapper tmp_lhs = *this;
+
+    tmp_lhs *= tmp_rhs;
+    *this = tmp_lhs;
+  }
+
+  // Compund operators: upcast overloads for *=
+  template <class T>
+  KOKKOS_FUNCTION friend std::enable_if_t<
+      std::is_same<T, float>::value || std::is_same<T, double>::value, T>
+  operator*=(T& lhs, floating_point_wrapper rhs) {
+    lhs *= static_cast<T>(rhs);
+    return lhs;
+  }
+
+  KOKKOS_FUNCTION
+  floating_point_wrapper& operator*=(float rhs) {
+    float result = static_cast<float>(val) * rhs;
+    val          = static_cast<impl_type>(result);
+    return *this;
+  }
+
+  KOKKOS_FUNCTION
+  floating_point_wrapper& operator*=(double rhs) {
+    double result = static_cast<double>(val) * rhs;
+    val           = static_cast<impl_type>(result);
+    return *this;
+  }
+
+  KOKKOS_FUNCTION
+  floating_point_wrapper& operator/=(floating_point_wrapper rhs) {
+#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH
+    val = val / rhs.val;  // cuda has no operator/= for __nv_bfloat
+#else
+    val = cast_to_wrapper(
+              cast_from_wrapper<float>(*this) / cast_from_wrapper<float>(rhs),
+              val)
+              .val;
+#endif
+    return *this;
+  }
+
+  KOKKOS_FUNCTION
+  void operator/=(const volatile floating_point_wrapper& rhs) volatile {
+    floating_point_wrapper tmp_rhs = rhs;
+    floating_point_wrapper tmp_lhs = *this;
+
+    tmp_lhs /= tmp_rhs;
+    *this = tmp_lhs;
+  }
+
+  // Compund operators: upcast overloads for /=
+  template <class T>
+  KOKKOS_FUNCTION friend std::enable_if_t<
+      std::is_same<T, float>::value || std::is_same<T, double>::value, T>
+  operator/=(T& lhs, floating_point_wrapper rhs) {
+    lhs /= static_cast<T>(rhs);
+    return lhs;
+  }
+
+  KOKKOS_FUNCTION
+  floating_point_wrapper& operator/=(float rhs) {
+    float result = static_cast<float>(val) / rhs;
+    val          = static_cast<impl_type>(result);
+    return *this;
+  }
+
+  KOKKOS_FUNCTION
+  floating_point_wrapper& operator/=(double rhs) {
+    double result = static_cast<double>(val) / rhs;
+    val           = static_cast<impl_type>(result);
+    return *this;
+  }
+
+  // Binary Arithmetic
+  KOKKOS_FUNCTION
+  friend floating_point_wrapper operator+(floating_point_wrapper lhs,
+                                          floating_point_wrapper rhs) {
+#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH
+    lhs += rhs;
+#else
+    lhs.val = cast_to_wrapper(
+                  cast_from_wrapper<float>(lhs) + cast_from_wrapper<float>(rhs),
+                  lhs.val)
+                  .val;
+#endif
+    return lhs;
+  }
+
+  // Binary Arithmetic upcast operators for +
+  template <class T>
+  KOKKOS_FUNCTION friend std::enable_if_t<
+      std::is_same<T, float>::value || std::is_same<T, double>::value, T>
+  operator+(floating_point_wrapper lhs, T rhs) {
+    return T(lhs) + rhs;
+  }
+
+  template <class T>
+  KOKKOS_FUNCTION friend std::enable_if_t<
+      std::is_same<T, float>::value || std::is_same<T, double>::value, T>
+  operator+(T lhs, floating_point_wrapper rhs) {
+    return lhs + T(rhs);
+  }
+
+  KOKKOS_FUNCTION
+  friend floating_point_wrapper operator-(floating_point_wrapper lhs,
+                                          floating_point_wrapper rhs) {
+#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH
+    lhs -= rhs;
+#else
+    lhs.val = cast_to_wrapper(
+                  cast_from_wrapper<float>(lhs) - cast_from_wrapper<float>(rhs),
+                  lhs.val)
+                  .val;
+#endif
+    return lhs;
+  }
+
+  // Binary Arithmetic upcast operators for -
+  template <class T>
+  KOKKOS_FUNCTION friend std::enable_if_t<
+      std::is_same<T, float>::value || std::is_same<T, double>::value, T>
+  operator-(floating_point_wrapper lhs, T rhs) {
+    return T(lhs) - rhs;
+  }
+
+  template <class T>
+  KOKKOS_FUNCTION friend std::enable_if_t<
+      std::is_same<T, float>::value || std::is_same<T, double>::value, T>
+  operator-(T lhs, floating_point_wrapper rhs) {
+    return lhs - T(rhs);
+  }
+
+  KOKKOS_FUNCTION
+  friend floating_point_wrapper operator*(floating_point_wrapper lhs,
+                                          floating_point_wrapper rhs) {
+#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH
+    lhs *= rhs;
+#else
+    lhs.val = cast_to_wrapper(
+                  cast_from_wrapper<float>(lhs) * cast_from_wrapper<float>(rhs),
+                  lhs.val)
+                  .val;
+#endif
+    return lhs;
+  }
+
+  // Binary Arithmetic upcast operators for *
+  template <class T>
+  KOKKOS_FUNCTION friend std::enable_if_t<
+      std::is_same<T, float>::value || std::is_same<T, double>::value, T>
+  operator*(floating_point_wrapper lhs, T rhs) {
+    return T(lhs) * rhs;
+  }
+
+  template <class T>
+  KOKKOS_FUNCTION friend std::enable_if_t<
+      std::is_same<T, float>::value || std::is_same<T, double>::value, T>
+  operator*(T lhs, floating_point_wrapper rhs) {
+    return lhs * T(rhs);
+  }
+
+  KOKKOS_FUNCTION
+  friend floating_point_wrapper operator/(floating_point_wrapper lhs,
+                                          floating_point_wrapper rhs) {
+#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH
+    lhs /= rhs;
+#else
+    lhs.val = cast_to_wrapper(
+                  cast_from_wrapper<float>(lhs) / cast_from_wrapper<float>(rhs),
+                  lhs.val)
+                  .val;
+#endif
+    return lhs;
+  }
+
+  // Binary Arithmetic upcast operators for /
+  template <class T>
+  KOKKOS_FUNCTION friend std::enable_if_t<
+      std::is_same<T, float>::value || std::is_same<T, double>::value, T>
+  operator/(floating_point_wrapper lhs, T rhs) {
+    return T(lhs) / rhs;
+  }
+
+  template <class T>
+  KOKKOS_FUNCTION friend std::enable_if_t<
+      std::is_same<T, float>::value || std::is_same<T, double>::value, T>
+  operator/(T lhs, floating_point_wrapper rhs) {
+    return lhs / T(rhs);
+  }
+
+  // Logical operators
+  KOKKOS_FUNCTION
+  bool operator!() const {
+#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH
+    return static_cast<bool>(!val);
+#else
+    return !cast_from_wrapper<float>(*this);
+#endif
+  }
+
+  // NOTE: Loses short-circuit evaluation
+  KOKKOS_FUNCTION
+  bool operator&&(floating_point_wrapper rhs) const {
+#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH
+    return static_cast<bool>(val && rhs.val);
+#else
+    return cast_from_wrapper<float>(*this) && cast_from_wrapper<float>(rhs);
+#endif
+  }
+
+  // NOTE: Loses short-circuit evaluation
+  KOKKOS_FUNCTION
+  bool operator||(floating_point_wrapper rhs) const {
+#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH
+    return static_cast<bool>(val || rhs.val);
+#else
+    return cast_from_wrapper<float>(*this) || cast_from_wrapper<float>(rhs);
+#endif
+  }
+
+  // Comparison operators
+  KOKKOS_FUNCTION
+  bool operator==(floating_point_wrapper rhs) const {
+#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH
+    return static_cast<bool>(val == rhs.val);
+#else
+    return cast_from_wrapper<float>(*this) == cast_from_wrapper<float>(rhs);
+#endif
+  }
+
+  KOKKOS_FUNCTION
+  bool operator!=(floating_point_wrapper rhs) const {
+#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH
+    return static_cast<bool>(val != rhs.val);
+#else
+    return cast_from_wrapper<float>(*this) != cast_from_wrapper<float>(rhs);
+#endif
+  }
+
+  KOKKOS_FUNCTION
+  bool operator<(floating_point_wrapper rhs) const {
+#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH
+    return static_cast<bool>(val < rhs.val);
+#else
+    return cast_from_wrapper<float>(*this) < cast_from_wrapper<float>(rhs);
+#endif
+  }
+
+  KOKKOS_FUNCTION
+  bool operator>(floating_point_wrapper rhs) const {
+#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH
+    return static_cast<bool>(val > rhs.val);
+#else
+    return cast_from_wrapper<float>(*this) > cast_from_wrapper<float>(rhs);
+#endif
+  }
+
+  KOKKOS_FUNCTION
+  bool operator<=(floating_point_wrapper rhs) const {
+#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH
+    return static_cast<bool>(val <= rhs.val);
+#else
+    return cast_from_wrapper<float>(*this) <= cast_from_wrapper<float>(rhs);
+#endif
+  }
+
+  KOKKOS_FUNCTION
+  bool operator>=(floating_point_wrapper rhs) const {
+#ifdef KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH
+    return static_cast<bool>(val >= rhs.val);
+#else
+    return cast_from_wrapper<float>(*this) >= cast_from_wrapper<float>(rhs);
+#endif
+  }
+
+  KOKKOS_FUNCTION
+  friend bool operator==(const volatile floating_point_wrapper& lhs,
+                         const volatile floating_point_wrapper& rhs) {
+    floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs;
+    return tmp_lhs == tmp_rhs;
+  }
+
+  KOKKOS_FUNCTION
+  friend bool operator!=(const volatile floating_point_wrapper& lhs,
+                         const volatile floating_point_wrapper& rhs) {
+    floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs;
+    return tmp_lhs != tmp_rhs;
+  }
+
+  KOKKOS_FUNCTION
+  friend bool operator<(const volatile floating_point_wrapper& lhs,
+                        const volatile floating_point_wrapper& rhs) {
+    floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs;
+    return tmp_lhs < tmp_rhs;
+  }
+
+  template <class T>
+  KOKKOS_FUNCTION friend std::enable_if_t<std::is_convertible_v<T, float> &&
+                                              (std::is_same_v<T, float> ||
+                                               std::is_same_v<T, double>),
+                                          bool>
+  operator<(floating_point_wrapper lhs, T rhs) {
+    return static_cast<float>(lhs) < rhs;
+  }
+
+  template <class T>
+  KOKKOS_FUNCTION friend std::enable_if_t<std::is_convertible_v<T, float> &&
+                                              (std::is_same_v<T, float> ||
+                                               std::is_same_v<T, double>),
+                                          bool>
+  operator<(T lhs, floating_point_wrapper rhs) {
+    return lhs < static_cast<float>(rhs);
+  }
+
+  KOKKOS_FUNCTION
+  friend bool operator>(const volatile floating_point_wrapper& lhs,
+                        const volatile floating_point_wrapper& rhs) {
+    floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs;
+    return tmp_lhs > tmp_rhs;
+  }
+
+  template <class T>
+  KOKKOS_FUNCTION friend std::enable_if_t<std::is_convertible_v<T, float> &&
+                                              (std::is_same_v<T, float> ||
+                                               std::is_same_v<T, double>),
+                                          bool>
+  operator>(floating_point_wrapper lhs, T rhs) {
+    return static_cast<float>(lhs) > rhs;
+  }
+
+  template <class T>
+  KOKKOS_FUNCTION friend std::enable_if_t<std::is_convertible_v<T, float> &&
+                                              (std::is_same_v<T, float> ||
+                                               std::is_same_v<T, double>),
+                                          bool>
+  operator>(T lhs, floating_point_wrapper rhs) {
+    return lhs > static_cast<float>(rhs);
+  }
+
+  KOKKOS_FUNCTION
+  friend bool operator<=(const volatile floating_point_wrapper& lhs,
+                         const volatile floating_point_wrapper& rhs) {
+    floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs;
+    return tmp_lhs <= tmp_rhs;
+  }
+
+  template <class T>
+  KOKKOS_FUNCTION friend std::enable_if_t<std::is_convertible_v<T, float> &&
+                                              (std::is_same_v<T, float> ||
+                                               std::is_same_v<T, double>),
+                                          bool>
+  operator<=(floating_point_wrapper lhs, T rhs) {
+    return static_cast<float>(lhs) <= rhs;
+  }
+
+  template <class T>
+  KOKKOS_FUNCTION friend std::enable_if_t<std::is_convertible_v<T, float> &&
+                                              (std::is_same_v<T, float> ||
+                                               std::is_same_v<T, double>),
+                                          bool>
+  operator<=(T lhs, floating_point_wrapper rhs) {
+    return lhs <= static_cast<float>(rhs);
+  }
+
+  KOKKOS_FUNCTION
+  friend bool operator>=(const volatile floating_point_wrapper& lhs,
+                         const volatile floating_point_wrapper& rhs) {
+    floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs;
+    return tmp_lhs >= tmp_rhs;
+  }
+
+  template <class T>
+  KOKKOS_FUNCTION friend std::enable_if_t<std::is_convertible_v<T, float> &&
+                                              (std::is_same_v<T, float> ||
+                                               std::is_same_v<T, double>),
+                                          bool>
+  operator>=(floating_point_wrapper lhs, T rhs) {
+    return static_cast<float>(lhs) >= rhs;
+  }
+
+  template <class T>
+  KOKKOS_FUNCTION friend std::enable_if_t<std::is_convertible_v<T, float> &&
+                                              (std::is_same_v<T, float> ||
+                                               std::is_same_v<T, double>),
+                                          bool>
+  operator>=(T lhs, floating_point_wrapper rhs) {
+    return lhs >= static_cast<float>(rhs);
+  }
+
+  // Insertion and extraction operators
+  friend std::ostream& operator<<(std::ostream& os,
+                                  const floating_point_wrapper& x) {
+    const std::string out = std::to_string(static_cast<double>(x));
+    os << out;
+    return os;
+  }
+
+  friend std::istream& operator>>(std::istream& is, floating_point_wrapper& x) {
+    std::string in;
+    is >> in;
+    x = std::stod(in);
+    return is;
+  }
+};
+}  // namespace Impl
+
+// Declare wrapper overloads now that floating_point_wrapper is declared
+template <class T>
+static KOKKOS_INLINE_FUNCTION Kokkos::Experimental::half_t cast_to_wrapper(
+    T x, const volatile Kokkos::Impl::half_impl_t::type&) {
+  return Kokkos::Experimental::cast_to_half(x);
+}
+
+#ifdef KOKKOS_IMPL_BHALF_TYPE_DEFINED
+template <class T>
+static KOKKOS_INLINE_FUNCTION Kokkos::Experimental::bhalf_t cast_to_wrapper(
+    T x, const volatile Kokkos::Impl::bhalf_impl_t::type&) {
+  return Kokkos::Experimental::cast_to_bhalf(x);
+}
+#endif  // KOKKOS_IMPL_BHALF_TYPE_DEFINED
+
+template <class T>
+static KOKKOS_INLINE_FUNCTION T
+cast_from_wrapper(const Kokkos::Experimental::half_t& x) {
+  return Kokkos::Experimental::cast_from_half<T>(x);
+}
+
+#ifdef KOKKOS_IMPL_BHALF_TYPE_DEFINED
+template <class T>
+static KOKKOS_INLINE_FUNCTION T
+cast_from_wrapper(const Kokkos::Experimental::bhalf_t& x) {
+  return Kokkos::Experimental::cast_from_bhalf<T>(x);
+}
+#endif  // KOKKOS_IMPL_BHALF_TYPE_DEFINED
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif  // KOKKOS_IMPL_HALF_TYPE_DEFINED
+
+// If none of the above actually did anything and defined a half precision type
+// define a fallback implementation here using float
+#ifndef KOKKOS_IMPL_HALF_TYPE_DEFINED
+#define KOKKOS_IMPL_HALF_TYPE_DEFINED
+#define KOKKOS_HALF_T_IS_FLOAT true
+namespace Kokkos {
+namespace Impl {
+struct half_impl_t {
+  using type = float;
+};
+}  // namespace Impl
+namespace Experimental {
+
+using half_t = Kokkos::Impl::half_impl_t::type;
+
+// cast_to_half
+KOKKOS_INLINE_FUNCTION
+half_t cast_to_half(float val) { return half_t(val); }
+KOKKOS_INLINE_FUNCTION
+half_t cast_to_half(bool val) { return half_t(val); }
+KOKKOS_INLINE_FUNCTION
+half_t cast_to_half(double val) { return half_t(val); }
+KOKKOS_INLINE_FUNCTION
+half_t cast_to_half(short val) { return half_t(val); }
+KOKKOS_INLINE_FUNCTION
+half_t cast_to_half(unsigned short val) { return half_t(val); }
+KOKKOS_INLINE_FUNCTION
+half_t cast_to_half(int val) { return half_t(val); }
+KOKKOS_INLINE_FUNCTION
+half_t cast_to_half(unsigned int val) { return half_t(val); }
+KOKKOS_INLINE_FUNCTION
+half_t cast_to_half(long val) { return half_t(val); }
+KOKKOS_INLINE_FUNCTION
+half_t cast_to_half(unsigned long val) { return half_t(val); }
+KOKKOS_INLINE_FUNCTION
+half_t cast_to_half(long long val) { return half_t(val); }
+KOKKOS_INLINE_FUNCTION
+half_t cast_to_half(unsigned long long val) { return half_t(val); }
+
+// cast_from_half
+// Using an explicit list here too, since the other ones are explicit and for
+// example don't include char
+template <class T>
+KOKKOS_INLINE_FUNCTION std::enable_if_t<
+    std::is_same<T, float>::value || std::is_same<T, bool>::value ||
+        std::is_same<T, double>::value || std::is_same<T, short>::value ||
+        std::is_same<T, unsigned short>::value || std::is_same<T, int>::value ||
+        std::is_same<T, unsigned int>::value || std::is_same<T, long>::value ||
+        std::is_same<T, unsigned long>::value ||
+        std::is_same<T, long long>::value ||
+        std::is_same<T, unsigned long long>::value,
+    T>
+cast_from_half(half_t val) {
+  return T(val);
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#else
+#define KOKKOS_HALF_T_IS_FLOAT false
+#endif  // KOKKOS_IMPL_HALF_TYPE_DEFINED
+
+#ifndef KOKKOS_IMPL_BHALF_TYPE_DEFINED
+#define KOKKOS_IMPL_BHALF_TYPE_DEFINED
+#define KOKKOS_BHALF_T_IS_FLOAT true
+namespace Kokkos {
+namespace Impl {
+struct bhalf_impl_t {
+  using type = float;
+};
+}  // namespace Impl
+
+namespace Experimental {
+
+using bhalf_t = Kokkos::Impl::bhalf_impl_t::type;
+
+// cast_to_bhalf
+KOKKOS_INLINE_FUNCTION
+bhalf_t cast_to_bhalf(float val) { return bhalf_t(val); }
+KOKKOS_INLINE_FUNCTION
+bhalf_t cast_to_bhalf(bool val) { return bhalf_t(val); }
+KOKKOS_INLINE_FUNCTION
+bhalf_t cast_to_bhalf(double val) { return bhalf_t(val); }
+KOKKOS_INLINE_FUNCTION
+bhalf_t cast_to_bhalf(short val) { return bhalf_t(val); }
+KOKKOS_INLINE_FUNCTION
+bhalf_t cast_to_bhalf(unsigned short val) { return bhalf_t(val); }
+KOKKOS_INLINE_FUNCTION
+bhalf_t cast_to_bhalf(int val) { return bhalf_t(val); }
+KOKKOS_INLINE_FUNCTION
+bhalf_t cast_to_bhalf(unsigned int val) { return bhalf_t(val); }
+KOKKOS_INLINE_FUNCTION
+bhalf_t cast_to_bhalf(long val) { return bhalf_t(val); }
+KOKKOS_INLINE_FUNCTION
+bhalf_t cast_to_bhalf(unsigned long val) { return bhalf_t(val); }
+KOKKOS_INLINE_FUNCTION
+bhalf_t cast_to_bhalf(long long val) { return bhalf_t(val); }
+KOKKOS_INLINE_FUNCTION
+bhalf_t cast_to_bhalf(unsigned long long val) { return bhalf_t(val); }
+
+// cast_from_bhalf
+template <class T>
+KOKKOS_INLINE_FUNCTION std::enable_if_t<
+    std::is_same<T, float>::value || std::is_same<T, bool>::value ||
+        std::is_same<T, double>::value || std::is_same<T, short>::value ||
+        std::is_same<T, unsigned short>::value || std::is_same<T, int>::value ||
+        std::is_same<T, unsigned int>::value || std::is_same<T, long>::value ||
+        std::is_same<T, unsigned long>::value ||
+        std::is_same<T, long long>::value ||
+        std::is_same<T, unsigned long long>::value,
+    T>
+cast_from_bhalf(bhalf_t val) {
+  return T(val);
+}
+}  // namespace Experimental
+}  // namespace Kokkos
+#else
+#define KOKKOS_BHALF_T_IS_FLOAT false
+#endif  // KOKKOS_IMPL_BHALF_TYPE_DEFINED
+
+#endif  // KOKKOS_HALF_FLOATING_POINT_WRAPPER_HPP_
diff --git a/packages/kokkos/core/src/impl/Kokkos_Half_MathematicalFunctions.hpp b/packages/kokkos/core/src/impl/Kokkos_Half_MathematicalFunctions.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e6a5cadc67cbab0356d8253b88c2b94048ece9fe
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_Half_MathematicalFunctions.hpp
@@ -0,0 +1,259 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_HALF_MATHEMATICAL_FUNCTIONS_HPP_
+#define KOKKOS_HALF_MATHEMATICAL_FUNCTIONS_HPP_
+
+#include <Kokkos_MathematicalFunctions.hpp>  // For the float overloads
+#include <Kokkos_BitManipulation.hpp>        // bit_cast
+
+// clang-format off
+namespace Kokkos {
+// BEGIN macro definitions
+#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT
+  #define KOKKOS_IMPL_MATH_H_FUNC_WRAPPER(MACRO, FUNC) \
+    MACRO(FUNC, Kokkos::Experimental::half_t)
+#else
+  #define KOKKOS_IMPL_MATH_H_FUNC_WRAPPER(MACRO, FUNC)
+#endif
+
+#if defined(KOKKOS_BHALF_T_IS_FLOAT) && !KOKKOS_BHALF_T_IS_FLOAT
+  #define KOKKOS_IMPL_MATH_B_FUNC_WRAPPER(MACRO, FUNC) \
+    MACRO(FUNC, Kokkos::Experimental::bhalf_t)
+#else
+  #define KOKKOS_IMPL_MATH_B_FUNC_WRAPPER(MACRO, FUNC)
+#endif
+
+#define KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(MACRO, FUNC) \
+  KOKKOS_IMPL_MATH_H_FUNC_WRAPPER(MACRO, FUNC)          \
+  KOKKOS_IMPL_MATH_B_FUNC_WRAPPER(MACRO, FUNC)
+
+
+#define KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE(FUNC, HALF_TYPE)      \
+  KOKKOS_INLINE_FUNCTION HALF_TYPE FUNC(HALF_TYPE x) {                  \
+    return static_cast<HALF_TYPE>(Kokkos::FUNC(static_cast<float>(x))); \
+  }
+
+#define KOKKOS_IMPL_MATH_BINARY_FUNCTION_HALF_MIXED(FUNC, HALF_TYPE, MIXED_TYPE) \
+  KOKKOS_INLINE_FUNCTION double FUNC(HALF_TYPE x, MIXED_TYPE y) {  \
+    return Kokkos::FUNC(static_cast<double>(x), static_cast<double>(y)); \
+  } \
+  KOKKOS_INLINE_FUNCTION double FUNC(MIXED_TYPE x, HALF_TYPE y) {  \
+    return Kokkos::FUNC(static_cast<double>(x), static_cast<double>(y)); \
+  }
+
+#define KOKKOS_IMPL_MATH_BINARY_FUNCTION_HALF(FUNC, HALF_TYPE)       \
+  KOKKOS_INLINE_FUNCTION HALF_TYPE FUNC(HALF_TYPE x, HALF_TYPE y) {  \
+    return static_cast<HALF_TYPE>(                                   \
+        Kokkos::FUNC(static_cast<float>(x), static_cast<float>(y))); \
+  } \
+  KOKKOS_INLINE_FUNCTION float FUNC(float x, HALF_TYPE y) {  \
+    return Kokkos::FUNC(static_cast<float>(x), static_cast<float>(y)); \
+  } \
+  KOKKOS_INLINE_FUNCTION float FUNC(HALF_TYPE x, float y) {  \
+    return Kokkos::FUNC(static_cast<float>(x), static_cast<float>(y)); \
+  } \
+  KOKKOS_IMPL_MATH_BINARY_FUNCTION_HALF_MIXED(FUNC, HALF_TYPE, double) \
+  KOKKOS_IMPL_MATH_BINARY_FUNCTION_HALF_MIXED(FUNC, HALF_TYPE, short) \
+  KOKKOS_IMPL_MATH_BINARY_FUNCTION_HALF_MIXED(FUNC, HALF_TYPE, unsigned short) \
+  KOKKOS_IMPL_MATH_BINARY_FUNCTION_HALF_MIXED(FUNC, HALF_TYPE, int) \
+  KOKKOS_IMPL_MATH_BINARY_FUNCTION_HALF_MIXED(FUNC, HALF_TYPE, unsigned int) \
+  KOKKOS_IMPL_MATH_BINARY_FUNCTION_HALF_MIXED(FUNC, HALF_TYPE, long) \
+  KOKKOS_IMPL_MATH_BINARY_FUNCTION_HALF_MIXED(FUNC, HALF_TYPE, unsigned long) \
+  KOKKOS_IMPL_MATH_BINARY_FUNCTION_HALF_MIXED(FUNC, HALF_TYPE, long long) \
+  KOKKOS_IMPL_MATH_BINARY_FUNCTION_HALF_MIXED(FUNC, HALF_TYPE, unsigned long long)
+
+
+#define KOKKOS_IMPL_MATH_UNARY_PREDICATE_HALF(FUNC, HALF_TYPE) \
+  KOKKOS_INLINE_FUNCTION bool FUNC(HALF_TYPE x) {              \
+    return Kokkos::FUNC(static_cast<float>(x));                \
+  }
+
+// END macros definitions
+
+
+// Basic operations
+KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE, abs)
+KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE, fabs)
+KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_BINARY_FUNCTION_HALF, fmod)
+KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_BINARY_FUNCTION_HALF, remainder)
+// remquo
+// fma
+KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_BINARY_FUNCTION_HALF, fmax)
+KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_BINARY_FUNCTION_HALF, fmin)
+KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_BINARY_FUNCTION_HALF, fdim)
+// nanq
+// Exponential functions
+KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE, exp)
+KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE, exp2)
+KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE, expm1)
+KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE, log)
+KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE, log10)
+KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE, log2)
+KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE, log1p)
+// Power functions
+KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_BINARY_FUNCTION_HALF, pow)
+KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE, sqrt)
+KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE, cbrt)
+KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_BINARY_FUNCTION_HALF, hypot)
+// Trigonometric functions
+KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE, sin)
+KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE, cos)
+KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE, tan)
+KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE, asin)
+KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE, acos)
+KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE, atan)
+KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_BINARY_FUNCTION_HALF, atan2)
+// Hyperbolic functions
+KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE, sinh)
+KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE, cosh)
+KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE, tanh)
+KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE, asinh)
+KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE, acosh)
+KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE, atanh)
+// Error and gamma functions
+KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE, erf)
+KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE, erfc)
+KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE, tgamma)
+KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE, lgamma)
+// Nearest integer floating point functions
+KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE, ceil)
+KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE, floor)
+KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE, trunc)
+KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE, round)
+// lround
+// llround
+// FIXME_SYCL not available as of current SYCL 2020 specification (revision 4)
+#ifndef KOKKOS_ENABLE_SYCL // FIXME_SYCL
+KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE, nearbyint)
+#endif
+// rint
+// lrint
+// llrint
+// Floating point manipulation functions
+// frexp
+// ldexp
+// modf
+// scalbn
+// scalbln
+// ilog
+KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE, logb)
+KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_BINARY_FUNCTION_HALF, nextafter)
+// nexttoward
+KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_BINARY_FUNCTION_HALF, copysign)
+// Classification and comparison functions
+// fpclassify
+
+#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT
+KOKKOS_INLINE_FUNCTION bool isfinite(Kokkos::Experimental::half_t x) {
+  using bit_type = Kokkos::Experimental::half_t::bit_comparison_type;
+  constexpr bit_type exponent_mask = Kokkos::Experimental::Impl::exponent_mask<Kokkos::Experimental::half_t>;
+  const bit_type bit_pattern_x = bit_cast<bit_type>(
+      static_cast<Kokkos::Experimental::half_t::impl_type>(x));
+  return (bit_pattern_x.value & exponent_mask.value) != exponent_mask.value;
+}
+#endif
+
+#if defined(KOKKOS_BHALF_T_IS_FLOAT) && !KOKKOS_BHALF_T_IS_FLOAT
+KOKKOS_INLINE_FUNCTION bool isfinite(Kokkos::Experimental::bhalf_t x) {
+  using bit_type = Kokkos::Experimental::bhalf_t::bit_comparison_type;
+  constexpr bit_type exponent_mask = Kokkos::Experimental::Impl::exponent_mask<Kokkos::Experimental::bhalf_t>;
+  const bit_type bit_pattern_x = bit_cast<bit_type>(
+      static_cast<Kokkos::Experimental::bhalf_t::impl_type>(x));
+  return (bit_pattern_x.value & exponent_mask.value) != exponent_mask.value;
+}
+#endif
+
+#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT
+KOKKOS_INLINE_FUNCTION bool isinf(Kokkos::Experimental::half_t x) {
+  using bit_type = Kokkos::Experimental::half_t::bit_comparison_type;
+  constexpr bit_type exponent_mask = Kokkos::Experimental::Impl::exponent_mask<Kokkos::Experimental::half_t>;
+  constexpr bit_type fraction_mask = Kokkos::Experimental::Impl::fraction_mask<Kokkos::Experimental::half_t>;
+  const bit_type bit_pattern_x = bit_cast<bit_type>(
+      static_cast<Kokkos::Experimental::half_t::impl_type>(x));
+  return (
+      ((bit_pattern_x.value & exponent_mask.value) == exponent_mask.value) &&
+      ((bit_pattern_x.value & fraction_mask.value) == 0));
+}
+#endif
+
+#if defined(KOKKOS_BHALF_T_IS_FLOAT) && !KOKKOS_BHALF_T_IS_FLOAT
+KOKKOS_INLINE_FUNCTION bool isinf(Kokkos::Experimental::bhalf_t x) {
+  using bit_type = Kokkos::Experimental::bhalf_t::bit_comparison_type;
+  constexpr bit_type exponent_mask = Kokkos::Experimental::Impl::exponent_mask<Kokkos::Experimental::bhalf_t>;
+  constexpr bit_type fraction_mask = Kokkos::Experimental::Impl::fraction_mask<Kokkos::Experimental::bhalf_t>;
+  const bit_type bit_pattern_x = bit_cast<bit_type>(
+      static_cast<Kokkos::Experimental::bhalf_t::impl_type>(x));
+  return (
+      ((bit_pattern_x.value & exponent_mask.value) == exponent_mask.value) &&
+      ((bit_pattern_x.value & fraction_mask.value) == 0));
+}
+#endif
+
+#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT
+KOKKOS_INLINE_FUNCTION bool isnan(Kokkos::Experimental::half_t x) {
+  using bit_type = Kokkos::Experimental::half_t::bit_comparison_type;
+  constexpr bit_type exponent_mask = Kokkos::Experimental::Impl::exponent_mask<Kokkos::Experimental::half_t>;
+  constexpr bit_type fraction_mask = Kokkos::Experimental::Impl::fraction_mask<Kokkos::Experimental::half_t>;
+  const bit_type bit_pattern_x = bit_cast<bit_type>(
+      static_cast<Kokkos::Experimental::half_t::impl_type>(x));
+  return (
+      ((bit_pattern_x.value & exponent_mask.value) == exponent_mask.value) &&
+      ((bit_pattern_x.value & fraction_mask.value) != 0));
+}
+#endif
+
+#if defined(KOKKOS_BHALF_T_IS_FLOAT) && !KOKKOS_BHALF_T_IS_FLOAT
+KOKKOS_INLINE_FUNCTION bool isnan(Kokkos::Experimental::bhalf_t x) {
+  using bit_type = Kokkos::Experimental::bhalf_t::bit_comparison_type;
+  constexpr bit_type exponent_mask = Kokkos::Experimental::Impl::exponent_mask<Kokkos::Experimental::bhalf_t>;
+  constexpr bit_type fraction_mask = Kokkos::Experimental::Impl::fraction_mask<Kokkos::Experimental::bhalf_t>;
+  const bit_type bit_pattern_x = bit_cast<bit_type>(
+      static_cast<Kokkos::Experimental::bhalf_t::impl_type>(x));
+  return (
+      ((bit_pattern_x.value & exponent_mask.value) == exponent_mask.value) &&
+      ((bit_pattern_x.value & fraction_mask.value) != 0));
+}
+#endif
+// isnormal
+KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_PREDICATE_HALF, signbit)
+// isgreater
+// isgreaterequal
+// isless
+// islessequal
+// islessgreater
+// isunordered
+// Complex number functions
+#define KOKKOS_IMPL_MATH_COMPLEX_REAL_HALF(FUNC, HALF_TYPE) \
+  KOKKOS_INLINE_FUNCTION HALF_TYPE FUNC(HALF_TYPE x) { return x; }
+
+#define KOKKOS_IMPL_MATH_COMPLEX_IMAG_HALF(FUNC, HALF_TYPE) \
+  KOKKOS_INLINE_FUNCTION HALF_TYPE FUNC(HALF_TYPE) { return 0; }
+
+KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_COMPLEX_REAL_HALF, real)
+KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_COMPLEX_IMAG_HALF, imag)
+
+#undef KOKKOS_IMPL_MATH_COMPLEX_REAL_HALF
+#undef KOKKOS_IMPL_MATH_COMPLEX_IMAG_HALF
+#undef KOKKOS_IMPL_MATH_UNARY_PREDICATE_HALF
+#undef KOKKOS_IMPL_MATH_BINARY_FUNCTION_HALF
+#undef KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE
+#undef KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER
+#undef KOKKOS_IMPL_MATH_B_FUNC_WRAPPER
+#undef KOKKOS_IMPL_MATH_H_FUNC_WRAPPER
+}  // namespace Kokkos
+// clang-format on
+#endif  // KOKKOS_HALF_MATHEMATICAL_FUNCTIONS_HPP_
diff --git a/packages/kokkos/core/src/impl/Kokkos_Half_NumericTraits.hpp b/packages/kokkos/core/src/impl/Kokkos_Half_NumericTraits.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..4779c2a6e109af102a7474af82d2ffba38f4d1f6
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_Half_NumericTraits.hpp
@@ -0,0 +1,349 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_HALF_NUMERIC_TRAITS_HPP_
+#define KOKKOS_HALF_NUMERIC_TRAITS_HPP_
+
+#include <Kokkos_NumericTraits.hpp>
+
+////////////// BEGIN HALF_T (binary16) limits //////////////
+// clang-format off
+// '\brief:' below are from the libc definitions for float and double:
+// https://www.gnu.org/software/libc/manual/html_node/Floating-Point-Parameters.html
+//
+// The arithmetic encoding and equations below are derived from:
+// Ref1: https://en.wikipedia.org/wiki/Single-precision_floating-point_format
+// Ref2: https://en.wikipedia.org/wiki/Exponent_bias
+// Ref3; https://docs.oracle.com/cd/E19957-01/806-3568/ncg_goldberg.html
+//
+// Some background on the magic numbers 2**10=1024 and 2**15=32768 used below:
+//
+// IMPORTANT: For IEEE754 encodings, see Ref1.
+//
+// For binary16, we have B = 2 and p = 16 with 2**16 possible significands.
+// The binary16 format is: [s  e  e  e  e  e  f f f f f f f f f f]
+//              bit index:  15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
+// s: signed bit (1 bit)
+// e: exponent bits (5 bits)
+// f: fractional bits (10 bits)
+//
+// E_bias      = 2**(n_exponent_bits - 1) - 1 = 2**(5 - 1) - 1 = 15
+// E_subnormal = 00000 (base2)
+// E_infinity  = 11111 (base2)
+// E_min       = 1 - E_bias = 1 - 15
+// E_max       = 2**5 - 1 - E_bias = 2**5 - 1 - 15 = 16
+//
+// 2**10=1024 is the smallest denominator that is representable in binary16:
+// [s  e  e  e  e  e  f f f f f f f f f f]
+// [0  0  0  0  0  0  0 0 0 0 0 0 0 0 0 1]
+// which is: 1 / 2**-10
+//
+//
+// 2**15 is the largest exponent factor representable in binary16, for example the
+// largest integer value representable in binary16 is:
+// [s  e  e  e  e  e  f f f f f f f f f f]
+// [0  1  1  1  1  0  1 1 1 1 1 1 1 1 1 1]
+// which is: 2**(2**4 + 2**3 + 2**2 + 2**1 - 15) * (1 + 2**-10 + 2**-9 + 2**-8 + 2**-7 + 2**-6 + 2**-5 + 2**-4 + 2**-3 + 2**-2 + 2**-1)) =
+//           2**15 * (1 + 0.9990234375) =
+//           65504.0
+//
+#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT
+/// \brief: Infinity
+///
+/// Binary16 encoding:
+///             [s  e  e  e  e  e  f f f f f f f f f f]
+///             [0  1  1  1  1  1  0 0 0 0 0 0 0 0 0 0]
+/// bit index:   15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
+///
+template <>
+struct Kokkos::Experimental::Impl::infinity_helper<Kokkos::Experimental::half_t> {
+  static constexpr Kokkos::Experimental::half_t::bit_comparison_type value{0b0'11111'0000000000};
+};
+
+/// \brief: Minimum normalized number
+///
+/// Stdc defines this as the smallest number (representable in binary16).
+///
+/// Binary16 encoding:
+///             [s  e  e  e  e  e  f f f f f f f f f f]
+///             [1  1  1  1  1  0  1 1 1 1 1 1 1 1 1 1]
+/// bit index:   15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
+///
+/// and in base10: -1 * 2**(2**4 + 2**3 + 2**2 + 2**1 - 15) * (1 + 2**-10 + 2**-9 + 2**-8 + 2**-7 + 2**-6 + 2**-5 + 2**-4 + 2**-3 + 2**-2 + 2**-1)
+///              = -2**15 * (1 + (2**10 - 1) / 2**10)
+template <>
+struct Kokkos::Experimental::Impl::finite_min_helper<
+    Kokkos::Experimental::half_t> {
+  static constexpr Kokkos::Experimental::half_t::bit_comparison_type value{0b1'11110'1111111111}; // -65504
+};
+
+/// \brief: Maximum normalized number
+///
+/// Stdc defines this as the maximum number (representable in binary16).
+///
+/// Binary16 encoding:
+///             [s  e  e  e  e  e  f f f f f f f f f f]
+///             [0  1  1  1  1  0  1 1 1 1 1 1 1 1 1 1]
+/// bit index:   15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
+///
+/// and in base10: 1 * 2**(2**4 + 2**3 + 2**2 + 2**1 - 15) * (1 + 2**-10 + 2**-9 + 2**-8 + 2**-7 + 2**-6 + 2**-5 + 2**-4 + 2**-3 + 2**-2 + 2**-1)
+///              = 2**15 * (1 + (2**10 - 1) / 2**10)
+template <>
+struct Kokkos::Experimental::Impl::finite_max_helper<
+    Kokkos::Experimental::half_t> {
+  static constexpr Kokkos::Experimental::half_t::bit_comparison_type value{0b0'11110'1111111111}; // +65504
+};
+
+/// \brief: This is the difference between 1 and the smallest floating point
+///         number of type binary16 that is greater than 1
+///
+/// Smallest number in binary16 that is greater than 1 encoding:
+///             [s  e  e  e  e  e  f f f f f f f f f f]
+///             [0  0  1  1  1  1  0 0 0 0 0 0 0 0 0 1]
+/// bit index:   15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
+///
+/// and in base10: 1 * 2**(2**3 + 2**2 + 2**1 + 2**0 - 15) * (1 + 2**-10)
+///                = 2**0 * (1 + 2**-10)
+///                = 1.0009765625
+///
+/// Lastly, 1 - 1.0009765625 = 0.0009765625.
+template <>
+struct Kokkos::Experimental::Impl::epsilon_helper<
+    Kokkos::Experimental::half_t> {
+  static constexpr Kokkos::Experimental::half_t::bit_comparison_type value{0b0'00101'0000000000}; // 0.0009765625
+};
+
+/// @brief: The largest possible rounding error in ULPs
+///
+/// This simply uses the maximum rounding error.
+///
+/// Reference: https://docs.oracle.com/cd/E19957-01/806-3568/ncg_goldberg.html#689
+template <>
+struct Kokkos::Experimental::Impl::round_error_helper<
+    Kokkos::Experimental::half_t> {
+  static constexpr Kokkos::Experimental::half_t::bit_comparison_type value{0b0'01110'0000000000}; // 0.5
+};
+
+/// \brief: Minimum normalized positive half precision number
+///
+/// Stdc defines this as the minimum normalized positive floating
+/// point number that is representable in type binary16
+///
+/// Smallest number in binary16 that is greater than 1 encoding:
+///             [s  e  e  e  e  e  f f f f f f f f f f]
+///             [0  0  0  0  0  1  0 0 0 0 0 0 0 0 0 0]
+/// bit index:   15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
+///
+/// and in base10: 1 * 2**(2**0 - 15) * (1)
+///                = 2**-14
+template <>
+struct Kokkos::Experimental::Impl::norm_min_helper<
+    Kokkos::Experimental::half_t> {
+  static constexpr Kokkos::Experimental::half_t::bit_comparison_type value{0b0'00001'0000000000}; // 0.00006103515625
+};
+
+/// \brief: Quiet not a half precision number
+///
+/// IEEE 754 defines this as all exponent bits and the first fraction bit high.
+///
+/// Quiet NaN in binary16:
+///             [s  e  e  e  e  e  f f f f f f f f f f]
+///             [0  1  1  1  1  1  1 0 0 0 0 0 0 0 0 0]
+/// bit index:   15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
+template <>
+struct Kokkos::Experimental::Impl::quiet_NaN_helper<
+    Kokkos::Experimental::half_t> {
+  static constexpr Kokkos::Experimental::half_t::bit_comparison_type value{0b0'11111'1000000000};
+};
+
+/// \brief: Signaling not a half precision number
+///
+/// IEEE 754 defines this as all exponent bits and the second fraction bit high.
+///
+/// Quiet NaN in binary16:
+///             [s  e  e  e  e  e  f f f f f f f f f f]
+///             [0  1  1  1  1  1  0 1 0 0 0 0 0 0 0 0]
+/// bit index:   15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
+template <>
+struct Kokkos::Experimental::Impl::signaling_NaN_helper<
+    Kokkos::Experimental::half_t> {
+  static constexpr Kokkos::Experimental::half_t::bit_comparison_type value{0b0'11111'0100000000};
+};
+
+/// \brief: Number of digits in the matissa that can be represented
+///         without losing precision.
+///
+/// Stdc defines this as the number of base-RADIX digits in the floating point mantissa for the binary16 data type.
+///
+/// In binary16, we have 10 fractional bits plus the implicit leading 1.
+template <>
+struct Kokkos::Experimental::Impl::digits_helper<Kokkos::Experimental::half_t> {
+  static constexpr int value = 11;
+};
+
+/// \brief: "The number of base-10 digits that can be represented by the type T without change"
+/// Reference: https://en.cppreference.com/w/cpp/types/numeric_limits/digits10.
+///
+/// "For base-radix types, it is the value of digits() (digits - 1 for floating-point types) multiplied by log10(radix) and rounded down."
+/// Reference: https://en.cppreference.com/w/cpp/types/numeric_limits/digits10.
+///
+/// This is: floor(11 - 1 * log10(2))
+template <>
+struct Kokkos::Experimental::Impl::digits10_helper<
+    Kokkos::Experimental::half_t> {
+  static constexpr int value = 3;
+};
+
+/// \brief: Value of the base of the exponent representation.
+///
+/// Stdc defined this as the value of the base, or radix, of the exponent representation.
+template <>
+struct Kokkos::Experimental::Impl::radix_helper<Kokkos::Experimental::half_t> {
+  static constexpr int value = 2;
+};
+
+/// \brief: This is the smallest possible exponent value
+///
+/// Stdc defines this as the smallest possible exponent value for type binary16. 
+/// More precisely, it is the minimum negative integer such that the value min_exponent_helper
+/// raised to this power minus 1 can be represented as a normalized floating point number of type float.
+///
+/// In binary16:
+///             [s  e  e  e  e  e  f f f f f f f f f f]
+///             [0  0  0  0  0  1  0 0 0 0 0 0 0 0 0 0]
+/// bit index:   15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
+/// 
+/// and in base10: 1 * 2**(2**0 - 15) * (1 + 0)
+///                = 2**-14
+/// 
+/// with a bias of one from (C11 5.2.4.2.2), gives -13;
+template <>
+struct Kokkos::Experimental::Impl::min_exponent_helper<
+    Kokkos::Experimental::half_t> {
+  static constexpr int value = -13;
+};
+
+/// \brief: This is the largest possible exponent value
+///
+/// In binary16:
+///             [s  e  e  e  e  e  f f f f f f f f f f]
+///             [0  1  1  1  1  0  0 0 0 0 0 0 0 0 0 0]
+/// bit index:   15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
+/// 
+/// and in base10: 1 * 2**(2**4 + 2**3 + 2**2 + 2**1 - 15) * (1 + 0)
+///                = 2**(30 - 15)
+///                = 2**15
+/// 
+/// with a bias of one from (C11 5.2.4.2.2), gives 16;
+template <>
+struct Kokkos::Experimental::Impl::max_exponent_helper<
+    Kokkos::Experimental::half_t> {
+  static constexpr int value = 16;
+};
+#endif
+////////////// END HALF_T (binary16) limits //////////////
+
+////////////// BEGIN BHALF_T (bfloat16) limits //////////////
+#if defined(KOKKOS_BHALF_T_IS_FLOAT) && !KOKKOS_BHALF_T_IS_FLOAT
+/// \brief: Infinity
+///
+/// Bfloat16 encoding:
+///             [s  e  e  e  e  e  e e e f f f f f f f]
+///             [0  1  1  1  1  1  1 1 1 0 0 0 0 0 0 0]
+/// bit index:   15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
+///
+template <>
+struct Kokkos::Experimental::Impl::infinity_helper<Kokkos::Experimental::bhalf_t> {
+  static constexpr Kokkos::Experimental::bhalf_t::bit_comparison_type value{0b0'11111111'0000000};
+};
+
+// Minimum normalized number
+template <>
+struct Kokkos::Experimental::Impl::finite_min_helper<
+    Kokkos::Experimental::bhalf_t> {
+  static constexpr Kokkos::Experimental::bhalf_t::bit_comparison_type value{0b1'11111110'1111111}; // -3.38953139e38
+};
+// Maximum normalized number
+template <>
+struct Kokkos::Experimental::Impl::finite_max_helper<
+    Kokkos::Experimental::bhalf_t> {
+  static constexpr Kokkos::Experimental::bhalf_t::bit_comparison_type value{0b0'11111110'1111111}; // +3.38953139e3
+};
+// 1/2^7
+template <>
+struct Kokkos::Experimental::Impl::epsilon_helper<
+    Kokkos::Experimental::bhalf_t> {
+  static constexpr Kokkos::Experimental::bhalf_t::bit_comparison_type value{0b0'01111000'0000000}; // 0.0078125
+};
+template <>
+struct Kokkos::Experimental::Impl::round_error_helper<
+    Kokkos::Experimental::bhalf_t> {
+  static constexpr Kokkos::Experimental::bhalf_t::bit_comparison_type value{0b0'01111110'0000000}; // 0.5
+};
+// Minimum normalized positive bhalf number
+template <>
+struct Kokkos::Experimental::Impl::norm_min_helper<
+    Kokkos::Experimental::bhalf_t> {
+  static constexpr Kokkos::Experimental::bhalf_t::bit_comparison_type value{0b0'00000001'0000000}; // 1.175494351e-38
+};
+// Quiet not a bhalf number
+template <>
+struct Kokkos::Experimental::Impl::quiet_NaN_helper<
+    Kokkos::Experimental::bhalf_t> {
+  static constexpr Kokkos::Experimental::bhalf_t::bit_comparison_type value{0b0'11111111'1000000};
+};
+// Signaling not a bhalf number
+template <>
+struct Kokkos::Experimental::Impl::signaling_NaN_helper<
+    Kokkos::Experimental::bhalf_t> {
+  static constexpr Kokkos::Experimental::bhalf_t::bit_comparison_type value{0b0'11111111'0100000};
+};
+// Number of digits in the matissa that can be represented
+// without losing precision.
+template <>
+struct Kokkos::Experimental::Impl::digits_helper<
+    Kokkos::Experimental::bhalf_t> {
+  static constexpr int value = 2;
+};
+// 7 - 1 * log10(2)
+template <>
+struct Kokkos::Experimental::Impl::digits10_helper<
+    Kokkos::Experimental::bhalf_t> {
+  static constexpr int value = 1;
+};
+// Value of the base of the exponent representation.
+template <>
+struct Kokkos::Experimental::Impl::radix_helper<Kokkos::Experimental::bhalf_t> {
+  static constexpr int value = 2;
+};
+// This is the smallest possible exponent value
+// with a bias of one (C11 5.2.4.2.2).
+template <>
+struct Kokkos::Experimental::Impl::min_exponent_helper<
+    Kokkos::Experimental::bhalf_t> {
+  static constexpr int value = -125;
+};
+// This is the largest possible exponent value
+// with a bias of one (C11 5.2.4.2.2).
+template <>
+struct Kokkos::Experimental::Impl::max_exponent_helper<
+    Kokkos::Experimental::bhalf_t> {
+  static constexpr int value = 128;
+};
+#endif
+////////////// END BHALF_T (bfloat16) limits //////////
+
+#endif  // KOKKOS_HALF_NUMERIC_TRAITS_HPP_
diff --git a/packages/kokkos/core/src/impl/Kokkos_HostSpace.cpp b/packages/kokkos/core/src/impl/Kokkos_HostSpace.cpp
index b47ce3beecd797dcfab7af1a075dd1fde08896ca..a9d72160593741b5334134fc9af847a55a9e5e8f 100644
--- a/packages/kokkos/core/src/impl/Kokkos_HostSpace.cpp
+++ b/packages/kokkos/core/src/impl/Kokkos_HostSpace.cpp
@@ -26,7 +26,8 @@
 
 /*--------------------------------------------------------------------------*/
 
-#if defined(KOKKOS_COMPILER_INTEL) && !defined(KOKKOS_ENABLE_CUDA)
+#if (defined(KOKKOS_COMPILER_INTEL) || defined(KOKKOS_COMPILER_INTEL_LLVM)) && \
+    !defined(KOKKOS_ENABLE_CUDA)
 
 // Intel specialized allocator does not interoperate with CUDA memory allocation
 
@@ -45,6 +46,10 @@
 #include <sstream>
 #include <cstring>
 
+#ifdef KOKKOS_COMPILER_INTEL
+#include <aligned_new>
+#endif
+
 #include <Kokkos_HostSpace.hpp>
 #include <impl/Kokkos_Error.hpp>
 #include <Kokkos_Atomic.hpp>
@@ -54,41 +59,10 @@
 
 namespace Kokkos {
 
-/* Default allocation mechanism */
-HostSpace::HostSpace()
-    : m_alloc_mech(
-#if defined(KOKKOS_ENABLE_INTEL_MM_ALLOC)
-          HostSpace::INTEL_MM_ALLOC
-#else
-          HostSpace::STD_MALLOC
-#endif
-      ) {
-}
-
-/* Default allocation mechanism */
-HostSpace::HostSpace(const HostSpace::AllocationMechanism &arg_alloc_mech)
-    : m_alloc_mech(HostSpace::STD_MALLOC) {
-  if (arg_alloc_mech == STD_MALLOC) {
-    m_alloc_mech = HostSpace::STD_MALLOC;
-  }
-#if defined(KOKKOS_ENABLE_INTEL_MM_ALLOC)
-  else if (arg_alloc_mech == HostSpace::INTEL_MM_ALLOC) {
-    m_alloc_mech = HostSpace::INTEL_MM_ALLOC;
-  }
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
+KOKKOS_DEPRECATED HostSpace::HostSpace(const HostSpace::AllocationMechanism &)
+    : HostSpace() {}
 #endif
-  else {
-    const char *const mech =
-        (arg_alloc_mech == HostSpace::INTEL_MM_ALLOC)
-            ? "INTEL_MM_ALLOC"
-            : ((arg_alloc_mech == HostSpace::POSIX_MMAP) ? "POSIX_MMAP" : "");
-
-    std::string msg;
-    msg.append("Kokkos::HostSpace ");
-    msg.append(mech);
-    msg.append(" is not available");
-    Kokkos::Impl::throw_runtime_exception(msg);
-  }
-}
 
 void *HostSpace::allocate(const size_t arg_alloc_size) const {
   return allocate("[unlabeled]", arg_alloc_size);
@@ -117,33 +91,9 @@ void *HostSpace::impl_allocate(
 
   void *ptr = nullptr;
 
-  if (arg_alloc_size) {
-    if (m_alloc_mech == STD_MALLOC) {
-      // Over-allocate to and round up to guarantee proper alignment.
-      size_t size_padded = arg_alloc_size + sizeof(void *) + alignment;
-
-      void *alloc_ptr = malloc(size_padded);
-
-      if (alloc_ptr) {
-        auto address = reinterpret_cast<uintptr_t>(alloc_ptr);
-
-        // offset enough to record the alloc_ptr
-        address += sizeof(void *);
-        uintptr_t rem    = address % alignment;
-        uintptr_t offset = rem ? (alignment - rem) : 0u;
-        address += offset;
-        ptr = reinterpret_cast<void *>(address);
-        // record the alloc'd pointer
-        address -= sizeof(void *);
-        *reinterpret_cast<void **>(address) = alloc_ptr;
-      }
-    }
-#if defined(KOKKOS_ENABLE_INTEL_MM_ALLOC)
-    else if (m_alloc_mech == INTEL_MM_ALLOC) {
-      ptr = _mm_malloc(arg_alloc_size, alignment);
-    }
-#endif
-  }
+  if (arg_alloc_size)
+    ptr = operator new (arg_alloc_size, std::align_val_t(alignment),
+                        std::nothrow_t{});
 
   if ((ptr == nullptr) || (reinterpret_cast<uintptr_t>(ptr) == ~uintptr_t(0)) ||
       (reinterpret_cast<uintptr_t>(ptr) & alignment_mask)) {
@@ -158,21 +108,6 @@ void *HostSpace::impl_allocate(
     Experimental::RawMemoryAllocationFailure::AllocationMechanism alloc_mec =
         Experimental::RawMemoryAllocationFailure::AllocationMechanism::
             StdMalloc;
-    switch (m_alloc_mech) {
-      case STD_MALLOC: break;  // default
-      case POSIX_MEMALIGN:
-        alloc_mec = Experimental::RawMemoryAllocationFailure::
-            AllocationMechanism::PosixMemAlign;
-        break;
-      case POSIX_MMAP:
-        alloc_mec = Experimental::RawMemoryAllocationFailure::
-            AllocationMechanism::PosixMMap;
-        break;
-      case INTEL_MM_ALLOC:
-        alloc_mec = Experimental::RawMemoryAllocationFailure::
-            AllocationMechanism::IntelMMAlloc;
-        break;
-    }
 
     throw Kokkos::Experimental::RawMemoryAllocationFailure(
         arg_alloc_size, alignment, failure_mode, alloc_mec);
@@ -207,15 +142,9 @@ void HostSpace::impl_deallocate(
       Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr,
                                         reported_size);
     }
-    if (m_alloc_mech == STD_MALLOC) {
-      void *alloc_ptr = *(reinterpret_cast<void **>(arg_alloc_ptr) - 1);
-      free(alloc_ptr);
-    }
-#if defined(KOKKOS_ENABLE_INTEL_MM_ALLOC)
-    else if (m_alloc_mech == INTEL_MM_ALLOC) {
-      _mm_free(arg_alloc_ptr);
-    }
-#endif
+    constexpr uintptr_t alignment = Kokkos::Impl::MEMORY_ALIGNMENT;
+    operator delete (arg_alloc_ptr, std::align_val_t(alignment),
+                     std::nothrow_t{});
   }
 }
 
@@ -284,42 +213,6 @@ SharedAllocationRecord<Kokkos::HostSpace, void>::SharedAllocationRecord(
 }  // namespace Impl
 }  // namespace Kokkos
 
-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-namespace {
-const unsigned HOST_SPACE_ATOMIC_MASK     = 0xFFFF;
-const unsigned HOST_SPACE_ATOMIC_XOR_MASK = 0x5A39;
-static int HOST_SPACE_ATOMIC_LOCKS[HOST_SPACE_ATOMIC_MASK + 1];
-}  // namespace
-
-namespace Impl {
-void init_lock_array_host_space() {
-  static int is_initialized = 0;
-  if (!is_initialized)
-    for (int i = 0; i < static_cast<int>(HOST_SPACE_ATOMIC_MASK + 1); i++)
-      HOST_SPACE_ATOMIC_LOCKS[i] = 0;
-}
-
-bool lock_address_host_space(void *ptr) {
-  return 0 == atomic_compare_exchange(
-                  &HOST_SPACE_ATOMIC_LOCKS[((size_t(ptr) >> 2) &
-                                            HOST_SPACE_ATOMIC_MASK) ^
-                                           HOST_SPACE_ATOMIC_XOR_MASK],
-                  0, 1);
-}
-
-void unlock_address_host_space(void *ptr) {
-  atomic_exchange(
-      &HOST_SPACE_ATOMIC_LOCKS[((size_t(ptr) >> 2) & HOST_SPACE_ATOMIC_MASK) ^
-                               HOST_SPACE_ATOMIC_XOR_MASK],
-      0);
-}
-
-}  // namespace Impl
-}  // namespace Kokkos
-
 //==============================================================================
 // <editor-fold desc="Explicit instantiations of CRTP Base classes"> {{{1
 
diff --git a/packages/kokkos/core/src/impl/Kokkos_HostSpace_ZeroMemset.hpp b/packages/kokkos/core/src/impl/Kokkos_HostSpace_ZeroMemset.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f740c408fb8fddefff501f46937e1942aa691bff
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_HostSpace_ZeroMemset.hpp
@@ -0,0 +1,50 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_HOSTSPACE_ZEROMEMSET_HPP
+#define KOKKOS_HOSTSPACE_ZEROMEMSET_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <Kokkos_HostSpace.hpp>
+#include <impl/Kokkos_ZeroMemset_fwd.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+template <class T, class... P>
+struct ZeroMemset<HostSpace::execution_space, View<T, P...>> {
+  ZeroMemset(const HostSpace::execution_space& exec, const View<T, P...>& dst,
+             typename View<T, P...>::const_value_type&) {
+    // Host spaces, except for HPX, are synchronous and we need to fence for HPX
+    // since we can't properly enqueue a std::memset otherwise.
+    // We can't use exec.fence() directly since we don't have a full definition
+    // of HostSpace here.
+    hostspace_fence(exec);
+    using ValueType = typename View<T, P...>::value_type;
+    std::memset(dst.data(), 0, sizeof(ValueType) * dst.size());
+  }
+
+  ZeroMemset(const View<T, P...>& dst,
+             typename View<T, P...>::const_value_type&) {
+    using ValueType = typename View<T, P...>::value_type;
+    std::memset(dst.data(), 0, sizeof(ValueType) * dst.size());
+  }
+};
+
+}  // end namespace Impl
+}  // end namespace Kokkos
+
+#endif  // KOKKOS_HOSTSPACE_ZEROMEMSET_HPP
diff --git a/packages/kokkos/core/src/impl/Kokkos_HostSpace_deepcopy.cpp b/packages/kokkos/core/src/impl/Kokkos_HostSpace_deepcopy.cpp
index 096dfd6b7f3f8a5f9550fd5793f20bc0658bddcd..84f525061ee3470bdbe716bb228b9ab00c8e26cd 100644
--- a/packages/kokkos/core/src/impl/Kokkos_HostSpace_deepcopy.cpp
+++ b/packages/kokkos/core/src/impl/Kokkos_HostSpace_deepcopy.cpp
@@ -52,11 +52,12 @@ void hostspace_parallel_deepcopy_async(const DefaultHostExecutionSpace& exec,
   // synchronously. The deep copy must be correctly sequenced with respect to
   // other kernels submitted to the same instance, so we only use the fallback
   // parallel_for version in this case.
-#if !(defined(KOKKOS_ENABLE_HPX) && defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH))
+#if !(defined(KOKKOS_ENABLE_HPX) && \
+      defined(KOKKOS_ENABLE_IMPL_HPX_ASYNC_DISPATCH))
   constexpr int host_deep_copy_serial_limit = 10 * 8192;
   if ((n < host_deep_copy_serial_limit) ||
       (DefaultHostExecutionSpace().concurrency() == 1)) {
-    std::memcpy(dst, src, n);
+    if (0 < n) std::memcpy(dst, src, n);
     return;
   }
 
diff --git a/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp b/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp
index 1fec93237ac2c49b1d961d68fcbc45eca15db4a6..51f25a8b60f141e8edac0082a34379e2d596f20b 100644
--- a/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp
@@ -155,8 +155,7 @@ class HostThreadTeamData {
 
   //----------------------------------------
 
-#ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC bug in NVHPC regarding constexpr
-                               // constructors used in device code
+#if !defined(KOKKOS_COMPILER_NVHPC) || (KOKKOS_COMPILER_NVHPC >= 230700)
   constexpr
 #endif
       HostThreadTeamData() noexcept
@@ -481,14 +480,14 @@ class HostThreadTeamMember {
         // with a return value of 'true'
 
         Kokkos::Impl::atomic_store(shared_value, value,
-                                   Kokkos::Impl::memory_order_release);
+                                   desul::MemoryOrderRelease());
 
         m_data.team_rendezvous_release();
         // This thread released all other threads from 'team_rendezvous'
         // with a return value of 'false'
       } else {
         value = Kokkos::Impl::atomic_load(shared_value,
-                                          Kokkos::Impl::memory_order_acquire);
+                                          desul::MemoryOrderAcquire());
       }
     }))
 
@@ -516,7 +515,7 @@ class HostThreadTeamMember {
 
           if (1 < m_data.m_team_size) {
             Kokkos::Impl::atomic_store(shared_value, value,
-                                       Kokkos::Impl::memory_order_release);
+                                       desul::MemoryOrderRelease());
           }
 
           m_data.team_rendezvous_release();
@@ -524,7 +523,7 @@ class HostThreadTeamMember {
           // with a return value of 'false'
         } else {
           value = Kokkos::Impl::atomic_load(shared_value,
-                                            Kokkos::Impl::memory_order_acquire);
+                                            desul::MemoryOrderAcquire());
         }))
 
     KOKKOS_IF_ON_DEVICE(
@@ -864,18 +863,21 @@ KOKKOS_INLINE_FUNCTION
 
 //----------------------------------------------------------------------------
 
-template <typename iType, class Closure, class Member>
+template <typename iType, class Closure, class Member, typename ValueType>
 KOKKOS_INLINE_FUNCTION
-    std::enable_if_t<Impl::is_host_thread_team_member<Member>::value>
+    std::enable_if_t<!Kokkos::is_reducer<ValueType>::value &&
+                     Impl::is_host_thread_team_member<Member>::value>
     parallel_scan(Impl::TeamThreadRangeBoundariesStruct<iType, Member> const&
                       loop_boundaries,
-                  Closure const& closure) {
-  // Extract ValueType from the closure
-
-  using value_type = typename Kokkos::Impl::FunctorAnalysis<
-      Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type;
+                  Closure const& closure, ValueType& return_val) {
+  // Extract ValueType from the Closure
+  using ClosureValueType = typename Kokkos::Impl::FunctorAnalysis<
+      Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure,
+      void>::value_type;
+  static_assert(std::is_same<ClosureValueType, ValueType>::value,
+                "Non-matching value types of closure and return type");
 
-  value_type accum = 0;
+  ValueType accum = ValueType();
 
   // Intra-member scan
   for (iType i = loop_boundaries.start; i < loop_boundaries.end;
@@ -883,25 +885,51 @@ KOKKOS_INLINE_FUNCTION
     closure(i, accum, false);
   }
 
+  auto team_member = loop_boundaries.thread;
+
   // 'accum' output is the exclusive prefix sum
-  accum = loop_boundaries.thread.team_scan(accum);
+  accum = team_member.team_scan(accum);
 
   for (iType i = loop_boundaries.start; i < loop_boundaries.end;
        i += loop_boundaries.increment) {
     closure(i, accum, true);
   }
+
+  team_member.team_broadcast(accum, team_member.team_size() - 1);
+
+  return_val = accum;
 }
 
-template <typename iType, class ClosureType, class Member>
+template <typename iType, class Closure, class Member>
 KOKKOS_INLINE_FUNCTION
     std::enable_if_t<Impl::is_host_thread_team_member<Member>::value>
+    parallel_scan(Impl::TeamThreadRangeBoundariesStruct<iType, Member> const&
+                      loop_boundaries,
+                  Closure const& closure) {
+  // Extract ValueType from the closure
+  using ValueType = typename Kokkos::Impl::FunctorAnalysis<
+      Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure,
+      void>::value_type;
+
+  ValueType scan_val;
+  parallel_scan(loop_boundaries, closure, scan_val);
+}
+
+template <typename iType, class ClosureType, class Member, typename ValueType>
+KOKKOS_INLINE_FUNCTION
+    std::enable_if_t<!Kokkos::is_reducer<ValueType>::value &&
+                     Impl::is_host_thread_team_member<Member>::value>
     parallel_scan(Impl::ThreadVectorRangeBoundariesStruct<iType, Member> const&
                       loop_boundaries,
-                  ClosureType const& closure) {
-  using value_type = typename Kokkos::Impl::FunctorAnalysis<
-      Impl::FunctorPatternInterface::SCAN, void, ClosureType>::value_type;
+                  ClosureType const& closure, ValueType& return_val) {
+  // Extract ValueType from the Closure
+  using ClosureValueType = typename Kokkos::Impl::FunctorAnalysis<
+      Kokkos::Impl::FunctorPatternInterface::SCAN, void, ClosureType,
+      void>::value_type;
+  static_assert(std::is_same<ClosureValueType, ValueType>::value,
+                "Non-matching value types of closure and return type");
 
-  value_type scan_val = value_type();
+  ValueType scan_val = ValueType();
 
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
@@ -910,6 +938,22 @@ KOKKOS_INLINE_FUNCTION
        i += loop_boundaries.increment) {
     closure(i, scan_val, true);
   }
+
+  return_val = scan_val;
+}
+
+template <typename iType, class ClosureType, class Member>
+KOKKOS_INLINE_FUNCTION
+    std::enable_if_t<Impl::is_host_thread_team_member<Member>::value>
+    parallel_scan(Impl::ThreadVectorRangeBoundariesStruct<iType, Member> const&
+                      loop_boundaries,
+                  ClosureType const& closure) {
+  // Extract ValueType from the closure
+  using ValueType = typename Kokkos::Impl::FunctorAnalysis<
+      Impl::FunctorPatternInterface::SCAN, void, ClosureType, void>::value_type;
+
+  ValueType scan_val;
+  parallel_scan(loop_boundaries, closure, scan_val);
 }
 
 template <typename iType, class Lambda, typename ReducerType, typename Member>
diff --git a/packages/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp b/packages/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp
index 392116a56e50b83f03dc23c4eda6625400b2b542..42a53b04fb2a940ae466dd1aa90bef90ad6c42b1 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp
@@ -19,35 +19,6 @@
 #define KOKKOS_MEMORY_FENCE_HPP
 namespace Kokkos {
 
-//----------------------------------------------------------------------------
-#ifndef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
-KOKKOS_FORCEINLINE_FUNCTION
-void memory_fence() {
-#if defined(__CUDA_ARCH__)
-  __threadfence();
-#elif defined(KOKKOS_ENABLE_OPENMPTARGET)
-#pragma omp flush
-#elif defined(__HIP_DEVICE_COMPILE__)
-  __threadfence();
-#elif defined(KOKKOS_ENABLE_SYCL) && defined(__SYCL_DEVICE_ONLY__)
-  sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::device);
-#elif defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64)
-  asm volatile("mfence" ::: "memory");
-#elif defined(KOKKOS_ENABLE_GNU_ATOMICS) || \
-    (defined(KOKKOS_COMPILER_NVCC) && defined(KOKKOS_ENABLE_INTEL_ATOMICS))
-  __sync_synchronize();
-#elif defined(KOKKOS_ENABLE_INTEL_ATOMICS)
-  _mm_mfence();
-#elif defined(KOKKOS_ENABLE_OPENMP_ATOMICS)
-#pragma omp flush
-#elif defined(KOKKOS_ENABLE_WINDOWS_ATOMICS)
-  MemoryBarrier();
-#elif !defined(KOKKOS_ENABLE_SERIAL_ATOMICS)
-#error "Error: memory_fence() not defined"
-#endif
-}
-#endif
-
 //////////////////////////////////////////////////////
 // store_fence()
 //
diff --git a/packages/kokkos/core/src/impl/Kokkos_NvidiaGpuArchitectures.hpp b/packages/kokkos/core/src/impl/Kokkos_NvidiaGpuArchitectures.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..956b6dffeae2526e69a38a610d9eebb14c03bc2c
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_NvidiaGpuArchitectures.hpp
@@ -0,0 +1,58 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_CUDA_NVIDIA_GPU_ARCHITECTURES_HPP
+#define KOKKOS_CUDA_NVIDIA_GPU_ARCHITECTURES_HPP
+
+#if defined(KOKKOS_ARCH_KEPLER30)
+#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 30
+#elif defined(KOKKOS_ARCH_KEPLER32)
+#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 32
+#elif defined(KOKKOS_ARCH_KEPLER35)
+#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 35
+#elif defined(KOKKOS_ARCH_KEPLER37)
+#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 37
+#elif defined(KOKKOS_ARCH_MAXWELL50)
+#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 50
+#elif defined(KOKKOS_ARCH_MAXWELL52)
+#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 52
+#elif defined(KOKKOS_ARCH_MAXWELL53)
+#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 53
+#elif defined(KOKKOS_ARCH_PASCAL60)
+#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 60
+#elif defined(KOKKOS_ARCH_PASCAL61)
+#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 61
+#elif defined(KOKKOS_ARCH_VOLTA70)
+#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 70
+#elif defined(KOKKOS_ARCH_VOLTA72)
+#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 72
+#elif defined(KOKKOS_ARCH_TURING75)
+#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 75
+#elif defined(KOKKOS_ARCH_AMPERE80)
+#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 80
+#elif defined(KOKKOS_ARCH_AMPERE86)
+#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 86
+#elif defined(KOKKOS_ARCH_ADA89)
+#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 89
+#elif defined(KOKKOS_ARCH_HOPPER90)
+#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 90
+#elif defined(KOKKOS_ENABLE_CUDA)
+// do not raise an error on other backends that may run on NVIDIA GPUs such as
+// OpenACC, OpenMPTarget, or SYCL
+#error NVIDIA GPU arch not recognized
+#endif
+
+#endif
diff --git a/packages/kokkos/core/src/impl/Kokkos_Profiling.cpp b/packages/kokkos/core/src/impl/Kokkos_Profiling.cpp
index e3cfcb6a29e4cff19f398845e41774cbb1fe66f4..bc6197753c32d3eb7e569f21c996fa26b23c2166 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Profiling.cpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Profiling.cpp
@@ -177,7 +177,8 @@ Kokkos::Tools::Impl::InitializationStatus parse_environment_variables(
     args = env_tools_args;
   }
   return {
-      Kokkos::Tools::Impl::InitializationStatus::InitializationResult::success};
+      Kokkos::Tools::Impl::InitializationStatus::InitializationResult::success,
+      ""};
 }
 InitializationStatus initialize_tools_subsystem(
     const Kokkos::Tools::InitArguments& args) {
@@ -192,13 +193,13 @@ InitializationStatus initialize_tools_subsystem(
     if (!Kokkos::Tools::printHelp(final_args)) {
       std::cerr << "Tool has not provided a help message" << std::endl;
     }
-    return {InitializationStatus::InitializationResult::help_request};
+    return {InitializationStatus::InitializationResult::help_request, ""};
   }
   Kokkos::Tools::parseArgs(final_args);
 #else
   (void)args;
 #endif
-  return {InitializationStatus::InitializationResult::success};
+  return {InitializationStatus::InitializationResult::success, ""};
 }
 
 }  // namespace Impl
@@ -625,17 +626,11 @@ void initialize(const std::string& profileLibrary) {
     return;
   }
 
-  char* envProfileLibrary = const_cast<char*>(profileLibrary.c_str());
-
-  const size_t envProfileLen = strlen(envProfileLibrary) + 1;
-  const auto envProfileCopy  = std::make_unique<char[]>(envProfileLen);
-  snprintf(envProfileCopy.get(), envProfileLen, "%s", envProfileLibrary);
-
-  char* profileLibraryName = strtok(envProfileCopy.get(), ";");
-
-  if ((profileLibraryName != nullptr) &&
-      (strcmp(profileLibraryName, "") != 0)) {
-    firstProfileLibrary = dlopen(profileLibraryName, RTLD_NOW | RTLD_GLOBAL);
+  if (auto end_first_library = profileLibrary.find(';');
+      end_first_library != 0) {
+    auto profileLibraryName = profileLibrary.substr(0, end_first_library);
+    firstProfileLibrary =
+        dlopen(profileLibraryName.c_str(), RTLD_NOW | RTLD_GLOBAL);
 
     if (firstProfileLibrary == nullptr) {
       std::cerr << "Error: Unable to load KokkosP library: "
@@ -644,10 +639,6 @@ void initialize(const std::string& profileLibrary) {
                 << ", RTLD_NOW | RTLD_GLOBAL) failed with " << dlerror()
                 << '\n';
     } else {
-#ifdef KOKKOS_ENABLE_PROFILING_LOAD_PRINT
-      std::cout << "KokkosP: Library Loaded: " << profileLibraryName
-                << std::endl;
-#endif
       lookup_function(firstProfileLibrary, "kokkosp_begin_parallel_scan",
                       Experimental::current_callbacks.begin_parallel_scan);
       lookup_function(firstProfileLibrary, "kokkosp_begin_parallel_for",
diff --git a/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp b/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp
index ccc5b60f5fb32c230759b5ce23ae98eacaa55cce..255f5125f4abf9d81c96eb6a5a102868a510dc15 100644
--- a/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp
+++ b/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp
@@ -19,6 +19,7 @@
 #endif
 
 #include <Kokkos_Core.hpp>
+#include <iomanip>
 
 namespace Kokkos {
 namespace Impl {
@@ -285,51 +286,30 @@ void SharedAllocationRecord<void, void>::print_host_accessible_records(
   // allocation.
   const SharedAllocationRecord<void, void>* r = root->m_next;
 
-  char buffer[256];
-
+  std::ios_base::fmtflags saved_flags = s.flags();
+#define KOKKOS_PAD_HEX(ptr)                              \
+  "0x" << std::hex << std::setw(12) << std::setfill('0') \
+       << reinterpret_cast<uintptr_t>(ptr)
   if (detail) {
     while (r != root) {
-      // Formatting dependent on sizeof(uintptr_t)
-      const char* format_string;
-
-      if (sizeof(uintptr_t) == sizeof(unsigned long)) {
-        format_string =
-            "%s addr( 0x%.12lx ) list( 0x%.12lx 0x%.12lx ) extent[ 0x%.12lx + "
-            "%.8ld ] count(%d) dealloc(0x%.12lx) %s\n";
-      } else if (sizeof(uintptr_t) == sizeof(unsigned long long)) {
-        format_string =
-            "%s addr( 0x%.12llx ) list( 0x%.12llx 0x%.12llx ) extent[ "
-            "0x%.12llx + %.8ld ] count(%d) dealloc(0x%.12llx) %s\n";
-      }
+      s << space_name << " addr( " << KOKKOS_PAD_HEX(r) << " ) list ( "
+        << KOKKOS_PAD_HEX(r->m_prev) << ' ' << KOKKOS_PAD_HEX(r->m_next)
+        << " ) extent[ " << KOKKOS_PAD_HEX(r->m_alloc_ptr) << " + " << std::dec
+        << std::setw(8) << r->m_alloc_size << " ] count(" << r->use_count()
+        << ") dealloc(" << KOKKOS_PAD_HEX(r->m_dealloc) << ") "
+        << r->m_alloc_ptr->m_label << '\n';
 
-      snprintf(buffer, 256, format_string, space_name,
-               reinterpret_cast<uintptr_t>(r),
-               reinterpret_cast<uintptr_t>(r->m_prev),
-               reinterpret_cast<uintptr_t>(r->m_next),
-               reinterpret_cast<uintptr_t>(r->m_alloc_ptr), r->m_alloc_size,
-               r->use_count(), reinterpret_cast<uintptr_t>(r->m_dealloc),
-               r->m_alloc_ptr->m_label);
-      s << buffer;
       r = r->m_next;
     }
   } else {
     while (r != root) {
-      // Formatting dependent on sizeof(uintptr_t)
-      const char* format_string;
-
-      if (sizeof(uintptr_t) == sizeof(unsigned long)) {
-        format_string = "%s [ 0x%.12lx + %ld ] %s\n";
-      } else if (sizeof(uintptr_t) == sizeof(unsigned long long)) {
-        format_string = "%s [ 0x%.12llx + %ld ] %s\n";
-      }
-
-      snprintf(buffer, 256, format_string, space_name,
-               reinterpret_cast<uintptr_t>(r->data()), r->size(),
-               r->m_alloc_ptr->m_label);
-      s << buffer;
+      s << space_name << " [ " << KOKKOS_PAD_HEX(r->data()) << " + " << std::dec
+        << r->size() << " ] " << r->m_alloc_ptr->m_label << '\n';
       r = r->m_next;
     }
   }
+#undef KOKKOS_PAD_HEX
+  s.flags(saved_flags);
 }
 #else
 void SharedAllocationRecord<void, void>::print_host_accessible_records(
diff --git a/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp b/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp
index 6bb87ca84b2f5089f1d7568fb09ebf3dcc2482c2..043505a158e99a8dbd5f82a598194d9ca6562433 100644
--- a/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp
@@ -37,7 +37,7 @@ class SharedAllocationHeader {
  private:
   using Record = SharedAllocationRecord<void, void>;
 
-#if defined(KOKKOS_ARCH_VEGA) || defined(KOKKOS_ARCH_NAVI)
+#if defined(KOKKOS_ARCH_AMD_GPU)
   static constexpr unsigned maximum_label_length =
       (1u << 8 /* 256 */) - sizeof(Record*);
 #else
@@ -70,7 +70,7 @@ class SharedAllocationHeader {
 template <>
 class SharedAllocationRecord<void, void> {
  protected:
-#if defined(KOKKOS_ARCH_VEGA) || defined(KOKKOS_ARCH_NAVI)
+#if defined(KOKKOS_ARCH_AMD_GPU)
   static_assert(sizeof(SharedAllocationHeader) == (1u << 8 /* 256 */),
                 "sizeof(SharedAllocationHeader) != 256");
 #else
diff --git a/packages/kokkos/core/src/impl/Kokkos_SharedAlloc_timpl.hpp b/packages/kokkos/core/src/impl/Kokkos_SharedAlloc_timpl.hpp
index 9aa96e27d150140c25a17a5e18e32e98303f095c..d403ef9db064c0d99c63ce03d3150da22de720a9 100644
--- a/packages/kokkos/core/src/impl/Kokkos_SharedAlloc_timpl.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_SharedAlloc_timpl.hpp
@@ -24,9 +24,9 @@
 
 #include <Kokkos_HostSpace.hpp>  // used with HostInaccessible specializations
 
-#include <string>    // std::string
-#include <cstring>   // strncpy
-#include <iostream>  // ostream
+#include <cstring>
+#include <ostream>
+#include <string>
 
 namespace Kokkos {
 namespace Impl {
diff --git a/packages/kokkos/core/src/impl/Kokkos_Stacktrace.cpp b/packages/kokkos/core/src/impl/Kokkos_Stacktrace.cpp
index 3b8dc4efc356d8beb8928b3d01a4577b6fdf9b86..b287510b386685822302b32918b76442af76f5f2 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Stacktrace.cpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Stacktrace.cpp
@@ -134,7 +134,7 @@ void for_each_token(const std::string& s, Callback c) {
     const size_t end   = find_first_whitespace(s, cur);
     const bool last    = (end == std::string::npos);
     const size_t count = last ? end : size_t(end - cur);
-    c(s.substr(cur, count), last);
+    c(s.substr(cur, count));
     cur = find_first_non_whitespace(s, end);
   }
 }
@@ -147,7 +147,6 @@ void for_each_token(const std::string& s, Callback c) {
 struct main_column_info {
   bool found_main;
   size_t main_col;
-  std::vector<size_t> main_col_lens;
 };
 
 main_column_info find_main_column(const std::vector<std::string>& traceback) {
@@ -155,7 +154,7 @@ main_column_info find_main_column(const std::vector<std::string>& traceback) {
   size_t main_col = 0;
   for (auto&& entry : traceback) {
     size_t col_count = 0;
-    for_each_token(entry, [&](const std::string& s, bool) {
+    for_each_token(entry, [&](const std::string& s) {
       const size_t pos = s.find("main");
       if (pos != std::string::npos) {
         found_main = true;
@@ -168,52 +167,33 @@ main_column_info find_main_column(const std::vector<std::string>& traceback) {
     }
   }
 
-  // Make another pass to get the column lengths.
-  // Only demangle the column of functions.
-  std::vector<size_t> max_col_lengths;
-  for (auto&& entry : traceback) {
-    size_t col_count = 0;
-    for_each_token(entry, [&](const std::string& s, bool) {
-      const size_t cur_col_len =
-          (found_main && col_count == main_col) ? demangle(s).size() : s.size();
-      ++col_count;
-      if (max_col_lengths.size() < col_count) {
-        max_col_lengths.push_back(cur_col_len);
-      } else {
-        const size_t old_max_len = max_col_lengths[col_count - 1];
-        if (old_max_len < cur_col_len) {
-          max_col_lengths[col_count - 1] = cur_col_len;
-        }
-      }
-    });
-  }
-  return main_column_info{found_main, main_col, max_col_lengths};
+  return main_column_info{found_main, main_col};
 }
 
-void demangle_and_print_traceback_entry(
-    std::ostream& out, const std::string& traceback_entry,
-    const bool found_main, const size_t main_col,
-    const std::vector<size_t>& max_col_lens) {
+void demangle_and_print_traceback_entry(std::ostream& out,
+                                        const std::string& traceback_entry,
+                                        const bool found_main,
+                                        const size_t main_col) {
   std::vector<std::string> tokens;
   size_t cur_col = 0;
-  for_each_token(traceback_entry, [&](const std::string& s, bool last) {
-    const size_t old_width(out.width());
-    out.width(max_col_lens[cur_col]);
-    try {
-      if (found_main && cur_col == main_col) {
-        out << demangle(s);
-      } else {
-        out << s;
-      }
-      if (!last) {
-        out << " ";
-      }
-      ++cur_col;
-    } catch (...) {
-      out.width(old_width);
-      throw;
+
+  // Print the address column first
+  for_each_token(traceback_entry, [&](const std::string& s) {
+    if (!(found_main && cur_col == main_col)) {
+      out << s;
+    }
+    ++cur_col;
+  });
+
+  out << " ";
+
+  // Then the function name
+  cur_col = 0;
+  for_each_token(traceback_entry, [&](const std::string& s) {
+    if (found_main && cur_col == main_col) {
+      out << demangle(s);
     }
-    out.width(old_width);
+    ++cur_col;
   });
 }
 
@@ -222,7 +202,7 @@ void demangle_and_print_traceback(std::ostream& out,
   const auto result = find_main_column(traceback);
   for (auto&& entry : traceback) {
     demangle_and_print_traceback_entry(out, entry, result.found_main,
-                                       result.main_col, result.main_col_lens);
+                                       result.main_col);
     out << std::endl;
   }
 }
diff --git a/packages/kokkos/core/src/impl/Kokkos_StringManipulation.hpp b/packages/kokkos/core/src/impl/Kokkos_StringManipulation.hpp
index c7baf6d0c3df9016c90963e7c4355b908078ee5a..231cc2c39c44980bbfc2a49ee3abca5280b87c5b 100644
--- a/packages/kokkos/core/src/impl/Kokkos_StringManipulation.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_StringManipulation.hpp
@@ -173,7 +173,7 @@ KOKKOS_FUNCTION constexpr to_chars_result to_chars_i(char *first, char *last,
       unsigned_val = Unsigned(~value) + Unsigned(1);
     }
   }
-  unsigned int const len = to_chars_len(unsigned_val);
+  std::ptrdiff_t const len = to_chars_len(unsigned_val);
   if (last - first < len) {
     return {last, errc::value_too_large};
   }
diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskBase.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskBase.hpp
index ac99d071594958a16689f9ebb435a8adeb164559..ed548e99a89f9764033bc9dff565c56f86b6f464 100644
--- a/packages/kokkos/core/src/impl/Kokkos_TaskBase.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_TaskBase.hpp
@@ -174,17 +174,15 @@ class TaskBase {
 
     // Assign dependence to m_next.  It will be processed in the subsequent
     // call to schedule.  Error if the dependence is reset.
-    if (lock != Kokkos::Impl::desul_atomic_exchange(
-                    &m_next, dep, Kokkos::Impl::MemoryOrderSeqCst(),
-                    Kokkos::Impl::MemoryScopeDevice())) {
+    if (lock != desul::atomic_exchange(&m_next, dep, desul::MemoryOrderSeqCst(),
+                                       desul::MemoryScopeDevice())) {
       Kokkos::abort("TaskScheduler ERROR: resetting task dependence");
     }
     if (nullptr != dep) {
       // The future may be destroyed upon returning from this call
       // so increment reference count to track this assignment.
-      Kokkos::Impl::desul_atomic_inc(&(dep->m_ref_count),
-                                     Kokkos::Impl::MemoryOrderSeqCst(),
-                                     Kokkos::Impl::MemoryScopeDevice());
+      desul::atomic_inc(&(dep->m_ref_count), desul::MemoryOrderSeqCst(),
+                        desul::MemoryScopeDevice());
     }
   }
 
@@ -208,6 +206,7 @@ class TaskBase {
 // the number of full task types that fit into a cache line.  We'll leave it
 // here for now, though, since we're probably going to be ripping all of the
 // old TaskBase stuff out eventually anyway.
+#ifndef KOKKOS_IMPL_32BIT
 constexpr size_t unpadded_task_base_size = 44 + 2 * sizeof(int16_t);
 // don't forget padding:
 constexpr size_t task_base_misalignment =
@@ -231,7 +230,7 @@ static constexpr
 
 static_assert(sizeof(TaskBase) == expected_task_base_size,
               "Verifying expected sizeof(TaskBase)");
-
+#endif
 // </editor-fold> end Verify the size of TaskBase is as expected }}}2
 //------------------------------------------------------------------------------
 
diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskNode.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskNode.hpp
index 789ba1469697a50dd85707119c0daeb346ca0593..a81f298bbf260f62997af5319c816816750367ec 100644
--- a/packages/kokkos/core/src/impl/Kokkos_TaskNode.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_TaskNode.hpp
@@ -42,14 +42,6 @@
 namespace Kokkos {
 namespace Impl {
 
-#ifdef KOKKOS_COMPILER_PGI
-// Bizzarely, an extra jump instruction forces the PGI compiler to not have a
-// bug related to (probably?) empty base optimization and/or aggregate
-// construction.  This must be defined out-of-line to generate a jump
-// jump instruction
-void _kokkos_pgi_compiler_bug_workaround();
-#endif
-
 enum TaskType : int16_t {
   TaskTeam    = 0,
   TaskSingle  = 1,
@@ -101,17 +93,11 @@ class ReferenceCountedBase {
 
  public:
   KOKKOS_INLINE_FUNCTION
-#ifndef KOKKOS_COMPILER_PGI
-  constexpr
-#endif
-      explicit ReferenceCountedBase(
-          reference_count_size_type initial_reference_count)
+  constexpr explicit ReferenceCountedBase(
+      reference_count_size_type initial_reference_count)
       : m_ref_count(initial_reference_count) {
     // This can't be here because it breaks constexpr
     // KOKKOS_EXPECTS(initial_reference_count > 0);
-#ifdef KOKKOS_COMPILER_PGI
-    Impl::_kokkos_pgi_compiler_bug_workaround();
-#endif
   }
 
   /** Decrement the reference count,
@@ -131,9 +117,8 @@ class ReferenceCountedBase {
 
   KOKKOS_INLINE_FUNCTION
   void increment_reference_count() {
-    Kokkos::Impl::desul_atomic_inc(&m_ref_count,
-                                   Kokkos::Impl::MemoryOrderSeqCst(),
-                                   Kokkos::Impl::MemoryScopeDevice());
+    desul::atomic_inc(&m_ref_count, desul::MemoryOrderSeqCst(),
+                      desul::MemoryScopeDevice());
   }
 };
 
diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskQueue.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskQueue.hpp
index 5f182dc33f2580ff9806ca612d071b8e07343fca..8312fbc1036737938313fd457c2711421b48d66e 100644
--- a/packages/kokkos/core/src/impl/Kokkos_TaskQueue.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_TaskQueue.hpp
@@ -160,9 +160,8 @@ class TaskQueue : public TaskQueueBase {
                                      task_root_type* const rhs) {
     if (*lhs) decrement(*lhs);
     if (rhs) {
-      Kokkos::Impl::desul_atomic_inc(&rhs->m_ref_count,
-                                     Kokkos::Impl::MemoryOrderSeqCst(),
-                                     Kokkos::Impl::MemoryScopeDevice());
+      desul::atomic_inc(&rhs->m_ref_count, desul::MemoryOrderSeqCst(),
+                        desul::MemoryScopeDevice());
     }
 
     // Force write of *lhs
diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskQueueCommon.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskQueueCommon.hpp
index 18dc3c757bf827766a06278caac8848b069ee614..3709d6e7209e30666404acdb115577067bd7ab08 100644
--- a/packages/kokkos/core/src/impl/Kokkos_TaskQueueCommon.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_TaskQueueCommon.hpp
@@ -129,30 +129,28 @@ class TaskQueueCommonMixin {
   KOKKOS_INLINE_FUNCTION
   void _increment_ready_count() {
     // TODO @tasking @memory_order DSH memory order
-    Kokkos::Impl::desul_atomic_inc(&this->m_ready_count,
-                                   Kokkos::Impl::MemoryOrderSeqCst(),
-                                   Kokkos::Impl::MemoryScopeDevice());
+    desul::atomic_inc(&this->m_ready_count, desul::MemoryOrderSeqCst(),
+                      desul::MemoryScopeDevice());
   }
 
   KOKKOS_INLINE_FUNCTION
   void _decrement_ready_count() {
     // TODO @tasking @memory_order DSH memory order
-    Kokkos::Impl::desul_atomic_dec(&this->m_ready_count,
-                                   Kokkos::Impl::MemoryOrderSeqCst(),
-                                   Kokkos::Impl::MemoryScopeDevice());
+    desul::atomic_dec(&this->m_ready_count, desul::MemoryOrderSeqCst(),
+                      desul::MemoryScopeDevice());
   }
 
  public:
   KOKKOS_INLINE_FUNCTION
   bool is_done() const noexcept {
-    // TODO @tasking @memory_order DSH Memory order, instead of volatile
-    return (*(volatile int*)(&m_ready_count)) == 0;
+    return desul::atomic_load(&m_ready_count, desul::MemoryOrderAcquire(),
+                              desul::MemoryScopeDevice()) == 0;
   }
 
   KOKKOS_INLINE_FUNCTION
   int32_t ready_count() const noexcept {
-    // TODO @tasking @memory_order DSH Memory order, instead of volatile
-    return (*(volatile int*)(&m_ready_count));
+    return desul::atomic_load(&m_ready_count, desul::MemoryOrderAcquire(),
+                              desul::MemoryScopeDevice());
   }
 
   template <class TaskQueueTraits, class TeamSchedulerInfo>
diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskQueueMemoryManager.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskQueueMemoryManager.hpp
index aef919e8346e2456b61315a9f3f0736cd2b681d1..e2bb9d2b61a10943e00e4c29e08b2c849cbb44c6 100644
--- a/packages/kokkos/core/src/impl/Kokkos_TaskQueueMemoryManager.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_TaskQueueMemoryManager.hpp
@@ -73,9 +73,9 @@ class TaskQueueMemoryManager : public TaskQueueBase {
     } else {
       void* data = m_pool.allocate(static_cast<size_t>(requested_size));
 
-      Kokkos::Impl::desul_atomic_inc(
-          &m_count_alloc, Kokkos::Impl::MemoryOrderSeqCst(),
-          Kokkos::Impl::MemoryScopeDevice());  // TODO? memory_order_relaxed
+      desul::atomic_inc(
+          &m_count_alloc, desul::MemoryOrderSeqCst(),
+          desul::MemoryScopeDevice());  // TODO? memory_order_relaxed
       // TODO @tasking @minor DSH make this thread safe? (otherwise, it's just
       // an approximation, which is probably fine...)
       if (m_max_alloc < m_count_alloc) m_max_alloc = m_count_alloc;
@@ -171,9 +171,9 @@ class TaskQueueMemoryManager : public TaskQueueBase {
   KOKKOS_INLINE_FUNCTION void deallocate(
       PoolAllocatedObjectBase<CountType>&& obj) {
     m_pool.deallocate((void*)&obj, 1);
-    Kokkos::Impl::desul_atomic_dec(
-        &m_count_alloc, Kokkos::Impl::MemoryOrderSeqCst(),
-        Kokkos::Impl::MemoryScopeDevice());  // TODO? memory_order_relaxed
+    desul::atomic_dec(
+        &m_count_alloc, desul::MemoryOrderSeqCst(),
+        desul::MemoryScopeDevice());  // TODO? memory_order_relaxed
   }
 
   KOKKOS_INLINE_FUNCTION
diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskQueueMultiple.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskQueueMultiple.hpp
index e653f70fe91bfed21e92798f67319f040388c5c3..4ed057a689727e9df8d16291d1b54f78237e1a6e 100644
--- a/packages/kokkos/core/src/impl/Kokkos_TaskQueueMultiple.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_TaskQueueMultiple.hpp
@@ -119,7 +119,9 @@ class TaskQueueMultiple : public TaskQueue<ExecSpace, MemorySpace> {
         for (int iteam = 0; iteam < m_other_queues->size(); ++iteam) {
           if (iteam == m_league_rank) continue;
           auto& steal_from = get_team_queue(iteam);
-          if (*((volatile int*)&steal_from.m_ready_count) > 0) {
+          if (desul::atomic_load(&steal_from.m_ready_count,
+                                 desul::MemoryOrderAcquire(),
+                                 desul::MemoryScopeDevice()) > 0) {
             // we've found at least one queue that's not done, so even if we
             // can't pop something off of it we shouldn't return a nullptr
             // indicating completion.  rv will be end_tag when the pop fails
@@ -128,14 +130,12 @@ class TaskQueueMultiple : public TaskQueue<ExecSpace, MemorySpace> {
               // task stolen.
               // first increment our ready count, then decrement the ready count
               // on the other queue:
-              Kokkos::Impl::desul_atomic_inc(
-                  &this->m_ready_count, Kokkos::Impl::MemoryOrderSeqCst(),
-                  Kokkos::Impl::MemoryScopeDevice());  // TODO?
-                                                       // memory_order_relaxed
-              Kokkos::Impl::desul_atomic_dec(
-                  &steal_from.m_ready_count, Kokkos::Impl::MemoryOrderSeqCst(),
-                  Kokkos::Impl::MemoryScopeDevice());  // TODO?
-                                                       // memory_order_relaxed
+              desul::atomic_inc(
+                  &this->m_ready_count, desul::MemoryOrderSeqCst(),
+                  desul::MemoryScopeDevice());  // TODO? memory_order_relaxed
+              desul::atomic_dec(
+                  &steal_from.m_ready_count, desul::MemoryOrderSeqCst(),
+                  desul::MemoryScopeDevice());  // TODO? memory_order_relaxed
               return rv;
             }
           }
diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp
index 68ff36579fd7226cce83f9b4c04fa2d881cfa684..074dc7bb983f01f7e73a97a83881dc153f45a9ea 100644
--- a/packages/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp
@@ -119,9 +119,9 @@ KOKKOS_FUNCTION void *TaskQueue<ExecSpace, MemorySpace>::allocate(size_t n) {
   void *const p = m_memory.allocate(n);
 
   if (p) {
-    Kokkos::Impl::desul_atomic_inc(
-        &m_count_alloc, Kokkos::Impl::MemoryOrderSeqCst(),
-        Kokkos::Impl::MemoryScopeDevice());  // TODO? memory_order_relaxed
+    desul::atomic_inc(
+        &m_count_alloc, desul::MemoryOrderSeqCst(),
+        desul::MemoryScopeDevice());  // TODO? memory_order_relaxed
 
     // if ( m_max_alloc < m_count_alloc ) m_max_alloc = m_count_alloc ;
   }
@@ -133,9 +133,8 @@ template <typename ExecSpace, typename MemorySpace>
 KOKKOS_FUNCTION void TaskQueue<ExecSpace, MemorySpace>::deallocate(void *p,
                                                                    size_t n) {
   m_memory.deallocate(p, n);
-  Kokkos::Impl::desul_atomic_dec(
-      &m_count_alloc, Kokkos::Impl::MemoryOrderSeqCst(),
-      Kokkos::Impl::MemoryScopeDevice());  // TODO? memory_order_relaxed
+  desul::atomic_dec(&m_count_alloc, desul::MemoryOrderSeqCst(),
+                    desul::MemoryScopeDevice());  // TODO? memory_order_relaxed
 }
 
 //----------------------------------------------------------------------------
@@ -186,9 +185,9 @@ KOKKOS_FUNCTION bool TaskQueue<ExecSpace, MemorySpace>::push_task(
     //     *queue = task;
     //   }
     //   old_head = *queue;
-    old_head = Kokkos::Impl::desul_atomic_compare_exchange(
+    old_head = desul::atomic_compare_exchange(
         const_cast<task_root_type **>(queue), old_head, task,
-        Kokkos::Impl::MemoryOrderSeqCst(), Kokkos::Impl::MemoryScopeDevice());
+        desul::MemoryOrderSeqCst(), desul::MemoryScopeDevice());
 
     if (old_head_tmp == old_head) return true;
   }
@@ -237,9 +236,9 @@ TaskQueue<ExecSpace, MemorySpace>::pop_ready_task(
     task_root_type *const x = task;
 
     //    task = Kokkos::atomic_compare_exchange(queue, x, lock);
-    task = Kokkos::Impl::desul_atomic_compare_exchange(
-        const_cast<task_root_type **>(queue), x, lock,
-        Kokkos::Impl::MemoryOrderSeqCst(), Kokkos::Impl::MemoryScopeDevice());
+    task = desul::atomic_compare_exchange(const_cast<task_root_type **>(queue),
+                                          x, lock, desul::MemoryOrderSeqCst(),
+                                          desul::MemoryScopeDevice());
 
     if (x == task) {
       // CAS succeeded and queue is locked
@@ -383,9 +382,9 @@ KOKKOS_FUNCTION void TaskQueue<ExecSpace, MemorySpace>::schedule_runnable(
     // to track number of ready + executing tasks.
     // The ready count will be decremented when the task is complete.
 
-    Kokkos::Impl::desul_atomic_inc(
-        &m_ready_count, Kokkos::Impl::MemoryOrderSeqCst(),
-        Kokkos::Impl::MemoryScopeDevice());  // TODO? memory_order_relaxed
+    desul::atomic_inc(
+        &m_ready_count, desul::MemoryOrderSeqCst(),
+        desul::MemoryScopeDevice());  // TODO? memory_order_relaxed
 
     task_root_type *volatile *const ready_queue =
         &m_ready[t.m_priority][t.m_task_type];
@@ -538,9 +537,9 @@ KOKKOS_FUNCTION void TaskQueue<ExecSpace, MemorySpace>::reschedule(
 
   task_root_type *const zero = nullptr;
   task_root_type *const lock = (task_root_type *)task_root_type::LockTag;
-  if (lock != Kokkos::Impl::desul_atomic_exchange(
-                  &task->m_next, zero, Kokkos::Impl::MemoryOrderSeqCst(),
-                  Kokkos::Impl::MemoryScopeDevice())) {
+  if (lock != desul::atomic_exchange(&task->m_next, zero,
+                                     desul::MemoryOrderSeqCst(),
+                                     desul::MemoryScopeDevice())) {
     Kokkos::abort("TaskScheduler::respawn ERROR: already respawned");
   }
 }
@@ -587,9 +586,9 @@ KOKKOS_FUNCTION void TaskQueue<ExecSpace, MemorySpace>::complete(
 
     // Stop other tasks from adding themselves to this task's wait queue
     // by locking the head of this task's wait queue.
-    task_root_type *x = Kokkos::Impl::desul_atomic_exchange(
+    task_root_type *x = desul::atomic_exchange(
         const_cast<task_root_type **>(&t.m_wait), lock,
-        Kokkos::Impl::MemoryOrderSeqCst(), Kokkos::Impl::MemoryScopeDevice());
+        desul::MemoryOrderSeqCst(), desul::MemoryScopeDevice());
 
     if (x != (task_root_type *)lock) {
       // This thread has transitioned this 'task' to complete.
@@ -632,9 +631,9 @@ KOKKOS_FUNCTION void TaskQueue<ExecSpace, MemorySpace>::complete(
     // A runnable task was popped from a ready queue and executed.
     // If respawned into a ready queue then the ready count was incremented
     // so decrement whether respawned or not.
-    Kokkos::Impl::desul_atomic_dec(
-        &m_ready_count, Kokkos::Impl::MemoryOrderSeqCst(),
-        Kokkos::Impl::MemoryScopeDevice());  // TODO? memory_order_relaxed
+    desul::atomic_dec(
+        &m_ready_count, desul::MemoryOrderSeqCst(),
+        desul::MemoryScopeDevice());  // TODO? memory_order_relaxed
   }
 }
 
diff --git a/packages/kokkos/core/src/impl/Kokkos_Tools_Generic.hpp b/packages/kokkos/core/src/impl/Kokkos_Tools_Generic.hpp
index 4ccb64ce4f410ebfbf32341b2fd1272a9e954686..a77e139ec3051eca1b0553a50a21f5c850030b55 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Tools_Generic.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Tools_Generic.hpp
@@ -18,6 +18,7 @@
 #define KOKKOS_IMPL_KOKKOS_TOOLS_GENERIC_HPP
 
 #include <impl/Kokkos_Profiling.hpp>
+#include <impl/Kokkos_FunctorAnalysis.hpp>
 
 #include <Kokkos_Core_fwd.hpp>
 #include <Kokkos_ExecPolicy.hpp>
@@ -99,9 +100,12 @@ struct SimpleTeamSizeCalculator {
                                         const Functor& functor,
                                         const Kokkos::ParallelReduceTag&) {
     using exec_space = typename Policy::execution_space;
-    using driver =
-        Kokkos::Impl::ParallelReduce<Functor, Policy, Kokkos::InvalidType,
-                                     exec_space>;
+    using analysis   = Kokkos::Impl::FunctorAnalysis<
+        Kokkos::Impl::FunctorPatternInterface::REDUCE, Policy, Functor, void>;
+    using driver = typename Kokkos::Impl::ParallelReduce<
+        Kokkos::Impl::CombinedFunctorReducer<Functor,
+                                             typename analysis::Reducer>,
+        Policy, exec_space>;
     return driver::max_tile_size_product(policy, functor);
   }
 };
@@ -120,7 +124,13 @@ struct ComplexReducerSizeCalculator {
     using value_type = typename ReducerType::value_type;
     value_type value;
     ReducerType reducer_example = ReducerType(value);
-    return policy.team_size_max(functor, reducer_example, tag);
+
+    using Analysis = Kokkos::Impl::FunctorAnalysis<
+        Kokkos::Impl::FunctorPatternInterface::REDUCE, Policy, ReducerType,
+        value_type>;
+    typename Analysis::Reducer final_reducer(reducer_example);
+
+    return policy.team_size_max(functor, final_reducer, tag);
   }
   template <typename Policy, typename Functor, typename Tag>
   int get_recommended_team_size(const Policy& policy, const Functor& functor,
@@ -128,15 +138,26 @@ struct ComplexReducerSizeCalculator {
     using value_type = typename ReducerType::value_type;
     value_type value;
     ReducerType reducer_example = ReducerType(value);
-    return policy.team_size_recommended(functor, reducer_example, tag);
+
+    using Analysis = Kokkos::Impl::FunctorAnalysis<
+        Kokkos::Impl::FunctorPatternInterface::REDUCE, Policy, ReducerType,
+        value_type>;
+    typename Analysis::Reducer final_reducer(reducer_example);
+
+    return policy.team_size_recommended(functor, final_reducer, tag);
   }
   template <typename Policy, typename Functor>
   int get_mdrange_max_tile_size_product(const Policy& policy,
                                         const Functor& functor,
                                         const Kokkos::ParallelReduceTag&) {
     using exec_space = typename Policy::execution_space;
-    using driver =
-        Kokkos::Impl::ParallelReduce<Functor, Policy, ReducerType, exec_space>;
+    using Analysis   = Kokkos::Impl::FunctorAnalysis<
+        Kokkos::Impl::FunctorPatternInterface::REDUCE, Policy, ReducerType,
+        void>;
+    using driver = typename Kokkos::Impl::ParallelReduce<
+        Kokkos::Impl::CombinedFunctorReducer<Functor,
+                                             typename Analysis::Reducer>,
+        Policy, exec_space>;
     return driver::max_tile_size_product(policy, functor);
   }
 };
diff --git a/packages/kokkos/core/src/impl/Kokkos_Utilities.hpp b/packages/kokkos/core/src/impl/Kokkos_Utilities.hpp
index c3504ffce5a783f9523cbaf40bb3149ace6934d4..7e2f130564fefa47d8591c69de5e1cee29e5f617 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Utilities.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Utilities.hpp
@@ -29,6 +29,23 @@
 namespace Kokkos {
 namespace Impl {
 
+// same as std::integral_constant but with __host__ __device__ annotations on
+// the implicit conversion function and the call operator
+template <class T, T v>
+struct integral_constant {
+  using value_type         = T;
+  using type               = integral_constant<T, v>;
+  static constexpr T value = v;
+  KOKKOS_FUNCTION constexpr operator value_type() const noexcept {
+    return value;
+  }
+  KOKKOS_FUNCTION constexpr value_type operator()() const noexcept {
+    return value;
+  }
+};
+
+//==============================================================================
+
 template <typename... Is>
 struct always_true : std::true_type {};
 
@@ -62,6 +79,33 @@ template <class T>
 using remove_cvref_t = typename remove_cvref<T>::type;
 #endif
 
+// same as C++23 std::to_underlying but with __host__ __device__ annotations
+template <typename E>
+KOKKOS_FUNCTION constexpr std::underlying_type_t<E> to_underlying(
+    E e) noexcept {
+  return static_cast<std::underlying_type_t<E>>(e);
+}
+
+#if defined(__cpp_lib_is_scoped_enum)
+// since C++23
+using std::is_scoped_enum;
+using std::is_scoped_enum_v;
+#else
+template <typename E, bool = std::is_enum_v<E>>
+struct is_scoped_enum_impl : std::false_type {};
+
+template <typename E>
+struct is_scoped_enum_impl<E, true>
+    : std::bool_constant<!std::is_convertible_v<E, std::underlying_type_t<E>>> {
+};
+
+template <typename E>
+struct is_scoped_enum : is_scoped_enum_impl<E>::type {};
+
+template <typename E>
+inline constexpr bool is_scoped_enum_v = is_scoped_enum<E>::value;
+#endif
+
 //==============================================================================
 // <editor-fold desc="is_specialization_of"> {{{1
 
@@ -74,25 +118,6 @@ struct is_specialization_of<Template<Args...>, Template> : std::true_type {};
 // </editor-fold> end is_specialization_of }}}1
 //==============================================================================
 
-//==============================================================================
-// destruct_delete is a unique_ptr deleter for objects
-// created by placement new into already allocated memory
-// by only calling the destructor on the object.
-//
-// Because unique_ptr never calls its deleter with a nullptr value,
-// no need to check if p == nullptr.
-//
-// Note:  This differs in interface from std::default_delete in that the
-// function call operator is templated instead of the class, to make
-// it easier to use and disallow specialization.
-struct destruct_delete {
-  template <typename T>
-  KOKKOS_INLINE_FUNCTION constexpr void operator()(T* p) const noexcept {
-    p->~T();
-  }
-};
-//==============================================================================
-
 //==============================================================================
 // <editor-fold desc="type_list"> {{{1
 
diff --git a/packages/kokkos/core/src/impl/Kokkos_ViewArray.hpp b/packages/kokkos/core/src/impl/Kokkos_ViewArray.hpp
index c76bde49933442b9a6ef24905aeea5ab8dea41cf..725ba5de092a82ab9c486c43d028c57192eaeadf 100644
--- a/packages/kokkos/core/src/impl/Kokkos_ViewArray.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_ViewArray.hpp
@@ -123,7 +123,7 @@ class ViewMapping<Traits, Kokkos::Array<>> {
   //----------------------------------------
   // Domain dimensions
 
-  enum { Rank = Traits::dimension::rank };
+  static constexpr unsigned Rank = Traits::dimension::rank;
 
   template <typename iType>
   KOKKOS_INLINE_FUNCTION constexpr size_t extent(const iType &r) const {
diff --git a/packages/kokkos/core/src/impl/Kokkos_ViewCtor.hpp b/packages/kokkos/core/src/impl/Kokkos_ViewCtor.hpp
index 92c17d0cf568b9a3dc179dec07f2dcd7c7c33b8d..e1b8ba86a5b5b58721071548d1c68d6986227257 100644
--- a/packages/kokkos/core/src/impl/Kokkos_ViewCtor.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_ViewCtor.hpp
@@ -234,6 +234,7 @@ struct ViewCtorProp : public ViewCtorProp<void, P>... {
   }
 };
 
+#if !defined(KOKKOS_COMPILER_MSVC) || !defined(KOKKOS_COMPILER_NVCC)
 template <typename... P>
 auto with_properties_if_unset(const ViewCtorProp<P...> &view_ctor_prop) {
   return view_ctor_prop;
@@ -274,6 +275,52 @@ auto with_properties_if_unset(const ViewCtorProp<P...> &view_ctor_prop,
 #endif
 #endif
 }
+#else
+
+template <class ViewCtorP, class... Properties>
+struct WithPropertiesIfUnset;
+
+template <class ViewCtorP>
+struct WithPropertiesIfUnset<ViewCtorP> {
+  static constexpr auto apply_prop(const ViewCtorP &view_ctor_prop) {
+    return view_ctor_prop;
+  }
+};
+
+template <class... P, class Property, class... Properties>
+struct WithPropertiesIfUnset<ViewCtorProp<P...>, Property, Properties...> {
+  static constexpr auto apply_prop(const ViewCtorProp<P...> &view_ctor_prop,
+                                   const Property &prop,
+                                   const Properties &... properties) {
+    if constexpr ((is_execution_space<Property>::value &&
+                   !ViewCtorProp<P...>::has_execution_space) ||
+                  (is_memory_space<Property>::value &&
+                   !ViewCtorProp<P...>::has_memory_space) ||
+                  (is_view_label<Property>::value &&
+                   !ViewCtorProp<P...>::has_label) ||
+                  (std::is_same_v<Property, WithoutInitializing_t> &&
+                   ViewCtorProp<P...>::initialize)) {
+      using NewViewCtorProp = ViewCtorProp<P..., Property>;
+      NewViewCtorProp new_view_ctor_prop(view_ctor_prop);
+      static_cast<ViewCtorProp<void, Property> &>(new_view_ctor_prop).value =
+          prop;
+      return WithPropertiesIfUnset<NewViewCtorProp, Properties...>::apply_prop(
+          new_view_ctor_prop, properties...);
+    } else
+      return WithPropertiesIfUnset<ViewCtorProp<P...>,
+                                   Properties...>::apply_prop(view_ctor_prop,
+                                                              properties...);
+  }
+};
+
+template <typename... P, class... Properties>
+auto with_properties_if_unset(const ViewCtorProp<P...> &view_ctor_prop,
+                              const Properties &... properties) {
+  return WithPropertiesIfUnset<ViewCtorProp<P...>, Properties...>::apply_prop(
+      view_ctor_prop, properties...);
+}
+
+#endif
 
 struct ExecutionSpaceTag {};
 struct MemorySpaceTag {};
diff --git a/packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp b/packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp
index 3ab8237cd1501d66c37a8fb8093fb53e676ead91..01d0dc4f68112bea94818ac75c5d0ca61716a16f 100644
--- a/packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp
@@ -32,6 +32,7 @@
 #include <impl/Kokkos_Atomic_View.hpp>
 #include <impl/Kokkos_Tools.hpp>
 #include <impl/Kokkos_StringManipulation.hpp>
+#include <impl/Kokkos_ZeroMemset_fwd.hpp>
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@@ -159,8 +160,8 @@ struct KOKKOS_IMPL_ENFORCE_EMPTY_BASE_OPTIMIZATION ViewDimension
   using D6::N6;
   using D7::N7;
 
-  enum : unsigned { rank = sizeof...(Vals) };
-  enum : unsigned { rank_dynamic = Impl::rank_dynamic<Vals...>::value };
+  static constexpr unsigned rank         = sizeof...(Vals);
+  static constexpr unsigned rank_dynamic = Impl::rank_dynamic<Vals...>::value;
 
   ViewDimension()                     = default;
   ViewDimension(const ViewDimension&) = default;
@@ -286,7 +287,6 @@ struct ViewDimensionAssignable<ViewDimension<DstArgs...>,
 //----------------------------------------------------------------------------
 
 namespace Kokkos {
-namespace Impl {
 
 struct ALL_t {
   KOKKOS_INLINE_FUNCTION
@@ -296,7 +296,15 @@ struct ALL_t {
   constexpr bool operator==(const ALL_t&) const { return true; }
 };
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
+namespace Impl {
+// TODO This alias declaration forces us to fully qualify ALL_t inside the
+// Kokkos::Impl namespace to avoid deprecation warnings. Replace the
+// fully-qualified name when we remove Kokkos::Impl::ALL_t.
+using ALL_t KOKKOS_DEPRECATED_WITH_COMMENT("Use Kokkos::ALL_t instead!") =
+    Kokkos::ALL_t;
 }  // namespace Impl
+#endif
 }  // namespace Kokkos
 
 namespace Kokkos {
@@ -304,7 +312,7 @@ namespace Impl {
 
 template <class T>
 struct is_integral_extent_type {
-  enum : bool { value = std::is_same<T, Kokkos::Impl::ALL_t>::value ? 1 : 0 };
+  enum : bool { value = std::is_same<T, Kokkos::ALL_t>::value ? 1 : 0 };
 };
 
 template <class iType>
@@ -354,7 +362,7 @@ struct SubviewLegalArgsCompileTime<Kokkos::LayoutLeft, Kokkos::LayoutLeft,
               (Kokkos::Impl::is_integral_extent_type<Arg>::value)) ||
              ((CurrentArg >= RankDest) && (std::is_integral<Arg>::value)) ||
              ((CurrentArg < RankDest) &&
-              (std::is_same<Arg, Kokkos::Impl::ALL_t>::value)) ||
+              (std::is_same<Arg, Kokkos::ALL_t>::value)) ||
              ((CurrentArg == 0) &&
               (Kokkos::Impl::is_integral_extent_type<Arg>::value))) &&
             (SubviewLegalArgsCompileTime<Kokkos::LayoutLeft, Kokkos::LayoutLeft,
@@ -385,7 +393,7 @@ struct SubviewLegalArgsCompileTime<Kokkos::LayoutRight, Kokkos::LayoutRight,
              ((CurrentArg < RankSrc - RankDest) &&
               (std::is_integral<Arg>::value)) ||
              ((CurrentArg >= RankSrc - RankDest) &&
-              (std::is_same<Arg, Kokkos::Impl::ALL_t>::value))) &&
+              (std::is_same<Arg, Kokkos::ALL_t>::value))) &&
             (SubviewLegalArgsCompileTime<Kokkos::LayoutRight,
                                          Kokkos::LayoutRight, RankDest, RankSrc,
                                          CurrentArg + 1, SubViewArgs...>::value)
@@ -397,7 +405,7 @@ struct SubviewLegalArgsCompileTime<Kokkos::LayoutRight, Kokkos::LayoutRight,
                                    RankDest, RankSrc, CurrentArg, Arg> {
   enum {
     value = ((CurrentArg == RankSrc - 1) &&
-             (std::is_same<Arg, Kokkos::Impl::ALL_t>::value))
+             (std::is_same<Arg, Kokkos::ALL_t>::value))
   };
 };
 
@@ -463,8 +471,7 @@ struct SubviewExtents {
   KOKKOS_FORCEINLINE_FUNCTION bool set(unsigned domain_rank,
                                        unsigned range_rank,
                                        const ViewDimension<DimArgs...>& dim,
-                                       const Kokkos::Impl::ALL_t,
-                                       Args... args) {
+                                       Kokkos::ALL_t, Args... args) {
     m_begin[domain_rank] = 0;
     m_length[range_rank] = dim.extent(domain_rank);
     m_index[range_rank]  = domain_rank;
@@ -559,7 +566,7 @@ struct SubviewExtents {
   // std::pair range
   template <size_t... DimArgs, class... Args>
   void error(char* buf, int buf_len, unsigned domain_rank, unsigned range_rank,
-             const ViewDimension<DimArgs...>& dim, const Kokkos::Impl::ALL_t,
+             const ViewDimension<DimArgs...>& dim, Kokkos::ALL_t,
              Args... args) const {
     const int n = std::min(buf_len, snprintf(buf, buf_len, " Kokkos::ALL %c",
                                              int(sizeof...(Args) ? ',' : ')')));
@@ -2183,7 +2190,8 @@ struct ViewStride;
 
 template <>
 struct ViewStride<0> {
-  enum { S0 = 0, S1 = 0, S2 = 0, S3 = 0, S4 = 0, S5 = 0, S6 = 0, S7 = 0 };
+  static constexpr size_t S0 = 0, S1 = 0, S2 = 0, S3 = 0, S4 = 0, S5 = 0,
+                          S6 = 0, S7 = 0;
 
   ViewStride()                  = default;
   ViewStride(const ViewStride&) = default;
@@ -2197,7 +2205,8 @@ struct ViewStride<0> {
 template <>
 struct ViewStride<1> {
   size_t S0;
-  enum { S1 = 0, S2 = 0, S3 = 0, S4 = 0, S5 = 0, S6 = 0, S7 = 0 };
+  static constexpr size_t S1 = 0, S2 = 0, S3 = 0, S4 = 0, S5 = 0, S6 = 0,
+                          S7 = 0;
 
   ViewStride()                  = default;
   ViewStride(const ViewStride&) = default;
@@ -2212,7 +2221,7 @@ struct ViewStride<1> {
 template <>
 struct ViewStride<2> {
   size_t S0, S1;
-  enum { S2 = 0, S3 = 0, S4 = 0, S5 = 0, S6 = 0, S7 = 0 };
+  static constexpr size_t S2 = 0, S3 = 0, S4 = 0, S5 = 0, S6 = 0, S7 = 0;
 
   ViewStride()                  = default;
   ViewStride(const ViewStride&) = default;
@@ -2227,7 +2236,7 @@ struct ViewStride<2> {
 template <>
 struct ViewStride<3> {
   size_t S0, S1, S2;
-  enum { S3 = 0, S4 = 0, S5 = 0, S6 = 0, S7 = 0 };
+  static constexpr size_t S3 = 0, S4 = 0, S5 = 0, S6 = 0, S7 = 0;
 
   ViewStride()                  = default;
   ViewStride(const ViewStride&) = default;
@@ -2242,7 +2251,7 @@ struct ViewStride<3> {
 template <>
 struct ViewStride<4> {
   size_t S0, S1, S2, S3;
-  enum { S4 = 0, S5 = 0, S6 = 0, S7 = 0 };
+  static constexpr size_t S4 = 0, S5 = 0, S6 = 0, S7 = 0;
 
   ViewStride()                  = default;
   ViewStride(const ViewStride&) = default;
@@ -2257,7 +2266,7 @@ struct ViewStride<4> {
 template <>
 struct ViewStride<5> {
   size_t S0, S1, S2, S3, S4;
-  enum { S5 = 0, S6 = 0, S7 = 0 };
+  static constexpr size_t S5 = 0, S6 = 0, S7 = 0;
 
   ViewStride()                  = default;
   ViewStride(const ViewStride&) = default;
@@ -2272,7 +2281,7 @@ struct ViewStride<5> {
 template <>
 struct ViewStride<6> {
   size_t S0, S1, S2, S3, S4, S5;
-  enum { S6 = 0, S7 = 0 };
+  static constexpr size_t S6 = 0, S7 = 0;
 
   ViewStride()                  = default;
   ViewStride(const ViewStride&) = default;
@@ -2287,7 +2296,7 @@ struct ViewStride<6> {
 template <>
 struct ViewStride<7> {
   size_t S0, S1, S2, S3, S4, S5, S6;
-  enum { S7 = 0 };
+  static constexpr size_t S7 = 0;
 
   ViewStride()                  = default;
   ViewStride(const ViewStride&) = default;
@@ -2707,14 +2716,8 @@ struct ViewDataHandle<
     Traits,
     std::enable_if_t<(std::is_void<typename Traits::specialize>::value &&
                       (!Traits::memory_traits::is_aligned) &&
-                      Traits::memory_traits::is_restrict
-#ifdef KOKKOS_ENABLE_CUDA
-                      && (!(std::is_same<typename Traits::memory_space,
-                                         Kokkos::CudaSpace>::value ||
-                            std::is_same<typename Traits::memory_space,
-                                         Kokkos::CudaUVMSpace>::value))
-#endif
-                      && (!Traits::memory_traits::is_atomic))>> {
+                      Traits::memory_traits::is_restrict &&
+                      (!Traits::memory_traits::is_atomic))>> {
   using value_type  = typename Traits::value_type;
   using handle_type = typename Traits::value_type* KOKKOS_RESTRICT;
   using return_type = typename Traits::value_type& KOKKOS_RESTRICT;
@@ -2737,14 +2740,8 @@ struct ViewDataHandle<
     Traits,
     std::enable_if_t<(std::is_void<typename Traits::specialize>::value &&
                       Traits::memory_traits::is_aligned &&
-                      (!Traits::memory_traits::is_restrict)
-#ifdef KOKKOS_ENABLE_CUDA
-                      && (!(std::is_same<typename Traits::memory_space,
-                                         Kokkos::CudaSpace>::value ||
-                            std::is_same<typename Traits::memory_space,
-                                         Kokkos::CudaUVMSpace>::value))
-#endif
-                      && (!Traits::memory_traits::is_atomic))>> {
+                      (!Traits::memory_traits::is_restrict) &&
+                      (!Traits::memory_traits::is_atomic))>> {
   using value_type = typename Traits::value_type;
   // typedef work-around for intel compilers error #3186: expected typedef
   // declaration
@@ -2782,14 +2779,8 @@ struct ViewDataHandle<
     Traits,
     std::enable_if_t<(std::is_void<typename Traits::specialize>::value &&
                       Traits::memory_traits::is_aligned &&
-                      Traits::memory_traits::is_restrict
-#ifdef KOKKOS_ENABLE_CUDA
-                      && (!(std::is_same<typename Traits::memory_space,
-                                         Kokkos::CudaSpace>::value ||
-                            std::is_same<typename Traits::memory_space,
-                                         Kokkos::CudaUVMSpace>::value))
-#endif
-                      && (!Traits::memory_traits::is_atomic))>> {
+                      Traits::memory_traits::is_restrict &&
+                      (!Traits::memory_traits::is_atomic))>> {
   using value_type = typename Traits::value_type;
   // typedef work-around for intel compilers error #3186: expected typedef
   // declaration
@@ -2928,8 +2919,9 @@ struct ViewValueFunctor<DeviceType, ValueType, false /* is_scalar */> {
             "Kokkos::View::initialization [" + name + "] via memset",
             Kokkos::Profiling::Experimental::device_id(space), &kpID);
       }
-      (void)ZeroMemset<ExecSpace, ValueType*, typename DeviceType::memory_space,
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged>>(
+      (void)ZeroMemset<
+          ExecSpace, Kokkos::View<ValueType*, typename DeviceType::memory_space,
+                                  Kokkos::MemoryTraits<Kokkos::Unmanaged>>>(
           space,
           Kokkos::View<ValueType*, typename DeviceType::memory_space,
                        Kokkos::MemoryTraits<Kokkos::Unmanaged>>(ptr, n),
@@ -3065,8 +3057,9 @@ struct ViewValueFunctor<DeviceType, ValueType, true /* is_scalar */> {
             Kokkos::Profiling::Experimental::device_id(space), &kpID);
       }
 
-      (void)ZeroMemset<ExecSpace, ValueType*, typename DeviceType::memory_space,
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged>>(
+      (void)ZeroMemset<
+          ExecSpace, Kokkos::View<ValueType*, typename DeviceType::memory_space,
+                                  Kokkos::MemoryTraits<Kokkos::Unmanaged>>>(
           space,
           Kokkos::View<ValueType*, typename DeviceType::memory_space,
                        Kokkos::MemoryTraits<Kokkos::Unmanaged>>(ptr, n),
@@ -3158,7 +3151,7 @@ class ViewMapping<
   //----------------------------------------
   // Domain dimensions
 
-  enum { Rank = Traits::dimension::rank };
+  static constexpr unsigned Rank = Traits::dimension::rank;
 
   template <typename iType>
   KOKKOS_INLINE_FUNCTION constexpr size_t extent(const iType& r) const {
@@ -3674,7 +3667,7 @@ class ViewMapping<
     size_t exp_stride = 1;
     if (std::is_same<typename DstTraits::array_layout,
                      Kokkos::LayoutLeft>::value) {
-      for (int i = 0; i < src.Rank; i++) {
+      for (int i = 0; i < (int)src.Rank; i++) {
         if (i > 0) exp_stride *= src.extent(i - 1);
         if (strides[i] != exp_stride) {
           assignable = false;
@@ -3683,9 +3676,9 @@ class ViewMapping<
       }
     } else if (std::is_same<typename DstTraits::array_layout,
                             Kokkos::LayoutRight>::value) {
-      for (int i = src.Rank - 1; i >= 0; i--) {
-        if (i < src.Rank - 1) exp_stride *= src.extent(i + 1);
-        if (strides[i] != exp_stride) {
+      for (int i = 0; i < (int)src.Rank; i++) {
+        if (i > 0) exp_stride *= src.extent(src.Rank - i);
+        if (strides[src.Rank - 1 - i] != exp_stride) {
           assignable = false;
           break;
         }
@@ -3786,8 +3779,8 @@ struct SubViewDataTypeImpl<
 /* for ALL slice, subview has the same dimension */
 template <class ValueType, ptrdiff_t Ext, ptrdiff_t... Exts, class... Args>
 struct SubViewDataTypeImpl<void, ValueType,
-                           Kokkos::Experimental::Extents<Ext, Exts...>, ALL_t,
-                           Args...>
+                           Kokkos::Experimental::Extents<Ext, Exts...>,
+                           Kokkos::ALL_t, Args...>
     : SubViewDataTypeImpl<void, typename ApplyExtent<ValueType, Ext>::type,
                           Kokkos::Experimental::Extents<Exts...>, Args...> {};
 
diff --git a/packages/kokkos/algorithms/unit_tests/TestSortCommon.hpp b/packages/kokkos/core/src/impl/Kokkos_ZeroMemset_fwd.hpp
similarity index 67%
rename from packages/kokkos/algorithms/unit_tests/TestSortCommon.hpp
rename to packages/kokkos/core/src/impl/Kokkos_ZeroMemset_fwd.hpp
index b8e2e17e4f353994a013d3cd9b8305135d2b84ca..f36e72e91451888139d48014d833b66134a0eeb6 100644
--- a/packages/kokkos/algorithms/unit_tests/TestSortCommon.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_ZeroMemset_fwd.hpp
@@ -14,14 +14,16 @@
 //
 //@HEADER
 
-#ifndef KOKKOS_ALGORITHMS_UNITTESTS_TESTSORT_COMMON_HPP
-#define KOKKOS_ALGORITHMS_UNITTESTS_TESTSORT_COMMON_HPP
+#ifndef KOKKOS_ZEROMEMSET_FWD_HPP
+#define KOKKOS_ZEROMEMSET_FWD_HPP
 
-#include <TestSort.hpp>
+namespace Kokkos {
+namespace Impl {
 
-namespace Test {
-TEST(TEST_CATEGORY, SortUnsigned) {
-  Impl::test_sort<TEST_EXECSPACE, unsigned>(171);
-}
-}  // namespace Test
-#endif
+template <typename ExecutionSpace, class ViewType>
+struct ZeroMemset;
+
+}  // namespace Impl
+}  // namespace Kokkos
+
+#endif  // #ifndef KOKKOS_ZEROMEMSET_FWD_HPP
diff --git a/packages/kokkos/core/src/setup/Kokkos_Setup_Cuda.hpp b/packages/kokkos/core/src/setup/Kokkos_Setup_Cuda.hpp
index d774914d9f4fa908d67d343ee2d539a871269f7e..1130485e841d9ba37e28044ac37df35d9d108afc 100644
--- a/packages/kokkos/core/src/setup/Kokkos_Setup_Cuda.hpp
+++ b/packages/kokkos/core/src/setup/Kokkos_Setup_Cuda.hpp
@@ -53,28 +53,9 @@
 #error "Cuda device capability >= 3.0 is required."
 #endif
 
-#ifdef KOKKOS_ENABLE_CUDA_LAMBDA
 #define KOKKOS_LAMBDA [=] __host__ __device__
-
 #define KOKKOS_CLASS_LAMBDA [ =, *this ] __host__ __device__
 
-#else  // !defined(KOKKOS_ENABLE_CUDA_LAMBDA)
-#undef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
-#endif  // !defined(KOKKOS_ENABLE_CUDA_LAMBDA)
-
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700)
-// PTX atomics with memory order semantics are only available on volta and later
-#if !defined(KOKKOS_DISABLE_CUDA_ASM)
-#if !defined(KOKKOS_ENABLE_CUDA_ASM)
-#define KOKKOS_ENABLE_CUDA_ASM
-#if !defined(KOKKOS_DISABLE_CUDA_ASM_ATOMICS) && \
-    defined(KOKKOS_ENABLE_GNU_ATOMICS)
-#define KOKKOS_ENABLE_CUDA_ASM_ATOMICS
-#endif
-#endif
-#endif
-#endif
-
 #define KOKKOS_IMPL_FORCEINLINE_FUNCTION __device__ __host__ __forceinline__
 #define KOKKOS_IMPL_FORCEINLINE __forceinline__
 #define KOKKOS_IMPL_INLINE_FUNCTION __device__ __host__ inline
diff --git a/packages/kokkos/core/src/setup/Kokkos_Setup_SYCL.hpp b/packages/kokkos/core/src/setup/Kokkos_Setup_SYCL.hpp
index 417f80f7f4df258679e03ed354ef1d51df4446f5..7f7957bc61f21a337179273ed9f882f93a969d63 100644
--- a/packages/kokkos/core/src/setup/Kokkos_Setup_SYCL.hpp
+++ b/packages/kokkos/core/src/setup/Kokkos_Setup_SYCL.hpp
@@ -17,6 +17,11 @@
 #ifndef KOKKOS_SETUP_SYCL_HPP_
 #define KOKKOS_SETUP_SYCL_HPP_
 
+// FIXME_SYCL Using in-order queues currently gives better performance on Intel
+// GPUs and we run into correctness issues with out-of-order queues on NVIDIA
+// GPUs.
+#define KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES
+
 // FIXME_SYCL the fallback assert is temporarily disabled by default in the
 // compiler so we need to force it
 #ifndef SYCL_ENABLE_FALLBACK_ASSERT
diff --git a/packages/kokkos/core/unit_test/CMakeLists.txt b/packages/kokkos/core/unit_test/CMakeLists.txt
index 8019e5f3bbfb38740cf8213bc8a19f7e09f941cf..b71c72c3c9f7f96545ff273f53d4066dfe507167 100644
--- a/packages/kokkos/core/unit_test/CMakeLists.txt
+++ b/packages/kokkos/core/unit_test/CMakeLists.txt
@@ -3,7 +3,7 @@
 #
 
 IF(NOT GTest_FOUND)  # fallback to internal gtest
-  SET(GTEST_SOURCE_DIR ${${PARENT_PACKAGE_NAME}_SOURCE_DIR}/tpls/gtest)
+  SET(GTEST_SOURCE_DIR ${Kokkos_SOURCE_DIR}/tpls/gtest)
 
   #need here for tribits
   KOKKOS_INCLUDE_DIRECTORIES(${GTEST_SOURCE_DIR})
@@ -45,8 +45,11 @@ SET(KOKKOS_OPENMP_FEATURE_LEVEL 999)
 SET(KOKKOS_OPENMP_NAME OpenMP)
 
 # FIXME_OPENMPTARGET - The NVIDIA HPC compiler nvc++ only compiles the first 8 incremental tests for the OpenMPTarget backend.
+# FIXME_OPENMPTARGET - Clang version 17 fails to compile incremental tests past 12 with verion 17. There is PR for this in upstream already. So it should be fixed by version 18.
 IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
   SET(KOKKOS_OPENMPTARGET_FEATURE_LEVEL 10)
+ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL "Clang" AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 17.0.0)
+  SET(KOKKOS_OPENMPTARGET_FEATURE_LEVEL 12)
 ELSE()
   SET(KOKKOS_OPENMPTARGET_FEATURE_LEVEL 14)
 ENDIF()
@@ -58,7 +61,13 @@ SET(KOKKOS_SYCL_FEATURE_LEVEL 999)
 SET(KOKKOS_SYCL_NAME Experimental::SYCL)
 SET(KOKKOS_THREADS_FEATURE_LEVEL 999)
 SET(KOKKOS_THREADS_NAME Threads)
-SET(KOKKOS_OPENACC_FEATURE_LEVEL 11)
+# FIXME_OPENACC - The Clang compiler only compiles the first 9 incremental tests for the OpenACC backend.
+IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang)
+  SET(KOKKOS_OPENACC_FEATURE_LEVEL 9)
+ELSE()
+  SET(KOKKOS_OPENACC_FEATURE_LEVEL 16)
+ENDIF()
+
 SET(KOKKOS_OPENACC_NAME Experimental::OpenACC)
 
 
@@ -75,8 +84,11 @@ SET(COMPILE_ONLY_SOURCES
   TestArray.cpp
   TestCreateMirror.cpp
   TestDetectionIdiom.cpp
+  TestBitManipulation.cpp
   TestInterOp.cpp
   TestStringManipulation.cpp
+  TestVersionMacros.cpp
+  TestViewRank.cpp
   TestViewTypeTraits.cpp
   TestTypeList.cpp
   view/TestExtentsDatatypeConversion.cpp
@@ -92,7 +104,7 @@ IF(KOKKOS_HAS_TRILINOS)
   LIST(REMOVE_ITEM COMPILE_ONLY_SOURCES TestInterOp.cpp)
 ENDIF()
 KOKKOS_ADD_EXECUTABLE(
-  TestCompileOnly
+  CoreTestCompileOnly
   SOURCES
   TestCompileMain.cpp
   ${COMPILE_ONLY_SOURCES}
@@ -110,39 +122,41 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL)
     # file. That then exceeded the shell command line max length.
     set(${Tag}_SOURCES1A)
     foreach(Name
+        Abort
+        ArrayOps
+        AtomicOperations_complexdouble
+        AtomicOperations_complexfloat
+        AtomicOperations_double
+        AtomicOperations_float
         AtomicOperations_int
-        AtomicOperations_unsignedint
         AtomicOperations_longint
-        AtomicOperations_unsignedlongint
         AtomicOperations_longlongint
-        AtomicOperations_double
-        AtomicOperations_float
-        AtomicOperations_complexdouble
-        AtomicOperations_complexfloat
         AtomicOperations_shared
-        AtomicViews
+        AtomicOperations_unsignedint
+        AtomicOperations_unsignedlongint
         Atomics
+        AtomicViews
+        BitManipulationBuiltins
         BlockSizeDeduction
-        Concepts
+        CheckedIntegerOps
+        CommonPolicyConstructors
+        CommonPolicyInterface
         Complex
+        Concepts
         Crs
         DeepCopyAlignment
+        ExecSpacePartitioning
         ExecutionSpace
         FunctorAnalysis
+        HostSharedPtr
+        HostSharedPtrAccessOnDevice
         Init
         JoinBackwardCompatibility
         LocalDeepCopy
-        MinMaxClamp
         MathematicalConstants
         MathematicalFunctions1
         MathematicalFunctions2
-        MDRange_a
-        MDRange_b
-        MDRange_c
-        HostSharedPtr
-        HostSharedPtrAccessOnDevice
-        QuadPrecisionMath
-        ExecSpacePartitioning
+        MathematicalFunctions3
         MathematicalSpecialFunctions
         )
       set(file ${dir}/Test${Tag}_${Name}.cpp)
@@ -158,31 +172,34 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL)
 
     set(${Tag}_SOURCES1B)
     foreach(Name
+        MDRange_a
+        MDRange_b
+        MDRange_c
         MDRange_d
         MDRange_e
         MDRange_f
         MDRange_g
+        MDRangePolicyConstructors
+        MDRangeReduce
+        MDSpan
+        MinMaxClamp
         NumericTraits
         Other
+        ParallelScanRangePolicy
+        Printf
+        QuadPrecisionMath
         RangePolicy
-        RangePolicyRequire
-        CommonPolicyConstructors
         RangePolicyConstructors
-        TeamPolicyConstructors
-        MDRangePolicyConstructors
-        CommonPolicyInterface
-        Reductions
+        RangePolicyRequire
+        ReducerCTADs
         Reducers_a
         Reducers_b
         Reducers_c
         Reducers_d
         Reducers_e
-        ReducerCTADs
+        Reductions
         Reductions_DeviceView
-        Scan
         SharedAlloc
-        TeamMDRange
-        ViewMapping_a
         )
       set(file ${dir}/Test${Tag}_${Name}.cpp)
       # Write to a temporary intermediate file and call configure_file to avoid
@@ -197,15 +214,17 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL)
 
     SET(${Tag}_SOURCES2A)
     foreach(Name
-      Abort
-      MDSpan
       TeamBasic
+      TeamCombinedReducers
+      TeamMDRange
+      TeamPolicyConstructors
       TeamReductionScan
       TeamScan
       TeamScratch
       TeamTeamSize
       TeamVectorRange
       UniqueToken
+      View_64bit
       ViewAPI_a
       ViewAPI_b
       ViewAPI_c
@@ -216,12 +235,12 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL)
       ViewCtorDimMatch
       ViewHooks
       ViewLayoutStrideAssignment
+      ViewMapping_a
       ViewMapping_b
       ViewMapping_subview
       ViewMemoryAccessViolation
       ViewOfClass
       ViewResize
-      View_64bit
       WorkGraph
       WithoutInitializing
       )
@@ -346,43 +365,75 @@ foreach(PairDeviceSpace HIP-HostPinned;HIP-Managed;Cuda-HostPinned;Cuda-UVM;SYCL
   endif()
 endforeach()
 
-
+# Disable non-compiling tests based on clang version.
 if(Kokkos_ENABLE_OPENMPTARGET)
   list(REMOVE_ITEM OpenMPTarget_SOURCES
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Other.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamCombinedReducers.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamReductionScan.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_WorkGraph.cpp
+    IF (KOKKOS_CXX_COMPILER_ID STREQUAL "Clang" AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 16.0.0)
+        ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c01.cpp
+        ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c02.cpp
+        ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c03.cpp
+        ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reducers_d.cpp
+    endif()
+    IF (KOKKOS_CXX_COMPILER_ID STREQUAL "Clang" AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 16.0.0)
+        ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicOperations_shared.cpp
+        ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_MinMaxClamp.cpp
+        ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamVectorRange.cpp
+        ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_LocalDeepCopy.cpp
+        ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamScan.cpp
+        ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamBasic.cpp
+        ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewAPI_e.cpp
+    endif()
+    # FIXME_OPENMPTARGET_CRAY: The following tests fail at compile time when the OpenMPTarget backend is enabled with the Cray compiler.
+    # Atomic compare/exchange is used in these tests which can be one of the reasons for the compilation failures.
+    IF(KOKKOS_CXX_COMPILER_ID STREQUAL Cray)
+        ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicOperations_complexdouble.cpp
+        ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_UniqueToken.cpp
+        ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SharedAlloc.cpp
+    ENDIF()
     )
 endif()
 
+# FIXME_OPENMPTARGET - MinMaxClamp fails even with the host backend when OpenMPTarget backend is enabled.
+# FIXME_OPENMPTARGET - Unsure of the reason as of now.
+IF (KOKKOS_CXX_COMPILER_ID STREQUAL "Clang" AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 16.0.0)
+    IF(Kokkos_ENABLE_OPENMPTARGET AND Kokkos_ENABLE_OPENMP)
+      list(REMOVE_ITEM OpenMP_SOURCES
+          ${CMAKE_CURRENT_BINARY_DIR}/openmp/TestOpenMP_MinMaxClamp.cpp
+            )
+    ENDIF()
+    IF(Kokkos_ENABLE_OPENMPTARGET AND Kokkos_ENABLE_SERIAL)
+        list(REMOVE_ITEM Serial_SOURCES1
+            ${CMAKE_CURRENT_BINARY_DIR}/serial/TestSerial_MinMaxClamp.cpp
+            )
+    ENDIF()
+ENDIF()
+
 if(Kokkos_ENABLE_OPENACC)
   list(REMOVE_ITEM OpenACC_SOURCES
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_complexfloat.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_complexdouble.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_complexfloat.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Crs.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_JoinBackwardCompatibility.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_LocalDeepCopy.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Other.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamCombinedReducers.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamMDRange.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamReductionScan.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamScan.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamVectorRange.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewAPI_e.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewCopy_a.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewCopy_b.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewMapping_subview.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewOfClass.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_WorkGraph.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamVectorRange.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_JoinBackwardCompatibility.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_CommonPolicyConstructors.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_RangePolicyConstructors.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamPolicyConstructors.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRangePolicyConstructors.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_CommonPolicyInterface.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamMDRange.cpp
   )
 endif()
 
 # FIXME_OPENMPTARGET - Comment non-passing tests with amdclang++
-# FIXME_OPENMPTARGET - Need to check on NAVI architecture
+# FIXME_OPENMPTARGET - Need to check on GFX1030 and GFX1100 architectures
 IF(KOKKOS_ARCH_VEGA)
   SET(KOKKOS_AMDGPU_ARCH TRUE)
 ENDIF()
@@ -402,43 +453,40 @@ ENDIF()
 # FIXME_OPENMPTARGET - Comment non-passing tests with the NVIDIA HPC compiler nvc++
 IF(KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
   list(REMOVE_ITEM OpenMPTarget_SOURCES
-    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_int64_t_reduce.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_int64_t_reduce_dynamic.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_int64_t_reduce_dynamic_view.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_double_reduce.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_double_reduce_dynamic.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamTeamSize.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reductions_DeviceView.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamVectorRange.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_UniqueToken.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_HostSharedPtr.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_HostSharedPtrAccessOnDevice.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamScratch.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TestScan.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TestTeamScan.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TestTeamReductionScan.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Atomics.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/default/TestDefaultDeviceType_a1.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/default/TestDefaultDeviceType_b1.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicOperations.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicOperations_double.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicOperations_float.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicOperations_int.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicOperations_longint.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicOperations_longlongint.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicOperations_double.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicOperations_unsignedint.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicOperations_unsignedlongint.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Atomics.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicViews.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_BlockSizeDeduction.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_CommonPolicyConstructors.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_CommonPolicyInterface.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_DeepCopyAlignment.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_HostSharedPtr.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_HostSharedPtrAccessOnDevice.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_MathematicalFunctions.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_MDRange_a.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_MDRange_b.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_MDRange_c.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_MDRange_d.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_MDRangePolicyConstructors.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_NumericTraits.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_RangePolicy.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_RangePolicyConstructors.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_RangePolicyRequire.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reducers_a.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reducers_b.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reducers_c.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reducers_d.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reducers_e.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewMapping_b.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamBasic.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Scan.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_NumericTraits.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_DeepCopyAlignment.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_MathematicalFunctions.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reductions_DeviceView.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_b.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c01.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c02.cpp
@@ -453,78 +501,131 @@ IF(KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c11.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c12.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c13.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_MDRange_a.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_MDRange_b.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_MDRange_c.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_MDRange_d.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamBasic.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamPolicyConstructors.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamScratch.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamTeamSize.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamVectorRange.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_UniqueToken.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewAPI_a.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewAPI_b.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewAPI_c.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewAPI_d.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewAPI_f.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewMapping_b.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewResize.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_RangePolicyRequire.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_RangePolicy.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_CommonPolicyConstructors.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_RangePolicyConstructors.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamPolicyConstructors.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_MDRangePolicyConstructors.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_CommonPolicyInterface.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/default/TestDefaultDeviceType_a1.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/default/TestDefaultDeviceType_b1.cpp
     )
 endif()
 
 # FIXME_OPENACC - Comment non-passing tests with the NVIDIA HPC compiler nvc++
 IF(KOKKOS_ENABLE_OPENACC AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
   list(REMOVE_ITEM OpenACC_SOURCES
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_int64_t_reduce.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_int64_t_reduce_dynamic.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_int64_t_reduce_dynamic_view.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_double_reduce.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_double_reduce_dynamic.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamTeamSize.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reductions_DeviceView.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamVectorRange.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_UniqueToken.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_HostSharedPtr.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_HostSharedPtrAccessOnDevice.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamScratch.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TestScan.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TestTeamScan.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TestTeamReductionScan.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Atomics.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/default/TestDefaultDeviceType_a1.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/default/TestDefaultDeviceType_b1.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_double.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_float.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_int.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_longint.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_longlongint.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_double.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_shared.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_unsignedint.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_unsignedlongint.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_shared.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Atomics.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicViews.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_BlockSizeDeduction.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reductions.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reduce.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reducers_a.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reducers_b.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reducers_c.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reducers_d.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reducers_e.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewMapping_b.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamBasic.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Scan.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_NumericTraits.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_DeepCopyAlignment.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MathematicalFunctions.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_HostSharedPtr.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_HostSharedPtrAccessOnDevice.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MathematicalFunctions1.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MathematicalFunctions2.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_a.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MathematicalFunctions3.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRange_c.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRange_f.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_NumericTraits.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_RangePolicy.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_RangePolicyRequire.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reducers_a.cpp #fails if NVHPC V22.5 or lower.
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reducers_d.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reductions.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reductions_DeviceView.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_b.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c01.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c02.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c03.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c05.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c08.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c11.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamBasic.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamScratch.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamTeamSize.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_UniqueToken.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewMapping_b.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewResize.cpp
+    )
+endif()
+
+# FIXME_OPENACC - Comment non-passing tests with the Clang compiler
+IF(KOKKOS_ENABLE_OPENACC AND KOKKOS_CXX_COMPILER_ID STREQUAL Clang)
+  list(REMOVE_ITEM OpenACC_SOURCES
+    ${CMAKE_CURRENT_SOURCE_DIR}/default/TestDefaultDeviceType_a1.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/default/TestDefaultDeviceType_b1.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_double.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_float.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_int.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_longint.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_longlongint.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_shared.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_unsignedint.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_unsignedlongint.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Atomics.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicViews.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_BlockSizeDeduction.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_DeepCopyAlignment.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_HostSharedPtrAccessOnDevice.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MathematicalFunctions1.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MathematicalFunctions2.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRange_c.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRange_f.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_NumericTraits.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_RangePolicy.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_RangePolicyRequire.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reducers_a.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reducers_d.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reductions.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reductions_DeviceView.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamBasic.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamScratch.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamTeamSize.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_UniqueToken.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewMapping_b.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewResize.cpp
+    # This test is not removed above for OpenACC+NVHPC but all its TEST
+    # functions are not compiled for the case of KOKKOS_COMPILER_NVHPC.
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewCtorDimMatch.cpp
+    # These tests are not removed above for OpenACC+NVHPC.
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Abort.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Complex.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ExecutionSpace.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ExecSpacePartitioning.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Init.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MathematicalConstants.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MathematicalSpecialFunctions.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MinMaxClamp.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewLayoutStrideAssignment.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewMapping_a.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewMemoryAccessViolation.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_WithoutInitializing.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewAPI_d.cpp
+    )
+  # When tested on a systme with AMD MI60 GPU and ROCm V5.4.0, these cause
+  # clang-linker-wrapper to hang for a long time while building the unit tests.
+  # In some cases, including them caused the build not to complete after an hour,
+  # but excluding them permitted the build to finish in 1.5 mins or less.
+  IF(KOKKOS_AMDGPU_ARCH)
+    list(REMOVE_ITEM OpenACC_SOURCES
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_BitManipulationBuiltins.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MathematicalFunctions3.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ParallelScanRangePolicy.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c04.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c05.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c06.cpp
@@ -534,48 +635,31 @@ IF(KOKKOS_ENABLE_OPENACC AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
     ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c10.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c11.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c12.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c13.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRange_a.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRange_b.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRange_c.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRange_d.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRange_e.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRange_f.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRange_g.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewAPI_a.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewAPI_b.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewAPI_c.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewAPI_d.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewAPI_f.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewResize.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_RangePolicyRequire.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_RangePolicy.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_CommonPolicyConstructors.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_RangePolicyConstructors.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamPolicyConstructors.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRangePolicyConstructors.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_CommonPolicyInterface.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/default/TestDefaultDeviceType_a1.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/default/TestDefaultDeviceType_b1.cpp
     )
+  endif()
+  # Fails serial.atomics_tpetra_max_abs when we test with Clacc.
+  list(REMOVE_ITEM Serial_SOURCES1
+    ${CMAKE_CURRENT_BINARY_DIR}/serial/TestSerial_Atomics.cpp)
 endif()
 
 if(Kokkos_ENABLE_SERIAL)
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
-    UnitTest_Serial1
+    CoreUnitTest_Serial1
     SOURCES
     UnitTestMainInit.cpp
     ${Serial_SOURCES1}
     serial/TestSerial_Task.cpp
   )
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
-    UnitTest_Serial2
+    CoreUnitTest_Serial2
     SOURCES
     UnitTestMainInit.cpp
     ${Serial_SOURCES2}
   )
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
-    UnitTest_SerialGraph
+    CoreUnitTest_SerialGraph
     SOURCES
     UnitTestMainInit.cpp
     serial/TestSerial_Graph.cpp
@@ -584,7 +668,7 @@ endif()
 
 if(Kokkos_ENABLE_THREADS)
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
-    UnitTest_Threads
+    CoreUnitTest_Threads
     SOURCES ${Threads_SOURCES}
     UnitTestMainInit.cpp
   )
@@ -596,20 +680,20 @@ if (Kokkos_ENABLE_OPENMP)
     openmp/TestOpenMP_PartitionMaster.cpp
   )
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
-    UnitTest_OpenMP
+    CoreUnitTest_OpenMP
     SOURCES
     UnitTestMainInit.cpp
     ${OpenMP_SOURCES}
     ${OpenMP_EXTRA_SOURCES}
   )
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
-    UnitTest_OpenMPInterOp
+    CoreUnitTest_OpenMPInterOp
     SOURCES
       UnitTestMain.cpp
       openmp/TestOpenMP_InterOp.cpp
   )
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
-    UnitTest_OpenMPGraph
+    CoreUnitTest_OpenMPGraph
     SOURCES
       UnitTestMainInit.cpp
       openmp/TestOpenMP_Graph.cpp
@@ -618,32 +702,39 @@ endif()
 
 if(Kokkos_ENABLE_HPX)
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
-    UnitTest_HPX
+    CoreUnitTest_HPX
     SOURCES
       UnitTestMainInit.cpp
       ${HPX_SOURCES}
       hpx/TestHPX_Task.cpp
   )
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
-    UnitTest_HPXInterOp
+    CoreUnitTest_HPXInterOp
     SOURCES
       UnitTestMain.cpp
       hpx/TestHPX_InterOp.cpp
   )
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
-    UnitTest_HPX_IndependentInstances
+    CoreUnitTest_HPX_IndependentInstances
     SOURCES
       UnitTestMainInit.cpp
       hpx/TestHPX_IndependentInstances.cpp
       hpx/TestHPX_IndependentInstancesDelayedExecution.cpp
       hpx/TestHPX_IndependentInstancesInstanceIds.cpp
       hpx/TestHPX_IndependentInstancesRefCounting.cpp
+      hpx/TestHPX_IndependentInstancesSynchronization.cpp
+  )
+  KOKKOS_ADD_EXECUTABLE_AND_TEST(
+    CoreUnitTest_HPX_InParallel
+    SOURCES
+      UnitTestMainInit.cpp
+      hpx/TestHPX_InParallel.cpp
   )
 endif()
 
 if(Kokkos_ENABLE_OPENMPTARGET)
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
-    UnitTest_OpenMPTarget
+    CoreUnitTest_OpenMPTarget
     SOURCES
     UnitTestMainInit.cpp
     ${OpenMPTarget_SOURCES}
@@ -652,7 +743,7 @@ endif()
 
 if(Kokkos_ENABLE_OPENACC)
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
-    UnitTest_OpenACC
+    CoreUnitTest_OpenACC
     SOURCES
     UnitTestMainInit.cpp
     ${OpenACC_SOURCES}
@@ -661,7 +752,7 @@ endif()
 
 if(Kokkos_ENABLE_CUDA)
     KOKKOS_ADD_EXECUTABLE_AND_TEST(
-    UnitTest_Cuda1
+    CoreUnitTest_Cuda1
     SOURCES
       UnitTestMainInit.cpp
       ${Cuda_SOURCES1}
@@ -669,14 +760,14 @@ if(Kokkos_ENABLE_CUDA)
     )
 
     KOKKOS_ADD_EXECUTABLE_AND_TEST(
-    UnitTest_Cuda2
+    CoreUnitTest_Cuda2
     SOURCES
       UnitTestMainInit.cpp
       ${Cuda_SOURCES2}
     )
 
     KOKKOS_ADD_EXECUTABLE_AND_TEST(
-    UnitTest_Cuda3
+    CoreUnitTest_Cuda3
     SOURCES
       UnitTestMainInit.cpp
       cuda/TestCuda_Task.cpp
@@ -687,7 +778,7 @@ if(Kokkos_ENABLE_CUDA)
     )
 
     KOKKOS_ADD_EXECUTABLE_AND_TEST(
-    UnitTest_CudaTimingBased
+    CoreUnitTest_CudaTimingBased
     SOURCES
       UnitTestMainInit.cpp
       cuda/TestCuda_DebugSerialExecution.cpp
@@ -695,19 +786,19 @@ if(Kokkos_ENABLE_CUDA)
   )
 
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
-    UnitTest_CudaInterOpInit
+    CoreUnitTest_CudaInterOpInit
     SOURCES
       UnitTestMain.cpp
       cuda/TestCuda_InterOp_Init.cpp
   )
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
-    UnitTest_CudaInterOpStreams
+    CoreUnitTest_CudaInterOpStreams
     SOURCES
       UnitTestMain.cpp
       cuda/TestCuda_InterOp_Streams.cpp
   )
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
-    UnitTest_CudaGraph
+    CoreUnitTest_CudaGraph
     SOURCES
       UnitTestMainInit.cpp
       cuda/TestCuda_Graph.cpp
@@ -716,7 +807,7 @@ endif()
 
 if(Kokkos_ENABLE_HIP)
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
-    UnitTest_HIP
+    CoreUnitTest_HIP
     SOURCES
       UnitTestMainInit.cpp
       ${HIP_SOURCES}
@@ -728,73 +819,74 @@ if(Kokkos_ENABLE_HIP)
       hip/TestHIP_BlocksizeDeduction.cpp
   )
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
-    UnitTest_HIPInterOpInit
+    CoreUnitTest_HIPInterOpInit
     SOURCES
       UnitTestMain.cpp
       hip/TestHIP_InterOp_Init.cpp
   )
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
-    UnitTest_HIPInterOpStreams
+    CoreUnitTest_HIPInterOpStreams
     SOURCES
       UnitTestMain.cpp
       hip/TestHIP_InterOp_Streams.cpp
   )
+  KOKKOS_ADD_EXECUTABLE_AND_TEST(
+    UnitTest_HIPGraph
+    SOURCES
+      UnitTestMainInit.cpp
+      hip/TestHIP_Graph.cpp
+  )
 endif()
 
 if(Kokkos_ENABLE_SYCL)
-  list(REMOVE_ITEM SYCL_SOURCES1A
-       # FIXME_SYCL atomic_fetch_oper for large types to be implemented
-       ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_AtomicOperations_complexdouble.cpp
-  )
-
   list(REMOVE_ITEM SYCL_SOURCES2A
        ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_WorkGraph.cpp
   )
 
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
-    UnitTest_SYCL1A
+    CoreUnitTest_SYCL1A
     SOURCES
       UnitTestMainInit.cpp
       ${SYCL_SOURCES1A}
   )
 
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
-    UnitTest_SYCL1B
+    CoreUnitTest_SYCL1B
     SOURCES
       UnitTestMainInit.cpp
       ${SYCL_SOURCES1B}
   )
 
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
-    UnitTest_SYCL2A
+    CoreUnitTest_SYCL2A
     SOURCES
       UnitTestMainInit.cpp
       ${SYCL_SOURCES2A}
   )
 
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
-    UnitTest_SYCL2B
+    CoreUnitTest_SYCL2B
     SOURCES
       UnitTestMainInit.cpp
       ${SYCL_SOURCES2B}
   )
 
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
-    UnitTest_SYCL2C
+    CoreUnitTest_SYCL2C
     SOURCES
       UnitTestMainInit.cpp
       ${SYCL_SOURCES2C}
   )
 
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
-    UnitTest_SYCL2D
+    CoreUnitTest_SYCL2D
     SOURCES
       UnitTestMainInit.cpp
       ${SYCL_SOURCES2D}
   )
 
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
-    UnitTest_SYCL3
+    CoreUnitTest_SYCL3
     SOURCES
       UnitTestMainInit.cpp
       # FIXME_SYCL
@@ -805,19 +897,19 @@ if(Kokkos_ENABLE_SYCL)
   )
 
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
-    UnitTest_SYCLInterOpInit
+    CoreUnitTest_SYCLInterOpInit
     SOURCES
       UnitTestMain.cpp
       sycl/TestSYCL_InterOp_Init.cpp
   )
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
-    UnitTest_SYCLInterOpInit_Context
+    CoreUnitTest_SYCLInterOpInit_Context
     SOURCES
     UnitTestMainInit.cpp
       sycl/TestSYCL_InterOp_Init_Context.cpp
   )
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
-    UnitTest_SYCLInterOpStreams
+    CoreUnitTest_SYCLInterOpStreams
     SOURCES
       UnitTestMain.cpp
      sycl/TestSYCL_InterOp_Streams.cpp
@@ -830,6 +922,7 @@ SET(DEFAULT_DEVICE_SOURCES
   TestParseCmdLineArgsAndEnvVars.cpp
   TestSharedSpace.cpp
   TestSharedHostPinnedSpace.cpp
+  TestCompilerMacros.cpp
   default/TestDefaultDeviceType.cpp
   default/TestDefaultDeviceType_a1.cpp
   default/TestDefaultDeviceType_b1.cpp
@@ -856,28 +949,49 @@ endif()
 
 # FIXME_OPENMPTARGET, FIXME_OPENACC - Comment non-passing tests with the NVIDIA HPC compiler nvc++
 if ((KOKKOS_ENABLE_OPENMPTARGET OR KOKKOS_ENABLE_OPENACC) AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
+  LIST(REMOVE_ITEM DEFAULT_DEVICE_SOURCES
+    default/TestDefaultDeviceType_a1.cpp
+    default/TestDefaultDeviceType_b1.cpp
+    default/TestDefaultDeviceType_c1.cpp
+    default/TestDefaultDeviceType_a2.cpp
+    default/TestDefaultDeviceType_b2.cpp
+    default/TestDefaultDeviceType_c2.cpp
+    default/TestDefaultDeviceType_a3.cpp
+    default/TestDefaultDeviceType_b3.cpp
+    default/TestDefaultDeviceType_c3.cpp
+    default/TestDefaultDeviceType_d.cpp
+    default/TestDefaultDeviceTypeResize.cpp
+    default/TestDefaultDeviceTypeViewAPI.cpp
+  )
+endif()
+
+# FIXME_OPENACC - Comment non-passing tests with the Clang compiler
+if (KOKKOS_ENABLE_OPENACC AND KOKKOS_CXX_COMPILER_ID STREQUAL Clang)
   SET(DEFAULT_DEVICE_SOURCES
+    TestCompilerMacros.cpp
     UnitTestMainInit.cpp
     TestInitializationSettings.cpp
     TestParseCmdLineArgsAndEnvVars.cpp
-    default/TestDefaultDeviceType.cpp
+    default/TestDefaultDeviceType_d.cpp
+    default/TestDefaultDeviceTypeResize.cpp
+    default/TestDefaultDeviceTypeViewAPI.cpp
   )
 endif()
 
 KOKKOS_ADD_EXECUTABLE_AND_TEST(
-  UnitTest_Default
+  CoreUnitTest_Default
   SOURCES ${DEFAULT_DEVICE_SOURCES}
 )
 
 KOKKOS_ADD_EXECUTABLE_AND_TEST(
-  UnitTest_LegionInitialization
+  CoreUnitTest_LegionInitialization
   SOURCES
     UnitTestMain.cpp
     TestLegionInitialization.cpp
 )
 
 KOKKOS_ADD_EXECUTABLE_AND_TEST(
-  UnitTest_PushFinalizeHook
+  CoreUnitTest_PushFinalizeHook
   SOURCES
     UnitTest_PushFinalizeHook.cpp
 )
@@ -885,7 +999,7 @@ KOKKOS_ADD_EXECUTABLE_AND_TEST(
 # This test is intended for development and debugging by putting code
 # into TestDefaultDeviceDevelop.cpp. By default its empty.
 KOKKOS_ADD_EXECUTABLE_AND_TEST(
-  UnitTest_Develop
+  CoreUnitTest_Develop
   SOURCES
     UnitTestMainInit.cpp
     default/TestDefaultDeviceDevelop.cpp
@@ -900,7 +1014,7 @@ KOKKOS_ADD_TEST_EXECUTABLE( push_finalize_hook_terminate
   SOURCES UnitTest_PushFinalizeHook_terminate.cpp
 )
 
-KOKKOS_ADD_ADVANCED_TEST( UnitTest_PushFinalizeHook_terminate
+KOKKOS_ADD_ADVANCED_TEST( CoreUnitTest_PushFinalizeHook_terminate
   TEST_0
     EXEC push_finalize_hook_terminate
     NUM_MPI_PROCS 1
@@ -910,24 +1024,24 @@ KOKKOS_ADD_ADVANCED_TEST( UnitTest_PushFinalizeHook_terminate
 )
   if(KOKKOS_ENABLE_TUNING)
     KOKKOS_ADD_EXECUTABLE_AND_TEST(
-      UnitTest_TuningBuiltins
+      CoreUnitTest_TuningBuiltins
       SOURCES
       tools/TestBuiltinTuners.cpp
     )
     KOKKOS_ADD_EXECUTABLE_AND_TEST(
-      UnitTest_TuningBasics
+      CoreUnitTest_TuningBasics
       SOURCES
         tools/TestTuning.cpp
     )
     KOKKOS_ADD_EXECUTABLE_AND_TEST(
-      UnitTest_CategoricalTuner
+      CoreUnitTest_CategoricalTuner
       SOURCES
       tools/TestCategoricalTuner.cpp
     )
   endif()
   if((NOT Kokkos_ENABLE_OPENMPTARGET) AND (NOT Kokkos_ENABLE_OPENACC))
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
-    UnitTest_LogicalSpaces
+    CoreUnitTest_LogicalSpaces
     SOURCES
       tools/TestLogicalSpaces.cpp
   )
@@ -937,6 +1051,7 @@ KOKKOS_ADD_ADVANCED_TEST( UnitTest_PushFinalizeHook_terminate
     tools/TestEventCorrectness.cpp
     tools/TestWithoutInitializing.cpp
     tools/TestProfilingSection.cpp
+    tools/TestScopedRegion.cpp
     )
 
   # FIXME_OPENMPTARGET This test causes internal compiler errors as of 09/01/22
@@ -946,18 +1061,18 @@ KOKKOS_ADD_ADVANCED_TEST( UnitTest_PushFinalizeHook_terminate
   endif()
 
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
-    UnitTest_KokkosP
+    CoreUnitTest_KokkosP
     SOURCES
     ${KOKKOSP_SOURCES}
   )
   if(KOKKOS_ENABLE_LIBDL)
     KOKKOS_ADD_EXECUTABLE_AND_TEST(
-      UnitTest_ToolIndependence
+      CoreUnitTest_ToolIndependence
       SOURCES
       tools/TestIndependence.cpp
     )
     TARGET_COMPILE_DEFINITIONS(
-      KokkosCore_UnitTest_ToolIndependence PUBLIC
+      Kokkos_CoreUnitTest_ToolIndependence PUBLIC
       KOKKOS_TOOLS_INDEPENDENT_BUILD
     )
     KOKKOS_ADD_TEST_LIBRARY(
@@ -974,6 +1089,12 @@ KOKKOS_ADD_ADVANCED_TEST( UnitTest_PushFinalizeHook_terminate
       tools/TestAllCalls.cpp
     )
 
+    KOKKOS_ADD_TEST_EXECUTABLE(
+      ToolsInitialization
+      UnitTestMain.cpp
+      tools/TestToolsInitialization.cpp
+    )
+
     set(ADDRESS_REGEX "0x[0-9a-f]*")
     set(MEMSPACE_REGEX "[HC][ou][sd][ta][a-zA-Z]*")
     set(SIZE_REGEX "[0-9]*")
@@ -987,7 +1108,7 @@ KOKKOS_ADD_ADVANCED_TEST( UnitTest_PushFinalizeHook_terminate
       TOOL kokkosprinter-tool
       ARGS --kokkos-tools-help
       PASS_REGULAR_EXPRESSION
-        "kokkosp_init_library::kokkosp_print_help:KokkosCore_ProfilingAllCalls::kokkosp_finalize_library::")
+        "kokkosp_init_library::kokkosp_print_help:Kokkos_ProfilingAllCalls::kokkosp_finalize_library::")
 
     # check help works via direct library specification
     KOKKOS_ADD_TEST(
@@ -997,7 +1118,7 @@ KOKKOS_ADD_ADVANCED_TEST( UnitTest_PushFinalizeHook_terminate
       ARGS --kokkos-tools-help
            --kokkos-tools-libs=$<TARGET_FILE:kokkosprinter-tool>
       PASS_REGULAR_EXPRESSION
-        "kokkosp_init_library::kokkosp_print_help:KokkosCore_ProfilingAllCalls::kokkosp_finalize_library::")
+        "kokkosp_init_library::kokkosp_print_help:Kokkos_ProfilingAllCalls::kokkosp_finalize_library::")
 
     KOKKOS_ADD_TEST(
       SKIP_TRIBITS
@@ -1005,7 +1126,7 @@ KOKKOS_ADD_ADVANCED_TEST( UnitTest_PushFinalizeHook_terminate
       EXE  ProfilingAllCalls
       TOOL kokkosprinter-tool
       ARGS --kokkos-tools-args="-c test delimit"
-      PASS_REGULAR_EXPRESSION "kokkosp_init_library::kokkosp_parse_args:4:KokkosCore_ProfilingAllCalls:-c:test:delimit::.*::kokkosp_allocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]source] via memset:[0-9]+:0::kokkosp_end_parallel_for:0::kokkosp_allocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]destination] via memset:[0-9]+:0::kokkosp_end_parallel_for:0::kokkosp_begin_deep_copy:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::.*kokkosp_end_deep_copy::kokkosp_begin_parallel_for:parallel_for:${SIZE_REGEX}:0::kokkosp_end_parallel_for:0::kokkosp_begin_parallel_reduce:parallel_reduce:${SIZE_REGEX}:1${SKIP_SCRATCH_INITIALIZATION_REGEX}::kokkosp_end_parallel_reduce:1::kokkosp_begin_parallel_scan:parallel_scan:${SIZE_REGEX}:2::kokkosp_end_parallel_scan:2::kokkosp_push_profile_region:push_region::kokkosp_pop_profile_region::kokkosp_create_profile_section:created_section:3::kokkosp_start_profile_section:3::kokkosp_stop_profile_section:3::kokkosp_destroy_profile_section:3::kokkosp_profile_event:profiling_event::kokkosp_declare_metadata:dogs:good::kokkosp_deallocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_deallocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_finalize_library::"
+      PASS_REGULAR_EXPRESSION "kokkosp_init_library::kokkosp_parse_args:4:Kokkos_ProfilingAllCalls:-c:test:delimit::.*::kokkosp_allocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]source] via memset:[0-9]+:0::kokkosp_end_parallel_for:0::kokkosp_allocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]destination] via memset:[0-9]+:0::kokkosp_end_parallel_for:0::kokkosp_begin_deep_copy:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::.*kokkosp_end_deep_copy::kokkosp_begin_parallel_for:parallel_for:${SIZE_REGEX}:0::kokkosp_end_parallel_for:0::kokkosp_begin_parallel_reduce:parallel_reduce:${SIZE_REGEX}:1${SKIP_SCRATCH_INITIALIZATION_REGEX}::kokkosp_end_parallel_reduce:1::kokkosp_begin_parallel_scan:parallel_scan:${SIZE_REGEX}:2::kokkosp_end_parallel_scan:2::kokkosp_push_profile_region:push_region::kokkosp_pop_profile_region::kokkosp_create_profile_section:created_section:3::kokkosp_start_profile_section:3::kokkosp_stop_profile_section:3::kokkosp_destroy_profile_section:3::kokkosp_profile_event:profiling_event::kokkosp_declare_metadata:dogs:good::kokkosp_deallocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_deallocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_finalize_library::"
     )
 
     # Above will test that leading/trailing quotes are stripped bc ctest cmd args is:
@@ -1022,7 +1143,7 @@ KOKKOS_ADD_ADVANCED_TEST( UnitTest_PushFinalizeHook_terminate
       EXE  ProfilingAllCalls
       ARGS [=[--kokkos-tools-args=-c test delimit]=]
             --kokkos-tools-libs=$<TARGET_FILE:kokkosprinter-tool>
-      PASS_REGULAR_EXPRESSION "kokkosp_init_library::kokkosp_parse_args:4:KokkosCore_ProfilingAllCalls:-c:test:delimit::.*::kokkosp_allocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]source] via memset:[0-9]+:0::kokkosp_end_parallel_for:0::kokkosp_allocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]destination] via memset:[0-9]+:0::kokkosp_end_parallel_for:0::kokkosp_begin_deep_copy:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::.*kokkosp_end_deep_copy::kokkosp_begin_parallel_for:parallel_for:${SIZE_REGEX}:0::kokkosp_end_parallel_for:0::kokkosp_begin_parallel_reduce:parallel_reduce:${SIZE_REGEX}:1${SKIP_SCRATCH_INITIALIZATION_REGEX}::kokkosp_end_parallel_reduce:1::kokkosp_begin_parallel_scan:parallel_scan:${SIZE_REGEX}:2::kokkosp_end_parallel_scan:2::kokkosp_push_profile_region:push_region::kokkosp_pop_profile_region::kokkosp_create_profile_section:created_section:3::kokkosp_start_profile_section:3::kokkosp_stop_profile_section:3::kokkosp_destroy_profile_section:3::kokkosp_profile_event:profiling_event::kokkosp_declare_metadata:dogs:good::kokkosp_deallocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_deallocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_finalize_library::"
+      PASS_REGULAR_EXPRESSION "kokkosp_init_library::kokkosp_parse_args:4:Kokkos_ProfilingAllCalls:-c:test:delimit::.*::kokkosp_allocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]source] via memset:[0-9]+:0::kokkosp_end_parallel_for:0::kokkosp_allocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]destination] via memset:[0-9]+:0::kokkosp_end_parallel_for:0::kokkosp_begin_deep_copy:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::.*kokkosp_end_deep_copy::kokkosp_begin_parallel_for:parallel_for:${SIZE_REGEX}:0::kokkosp_end_parallel_for:0::kokkosp_begin_parallel_reduce:parallel_reduce:${SIZE_REGEX}:1${SKIP_SCRATCH_INITIALIZATION_REGEX}::kokkosp_end_parallel_reduce:1::kokkosp_begin_parallel_scan:parallel_scan:${SIZE_REGEX}:2::kokkosp_end_parallel_scan:2::kokkosp_push_profile_region:push_region::kokkosp_pop_profile_region::kokkosp_create_profile_section:created_section:3::kokkosp_start_profile_section:3::kokkosp_stop_profile_section:3::kokkosp_destroy_profile_section:3::kokkosp_profile_event:profiling_event::kokkosp_declare_metadata:dogs:good::kokkosp_deallocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_deallocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_finalize_library::"
     )
   endif() #KOKKOS_ENABLE_LIBDL
 if(NOT KOKKOS_HAS_TRILINOS)
@@ -1040,7 +1161,7 @@ KOKKOS_ADD_TEST_EXECUTABLE(
 # to work correctly with shared libraries
 KOKKOS_SET_EXE_PROPERTY(StackTraceTestExec ENABLE_EXPORTS ON)
 
-KOKKOS_ADD_TEST( NAME UnitTest_StackTraceTest
+KOKKOS_ADD_TEST( NAME CoreUnitTest_StackTraceTest
                  EXE  StackTraceTestExec
                  FAIL_REGULAR_EXPRESSION "FAILED"
                )
@@ -1049,7 +1170,7 @@ endif()
 if(Kokkos_ENABLE_DEPRECATED_CODE_3)
   foreach(INITTESTS_NUM RANGE 1 18)
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
-    UnitTest_DefaultInit_${INITTESTS_NUM}
+    CoreUnitTest_DefaultInit_${INITTESTS_NUM}
     SOURCES UnitTestMain.cpp default/TestDefaultDeviceTypeInit_${INITTESTS_NUM}.cpp
   )
   endforeach(INITTESTS_NUM)
@@ -1057,7 +1178,7 @@ endif()
 
 if (KOKKOS_ENABLE_HWLOC)
 KOKKOS_ADD_EXECUTABLE_AND_TEST(
-  UnitTest_HWLOC
+  CoreUnitTest_HWLOC
   SOURCES UnitTestMain.cpp  TestHWLOC.cpp
 )
 endif()
@@ -1113,34 +1234,35 @@ FOREACH (DEVICE ${KOKKOS_ENABLED_DEVICES})
 ENDFOREACH()
 
 KOKKOS_ADD_EXECUTABLE_AND_TEST(
-  UnitTest_CTestDevice
+  CoreUnitTest_CTestDevice
   SOURCES UnitTestMain.cpp  TestCTestDevice.cpp
 )
 
 KOKKOS_ADD_EXECUTABLE_AND_TEST(
-  UnitTest_CMakePassCmdLineArgs
+  CoreUnitTest_CMakePassCmdLineArgs
   SOURCES UnitTest_CMakePassCmdLineArgs.cpp
   ARGS "one 2 THREE"
 )
 
 # This test is not properly set up to run within Trilinos
 if (NOT KOKKOS_HAS_TRILINOS)
-  add_executable(KokkosCore_UnitTest_DeviceAndThreads UnitTest_DeviceAndThreads.cpp)
-  target_link_libraries(KokkosCore_UnitTest_DeviceAndThreads Kokkos::kokkoscore)
+  SET_SOURCE_FILES_PROPERTIES(UnitTest_DeviceAndThreads.cpp PROPERTIES LANGUAGE ${KOKKOS_COMPILE_LANGUAGE})
+  add_executable(Kokkos_CoreUnitTest_DeviceAndThreads UnitTest_DeviceAndThreads.cpp)
+  target_link_libraries(Kokkos_CoreUnitTest_DeviceAndThreads Kokkos::kokkoscore)
   find_package(Python3 COMPONENTS Interpreter)
   if(Python3_Interpreter_FOUND AND Python3_VERSION VERSION_GREATER_EQUAL 3.7)
     if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.20)
       set(USE_SOURCE_PERMISSIONS_WHEN_SUPPORTED USE_SOURCE_PERMISSIONS)
     endif()
     file(GENERATE
-      OUTPUT $<TARGET_FILE_DIR:KokkosCore_UnitTest_DeviceAndThreads>/TestDeviceAndThreads.py
+      OUTPUT $<TARGET_FILE_DIR:Kokkos_CoreUnitTest_DeviceAndThreads>/TestDeviceAndThreads.py
       INPUT TestDeviceAndThreads.py
       ${USE_SOURCE_PERMISSIONS_WHEN_SUPPORTED}
     )
     if(NOT Kokkos_ENABLE_OPENMPTARGET)  # FIXME_OPENMPTARGET does not select the right device
       add_test(
-        NAME KokkosCore_UnitTest_DeviceAndThreads
-        COMMAND ${Python3_EXECUTABLE} -m unittest -v $<TARGET_FILE_DIR:KokkosCore_UnitTest_DeviceAndThreads>/TestDeviceAndThreads.py
+        NAME Kokkos_CoreUnitTest_DeviceAndThreads
+        COMMAND ${Python3_EXECUTABLE} -m unittest -v $<TARGET_FILE_DIR:Kokkos_CoreUnitTest_DeviceAndThreads>/TestDeviceAndThreads.py
       )
     endif()
   endif()
diff --git a/packages/kokkos/core/unit_test/Makefile b/packages/kokkos/core/unit_test/Makefile
index 05be2252658682cc3bc716b2f292910639cf3c5f..33a84b61f92a82fc4e668fd640149afa113e06d5 100644
--- a/packages/kokkos/core/unit_test/Makefile
+++ b/packages/kokkos/core/unit_test/Makefile
@@ -52,8 +52,8 @@ ifneq ($(KOKKOS_INTERNAL_USE_RDYNAMIC), 1)
    KOKKOS_INTERNAL_USE_RDYNAMIC := $(call kokkos_has_string,$(CXXFLAGS),rdynamic)
 endif
 
-ifeq ($(KOKKOS_INTERNAL_USE_RDYNAMIC),1) 
-   ifneq ($(KOKKOS_INTERNAL_HAS_OPTIMIZATIONS),1) 
+ifeq ($(KOKKOS_INTERNAL_USE_RDYNAMIC),1)
+   ifneq ($(KOKKOS_INTERNAL_HAS_OPTIMIZATIONS),1)
       STACK_TRACE_TERMINATE_FILTER :=_dynamic
    else
       STACK_TRACE_TERMINATE_FILTER :=
@@ -62,7 +62,7 @@ else
    STACK_TRACE_TERMINATE_FILTER :=
 endif
 
-TESTS = AtomicOperations_int AtomicOperations_unsignedint AtomicOperations_longint AtomicOperations_unsignedlongint AtomicOperations_longlongint AtomicOperations_double AtomicOperations_float AtomicOperations_complexdouble AtomicOperations_complexfloat AtomicViews Atomics BlockSizeDeduction Concepts Complex Crs DeepCopyAlignment FunctorAnalysis Init LocalDeepCopy MDRange_a MDRange_b MDRange_c MDRange_d MDRange_e MDRange_f Other RangePolicy RangePolicyRequire Reductions Reducers_a Reducers_b Reducers_c Reducers_d Reducers_e Reductions_DeviceView Scan SharedAlloc TeamBasic TeamReductionScan TeamScratch TeamTeamSize TeamVectorRange UniqueToken ViewAPI_a ViewAPI_b ViewAPI_c ViewAPI_d ViewAPI_e ViewCopy_a ViewCopy_b ViewLayoutStrideAssignment ViewMapping_a ViewMapping_b ViewMapping_subview ViewOfClass WorkGraph View_64bit ViewResize
+TESTS = AtomicOperations_int AtomicOperations_unsignedint AtomicOperations_longint AtomicOperations_unsignedlongint AtomicOperations_longlongint AtomicOperations_double AtomicOperations_float AtomicOperations_complexdouble AtomicOperations_complexfloat AtomicViews Atomics BlockSizeDeduction Concepts Complex Crs DeepCopyAlignment FunctorAnalysis Init LocalDeepCopy MDRange_a MDRange_b MDRange_c MDRange_d MDRange_e MDRange_f Other ParallelScanRangePolicy RangePolicy RangePolicyRequire Reductions Reducers_a Reducers_b Reducers_c Reducers_d Reducers_e Reductions_DeviceView SharedAlloc TeamBasic TeamReductionScan TeamScratch TeamTeamSize TeamVectorRange UniqueToken ViewAPI_a ViewAPI_b ViewAPI_c ViewAPI_d ViewAPI_e ViewCopy_a ViewCopy_b ViewLayoutStrideAssignment ViewMapping_a ViewMapping_b ViewMapping_subview ViewOfClass WorkGraph View_64bit ViewResize
 
 tmp := $(foreach device, $(KOKKOS_DEVICELIST), \
   tmp2 := $(foreach test, $(TESTS), \
@@ -127,7 +127,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
     OBJ_CUDA += TestCuda_SubView_c07.o TestCuda_SubView_c08.o TestCuda_SubView_c09.o
     OBJ_CUDA += TestCuda_SubView_c10.o TestCuda_SubView_c11.o TestCuda_SubView_c12.o
     OBJ_CUDA += TestCuda_SubView_c13.o
-    OBJ_CUDA += TestCuda_Reductions.o TestCuda_Scan.o
+    OBJ_CUDA += TestCuda_Reductions.o TestCuda_ParallelScanRangePolicy.o
     OBJ_CUDA += TestCuda_Reductions_DeviceView.o
     OBJ_CUDA += TestCuda_Reducers_a.o TestCuda_Reducers_b.o TestCuda_Reducers_c.o TestCuda_Reducers_d.o TestCuda_Reducers_e.o
     OBJ_CUDA += TestCuda_Complex.o
@@ -171,7 +171,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_THREADS), 1)
     OBJ_THREADS += TestThreads_SubView_c04.o TestThreads_SubView_c05.o TestThreads_SubView_c06.o
     OBJ_THREADS += TestThreads_SubView_c07.o TestThreads_SubView_c08.o TestThreads_SubView_c09.o
     OBJ_THREADS += TestThreads_SubView_c10.o TestThreads_SubView_c11.o TestThreads_SubView_c12.o
-    OBJ_THREADS += TestThreads_Reductions.o TestThreads_Scan.o
+    OBJ_THREADS += TestThreads_Reductions.o TestThreads_ParallelScanRangePolicy.o
     OBJ_THREADS += TestThreads_Reductions_DeviceView.o
     OBJ_THREADS += TestThreads_Reducers_a.o TestThreads_Reducers_b.o TestThreads_Reducers_c.o TestThreads_Reducers_d.o TestThreads_Reducers_e.o
     OBJ_THREADS += TestThreads_Complex.o
@@ -207,7 +207,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
     OBJ_OPENMP += TestOpenMP_SubView_c07.o TestOpenMP_SubView_c08.o TestOpenMP_SubView_c09.o
     OBJ_OPENMP += TestOpenMP_SubView_c10.o TestOpenMP_SubView_c11.o TestOpenMP_SubView_c12.o
     OBJ_OPENMP += TestOpenMP_SubView_c13.o
-    OBJ_OPENMP += TestOpenMP_Reductions.o TestOpenMP_Scan.o
+    OBJ_OPENMP += TestOpenMP_Reductions.o TestOpenMP_ParallelScanRangePolicy.o
     OBJ_OPENMP += TestOpenMP_Reductions_DeviceView.o
     OBJ_OPENMP += TestOpenMP_Reducers_a.o TestOpenMP_Reducers_b.o TestOpenMP_Reducers_c.o TestOpenMP_Reducers_d.o TestOpenMP_Reducers_e.o
     OBJ_OPENMP += TestOpenMP_Complex.o
@@ -251,11 +251,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
     #OBJ_OPENMPTARGET += TestOpenMPTarget_SubView_c10.o TestOpenMPTarget_SubView_c11.o TestOpenMPTarget_SubView_c12.o
     #OBJ_OPENMPTARGET += TestOpenMPTarget_Reductions.o # Need custom reductions
     OBJ_OPENMPTARGET += TestOpenMPTarget_Reducers_a.o TestOpenMPTarget_Reducers_b.o TestOpenMPTarget_Reducers_c.o TestOpenMPTarget_Reducers_d.o TestOpenMPTarget_Reducers_e.o
-    #OBJ_OPENMPTARGET += TestOpenMPTarget_Scan.o
+    OBJ_OPENMPTARGET += TestOpenMPTarget_ParallelScanRangePolicy.o
     OBJ_OPENMPTARGET += TestOpenMPTarget_Complex.o
     OBJ_OPENMPTARGET += TestOpenMPTarget_AtomicOperations_int.o TestOpenMPTarget_AtomicOperations_unsignedint.o TestOpenMPTarget_AtomicOperations_longint.o
     OBJ_OPENMPTARGET += TestOpenMPTarget_AtomicOperations_unsignedlongint.o TestOpenMPTarget_AtomicOperations_longlongint.o TestOpenMPTarget_AtomicOperations_double.o TestOpenMPTarget_AtomicOperations_float.o
-    #OBJ_OPENMPTARGET += TestOpenMPTarget_AtomicOperations_complexfloat.o 
+    #OBJ_OPENMPTARGET += TestOpenMPTarget_AtomicOperations_complexfloat.o
     #OBJ_OPENMPTARGET += TestOpenMPTarget_AtomicOperations_complexdouble.o
     OBJ_OPENMPTARGET += TestOpenMPTarget_AtomicViews.o
     OBJ_OPENMPTARGET += TestOpenMPTarget_Atomics.o # Commented Out Arbitrary Type Atomics
@@ -290,6 +290,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
 	OBJ_HIP += TestHIP_MDRange_a.o TestHIP_MDRange_b.o TestHIP_MDRange_c.o TestHIP_MDRange_d.o TestHIP_MDRange_e.o
 	OBJ_HIP += TestHIP_Spaces.o
 	OBJ_HIP += TestHIP_Memory_Requirements.o
+        OBJ_HIP += TestHIP_ParallelScanRangePolicy.o
 	OBJ_HIP += TestHIPHostPinned_ViewAPI_a.o TestHIPHostPinned_ViewAPI_b.o TestHIPHostPinned_ViewAPI_c.o TestHIPHostPinned_ViewAPI_d.o TestHIPHostPinned_ViewAPI_e.o
 	OBJ_HIP += TestHIPHostPinned_ViewCopy_a.o TestHIPHostPinned_ViewCopy_b.o
 	OBJ_HIP += TestHIPHostPinned_ViewMapping_a.o TestHIPHostPinned_ViewMapping_b.o TestHIPHostPinned_ViewMapping_subview.o
@@ -316,7 +317,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
 	OBJ_HPX += TestHPX_SubView_c10.o TestHPX_SubView_c11.o TestHPX_SubView_c12.o
 	OBJ_HPX += TestHPX_SubView_c13.o
 	OBJ_HPX += TestHPX_Reductions.o
-	OBJ_HPX += TestHPX_Scan.o
+	OBJ_HPX += TestHPX_ParallelScanRangePolicy.o
 	OBJ_HPX += TestHPX_Reducers_a.o TestHPX_Reducers_b.o TestHPX_Reducers_c.o TestHPX_Reducers_d.o TestHPX_Reducers_e.o
 	OBJ_HPX += TestHPX_Complex.o
 	OBJ_HPX += TestHPX_AtomicOperations_int.o TestHPX_AtomicOperations_unsignedint.o TestHPX_AtomicOperations_longint.o
@@ -355,7 +356,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
     OBJ_SERIAL += TestSerial_SubView_c07.o TestSerial_SubView_c08.o TestSerial_SubView_c09.o
     OBJ_SERIAL += TestSerial_SubView_c10.o TestSerial_SubView_c11.o TestSerial_SubView_c12.o
     OBJ_SERIAL += TestSerial_SubView_c13.o
-    OBJ_SERIAL += TestSerial_Reductions.o TestSerial_Scan.o
+    OBJ_SERIAL += TestSerial_Reductions.o TestSerial_ParallelScanRangePolicy.o
     OBJ_SERIAL += TestSerial_Reductions_DeviceView.o
     OBJ_SERIAL += TestSerial_Reducers_a.o TestSerial_Reducers_b.o TestSerial_Reducers_c.o TestSerial_Reducers_d.o TestSerial_Reducers_e.o
     OBJ_SERIAL += TestSerial_Complex.o
diff --git a/packages/kokkos/core/unit_test/TestAbort.hpp b/packages/kokkos/core/unit_test/TestAbort.hpp
index 63e9bdc6e23606398b5a2bc9d32c927cf8d5c950..6e51ef6ee715c5a96c1533386d0f162251bd0aef 100644
--- a/packages/kokkos/core/unit_test/TestAbort.hpp
+++ b/packages/kokkos/core/unit_test/TestAbort.hpp
@@ -19,7 +19,6 @@
 #include <regex>
 #include <Kokkos_Core.hpp>
 
-#ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC
 TEST(TEST_CATEGORY_DEATH, abort_from_host) {
   ::testing::FLAGS_gtest_death_test_style = "threadsafe";
 
@@ -81,7 +80,13 @@ void test_abort_from_device() {
   } else {
     TestAbortCausingAbnormalProgramTerminationAndPrinting<ExecutionSpace>();
   }
-#elif defined(KOKKOS_ENABLE_SYCL)  // FIXME_SYCL
+#elif defined(KOKKOS_ENABLE_OPENACC)  // FIXME_OPENACC
+  if (std::is_same<ExecutionSpace, Kokkos::Experimental::OpenACC>::value) {
+    TestAbortPrintingToStdout<ExecutionSpace>();
+  } else {
+    TestAbortCausingAbnormalProgramTerminationAndPrinting<ExecutionSpace>();
+  }
+#elif defined(KOKKOS_ENABLE_SYCL)     // FIXME_SYCL
   if (std::is_same<ExecutionSpace, Kokkos::Experimental::SYCL>::value) {
 #ifdef NDEBUG
     TestAbortPrintingToStdout<ExecutionSpace>();
@@ -100,4 +105,3 @@ TEST(TEST_CATEGORY_DEATH, abort_from_device) {
   ::testing::FLAGS_gtest_death_test_style = "threadsafe";
   test_abort_from_device<TEST_EXECSPACE>();
 }
-#endif
diff --git a/packages/kokkos/core/unit_test/TestAggregate.hpp b/packages/kokkos/core/unit_test/TestAggregate.hpp
index 23cc5860ac906b9777cb7d8a2c9747709c817735..4f67b2eddcebe9c1fbb874e48923fd03995a787b 100644
--- a/packages/kokkos/core/unit_test/TestAggregate.hpp
+++ b/packages/kokkos/core/unit_test/TestAggregate.hpp
@@ -56,8 +56,8 @@ void TestViewAggregate() {
                 "");
   static_assert(std::is_same<typename a32_type::pointer_type, double *>::value,
                 "");
-  static_assert(a32_type::Rank == 2, "");
-  static_assert(a32_flat_type::Rank == 3, "");
+  static_assert(a32_type::rank == 2, "");
+  static_assert(a32_flat_type::rank == 3, "");
 
   a32_type x("test", 4, 5);
   a32_flat_type y(x);
diff --git a/packages/kokkos/core/unit_test/TestArrayOps.hpp b/packages/kokkos/core/unit_test/TestArrayOps.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..065285727147edf338592711427a2f161ca645c1
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestArrayOps.hpp
@@ -0,0 +1,393 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+#include <numeric>
+
+namespace {
+
+TEST(TEST_CATEGORY, array_capacity) {
+  using A = Kokkos::Array<int, 2>;
+  A a{{3, 5}};
+
+  ASSERT_FALSE(a.empty());
+  ASSERT_EQ(a.size(), 2u);
+  ASSERT_EQ(a.max_size(), 2u);
+}
+
+enum Enum { EZero, EOne };
+enum EnumShort : short { ESZero, ESOne };
+
+TEST(TEST_CATEGORY, array_element_access) {
+  using A = Kokkos::Array<int, 2>;
+  A a{{3, 5}};
+  A const& ca = a;
+
+  size_t index = 1;
+  ASSERT_EQ(a[index], 5);
+
+  auto sc = static_cast<signed char>(index);
+  ASSERT_EQ(a[sc], a[index]);
+  ASSERT_EQ(ca[sc], a[index]);
+
+  auto uc = static_cast<unsigned char>(index);
+  ASSERT_EQ(a[uc], a[index]);
+  ASSERT_EQ(ca[uc], a[index]);
+
+  auto s = static_cast<short>(index);
+  ASSERT_EQ(a[s], a[index]);
+  ASSERT_EQ(ca[s], a[index]);
+
+  auto us = static_cast<unsigned short>(index);
+  ASSERT_EQ(a[us], a[index]);
+  ASSERT_EQ(ca[us], a[index]);
+
+  auto i = static_cast<int>(index);
+  ASSERT_EQ(a[i], a[index]);
+  ASSERT_EQ(ca[i], a[index]);
+
+  auto ui = static_cast<unsigned int>(index);
+  ASSERT_EQ(a[ui], a[index]);
+  ASSERT_EQ(ca[ui], a[index]);
+
+  auto l = static_cast<long>(index);
+  ASSERT_EQ(a[l], a[index]);
+  ASSERT_EQ(ca[l], a[index]);
+
+  auto ul = static_cast<unsigned long>(index);
+  ASSERT_EQ(a[ul], a[index]);
+  ASSERT_EQ(ca[ul], a[index]);
+
+  auto ll = static_cast<long long>(index);
+  ASSERT_EQ(a[ll], a[index]);
+  ASSERT_EQ(ca[ll], a[index]);
+
+  auto ull = static_cast<unsigned long long>(index);
+  ASSERT_EQ(a[ull], a[index]);
+  ASSERT_EQ(ca[ull], a[index]);
+
+  auto e = static_cast<Enum>(index);
+  ASSERT_EQ(a[e], a[index]);
+  ASSERT_EQ(ca[e], a[index]);
+
+  auto es = static_cast<EnumShort>(index);
+  ASSERT_EQ(a[es], a[index]);
+  ASSERT_EQ(ca[es], a[index]);
+
+  ASSERT_EQ(a.data()[index], a[index]);
+  ASSERT_EQ(ca.data()[index], a[index]);
+}
+
+TEST(TEST_CATEGORY, array_zero_capacity) {
+  using A = Kokkos::Array<int, 0>;
+  A e;
+
+  ASSERT_TRUE(e.empty());
+  ASSERT_EQ(e.size(), 0u);
+  ASSERT_EQ(e.max_size(), 0u);
+}
+
+TEST(TEST_CATEGORY, array_zero_data_nullptr) {
+  using A = Kokkos::Array<int, 0>;
+
+  A e;
+  ASSERT_EQ(e.data(), nullptr);
+
+  const A& ce = e;
+  ASSERT_EQ(ce.data(), nullptr);
+}
+
+TEST(TEST_CATEGORY, array_contiguous_capacity) {
+  using A =
+      Kokkos::Array<int, KOKKOS_INVALID_INDEX, Kokkos::Array<>::contiguous>;
+
+  A e(nullptr, 0);
+
+  ASSERT_TRUE(e.empty());
+  ASSERT_EQ(e.size(), 0u);
+  ASSERT_EQ(e.max_size(), 0u);
+
+  int aa[] = {3, 5};
+  A a(aa, std::size(aa));
+
+  ASSERT_EQ(a.empty(), 0 == std::size(aa));
+  ASSERT_EQ(a.size(), std::size(aa));
+  ASSERT_EQ(a.max_size(), std::size(aa));
+}
+
+TEST(TEST_CATEGORY, array_contiguous_element_access) {
+  int aa[] = {3, 5};
+  using A =
+      Kokkos::Array<int, KOKKOS_INVALID_INDEX, Kokkos::Array<>::contiguous>;
+  A a(aa, std::size(aa));
+  A const& ca = a;
+
+  size_t index = 1;
+  ASSERT_EQ(std::addressof(a[index]), std::addressof(aa[index]));
+
+  auto sc = static_cast<signed char>(index);
+  ASSERT_EQ(a[sc], aa[index]);
+  ASSERT_EQ(ca[sc], aa[index]);
+
+  auto uc = static_cast<unsigned char>(index);
+  ASSERT_EQ(a[uc], aa[index]);
+  ASSERT_EQ(ca[uc], aa[index]);
+
+  auto s = static_cast<short>(index);
+  ASSERT_EQ(a[s], aa[index]);
+  ASSERT_EQ(ca[s], aa[index]);
+
+  auto us = static_cast<unsigned short>(index);
+  ASSERT_EQ(a[us], aa[index]);
+  ASSERT_EQ(ca[us], aa[index]);
+
+  auto i = static_cast<int>(index);
+  ASSERT_EQ(a[i], aa[index]);
+  ASSERT_EQ(ca[i], aa[index]);
+
+  auto ui = static_cast<unsigned int>(index);
+  ASSERT_EQ(a[ui], aa[index]);
+  ASSERT_EQ(ca[ui], aa[index]);
+
+  auto l = static_cast<long>(index);
+  ASSERT_EQ(a[l], aa[index]);
+  ASSERT_EQ(ca[l], aa[index]);
+
+  auto ul = static_cast<unsigned long>(index);
+  ASSERT_EQ(a[ul], aa[index]);
+  ASSERT_EQ(ca[ul], aa[index]);
+
+  auto ll = static_cast<long long>(index);
+  ASSERT_EQ(a[ll], aa[index]);
+  ASSERT_EQ(ca[ll], aa[index]);
+
+  auto ull = static_cast<unsigned long long>(index);
+  ASSERT_EQ(a[ull], aa[index]);
+  ASSERT_EQ(ca[ull], aa[index]);
+
+  auto e = static_cast<Enum>(index);
+  ASSERT_EQ(a[e], aa[index]);
+  ASSERT_EQ(ca[e], aa[index]);
+
+  auto es = static_cast<EnumShort>(index);
+  ASSERT_EQ(a[es], aa[index]);
+  ASSERT_EQ(ca[es], aa[index]);
+
+  ASSERT_EQ(a.data(), aa);
+  ASSERT_EQ(ca.data(), aa);
+}
+
+TEST(TEST_CATEGORY, array_contiguous_assignment) {
+  using A =
+      Kokkos::Array<int, KOKKOS_INVALID_INDEX, Kokkos::Array<>::contiguous>;
+
+  int aa[] = {3, 5};
+  A a(aa, std::size(aa));
+
+  // operator=(Array<T, N, P> const&) semantics when lhs size a > rhs size b
+  using B = Kokkos::Array<int, 1>;
+  static_assert(std::size(aa) > B::size());
+  B b{{7}};
+
+  ASSERT_GT(std::size(a), std::size(b));
+  a = b;
+  ASSERT_GT(std::size(a), std::size(b));
+
+  ASSERT_EQ(a.size(), std::size(aa));
+  ASSERT_EQ(a.max_size(), std::size(aa));
+  ASSERT_EQ(a[0], 7);
+  ASSERT_EQ(a[1], 5);
+
+  // operator=(Array<T, N, P> const&) semantics when lhs size a < rhs size d
+  using D = Kokkos::Array<int, 4>;
+  static_assert(std::size(aa) < D::size());
+  D d{{11, 13, 17, 19}};
+
+  ASSERT_LT(std::size(a), std::size(d));
+  a = d;
+  ASSERT_LT(std::size(a), std::size(d));
+
+  ASSERT_EQ(a.size(), std::size(aa));
+  ASSERT_EQ(a.max_size(), std::size(aa));
+  ASSERT_EQ(a[0], 11);
+  ASSERT_EQ(a[1], 13);
+
+  // Copy assignment operator semantics when lhs size a > rhs size e
+  int ee[] = {23};
+  A e(ee, std::size(ee));
+
+  ASSERT_GT(a.size(), e.size());
+  a = e;
+  ASSERT_GT(a.size(), e.size());
+
+  ASSERT_EQ(a.size(), std::size(aa));
+  ASSERT_EQ(a.max_size(), std::size(aa));
+  ASSERT_EQ(a[0], 23);
+  ASSERT_EQ(a[1], 13);
+
+  // Copy assignment operator semantics when lhs size e < rhs size a
+  ASSERT_LT(e.size(), a.size());
+  e[0] = 29;  // To check that e[0] is overwritten by e = a
+  e    = a;
+  ASSERT_LT(e.size(), a.size());
+
+  ASSERT_EQ(e.size(), std::size(ee));
+  ASSERT_EQ(e.max_size(), std::size(ee));
+  ASSERT_EQ(e[0], 23);
+}
+
+TEST(TEST_CATEGORY, array_strided_capacity) {
+  using A = Kokkos::Array<int, KOKKOS_INVALID_INDEX, Kokkos::Array<>::strided>;
+
+  A e(nullptr, 0, 0);
+
+  ASSERT_TRUE(e.empty());
+  ASSERT_EQ(e.size(), 0u);
+  ASSERT_EQ(e.max_size(), 0u);
+
+  int aa[]                 = {5, 7, 11, 13, 17, 19};
+  constexpr size_t aStride = 2;
+  A a(aa, std::size(aa) / aStride, aStride);
+
+  ASSERT_EQ(a.empty(), 0 == std::size(aa) / aStride);
+  ASSERT_EQ(a.size(), std::size(aa) / aStride);
+  ASSERT_EQ(a.max_size(), std::size(aa) / aStride);
+}
+
+TEST(TEST_CATEGORY, array_strided_element_access) {
+  using A = Kokkos::Array<int, KOKKOS_INVALID_INDEX, Kokkos::Array<>::strided>;
+
+  int aa[]                 = {5, 7, 11, 13, 17, 19};
+  constexpr size_t aStride = 2;
+
+  A a(aa, std::size(aa) / aStride, aStride);
+  A const& ca = a;
+
+  size_t index = 1;
+  ASSERT_EQ(std::addressof(a[index]), std::addressof(aa[index * aStride]));
+
+  auto sc = static_cast<signed char>(index);
+  ASSERT_EQ(a[sc], aa[index * aStride]);
+  ASSERT_EQ(ca[sc], aa[index * aStride]);
+
+  auto uc = static_cast<unsigned char>(index);
+  ASSERT_EQ(a[uc], aa[index * aStride]);
+  ASSERT_EQ(ca[uc], aa[index * aStride]);
+
+  auto s = static_cast<short>(index);
+  ASSERT_EQ(a[s], aa[index * aStride]);
+  ASSERT_EQ(ca[s], aa[index * aStride]);
+
+  auto us = static_cast<unsigned short>(index);
+  ASSERT_EQ(a[us], aa[index * aStride]);
+  ASSERT_EQ(ca[us], aa[index * aStride]);
+
+  auto i = static_cast<int>(index);
+  ASSERT_EQ(a[i], aa[index * aStride]);
+  ASSERT_EQ(ca[i], aa[index * aStride]);
+
+  auto ui = static_cast<unsigned int>(index);
+  ASSERT_EQ(a[ui], aa[index * aStride]);
+  ASSERT_EQ(ca[ui], aa[index * aStride]);
+
+  auto l = static_cast<long>(index);
+  ASSERT_EQ(a[l], aa[index * aStride]);
+  ASSERT_EQ(ca[l], aa[index * aStride]);
+
+  auto ul = static_cast<unsigned long>(index);
+  ASSERT_EQ(a[ul], aa[index * aStride]);
+  ASSERT_EQ(ca[ul], aa[index * aStride]);
+
+  auto ll = static_cast<long long>(index);
+  ASSERT_EQ(a[ll], aa[index * aStride]);
+  ASSERT_EQ(ca[ll], aa[index * aStride]);
+
+  auto ull = static_cast<unsigned long long>(index);
+  ASSERT_EQ(a[ull], aa[index * aStride]);
+  ASSERT_EQ(ca[ull], aa[index * aStride]);
+
+  auto e = static_cast<Enum>(index);
+  ASSERT_EQ(a[e], aa[index * aStride]);
+  ASSERT_EQ(ca[e], aa[index * aStride]);
+
+  auto es = static_cast<EnumShort>(index);
+  ASSERT_EQ(a[es], aa[index * aStride]);
+  ASSERT_EQ(ca[es], aa[index * aStride]);
+
+  ASSERT_EQ(a.data(), aa);
+  ASSERT_EQ(ca.data(), aa);
+}
+
+TEST(TEST_CATEGORY, array_strided_assignment) {
+  using A  = Kokkos::Array<int, KOKKOS_INVALID_INDEX, Kokkos::Array<>::strided>;
+  int aa[] = {5, 7, 11, 13, 17, 19};
+  constexpr size_t aStride = 2;
+  A a(aa, std::size(aa) / aStride, aStride);
+
+  // operator=(Array<T, N, P> const&) semantics when lhs size a > rhs size b
+  using B = Kokkos::Array<int, 1>;
+  static_assert(std::size(aa) / aStride > B::size());
+  B b{{23}};
+
+  ASSERT_GT(std::size(a), std::size(b));
+  a = b;
+  ASSERT_GT(std::size(a), std::size(b));
+
+  ASSERT_EQ(a.size(), std::size(aa) / aStride);
+  ASSERT_EQ(a.max_size(), std::size(aa) / aStride);
+  ASSERT_EQ(a[0], b[0]);
+  ASSERT_EQ(a[1], aa[1 * aStride]);
+
+  // operator=(Array<T, N, P> const&) semantics when lhs size a < rhs size d
+  using D = Kokkos::Array<int, 7>;
+  static_assert(std::size(aa) / aStride < D::size());
+  D d{{29, 31, 37, 41, 43, 47, 53}};
+
+  ASSERT_LT(std::size(a), std::size(d));
+  a = d;
+  ASSERT_LT(std::size(a), std::size(d));
+
+  ASSERT_EQ(a.size(), std::size(aa) / aStride);
+  ASSERT_EQ(a.max_size(), std::size(aa) / aStride);
+  ASSERT_EQ(a[0], d[0]);
+  ASSERT_EQ(a[1], d[1]);
+
+  // Copy assignment operator semantics when lhs size a > rhs size e
+  int ee[]                 = {59, 61, 67, 71, 73, 79};
+  constexpr size_t eStride = 3;
+  A e(ee, std::size(ee) / eStride, eStride);
+
+  ASSERT_GT(a.size(), e.size());
+  a = e;
+  ASSERT_GT(a.size(), e.size());
+
+  ASSERT_EQ(a.size(), std::size(aa) / aStride);
+  ASSERT_EQ(a.max_size(), std::size(aa) / aStride);
+  ASSERT_EQ(a[0], ee[0 * eStride]);
+  ASSERT_EQ(a[1], ee[1 * eStride]);
+
+  // Copy assignment operator semantics when lhs size e < rhs size a
+  e[0] = 83;  // To check that e[0] is overwritten by e = a
+  ASSERT_LT(e.size(), a.size());
+  e = a;
+  ASSERT_LT(e.size(), a.size());
+  ASSERT_EQ(e.size(), std::size(ee) / eStride);
+  ASSERT_EQ(e.max_size(), std::size(ee) / eStride);
+  ASSERT_EQ(e[0], ee[0]);
+}
+
+}  // namespace
diff --git a/packages/kokkos/core/unit_test/TestAtomicOperations.hpp b/packages/kokkos/core/unit_test/TestAtomicOperations.hpp
index b8940378bdf769dd251855087ac629c54f54c867..a5aebed41380f7b01e250e87698faad96cf231cc 100644
--- a/packages/kokkos/core/unit_test/TestAtomicOperations.hpp
+++ b/packages/kokkos/core/unit_test/TestAtomicOperations.hpp
@@ -15,1156 +15,439 @@
 //@HEADER
 
 #include <Kokkos_Core.hpp>
+#include <Kokkos_Pair.hpp>
 
 namespace TestAtomicOperations {
 
-//-----------------------------------------------
-//--------------zero_functor---------------------
-//-----------------------------------------------
-
-template <class T, class DEVICE_TYPE>
-struct ZeroFunctor {
-  using execution_space = DEVICE_TYPE;
-  using type            = typename Kokkos::View<T, execution_space>;
-  using h_type          = typename Kokkos::View<T, execution_space>::HostMirror;
-
-  type data;
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(int) const { data() = 0; }
+struct AddAtomicTest {
+  template <class T>
+  KOKKOS_FUNCTION static auto atomic_op(T* ptr_op, T* ptr_fetch_op,
+                                        T* ptr_op_fetch, T update) {
+    Kokkos::atomic_add(ptr_op, update);
+    T old_val = Kokkos::atomic_fetch_add(ptr_fetch_op, update);
+    T new_val = Kokkos::atomic_add_fetch(ptr_op_fetch, update);
+    return Kokkos::pair<T, T>(old_val, new_val);
+  }
+  template <class T>
+  KOKKOS_FUNCTION static T op(T old, T update) {
+    return old + update;
+  }
+  static const char* name() { return "add"; }
 };
 
-//-----------------------------------------------
-//--------------init_functor---------------------
-//-----------------------------------------------
-
-template <class T, class DEVICE_TYPE>
-struct InitFunctor {
-  using execution_space = DEVICE_TYPE;
-  using type            = typename Kokkos::View<T, execution_space>;
-  using h_type          = typename Kokkos::View<T, execution_space>::HostMirror;
-
-  type data;
-  T init_value;
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(int) const { data() = init_value; }
-
-  InitFunctor(T _init_value) : init_value(_init_value) {}
+struct SubAtomicTest {
+  template <class T>
+  KOKKOS_FUNCTION static auto atomic_op(T* ptr_op, T* ptr_fetch_op,
+                                        T* ptr_op_fetch, T update) {
+    Kokkos::atomic_sub(ptr_op, update);
+    T old_val = Kokkos::atomic_fetch_sub(ptr_fetch_op, update);
+    T new_val = Kokkos::atomic_sub_fetch(ptr_op_fetch, update);
+    return Kokkos::pair<T, T>(old_val, new_val);
+  }
+  template <class T>
+  KOKKOS_FUNCTION static T op(T old, T update) {
+    return old - update;
+  }
+  static const char* name() { return "sub"; }
 };
 
-//---------------------------------------------------
-//--------------atomic_load/store/assign---------------------
-//---------------------------------------------------
-#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
-template <class T, class DEVICE_TYPE>
-struct LoadStoreFunctor {
-  using execution_space = DEVICE_TYPE;
-  using type            = Kokkos::View<T, execution_space>;
-
-  type data;
-  T i0;
-  T i1;
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(int) const {
-    T old = Kokkos::atomic_load(&data());
-    if (old != i0)
-      Kokkos::abort("Kokkos Atomic Load didn't get the right value");
-    Kokkos::atomic_store(&data(), i1);
-    Kokkos::atomic_assign(&data(), old);
+struct IncAtomicTest {
+  template <class T>
+  KOKKOS_FUNCTION static auto atomic_op(T* ptr_op, T* ptr_fetch_op,
+                                        T* ptr_op_fetch, T) {
+    Kokkos::atomic_inc(ptr_op);
+    T old_val = Kokkos::atomic_fetch_inc(ptr_fetch_op);
+    T new_val = Kokkos::atomic_inc_fetch(ptr_op_fetch);
+    return Kokkos::pair<T, T>(old_val, new_val);
   }
-  LoadStoreFunctor(T _i0, T _i1) : i0(_i0), i1(_i1) {}
+  template <class T>
+  KOKKOS_FUNCTION static T op(T old, T) {
+    return old + 1;
+  }
+  static const char* name() { return "inc"; }
 };
-#endif
-
-template <class T, class DeviceType>
-bool LoadStoreAtomicTest(T i0, T i1) {
-  using execution_space = typename DeviceType::execution_space;
-  struct InitFunctor<T, execution_space> f_init(i0);
-  typename InitFunctor<T, execution_space>::type data("Data");
-  typename InitFunctor<T, execution_space>::h_type h_data("HData");
-
-  f_init.data = data;
-  Kokkos::parallel_for(1, f_init);
-  execution_space().fence();
-
-#ifdef KOKKOS_ENABLE_DESUL_ATOMICS
-  struct LoadStoreFunctor<T, execution_space> f(i0, i1);
-
-  f.data = data;
-  Kokkos::parallel_for(1, f);
-#else
-  h_data() = i1;
-#endif
-
-  Kokkos::deep_copy(h_data, data);
-
-  return h_data() == i0;
-}
 
-//---------------------------------------------------
-//--------------atomic_fetch_max---------------------
-//---------------------------------------------------
-
-template <class T, class DEVICE_TYPE>
-struct MaxFunctor {
-  using execution_space = DEVICE_TYPE;
-  using type            = Kokkos::View<T, execution_space>;
-
-  type data;
-  T i0;
-  T i1;
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(int) const {
-    // Kokkos::atomic_fetch_max( &data(), (T) 1 );
-    Kokkos::atomic_fetch_max(&data(), (T)i1);
+struct DecAtomicTest {
+  template <class T>
+  KOKKOS_FUNCTION static auto atomic_op(T* ptr_op, T* ptr_fetch_op,
+                                        T* ptr_op_fetch, T) {
+    Kokkos::atomic_dec(ptr_op);
+    T old_val = Kokkos::atomic_fetch_dec(ptr_fetch_op);
+    T new_val = Kokkos::atomic_dec_fetch(ptr_op_fetch);
+    return Kokkos::pair<T, T>(old_val, new_val);
+  }
+  template <class T>
+  KOKKOS_FUNCTION static T op(T old, T) {
+    return old - 1;
   }
-  MaxFunctor(T _i0, T _i1) : i0(_i0), i1(_i1) {}
+  static const char* name() { return "dec"; }
 };
 
-template <class T, class execution_space>
-T MaxAtomic(T i0, T i1) {
-  struct InitFunctor<T, execution_space> f_init(i0);
-  typename InitFunctor<T, execution_space>::type data("Data");
-  typename InitFunctor<T, execution_space>::h_type h_data("HData");
-
-  f_init.data = data;
-  Kokkos::parallel_for(1, f_init);
-  execution_space().fence();
-
-  struct MaxFunctor<T, execution_space> f(i0, i1);
-
-  f.data = data;
-  Kokkos::parallel_for(1, f);
-  execution_space().fence();
-
-  Kokkos::deep_copy(h_data, data);
-  T val = h_data();
-
-  return val;
-}
-
-template <class T>
-T MaxAtomicCheck(T i0, T i1) {
-  T* data = new T[1];
-  data[0] = 0;
-
-  *data = (i0 > i1 ? i0 : i1);
-
-  T val = *data;
-  delete[] data;
-
-  return val;
-}
-
-template <class T, class DeviceType>
-bool MaxAtomicTest(T i0, T i1) {
-  T res       = MaxAtomic<T, DeviceType>(i0, i1);
-  T resSerial = MaxAtomicCheck<T>(i0, i1);
-
-  bool passed = true;
-
-  if (resSerial != res) {
-    passed = false;
-
-    std::cout << "Loop<" << typeid(T).name() << ">( test = MaxAtomicTest"
-              << " FAILED : " << resSerial << " != " << res << std::endl;
+struct MaxAtomicTest {
+  template <class T>
+  KOKKOS_FUNCTION static auto atomic_op(T* ptr_op, T* ptr_fetch_op,
+                                        T* ptr_op_fetch, T update) {
+    Kokkos::atomic_max(ptr_op, update);
+    T old_val = Kokkos::atomic_fetch_max(ptr_fetch_op, update);
+    T new_val = Kokkos::atomic_max_fetch(ptr_op_fetch, update);
+    return Kokkos::pair<T, T>(old_val, new_val);
   }
-
-  return passed;
-}
-
-//---------------------------------------------------
-//--------------atomic_fetch_min---------------------
-//---------------------------------------------------
-
-template <class T, class DEVICE_TYPE>
-struct MinFunctor {
-  using execution_space = DEVICE_TYPE;
-  using type            = Kokkos::View<T, execution_space>;
-
-  type data;
-  T i0;
-  T i1;
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(int) const { Kokkos::atomic_fetch_min(&data(), (T)i1); }
-
-  MinFunctor(T _i0, T _i1) : i0(_i0), i1(_i1) {}
+  template <class T>
+  KOKKOS_FUNCTION static T op(T old, T update) {
+    return update > old ? update : old;
+  }
+  static const char* name() { return "max"; }
 };
 
-template <class T, class execution_space>
-T MinAtomic(T i0, T i1) {
-  struct InitFunctor<T, execution_space> f_init(i0);
-  typename InitFunctor<T, execution_space>::type data("Data");
-  typename InitFunctor<T, execution_space>::h_type h_data("HData");
-
-  f_init.data = data;
-  Kokkos::parallel_for(1, f_init);
-  execution_space().fence();
-
-  struct MinFunctor<T, execution_space> f(i0, i1);
-
-  f.data = data;
-  Kokkos::parallel_for(1, f);
-  execution_space().fence();
-
-  Kokkos::deep_copy(h_data, data);
-  T val = h_data();
-
-  return val;
-}
-
-template <class T>
-T MinAtomicCheck(T i0, T i1) {
-  T* data = new T[1];
-  data[0] = 0;
-
-  *data = (i0 < i1 ? i0 : i1);
-
-  T val = *data;
-  delete[] data;
-
-  return val;
-}
-
-template <class T, class DeviceType>
-bool MinAtomicTest(T i0, T i1) {
-  T res       = MinAtomic<T, DeviceType>(i0, i1);
-  T resSerial = MinAtomicCheck<T>(i0, i1);
-
-  bool passed = true;
-
-  if (resSerial != res) {
-    passed = false;
-
-    std::cout << "Loop<" << typeid(T).name() << ">( test = MinAtomicTest"
-              << " FAILED : " << resSerial << " != " << res << std::endl;
+struct MinAtomicTest {
+  template <class T>
+  KOKKOS_FUNCTION static auto atomic_op(T* ptr_op, T* ptr_fetch_op,
+                                        T* ptr_op_fetch, T update) {
+    Kokkos::atomic_min(ptr_op, update);
+    T old_val = Kokkos::atomic_fetch_min(ptr_fetch_op, update);
+    T new_val = Kokkos::atomic_min_fetch(ptr_op_fetch, update);
+    return Kokkos::pair<T, T>(old_val, new_val);
   }
-
-  return passed;
-}
-
-//---------------------------------------------------
-//--------------atomic_increment---------------------
-//---------------------------------------------------
-
-template <class T, class DEVICE_TYPE>
-struct IncFunctor {
-  using execution_space = DEVICE_TYPE;
-  using type            = Kokkos::View<T, execution_space>;
-
-  type data;
-  T i0;
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(int) const { Kokkos::atomic_increment(&data()); }
-
-  IncFunctor(T _i0) : i0(_i0) {}
+  template <class T>
+  KOKKOS_FUNCTION static T op(T old, T update) {
+    return update < old ? update : old;
+  }
+  static const char* name() { return "min"; }
 };
 
-template <class T, class execution_space>
-T IncAtomic(T i0) {
-  struct InitFunctor<T, execution_space> f_init(i0);
-  typename InitFunctor<T, execution_space>::type data("Data");
-  typename InitFunctor<T, execution_space>::h_type h_data("HData");
-
-  f_init.data = data;
-  Kokkos::parallel_for(1, f_init);
-  execution_space().fence();
-
-  struct IncFunctor<T, execution_space> f(i0);
-
-  f.data = data;
-  Kokkos::parallel_for(1, f);
-  execution_space().fence();
-
-  Kokkos::deep_copy(h_data, data);
-  T val = h_data();
-
-  return val;
-}
-
-template <class T>
-T IncAtomicCheck(T i0) {
-  T* data = new T[1];
-  data[0] = 0;
-
-  *data = i0 + 1;
-
-  T val = *data;
-  delete[] data;
-
-  return val;
-}
-
-template <class T, class DeviceType>
-bool IncAtomicTest(T i0) {
-  T res       = IncAtomic<T, DeviceType>(i0);
-  T resSerial = IncAtomicCheck<T>(i0);
-
-  bool passed = true;
-
-  if (resSerial != res) {
-    passed = false;
-
-    std::cout << "Loop<" << typeid(T).name() << ">( test = IncAtomicTest"
-              << " FAILED : " << resSerial << " != " << res << std::endl;
+struct MulAtomicTest {
+  template <class T>
+  KOKKOS_FUNCTION static auto atomic_op(T* ptr_op, T* ptr_fetch_op,
+                                        T* ptr_op_fetch, T update) {
+    Kokkos::atomic_mul(ptr_op, update);
+    T old_val = Kokkos::atomic_fetch_mul(ptr_fetch_op, update);
+    T new_val = Kokkos::atomic_mul_fetch(ptr_op_fetch, update);
+    return Kokkos::pair<T, T>(old_val, new_val);
   }
-
-  return passed;
-}
-
-//---------------------------------------------------
-//-------------atomic_wrapping_increment-------------
-//---------------------------------------------------
-
-template <class T, class DEVICE_TYPE>
-struct WrappingIncFunctor {
-  using execution_space = DEVICE_TYPE;
-  using type            = Kokkos::View<T, execution_space>;
-
-  type data;
-  T i0;
-  T i1;
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(int) const {
-#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
-    desul::atomic_fetch_inc_mod(&data(), (T)i1, desul::MemoryOrderRelaxed(),
-                                desul::MemoryScopeDevice());
-#endif
+  template <class T>
+  KOKKOS_FUNCTION static T op(T old, T update) {
+    return old * update;
   }
-
-  WrappingIncFunctor(T _i0, T _i1) : i0(_i0), i1(_i1) {}
+  static const char* name() { return "mul"; }
 };
 
-template <class T, class execution_space>
-T WrappingIncAtomic(T i0, T i1) {
-  struct InitFunctor<T, execution_space> f_init(i0);
-  typename InitFunctor<T, execution_space>::type data("Data");
-  typename InitFunctor<T, execution_space>::h_type h_data("HData");
-
-  f_init.data = data;
-  Kokkos::parallel_for(1, f_init);
-  execution_space().fence();
-
-  struct WrappingIncFunctor<T, execution_space> f(i0, i1);
-
-  f.data = data;
-  Kokkos::parallel_for(1, f);
-  execution_space().fence();
-
-  Kokkos::deep_copy(h_data, data);
-  T val = h_data();
-
-  return val;
-}
-
-template <class T>
-T WrappingIncAtomicCheck(T i0, T i1) {
-  T* data = new T[1];
-  data[0] = 0;
-
-  // Wraps to 0 when i0 >= i1
-  *data = ((i0 >= i1) ? (T)0 : i0 + (T)1);
-
-  T val = *data;
-  delete[] data;
-
-  return val;
-}
-
-template <class T, class DeviceType>
-bool WrappingIncAtomicTest(T i0, T i1) {
-  T res       = WrappingIncAtomic<T, DeviceType>(i0, i1);
-  T resSerial = WrappingIncAtomicCheck<T>(i0, i1);
-
-  bool passed = true;
-
-  if (resSerial != res) {
-    passed = false;
-
-    std::cout << "Loop<" << typeid(T).name()
-              << ">( test = WrappingIncAtomicTest"
-              << " FAILED : " << resSerial << " != " << res << std::endl;
+struct DivAtomicTest {
+  template <class T>
+  KOKKOS_FUNCTION static auto atomic_op(T* ptr_op, T* ptr_fetch_op,
+                                        T* ptr_op_fetch, T update) {
+    Kokkos::atomic_div(ptr_op, update);
+    T old_val = Kokkos::atomic_fetch_div(ptr_fetch_op, update);
+    T new_val = Kokkos::atomic_div_fetch(ptr_op_fetch, update);
+    return Kokkos::pair<T, T>(old_val, new_val);
   }
-
-  return passed;
-}
-
-//---------------------------------------------------
-//--------------atomic_decrement---------------------
-//---------------------------------------------------
-
-template <class T, class DEVICE_TYPE>
-struct DecFunctor {
-  using execution_space = DEVICE_TYPE;
-  using type            = Kokkos::View<T, execution_space>;
-
-  type data;
-  T i0;
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(int) const { Kokkos::atomic_decrement(&data()); }
-
-  DecFunctor(T _i0) : i0(_i0) {}
+  template <class T>
+  KOKKOS_FUNCTION static T op(T old, T update) {
+    return old / update;
+  }
+  static const char* name() { return "div"; }
 };
 
-template <class T, class execution_space>
-T DecAtomic(T i0) {
-  struct InitFunctor<T, execution_space> f_init(i0);
-  typename InitFunctor<T, execution_space>::type data("Data");
-  typename InitFunctor<T, execution_space>::h_type h_data("HData");
-
-  f_init.data = data;
-  Kokkos::parallel_for(1, f_init);
-  execution_space().fence();
-
-  struct DecFunctor<T, execution_space> f(i0);
-
-  f.data = data;
-  Kokkos::parallel_for(1, f);
-  execution_space().fence();
-
-  Kokkos::deep_copy(h_data, data);
-  T val = h_data();
-
-  return val;
-}
-
-template <class T>
-T DecAtomicCheck(T i0) {
-  T* data = new T[1];
-  data[0] = 0;
-
-  *data = i0 - 1;
-
-  T val = *data;
-  delete[] data;
-
-  return val;
-}
-
-template <class T, class DeviceType>
-bool DecAtomicTest(T i0) {
-  T res       = DecAtomic<T, DeviceType>(i0);
-  T resSerial = DecAtomicCheck<T>(i0);
-
-  bool passed = true;
-
-  if (resSerial != res) {
-    passed = false;
-
-    std::cout << "Loop<" << typeid(T).name() << ">( test = DecAtomicTest"
-              << " FAILED : " << resSerial << " != " << res << std::endl;
+struct ModAtomicTest {
+  template <class T>
+  KOKKOS_FUNCTION static auto atomic_op(T* ptr_op, T* ptr_fetch_op,
+                                        T* ptr_op_fetch, T update) {
+    // Kokkos::atomic_mod(ptr_op, update);
+    (void)Kokkos::atomic_fetch_mod(ptr_op, update);
+    T old_val = Kokkos::atomic_fetch_mod(ptr_fetch_op, update);
+    T new_val = Kokkos::atomic_mod_fetch(ptr_op_fetch, update);
+    return Kokkos::pair<T, T>(old_val, new_val);
   }
-
-  return passed;
-}
-
-//---------------------------------------------------
-//-------------atomic_wrapping_decrement-------------
-//---------------------------------------------------
-
-template <class T, class DEVICE_TYPE>
-struct WrappingDecFunctor {
-  using execution_space = DEVICE_TYPE;
-  using type            = Kokkos::View<T, execution_space>;
-
-  type data;
-  T i0;
-  T i1;
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(int) const {
-#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
-    desul::atomic_fetch_dec_mod(&data(), (T)i1, desul::MemoryOrderRelaxed(),
-                                desul::MemoryScopeDevice());
-#endif
+  template <class T>
+  KOKKOS_FUNCTION static T op(T old, T update) {
+    return old % update;
   }
-
-  WrappingDecFunctor(T _i0, T _i1) : i0(_i0), i1(_i1) {}
+  static const char* name() { return "mod"; }
 };
 
-template <class T, class execution_space>
-T WrappingDecAtomic(T i0, T i1) {
-  struct InitFunctor<T, execution_space> f_init(i0);
-  typename InitFunctor<T, execution_space>::type data("Data");
-  typename InitFunctor<T, execution_space>::h_type h_data("HData");
-
-  f_init.data = data;
-  Kokkos::parallel_for(1, f_init);
-  execution_space().fence();
-
-  struct WrappingDecFunctor<T, execution_space> f(i0, i1);
-
-  f.data = data;
-  Kokkos::parallel_for(1, f);
-  execution_space().fence();
-
-  Kokkos::deep_copy(h_data, data);
-  T val = h_data();
-
-  return val;
-}
-
-template <class T>
-T WrappingDecAtomicCheck(T i0, T i1) {
-  T* data = new T[1];
-  data[0] = 0;
-
-  // Wraps to i1 when i0 <= 0
-  // i0 should never be negative
-  *data = ((i0 <= (T)0) ? i1 : i0 - (T)1);
-
-  T val = *data;
-  delete[] data;
-
-  return val;
-}
-
-template <class T, class DeviceType>
-bool WrappingDecAtomicTest(T i0, T i1) {
-  T res       = WrappingDecAtomic<T, DeviceType>(i0, i1);
-  T resSerial = WrappingDecAtomicCheck<T>(i0, i1);
-
-  bool passed = true;
-
-  if (resSerial != res) {
-    passed = false;
-
-    std::cout << "Loop<" << typeid(T).name()
-              << ">( test = WrappingDecAtomicTest"
-              << " FAILED : " << resSerial << " != " << res << std::endl;
+struct AndAtomicTest {
+  template <class T>
+  KOKKOS_FUNCTION static auto atomic_op(T* ptr_op, T* ptr_fetch_op,
+                                        T* ptr_op_fetch, T update) {
+    Kokkos::atomic_and(ptr_op, update);
+    T old_val = Kokkos::atomic_fetch_and(ptr_fetch_op, update);
+    T new_val = Kokkos::atomic_and_fetch(ptr_op_fetch, update);
+    return Kokkos::pair<T, T>(old_val, new_val);
   }
-
-  return passed;
-}
-
-//---------------------------------------------------
-//--------------atomic_fetch_mul---------------------
-//---------------------------------------------------
-
-template <class T, class DEVICE_TYPE>
-struct MulFunctor {
-  using execution_space = DEVICE_TYPE;
-  using type            = Kokkos::View<T, execution_space>;
-
-  type data;
-  T i0;
-  T i1;
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(int) const { Kokkos::atomic_fetch_mul(&data(), (T)i1); }
-
-  MulFunctor(T _i0, T _i1) : i0(_i0), i1(_i1) {}
+  template <class T>
+  KOKKOS_FUNCTION static T op(T old, T update) {
+    return old & update;
+  }
+  static const char* name() { return "and"; }
 };
 
-template <class T, class execution_space>
-T MulAtomic(T i0, T i1) {
-  struct InitFunctor<T, execution_space> f_init(i0);
-  typename InitFunctor<T, execution_space>::type data("Data");
-  typename InitFunctor<T, execution_space>::h_type h_data("HData");
-
-  f_init.data = data;
-  Kokkos::parallel_for(1, f_init);
-  execution_space().fence();
-
-  struct MulFunctor<T, execution_space> f(i0, i1);
-
-  f.data = data;
-  Kokkos::parallel_for(1, f);
-  execution_space().fence();
-
-  Kokkos::deep_copy(h_data, data);
-  T val = h_data();
-
-  return val;
-}
-
-template <class T>
-T MulAtomicCheck(T i0, T i1) {
-  T* data = new T[1];
-  data[0] = 0;
-
-  *data = i0 * i1;
-
-  T val = *data;
-  delete[] data;
-
-  return val;
-}
-
-template <class T, class DeviceType>
-bool MulAtomicTest(T i0, T i1) {
-  T res       = MulAtomic<T, DeviceType>(i0, i1);
-  T resSerial = MulAtomicCheck<T>(i0, i1);
-
-  bool passed = true;
-
-  if (resSerial != res) {
-    passed = false;
-
-    std::cout << "Loop<" << typeid(T).name() << ">( test = MulAtomicTest"
-              << " FAILED : " << resSerial << " != " << res << std::endl;
+struct OrAtomicTest {
+  template <class T>
+  KOKKOS_FUNCTION static auto atomic_op(T* ptr_op, T* ptr_fetch_op,
+                                        T* ptr_op_fetch, T update) {
+    Kokkos::atomic_or(ptr_op, update);
+    T old_val = Kokkos::atomic_fetch_or(ptr_fetch_op, update);
+    T new_val = Kokkos::atomic_or_fetch(ptr_op_fetch, update);
+    return Kokkos::pair<T, T>(old_val, new_val);
   }
-
-  return passed;
-}
-
-//---------------------------------------------------
-//--------------atomic_fetch_div---------------------
-//---------------------------------------------------
-
-template <class T, class DEVICE_TYPE>
-struct DivFunctor {
-  using execution_space = DEVICE_TYPE;
-  using type            = Kokkos::View<T, execution_space>;
-
-  type data;
-  T i0;
-  T i1;
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(int) const { Kokkos::atomic_fetch_div(&data(), (T)i1); }
-
-  DivFunctor(T _i0, T _i1) : i0(_i0), i1(_i1) {}
+  template <class T>
+  KOKKOS_FUNCTION static T op(T old, T update) {
+    return old | update;
+  }
+  static const char* name() { return "or"; }
 };
 
-template <class T, class execution_space>
-T DivAtomic(T i0, T i1) {
-  struct InitFunctor<T, execution_space> f_init(i0);
-  typename InitFunctor<T, execution_space>::type data("Data");
-  typename InitFunctor<T, execution_space>::h_type h_data("HData");
-
-  f_init.data = data;
-  Kokkos::parallel_for(1, f_init);
-  execution_space().fence();
-
-  struct DivFunctor<T, execution_space> f(i0, i1);
-
-  f.data = data;
-  Kokkos::parallel_for(1, f);
-  execution_space().fence();
-
-  Kokkos::deep_copy(h_data, data);
-  T val = h_data();
-
-  return val;
-}
-
-template <class T>
-T DivAtomicCheck(T i0, T i1) {
-  T* data = new T[1];
-  data[0] = 0;
-
-  *data = i0 / i1;
-
-  T val = *data;
-  delete[] data;
-
-  return val;
-}
-
-template <class T, class DeviceType>
-bool DivAtomicTest(T i0, T i1) {
-  T res       = DivAtomic<T, DeviceType>(i0, i1);
-  T resSerial = DivAtomicCheck<T>(i0, i1);
-
-  bool passed = true;
-
-  using Kokkos::abs;
-  if (abs((resSerial - res) * 1.) > 1e-5) {
-    passed = false;
-
-    std::cout << "Loop<" << typeid(T).name() << ">( test = DivAtomicTest"
-              << " FAILED : " << resSerial << " != " << res << std::endl;
+struct XorAtomicTest {
+  template <class T>
+  KOKKOS_FUNCTION static auto atomic_op(T* ptr_op, T* ptr_fetch_op,
+                                        T* ptr_op_fetch, T update) {
+    // Kokkos::atomic_xor(ptr_op, update);
+    (void)Kokkos::atomic_fetch_xor(ptr_op, update);
+    T old_val = Kokkos::atomic_fetch_xor(ptr_fetch_op, update);
+    T new_val = Kokkos::atomic_xor_fetch(ptr_op_fetch, update);
+    return Kokkos::pair<T, T>(old_val, new_val);
   }
-
-  return passed;
-}
-
-//---------------------------------------------------
-//--------------atomic_fetch_mod---------------------
-//---------------------------------------------------
-
-template <class T, class DEVICE_TYPE>
-struct ModFunctor {
-  using execution_space = DEVICE_TYPE;
-  using type            = Kokkos::View<T, execution_space>;
-
-  type data;
-  T i0;
-  T i1;
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(int) const { Kokkos::atomic_fetch_mod(&data(), (T)i1); }
-
-  ModFunctor(T _i0, T _i1) : i0(_i0), i1(_i1) {}
+  template <class T>
+  KOKKOS_FUNCTION static T op(T old, T update) {
+    return old ^ update;
+  }
+  static const char* name() { return "xor"; }
 };
 
-template <class T, class execution_space>
-T ModAtomic(T i0, T i1) {
-  struct InitFunctor<T, execution_space> f_init(i0);
-  typename InitFunctor<T, execution_space>::type data("Data");
-  typename InitFunctor<T, execution_space>::h_type h_data("HData");
-
-  f_init.data = data;
-  Kokkos::parallel_for(1, f_init);
-  execution_space().fence();
-
-  struct ModFunctor<T, execution_space> f(i0, i1);
-
-  f.data = data;
-  Kokkos::parallel_for(1, f);
-  execution_space().fence();
-
-  Kokkos::deep_copy(h_data, data);
-  T val = h_data();
-
-  return val;
-}
-
-template <class T>
-T ModAtomicCheck(T i0, T i1) {
-  T* data = new T[1];
-  data[0] = 0;
-
-  *data = i0 % i1;
-
-  T val = *data;
-  delete[] data;
-
-  return val;
-}
-
-template <class T, class DeviceType>
-bool ModAtomicTest(T i0, T i1) {
-  T res       = ModAtomic<T, DeviceType>(i0, i1);
-  T resSerial = ModAtomicCheck<T>(i0, i1);
-
-  bool passed = true;
-
-  if (resSerial != res) {
-    passed = false;
-
-    std::cout << "Loop<" << typeid(T).name() << ">( test = ModAtomicTest"
-              << " FAILED : " << resSerial << " != " << res << std::endl;
+struct NandAtomicTest {
+  template <class T>
+  KOKKOS_FUNCTION static auto atomic_op(T* ptr_op, T* ptr_fetch_op,
+                                        T* ptr_op_fetch, T update) {
+    // Kokkos::atomic_nand(ptr_op, update);
+    (void)Kokkos::atomic_fetch_nand(ptr_op, update);
+    T old_val = Kokkos::atomic_fetch_nand(ptr_fetch_op, update);
+    T new_val = Kokkos::atomic_nand_fetch(ptr_op_fetch, update);
+    return Kokkos::pair<T, T>(old_val, new_val);
   }
-
-  return passed;
-}
-
-//---------------------------------------------------
-//--------------atomic_fetch_and---------------------
-//---------------------------------------------------
-
-template <class T, class DEVICE_TYPE>
-struct AndFunctor {
-  using execution_space = DEVICE_TYPE;
-  using type            = Kokkos::View<T, execution_space>;
-
-  type data;
-  T i0;
-  T i1;
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(int) const {
-    T result = Kokkos::atomic_fetch_and(&data(), (T)i1);
-    Kokkos::atomic_and(&data(), result);
+  template <class T>
+  KOKKOS_FUNCTION static T op(T old, T update) {
+    return ~(old & update);
   }
-
-  AndFunctor(T _i0, T _i1) : i0(_i0), i1(_i1) {}
+  static const char* name() { return "nand"; }
 };
 
-template <class T, class execution_space>
-T AndAtomic(T i0, T i1) {
-  struct InitFunctor<T, execution_space> f_init(i0);
-  typename InitFunctor<T, execution_space>::type data("Data");
-  typename InitFunctor<T, execution_space>::h_type h_data("HData");
-
-  f_init.data = data;
-  Kokkos::parallel_for(1, f_init);
-  execution_space().fence();
-
-  struct AndFunctor<T, execution_space> f(i0, i1);
-
-  f.data = data;
-  Kokkos::parallel_for(1, f);
-  execution_space().fence();
-
-  Kokkos::deep_copy(h_data, data);
-  T val = h_data();
-
-  return val;
-}
-
-template <class T>
-T AndAtomicCheck(T i0, T i1) {
-  T* data = new T[1];
-  data[0] = 0;
-
-  *data = i0 & i1;
-
-  T val = *data;
-  delete[] data;
-
-  return val;
-}
-
-template <class T, class DeviceType>
-bool AndAtomicTest(T i0, T i1) {
-  T res       = AndAtomic<T, DeviceType>(i0, i1);
-  T resSerial = AndAtomicCheck<T>(i0, i1);
-
-  bool passed = true;
-
-  if (resSerial != res) {
-    passed = false;
-
-    std::cout << "Loop<" << typeid(T).name() << ">( test = AndAtomicTest"
-              << " FAILED : " << resSerial << " != " << res << std::endl;
+struct LShiftAtomicTest {
+  template <class T>
+  KOKKOS_FUNCTION static auto atomic_op(T* ptr_op, T* ptr_fetch_op,
+                                        T* ptr_op_fetch, T update) {
+    // Kokkos::atomic_lshift(ptr_op, update);
+    (void)Kokkos::atomic_fetch_lshift(ptr_op, update);
+    T old_val = Kokkos::atomic_fetch_lshift(ptr_fetch_op, update);
+    T new_val = Kokkos::atomic_lshift_fetch(ptr_op_fetch, update);
+    return Kokkos::pair<T, T>(old_val, new_val);
   }
-
-  return passed;
-}
-
-//---------------------------------------------------
-//--------------atomic_fetch_or----------------------
-//---------------------------------------------------
-
-template <class T, class DEVICE_TYPE>
-struct OrFunctor {
-  using execution_space = DEVICE_TYPE;
-  using type            = Kokkos::View<T, execution_space>;
-
-  type data;
-  T i0;
-  T i1;
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(int) const {
-    T result = Kokkos::atomic_fetch_or(&data(), (T)i1);
-    Kokkos::atomic_or(&data(), result);
+  template <class T>
+  KOKKOS_FUNCTION static T op(T old, T update) {
+    return old << update;
   }
-
-  OrFunctor(T _i0, T _i1) : i0(_i0), i1(_i1) {}
+  static const char* name() { return "lshift"; }
 };
 
-template <class T, class execution_space>
-T OrAtomic(T i0, T i1) {
-  struct InitFunctor<T, execution_space> f_init(i0);
-  typename InitFunctor<T, execution_space>::type data("Data");
-  typename InitFunctor<T, execution_space>::h_type h_data("HData");
-
-  f_init.data = data;
-  Kokkos::parallel_for(1, f_init);
-  execution_space().fence();
-
-  struct OrFunctor<T, execution_space> f(i0, i1);
-
-  f.data = data;
-  Kokkos::parallel_for(1, f);
-  execution_space().fence();
-
-  Kokkos::deep_copy(h_data, data);
-  T val = h_data();
-
-  return val;
-}
-
-template <class T>
-T OrAtomicCheck(T i0, T i1) {
-  T* data = new T[1];
-  data[0] = 0;
-
-  *data = i0 | i1;
-
-  T val = *data;
-  delete[] data;
-
-  return val;
-}
-
-template <class T, class DeviceType>
-bool OrAtomicTest(T i0, T i1) {
-  T res       = OrAtomic<T, DeviceType>(i0, i1);
-  T resSerial = OrAtomicCheck<T>(i0, i1);
-
-  bool passed = true;
-
-  if (resSerial != res) {
-    passed = false;
-
-    std::cout << "Loop<" << typeid(T).name() << ">( test = OrAtomicTest"
-              << " FAILED : " << resSerial << " != " << res << std::endl;
+struct RShiftAtomicTest {
+  template <class T>
+  KOKKOS_FUNCTION static auto atomic_op(T* ptr_op, T* ptr_fetch_op,
+                                        T* ptr_op_fetch, T update) {
+    // Kokkos::atomic_rshift(ptr_op, update); not implemented
+    (void)Kokkos::atomic_fetch_rshift(ptr_op, update);
+    T old_val = Kokkos::atomic_fetch_rshift(ptr_fetch_op, update);
+    T new_val = Kokkos::atomic_rshift_fetch(ptr_op_fetch, update);
+    return Kokkos::pair<T, T>(old_val, new_val);
   }
-
-  return passed;
-}
-
-//---------------------------------------------------
-//--------------atomic_fetch_xor---------------------
-//---------------------------------------------------
-
-template <class T, class DEVICE_TYPE>
-struct XorFunctor {
-  using execution_space = DEVICE_TYPE;
-  using type            = Kokkos::View<T, execution_space>;
-
-  type data;
-  T i0;
-  T i1;
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(int) const { Kokkos::atomic_fetch_xor(&data(), (T)i1); }
-
-  XorFunctor(T _i0, T _i1) : i0(_i0), i1(_i1) {}
+  template <class T>
+  KOKKOS_FUNCTION static T op(T old, T update) {
+    return old >> update;
+  }
+  static const char* name() { return "rshift"; }
 };
 
-template <class T, class execution_space>
-T XorAtomic(T i0, T i1) {
-  struct InitFunctor<T, execution_space> f_init(i0);
-  typename InitFunctor<T, execution_space>::type data("Data");
-  typename InitFunctor<T, execution_space>::h_type h_data("HData");
-
-  f_init.data = data;
-  Kokkos::parallel_for(1, f_init);
-  execution_space().fence();
-
-  struct XorFunctor<T, execution_space> f(i0, i1);
-
-  f.data = data;
-  Kokkos::parallel_for(1, f);
-  execution_space().fence();
-
-  Kokkos::deep_copy(h_data, data);
-  T val = h_data();
-
-  return val;
-}
-
-template <class T>
-T XorAtomicCheck(T i0, T i1) {
-  T* data = new T[1];
-  data[0] = 0;
-
-  *data = i0 ^ i1;
-
-  T val = *data;
-  delete[] data;
-
-  return val;
-}
-
-template <class T, class DeviceType>
-bool XorAtomicTest(T i0, T i1) {
-  T res       = XorAtomic<T, DeviceType>(i0, i1);
-  T resSerial = XorAtomicCheck<T>(i0, i1);
-
-  bool passed = true;
-
-  if (resSerial != res) {
-    passed = false;
-
-    std::cout << "Loop<" << typeid(T).name() << ">( test = XorAtomicTest"
-              << " FAILED : " << resSerial << " != " << res << std::endl;
+struct LoadStoreAtomicTest {
+  template <class T>
+  KOKKOS_FUNCTION static auto atomic_op(T* ptr_op, T* ptr_fetch_op,
+                                        T* ptr_op_fetch, T update) {
+    T old_val = Kokkos::atomic_load(ptr_op);
+    Kokkos::atomic_store(ptr_op, update);
+    Kokkos::atomic_store(ptr_op_fetch, update);
+    Kokkos::atomic_store(ptr_fetch_op, update);
+    return Kokkos::pair<T, T>(old_val, update);
   }
-
-  return passed;
-}
-
-//---------------------------------------------------
-//--------------atomic_fetch_lshift---------------------
-//---------------------------------------------------
-
-template <class T, class DEVICE_TYPE>
-struct LShiftFunctor {
-  using execution_space = DEVICE_TYPE;
-  using type            = Kokkos::View<T, execution_space>;
-
-  type data;
-  T i0;
-  T i1;
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(int) const { Kokkos::atomic_fetch_lshift(&data(), (T)i1); }
-
-  LShiftFunctor(T _i0, T _i1) : i0(_i0), i1(_i1) {}
+  template <class T>
+  KOKKOS_FUNCTION static T op(T, T update) {
+    return update;
+  }
+  static const char* name() { return "load/store"; }
 };
 
-template <class T, class execution_space>
-T LShiftAtomic(T i0, T i1) {
-  struct InitFunctor<T, execution_space> f_init(i0);
-  typename InitFunctor<T, execution_space>::type data("Data");
-  typename InitFunctor<T, execution_space>::h_type h_data("HData");
-
-  f_init.data = data;
-  Kokkos::parallel_for(1, f_init);
-  execution_space().fence();
-
-  struct LShiftFunctor<T, execution_space> f(i0, i1);
-
-  f.data = data;
-  Kokkos::parallel_for(1, f);
-  execution_space().fence();
-
-  Kokkos::deep_copy(h_data, data);
-  T val = h_data();
-
-  return val;
-}
-
-template <class T>
-T LShiftAtomicCheck(T i0, T i1) {
-  T* data = new T[1];
-  data[0] = 0;
-
-  *data = i0 << i1;
-
-  T val = *data;
-  delete[] data;
-
-  return val;
-}
-
-template <class T, class DeviceType>
-bool LShiftAtomicTest(T i0, T i1) {
-  T res       = LShiftAtomic<T, DeviceType>(i0, i1);
-  T resSerial = LShiftAtomicCheck<T>(i0, i1);
-
-  bool passed = true;
-
-  if (resSerial != res) {
-    passed = false;
-
-    std::cout << "Loop<" << typeid(T).name() << ">( test = LShiftAtomicTest"
-              << " FAILED : " << resSerial << " != " << res << std::endl;
+struct IncModAtomicTest {
+  template <class T>
+  KOKKOS_FUNCTION static auto atomic_op(T* ptr_op, T* ptr_fetch_op,
+                                        T* ptr_op_fetch, T wrap_value) {
+    // no atomic_inc_mod in desul
+    (void)desul::atomic_fetch_inc_mod(ptr_op, wrap_value,
+                                      desul::MemoryOrderRelaxed(),
+                                      desul::MemoryScopeDevice());
+    T old_val = desul::atomic_fetch_inc_mod(ptr_fetch_op, wrap_value,
+                                            desul::MemoryOrderRelaxed(),
+                                            desul::MemoryScopeDevice());
+    // no atomic_inc_mod_fetch in desul
+    (void)desul::atomic_fetch_inc_mod(ptr_op_fetch, wrap_value,
+                                      desul::MemoryOrderRelaxed(),
+                                      desul::MemoryScopeDevice());
+    T new_val = op(old_val, wrap_value);
+    return Kokkos::pair<T, T>(old_val, new_val);
   }
-
-  return passed;
-}
-
-//---------------------------------------------------
-//--------------atomic_fetch_rshift---------------------
-//---------------------------------------------------
-
-template <class T, class DEVICE_TYPE>
-struct RShiftFunctor {
-  using execution_space = DEVICE_TYPE;
-  using type            = Kokkos::View<T, execution_space>;
-
-  type data;
-  T i0;
-  T i1;
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(int) const { Kokkos::atomic_fetch_rshift(&data(), (T)i1); }
-
-  RShiftFunctor(T _i0, T _i1) : i0(_i0), i1(_i1) {}
+  template <class T>
+  KOKKOS_FUNCTION static T op(T old, T wrap_value) {
+    return old + 1 > wrap_value ? 0 : old + 1;
+  }
+  static const char* name() { return "inc_mod"; }
 };
 
-template <class T, class execution_space>
-T RShiftAtomic(T i0, T i1) {
-  struct InitFunctor<T, execution_space> f_init(i0);
-  typename InitFunctor<T, execution_space>::type data("Data");
-  typename InitFunctor<T, execution_space>::h_type h_data("HData");
-
-  f_init.data = data;
-  Kokkos::parallel_for(1, f_init);
-  execution_space().fence();
-
-  struct RShiftFunctor<T, execution_space> f(i0, i1);
-
-  f.data = data;
-  Kokkos::parallel_for(1, f);
-  execution_space().fence();
-
-  Kokkos::deep_copy(h_data, data);
-  T val = h_data();
-
-  return val;
-}
-
-template <class T>
-T RShiftAtomicCheck(T i0, T i1) {
-  T* data = new T[1];
-  data[0] = 0;
-
-  *data = i0 >> i1;
-
-  T val = *data;
-  delete[] data;
-
-  return val;
-}
-
-template <class T, class DeviceType>
-bool RShiftAtomicTest(T i0, T i1) {
-  T res       = RShiftAtomic<T, DeviceType>(i0, i1);
-  T resSerial = RShiftAtomicCheck<T>(i0, i1);
-
-  bool passed = true;
-
-  if (resSerial != res) {
-    passed = false;
-
-    std::cout << "Loop<" << typeid(T).name() << ">( test = RShiftAtomicTest"
-              << " FAILED : " << resSerial << " != " << res << std::endl;
+struct DecModAtomicTest {
+  template <class T>
+  KOKKOS_FUNCTION static auto atomic_op(T* ptr_op, T* ptr_fetch_op,
+                                        T* ptr_op_fetch, T wrap_value) {
+    // no atomic_dec_mod in desul
+    (void)desul::atomic_fetch_dec_mod(ptr_op, wrap_value,
+                                      desul::MemoryOrderRelaxed(),
+                                      desul::MemoryScopeDevice());
+    T old_val = desul::atomic_fetch_dec_mod(ptr_fetch_op, wrap_value,
+                                            desul::MemoryOrderRelaxed(),
+                                            desul::MemoryScopeDevice());
+    // no atomic_dec_mod_fetch in desul
+    (void)desul::atomic_fetch_dec_mod(ptr_op_fetch, wrap_value,
+                                      desul::MemoryOrderRelaxed(),
+                                      desul::MemoryScopeDevice());
+    T new_val = op(old_val, wrap_value);
+    return Kokkos::pair<T, T>(old_val, new_val);
+  }
+  template <class T>
+  KOKKOS_FUNCTION static T op(T old, T wrap_value) {
+    return ((old == 0) || (old > wrap_value)) ? wrap_value : old - 1;
   }
+  static const char* name() { return "dec_mod"; }
+};
 
-  return passed;
+template <class Op, class T, class ExecSpace>
+bool atomic_op_test(T old_val, T update) {
+  Kokkos::View<T[3], ExecSpace> op_data("op_data");
+  Kokkos::deep_copy(op_data, old_val);
+  int result = 0;
+  Kokkos::parallel_reduce(
+      Kokkos::RangePolicy<ExecSpace>(0, 1),
+      KOKKOS_LAMBDA(int, int& local_result) {
+        auto fetch_result =
+            Op::atomic_op(&op_data(0), &op_data(1), &op_data(2), update);
+        T expected_val = Op::op(old_val, update);
+        Kokkos::memory_fence();
+        if (op_data(0) != expected_val) local_result += 1;
+        if (op_data(1) != expected_val) local_result += 2;
+        if (op_data(2) != expected_val) local_result += 4;
+        if (fetch_result.first != old_val) local_result += 8;
+        if (fetch_result.second != expected_val) local_result += 16;
+      },
+      result);
+  if ((result & 1) != 0)
+    printf("atomic_%s failed with type %s\n", Op::name(), typeid(T).name());
+  if ((result & 2) != 0)
+    printf("atomic_fetch_%s failed with type %s\n", Op::name(),
+           typeid(T).name());
+  if ((result & 4) != 0)
+    printf("atomic_%s_fetch failed with type %s\n", Op::name(),
+           typeid(T).name());
+  if ((result & 8) != 0)
+    printf("atomic_fetch_%s did not return old value with type %s\n",
+           Op::name(), typeid(T).name());
+  if ((result & 16) != 0)
+    printf("atomic_%s_fetch did not return updated value with type %s\n",
+           Op::name(), typeid(T).name());
+
+  return result == 0;
 }
 
 //---------------------------------------------------
 //--------------atomic_test_control------------------
 //---------------------------------------------------
 
-template <class T, class DeviceType>
-bool AtomicOperationsTestIntegralType(int i0, int i1, int test) {
+template <class T, class ExecSpace>
+bool AtomicOperationsTestIntegralType(int old_val_in, int update_in, int test) {
+  T old_val = static_cast<T>(old_val_in);
+  T update  = static_cast<T>(update_in);
   switch (test) {
-    case 1: return MaxAtomicTest<T, DeviceType>((T)i0, (T)i1);
-    case 2: return MinAtomicTest<T, DeviceType>((T)i0, (T)i1);
-    case 3: return MulAtomicTest<T, DeviceType>((T)i0, (T)i1);
-    case 4: return DivAtomicTest<T, DeviceType>((T)i0, (T)i1);
-    case 5: return ModAtomicTest<T, DeviceType>((T)i0, (T)i1);
-    case 6: return AndAtomicTest<T, DeviceType>((T)i0, (T)i1);
-    case 7: return OrAtomicTest<T, DeviceType>((T)i0, (T)i1);
-    case 8: return XorAtomicTest<T, DeviceType>((T)i0, (T)i1);
-    case 9: return LShiftAtomicTest<T, DeviceType>((T)i0, (T)i1);
-    case 10: return RShiftAtomicTest<T, DeviceType>((T)i0, (T)i1);
-    case 11: return IncAtomicTest<T, DeviceType>((T)i0);
-    case 12: return DecAtomicTest<T, DeviceType>((T)i0);
-    case 13: return LoadStoreAtomicTest<T, DeviceType>((T)i0, (T)i1);
+    case 0: return atomic_op_test<AddAtomicTest, T, ExecSpace>(old_val, update);
+    case 1: return atomic_op_test<SubAtomicTest, T, ExecSpace>(old_val, update);
+    case 2: return atomic_op_test<MaxAtomicTest, T, ExecSpace>(old_val, update);
+    case 3: return atomic_op_test<MinAtomicTest, T, ExecSpace>(old_val, update);
+    case 4: return atomic_op_test<MulAtomicTest, T, ExecSpace>(old_val, update);
+    case 5:
+      return update != 0
+                 ? atomic_op_test<DivAtomicTest, T, ExecSpace>(old_val, update)
+                 : true;
+    case 6:
+      return update != 0
+                 ? atomic_op_test<ModAtomicTest, T, ExecSpace>(old_val, update)
+                 : true;
+    case 7: return atomic_op_test<AndAtomicTest, T, ExecSpace>(old_val, update);
+    case 8: return atomic_op_test<OrAtomicTest, T, ExecSpace>(old_val, update);
+    case 9: return atomic_op_test<XorAtomicTest, T, ExecSpace>(old_val, update);
+    case 10:
+      return atomic_op_test<NandAtomicTest, T, ExecSpace>(old_val, update);
+    case 11:
+      return update_in >= 0 ? atomic_op_test<LShiftAtomicTest, T, ExecSpace>(
+                                  old_val, update)
+                            : true;
+    case 12:
+      return update_in >= 0 ? atomic_op_test<RShiftAtomicTest, T, ExecSpace>(
+                                  old_val, update)
+                            : true;
+    case 13:
+      return atomic_op_test<IncAtomicTest, T, ExecSpace>(old_val, update);
+    case 14:
+      return atomic_op_test<DecAtomicTest, T, ExecSpace>(old_val, update);
+    case 15:
+      return atomic_op_test<LoadStoreAtomicTest, T, ExecSpace>(old_val, update);
   }
 
-  return 0;
+  return true;
 }
 
-template <class T, class DeviceType>
-bool AtomicOperationsTestUnsignedIntegralType(int i0, int i1, int test) {
+template <class T, class ExecSpace>
+bool AtomicOperationsTestUnsignedIntegralType(int old_val_in, int update_in,
+                                              int test) {
+  T old_val = static_cast<T>(old_val_in);
+  T update  = static_cast<T>(update_in);
   switch (test) {
-    case 1: return WrappingIncAtomicTest<T, DeviceType>((T)i0, (T)i1);
-    case 2: return WrappingDecAtomicTest<T, DeviceType>((T)i0, (T)i1);
+    case 1:
+      return atomic_op_test<IncModAtomicTest, T, ExecSpace>(old_val, update);
+    case 2:
+      return atomic_op_test<DecModAtomicTest, T, ExecSpace>(old_val, update);
   }
 
-  return 0;
+  return true;
 }
 
-template <class T, class DeviceType>
-bool AtomicOperationsTestNonIntegralType(int i0, int i1, int test) {
+template <class T, class ExecSpace>
+bool AtomicOperationsTestNonIntegralType(int old_val_in, int update_in,
+                                         int test) {
+  T old_val = static_cast<T>(old_val_in);
+  T update  = static_cast<T>(update_in);
   switch (test) {
-    case 1: return MaxAtomicTest<T, DeviceType>((T)i0, (T)i1);
-    case 2: return MinAtomicTest<T, DeviceType>((T)i0, (T)i1);
-    case 3: return MulAtomicTest<T, DeviceType>((T)i0, (T)i1);
-    case 4: return DivAtomicTest<T, DeviceType>((T)i0, (T)i1);
-    case 5: return LoadStoreAtomicTest<T, DeviceType>((T)i0, (T)i1);
+    case 0: return atomic_op_test<AddAtomicTest, T, ExecSpace>(old_val, update);
+    case 1: return atomic_op_test<SubAtomicTest, T, ExecSpace>(old_val, update);
+    case 2: return atomic_op_test<MaxAtomicTest, T, ExecSpace>(old_val, update);
+    case 3: return atomic_op_test<MinAtomicTest, T, ExecSpace>(old_val, update);
+    case 4: return atomic_op_test<MulAtomicTest, T, ExecSpace>(old_val, update);
+    case 5:
+      return update != 0
+                 ? atomic_op_test<DivAtomicTest, T, ExecSpace>(old_val, update)
+                 : true;
+    case 6:
+      return atomic_op_test<LoadStoreAtomicTest, T, ExecSpace>(old_val, update);
   }
 
-  return 0;
+  return true;
 }
-
 }  // namespace TestAtomicOperations
diff --git a/packages/kokkos/core/unit_test/TestAtomicOperations_complexdouble.hpp b/packages/kokkos/core/unit_test/TestAtomicOperations_complexdouble.hpp
index 9f55aa947b3d5ce006c859965eabf172bfb529c1..5708fd2ebf732a4129683489714499f189c0bb6e 100644
--- a/packages/kokkos/core/unit_test/TestAtomicOperations_complexdouble.hpp
+++ b/packages/kokkos/core/unit_test/TestAtomicOperations_complexdouble.hpp
@@ -16,17 +16,38 @@
 
 #include <TestAtomicOperations.hpp>
 
+using namespace TestAtomicOperations;
+
 namespace Test {
 TEST(TEST_CATEGORY, atomic_operations_complexdouble) {
-  const int start = 1;  // Avoid zero for division.
+#if defined(KOKKOS_ENABLE_SYCL) && \
+    !defined(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED)
+  if (std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::SYCL>)
+    GTEST_SKIP() << "skipping since device_global variables are not available";
+#endif
+  const int start = -5;
   const int end   = 11;
   for (int i = start; i < end; ++i) {
+    using T   = Kokkos::complex<double>;
+    T old_val = static_cast<T>(i);
+    T update  = static_cast<T>(end - i - start);
+    ASSERT_TRUE(
+        (atomic_op_test<AddAtomicTest, T, TEST_EXECSPACE>(old_val, update)));
     ASSERT_TRUE(
-        (TestAtomicOperations::MulAtomicTest<Kokkos::complex<double>,
-                                             TEST_EXECSPACE>(start, end - i)));
+        (atomic_op_test<SubAtomicTest, T, TEST_EXECSPACE>(old_val, update)));
     ASSERT_TRUE(
-        (TestAtomicOperations::DivAtomicTest<Kokkos::complex<double>,
-                                             TEST_EXECSPACE>(start, end - i)));
+        (atomic_op_test<MulAtomicTest, T, TEST_EXECSPACE>(old_val, update)));
+
+    // FIXME_32BIT disable division test for 32bit where we have accuracy issues
+    // with division atomics still compile it though
+    if (sizeof(void*) == 8) {
+      ASSERT_TRUE((update != 0
+                       ? atomic_op_test<DivAtomicTest, T, TEST_EXECSPACE>(
+                             old_val, update)
+                       : true));
+    }
+    ASSERT_TRUE((atomic_op_test<LoadStoreAtomicTest, T, TEST_EXECSPACE>(
+        old_val, update)));
   }
 }
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestAtomicOperations_complexfloat.hpp b/packages/kokkos/core/unit_test/TestAtomicOperations_complexfloat.hpp
index b23e55a339ee45de4353683b8cd3e4194966a86b..97bfeea6ad94c18a278bacae5a7a995d917c1798 100644
--- a/packages/kokkos/core/unit_test/TestAtomicOperations_complexfloat.hpp
+++ b/packages/kokkos/core/unit_test/TestAtomicOperations_complexfloat.hpp
@@ -16,17 +16,33 @@
 
 #include <TestAtomicOperations.hpp>
 
+using namespace TestAtomicOperations;
+
 namespace Test {
 TEST(TEST_CATEGORY, atomic_operations_complexfloat) {
-  const int start = 1;  // Avoid zero for division.
+  const int start = -5;
   const int end   = 11;
   for (int i = start; i < end; ++i) {
+    using T   = Kokkos::complex<float>;
+    T old_val = static_cast<T>(i);
+    T update  = static_cast<T>(end - i - start);
+    ASSERT_TRUE(
+        (atomic_op_test<AddAtomicTest, T, TEST_EXECSPACE>(old_val, update)));
     ASSERT_TRUE(
-        (TestAtomicOperations::MulAtomicTest<Kokkos::complex<float>,
-                                             TEST_EXECSPACE>(start, end - i)));
+        (atomic_op_test<SubAtomicTest, T, TEST_EXECSPACE>(old_val, update)));
     ASSERT_TRUE(
-        (TestAtomicOperations::DivAtomicTest<Kokkos::complex<float>,
-                                             TEST_EXECSPACE>(start, end - i)));
+        (atomic_op_test<MulAtomicTest, T, TEST_EXECSPACE>(old_val, update)));
+
+    // FIXME_32BIT disable division test for 32bit where we have accuracy issues
+    // with division atomics still compile it though
+    if (sizeof(void*) == 8) {
+      ASSERT_TRUE((update != 0
+                       ? atomic_op_test<DivAtomicTest, T, TEST_EXECSPACE>(
+                             old_val, update)
+                       : true));
+    }
+    ASSERT_TRUE((atomic_op_test<LoadStoreAtomicTest, T, TEST_EXECSPACE>(
+        old_val, update)));
   }
 }
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestAtomicOperations_double.hpp b/packages/kokkos/core/unit_test/TestAtomicOperations_double.hpp
index 0dea91f4165a400e9b80a7bac14684ca365b4e1d..30f7e5e3bdedcb98b7975a13e7826b4e056eb7a7 100644
--- a/packages/kokkos/core/unit_test/TestAtomicOperations_double.hpp
+++ b/packages/kokkos/core/unit_test/TestAtomicOperations_double.hpp
@@ -18,19 +18,16 @@
 
 namespace Test {
 TEST(TEST_CATEGORY, atomic_operations_double) {
-  const int start = 1;  // Avoid zero for division.
+  const int start = -5;
   const int end   = 11;
   for (int i = start; i < end; ++i) {
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestNonIntegralType<
-                 double, TEST_EXECSPACE>(start, end - i, 1)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestNonIntegralType<
-                 double, TEST_EXECSPACE>(start, end - i, 2)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestNonIntegralType<
-                 double, TEST_EXECSPACE>(start, end - i, 3)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestNonIntegralType<
-                 double, TEST_EXECSPACE>(start, end - i, 4)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestNonIntegralType<
-                 double, TEST_EXECSPACE>(start, end - i, 5)));
+    for (int t = 0; t < 8; t++)
+      // FIXME_32BIT disable division test for 32bit where we have accuracy
+      // issues with division atomics still compile it though
+      if (t != 5 || sizeof(void*) == 8) {
+        ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestNonIntegralType<
+                     double, TEST_EXECSPACE>(i, end - i + start, t)));
+      }
   }
 }
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestAtomicOperations_float.hpp b/packages/kokkos/core/unit_test/TestAtomicOperations_float.hpp
index 23348f20ec9fcdd7be42977080fe4b25c2a9cb4a..73ea439808926100f8ab39490a986323fdf31524 100644
--- a/packages/kokkos/core/unit_test/TestAtomicOperations_float.hpp
+++ b/packages/kokkos/core/unit_test/TestAtomicOperations_float.hpp
@@ -18,19 +18,16 @@
 
 namespace Test {
 TEST(TEST_CATEGORY, atomic_operations_float) {
-  const int start = 1;  // Avoid zero for division.
+  const int start = -5;
   const int end   = 11;
   for (int i = start; i < end; ++i) {
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestNonIntegralType<
-                 float, TEST_EXECSPACE>(start, end - i, 1)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestNonIntegralType<
-                 float, TEST_EXECSPACE>(start, end - i, 2)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestNonIntegralType<
-                 float, TEST_EXECSPACE>(start, end - i, 3)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestNonIntegralType<
-                 float, TEST_EXECSPACE>(start, end - i, 4)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestNonIntegralType<
-                 float, TEST_EXECSPACE>(start, end - i, 5)));
+    for (int t = 0; t < 8; t++)
+      // FIXME_32BIT disable division test for 32bit where we have accuracy
+      // issues with division atomics still compile it though
+      if (t != 5 || sizeof(void*) == 8) {
+        ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestNonIntegralType<
+                     double, TEST_EXECSPACE>(i, end - i + start, t)));
+      }
   }
 }
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestAtomicOperations_int.hpp b/packages/kokkos/core/unit_test/TestAtomicOperations_int.hpp
index 31cddf20baa438fcd5330b7e5b3ffa39db951a80..5aeaecd7af4ce8ad63986dabd46d9604a4419523 100644
--- a/packages/kokkos/core/unit_test/TestAtomicOperations_int.hpp
+++ b/packages/kokkos/core/unit_test/TestAtomicOperations_int.hpp
@@ -18,33 +18,12 @@
 
 namespace Test {
 TEST(TEST_CATEGORY, atomic_operations_int) {
-  const int start = 1;  // Avoid zero for division.
+  const int start = -5;
   const int end   = 11;
   for (int i = start; i < end; ++i) {
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 int, TEST_EXECSPACE>(start, end - i, 1)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 int, TEST_EXECSPACE>(start, end - i, 2)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 int, TEST_EXECSPACE>(start, end - i, 3)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 int, TEST_EXECSPACE>(start, end - i, 4)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 int, TEST_EXECSPACE>(start, end - i, 5)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 int, TEST_EXECSPACE>(start, end - i, 6)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 int, TEST_EXECSPACE>(start, end - i, 7)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 int, TEST_EXECSPACE>(start, end - i, 8)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 int, TEST_EXECSPACE>(start, end - i, 9)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 int, TEST_EXECSPACE>(start, end - i, 11)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 int, TEST_EXECSPACE>(start, end - i, 12)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 int, TEST_EXECSPACE>(start, end - i, 13)));
+    for (int t = 0; t < 16; t++)
+      ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
+                   int, TEST_EXECSPACE>(i, end - i + start, t)));
   }
 }
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestAtomicOperations_longint.hpp b/packages/kokkos/core/unit_test/TestAtomicOperations_longint.hpp
index 3171e61018f337185c7335f076b7eca8b2d576e1..b181171dd580ea2165cd9d62c020a7627bd3c44c 100644
--- a/packages/kokkos/core/unit_test/TestAtomicOperations_longint.hpp
+++ b/packages/kokkos/core/unit_test/TestAtomicOperations_longint.hpp
@@ -18,33 +18,12 @@
 
 namespace Test {
 TEST(TEST_CATEGORY, atomic_operations_long) {
-  const int start = 1;  // Avoid zero for division.
+  const int start = -5;
   const int end   = 11;
   for (int i = start; i < end; ++i) {
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 long int, TEST_EXECSPACE>(start, end - i, 1)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 long int, TEST_EXECSPACE>(start, end - i, 2)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 long int, TEST_EXECSPACE>(start, end - i, 3)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 long int, TEST_EXECSPACE>(start, end - i, 4)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 long int, TEST_EXECSPACE>(start, end - i, 5)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 long int, TEST_EXECSPACE>(start, end - i, 6)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 long int, TEST_EXECSPACE>(start, end - i, 7)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 long int, TEST_EXECSPACE>(start, end - i, 8)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 long int, TEST_EXECSPACE>(start, end - i, 9)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 long int, TEST_EXECSPACE>(start, end - i, 11)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 long int, TEST_EXECSPACE>(start, end - i, 12)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 long int, TEST_EXECSPACE>(start, end - i, 13)));
+    for (int t = 0; t < 16; t++)
+      ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
+                   long int, TEST_EXECSPACE>(i, end - i + start, t)));
   }
 }
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestAtomicOperations_longlongint.hpp b/packages/kokkos/core/unit_test/TestAtomicOperations_longlongint.hpp
index f7bef416d6344ef2f0df6158b99b28d5331f5a85..aa21722f474e724867c864013d93f162ec1ad185 100644
--- a/packages/kokkos/core/unit_test/TestAtomicOperations_longlongint.hpp
+++ b/packages/kokkos/core/unit_test/TestAtomicOperations_longlongint.hpp
@@ -18,33 +18,12 @@
 
 namespace Test {
 TEST(TEST_CATEGORY, atomic_operations_longlong) {
-  const int start = 1;  // Avoid zero for division.
+  const int start = -5;
   const int end   = 11;
   for (int i = start; i < end; ++i) {
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 long long int, TEST_EXECSPACE>(start, end - i, 1)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 long long int, TEST_EXECSPACE>(start, end - i, 2)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 long long int, TEST_EXECSPACE>(start, end - i, 3)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 long long int, TEST_EXECSPACE>(start, end - i, 4)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 long long int, TEST_EXECSPACE>(start, end - i, 5)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 long long int, TEST_EXECSPACE>(start, end - i, 6)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 long long int, TEST_EXECSPACE>(start, end - i, 7)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 long long int, TEST_EXECSPACE>(start, end - i, 8)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 long long int, TEST_EXECSPACE>(start, end - i, 9)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 long long int, TEST_EXECSPACE>(start, end - i, 11)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 long long int, TEST_EXECSPACE>(start, end - i, 12)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 long long int, TEST_EXECSPACE>(start, end - i, 13)));
+    for (int t = 0; t < 16; t++)
+      ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
+                   long long int, TEST_EXECSPACE>(i, end - i + start, t)));
   }
 }
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestAtomicOperations_unsignedint.hpp b/packages/kokkos/core/unit_test/TestAtomicOperations_unsignedint.hpp
index 31b2693d64871c90d29a5f13fac940836cb1732b..96acb94bb16649b1084f28d63fcf42c42b47a35d 100644
--- a/packages/kokkos/core/unit_test/TestAtomicOperations_unsignedint.hpp
+++ b/packages/kokkos/core/unit_test/TestAtomicOperations_unsignedint.hpp
@@ -18,41 +18,18 @@
 
 namespace Test {
 TEST(TEST_CATEGORY, atomic_operations_unsigned) {
-  const int start = 1;  // Avoid zero for division.
+  const int start = 0;
   const int end   = 11;
   for (int i = start; i < end; ++i) {
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 unsigned int, TEST_EXECSPACE>(start, end - i, 1)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 unsigned int, TEST_EXECSPACE>(start, end - i, 2)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 unsigned int, TEST_EXECSPACE>(start, end - i, 3)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 unsigned int, TEST_EXECSPACE>(start, end - i, 4)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 unsigned int, TEST_EXECSPACE>(start, end - i, 5)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 unsigned int, TEST_EXECSPACE>(start, end - i, 6)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 unsigned int, TEST_EXECSPACE>(start, end - i, 7)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 unsigned int, TEST_EXECSPACE>(start, end - i, 8)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 unsigned int, TEST_EXECSPACE>(start, end - i, 9)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 unsigned int, TEST_EXECSPACE>(start, end - i, 11)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 unsigned int, TEST_EXECSPACE>(start, end - i, 12)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 unsigned int, TEST_EXECSPACE>(start, end - i, 13)));
-#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+    for (int t = 0; t < 16; t++)
+      ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
+                   unsigned int, TEST_EXECSPACE>(i, end - i + start, t)));
     ASSERT_TRUE(
         (TestAtomicOperations::AtomicOperationsTestUnsignedIntegralType<
-            unsigned int, TEST_EXECSPACE>(start, end - i, 1)));  // Wrapping Inc
+            unsigned int, TEST_EXECSPACE>(i, end - i, 1)));  // Wrapping Inc
     ASSERT_TRUE(
         (TestAtomicOperations::AtomicOperationsTestUnsignedIntegralType<
-            unsigned int, TEST_EXECSPACE>(start, end - i, 2)));  // Wrapping Dec
-#endif
+            unsigned int, TEST_EXECSPACE>(i, end - i, 2)));  // Wrapping Dec
   }
 }
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestAtomicOperations_unsignedlongint.hpp b/packages/kokkos/core/unit_test/TestAtomicOperations_unsignedlongint.hpp
index 98c2d28b20d7c3a5c207e3789eb8c9322406115a..3482f6fe1ed4f25172e865f2fe2d4fe41f3e8f74 100644
--- a/packages/kokkos/core/unit_test/TestAtomicOperations_unsignedlongint.hpp
+++ b/packages/kokkos/core/unit_test/TestAtomicOperations_unsignedlongint.hpp
@@ -18,41 +18,18 @@
 
 namespace Test {
 TEST(TEST_CATEGORY, atomic_operations_unsignedlong) {
-  const int start = 1;  // Avoid zero for division.
+  const int start = 0;
   const int end   = 11;
   for (int i = start; i < end; ++i) {
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 unsigned long int, TEST_EXECSPACE>(start, end - i, 1)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 unsigned long int, TEST_EXECSPACE>(start, end - i, 2)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 unsigned long int, TEST_EXECSPACE>(start, end - i, 3)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 unsigned long int, TEST_EXECSPACE>(start, end - i, 4)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 unsigned long int, TEST_EXECSPACE>(start, end - i, 5)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 unsigned long int, TEST_EXECSPACE>(start, end - i, 6)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 unsigned long int, TEST_EXECSPACE>(start, end - i, 7)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 unsigned long int, TEST_EXECSPACE>(start, end - i, 8)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 unsigned long int, TEST_EXECSPACE>(start, end - i, 9)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 unsigned long int, TEST_EXECSPACE>(start, end - i, 11)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 unsigned long int, TEST_EXECSPACE>(start, end - i, 12)));
-    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
-                 unsigned long int, TEST_EXECSPACE>(start, end - i, 13)));
-#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+    for (int t = 0; t < 16; t++)
+      ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
+                   unsigned long int, TEST_EXECSPACE>(i, end - i + start, t)));
     ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestUnsignedIntegralType<
-                 unsigned long int, TEST_EXECSPACE>(start, end - i,
+                 unsigned long int, TEST_EXECSPACE>(i, end - i,
                                                     1)));  // Wrapping Inc
     ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestUnsignedIntegralType<
-                 unsigned long int, TEST_EXECSPACE>(start, end - i,
+                 unsigned long int, TEST_EXECSPACE>(i, end - i,
                                                     2)));  // Wrapping Dec
-#endif
   }
 }
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestAtomicOperations_unsignedlonglongint.hpp b/packages/kokkos/core/unit_test/TestAtomicOperations_unsignedlonglongint.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..cf41dedccb7b0b34c87946ae11a7a1dff5b4fa3d
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestAtomicOperations_unsignedlonglongint.hpp
@@ -0,0 +1,36 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <TestAtomicOperations.hpp>
+
+namespace Test {
+TEST(TEST_CATEGORY, atomic_operations_unsignedlonglong) {
+  const int start = 0;
+  const int end   = 11;
+  for (int i = start; i < end; ++i) {
+    for (int t = 0; t < 16; t++)
+      ASSERT_TRUE(
+          (TestAtomicOperations::AtomicOperationsTestIntegralType<
+              unsigned long long int, TEST_EXECSPACE>(i, end - i + start, t)));
+    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestUnsignedIntegralType<
+                 unsigned long long int, TEST_EXECSPACE>(i, end - i,
+                                                         1)));  // Wrapping Inc
+    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestUnsignedIntegralType<
+                 unsigned long long int, TEST_EXECSPACE>(i, end - i,
+                                                         2)));  // Wrapping Dec
+  }
+}
+}  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestAtomics.hpp b/packages/kokkos/core/unit_test/TestAtomics.hpp
index e5866bb89baf229d5347c9e9903382571ac0574d..2b40f12d0a4deaf3f970697c6e688208b20f9570 100644
--- a/packages/kokkos/core/unit_test/TestAtomics.hpp
+++ b/packages/kokkos/core/unit_test/TestAtomics.hpp
@@ -510,8 +510,12 @@ TEST(TEST_CATEGORY, atomics) {
   ASSERT_TRUE(
       (TestAtomic::Loop<Kokkos::complex<float>, TEST_EXECSPACE>(100, 3)));
 
-// FIXME_SYCL atomics for large types to be implemented
-#ifndef KOKKOS_ENABLE_SYCL
+// FIXME_SYCL Replace macro by SYCL_EXT_ONEAPI_DEVICE_GLOBAL or remove
+// condition alltogether when possible.
+#if defined(KOKKOS_ENABLE_SYCL) && \
+    !defined(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED)
+  if (std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::SYCL>) return;
+#endif
   ASSERT_TRUE(
       (TestAtomic::Loop<Kokkos::complex<double>, TEST_EXECSPACE>(1, 1)));
   ASSERT_TRUE(
@@ -536,7 +540,49 @@ TEST(TEST_CATEGORY, atomics) {
       (TestAtomic::Loop<TestAtomic::SuperScalar<4>, TEST_EXECSPACE>(100, 3)));
 #endif
 #endif
-#endif
 }
 
+// see https://github.com/trilinos/Trilinos/pull/11506
+struct TpetraUseCase {
+  template <class Scalar>
+  struct AbsMaxHelper {
+    Scalar value;
+
+    KOKKOS_FUNCTION AbsMaxHelper& operator+=(AbsMaxHelper const& rhs) {
+      Scalar lhs_abs_value = Kokkos::abs(value);
+      Scalar rhs_abs_value = Kokkos::abs(rhs.value);
+      value = lhs_abs_value > rhs_abs_value ? lhs_abs_value : rhs_abs_value;
+      return *this;
+    }
+
+    KOKKOS_FUNCTION AbsMaxHelper operator+(AbsMaxHelper const& rhs) const {
+      AbsMaxHelper ret = *this;
+      ret += rhs;
+      return ret;
+    }
+  };
+
+  using T = int;
+  Kokkos::View<T, TEST_EXECSPACE> d_{"lbl"};
+  KOKKOS_FUNCTION void operator()(int i) const {
+    // 0, -1, 2, -3, ...
+    auto v_i = static_cast<T>(i);
+    if (i % 2 == 1) v_i = -v_i;
+    Kokkos::atomic_add(reinterpret_cast<AbsMaxHelper<T>*>(&d_()),
+                       AbsMaxHelper<T>{v_i});
+  }
+
+  TpetraUseCase() {
+    Kokkos::parallel_for(Kokkos::RangePolicy<TEST_EXECSPACE>(0, 10), *this);
+  }
+
+  void check() {
+    T v;
+    Kokkos::deep_copy(v, d_);
+    ASSERT_EQ(v, 9);
+  }
+};
+
+TEST(TEST_CATEGORY, atomics_tpetra_max_abs) { TpetraUseCase().check(); }
+
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestBitManipulation.cpp b/packages/kokkos/core/unit_test/TestBitManipulation.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..50d2b3067ffb359672c6ebbbc646fc19b9e9f451
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestBitManipulation.cpp
@@ -0,0 +1,547 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <Kokkos_BitManipulation.hpp>
+
+struct X {
+  constexpr bool did_not_match() { return true; }
+};
+
+#define TEST_BIT_MANIPULATION(FUNC)                     \
+  constexpr X test_##FUNC(...) { return {}; }           \
+  static_assert(test_##FUNC((unsigned char)0));         \
+  static_assert(test_##FUNC((unsigned short)0));        \
+  static_assert(test_##FUNC((unsigned int)0));          \
+  static_assert(test_##FUNC((unsigned long)0));         \
+  static_assert(test_##FUNC((unsigned long long)0));    \
+  static_assert(test_##FUNC((bool)0).did_not_match());  \
+  static_assert(test_##FUNC((int)0).did_not_match());   \
+  static_assert(test_##FUNC((float)0).did_not_match()); \
+  static_assert(test_##FUNC((void *)0).did_not_match())
+
+//<editor-fold desc="[bit.rotate]">
+template <class UInt>
+constexpr auto test_rotl(UInt x) -> decltype(Kokkos::rotl(x, 0)) {
+  using Kokkos::rotl;
+
+  static_assert(noexcept(rotl(x, 0)));
+  static_assert(std::is_same_v<decltype(rotl(x, 0)), UInt>);
+
+  constexpr auto dig = Kokkos::Experimental::digits_v<UInt>;
+  constexpr auto max = Kokkos::Experimental::finite_max_v<UInt>;
+
+  static_assert(rotl(UInt(0), 0) == 0);
+  static_assert(rotl(UInt(0), 1) == 0);
+  static_assert(rotl(UInt(0), 4) == 0);
+  static_assert(rotl(UInt(0), 8) == 0);
+  static_assert(rotl(max, 0) == max);
+  static_assert(rotl(max, 1) == max);
+  static_assert(rotl(max, 4) == max);
+  static_assert(rotl(max, 8) == max);
+  static_assert(rotl(UInt(1), 0) == UInt(1) << 0);
+  static_assert(rotl(UInt(1), 1) == UInt(1) << 1);
+  static_assert(rotl(UInt(1), 4) == UInt(1) << 4);
+  static_assert(rotl(UInt(1), dig) == UInt(1));
+  static_assert(rotl(UInt(7), dig) == UInt(7));
+  static_assert(rotl(UInt(6), dig - 1) == UInt(3));
+  static_assert(rotl(UInt(3), 6) == UInt(3) << 6);
+
+  static_assert(rotl(UInt(max - 1), 0) == UInt(max - 1));
+  static_assert(rotl(UInt(max - 1), 1) == UInt(max - 2));
+  static_assert(rotl(UInt(max - 1), 2) == UInt(max - 4));
+  static_assert(rotl(UInt(max - 1), 3) == UInt(max - 8));
+  static_assert(rotl(UInt(max - 1), 4) == UInt(max - 16));
+  static_assert(rotl(UInt(max - 1), 5) == UInt(max - 32));
+  static_assert(rotl(UInt(max - 1), 6) == UInt(max - 64));
+  static_assert(rotl(UInt(max - 1), 7) == UInt(max - 128));
+  static_assert(rotl(UInt(1), 0) == UInt(1));
+  static_assert(rotl(UInt(1), 1) == UInt(2));
+  static_assert(rotl(UInt(1), 2) == UInt(4));
+  static_assert(rotl(UInt(1), 3) == UInt(8));
+  static_assert(rotl(UInt(1), 4) == UInt(16));
+  static_assert(rotl(UInt(1), 5) == UInt(32));
+  static_assert(rotl(UInt(1), 6) == UInt(64));
+  static_assert(rotl(UInt(1), 7) == UInt(128));
+
+  return true;
+}
+
+TEST_BIT_MANIPULATION(rotl);
+
+template <class UInt>
+constexpr auto test_rotr(UInt x) -> decltype(Kokkos::rotr(x, 0)) {
+  using Kokkos::rotr;
+
+  static_assert(noexcept(rotr(x, 0)));
+  static_assert(std::is_same_v<decltype(rotr(x, 0)), UInt>);
+
+  constexpr auto dig     = Kokkos::Experimental::digits_v<UInt>;
+  constexpr auto max     = Kokkos::Experimental::finite_max_v<UInt>;
+  constexpr auto highbit = rotr(UInt(1), 1);
+
+  static_assert(rotr(UInt(0), 0) == 0);
+  static_assert(rotr(UInt(0), 1) == 0);
+  static_assert(rotr(UInt(0), 4) == 0);
+  static_assert(rotr(UInt(0), 8) == 0);
+  static_assert(rotr(max, 0) == max);
+  static_assert(rotr(max, 1) == max);
+  static_assert(rotr(max, 4) == max);
+  static_assert(rotr(max, 8) == max);
+  static_assert(rotr(UInt(128), 0) == UInt(128) >> 0);
+  static_assert(rotr(UInt(128), 1) == UInt(128) >> 1);
+  static_assert(rotr(UInt(128), 4) == UInt(128) >> 4);
+  static_assert(rotr(UInt(1), dig) == UInt(1));
+  static_assert(rotr(UInt(7), dig) == UInt(7));
+  static_assert(rotr(UInt(6), dig - 1) == UInt(12));
+  static_assert(rotr(UInt(36), dig - 2) == UInt(144));
+
+  static_assert(rotr(UInt(max - 1), 0) == UInt(max - 1));
+  static_assert(rotr(UInt(max - 1), 1) == UInt(max - highbit));
+  static_assert(rotr(UInt(max - 1), 2) == UInt(max - (highbit >> 1)));
+  static_assert(rotr(UInt(max - 1), 3) == UInt(max - (highbit >> 2)));
+  static_assert(rotr(UInt(max - 1), 4) == UInt(max - (highbit >> 3)));
+  static_assert(rotr(UInt(max - 1), 5) == UInt(max - (highbit >> 4)));
+  static_assert(rotr(UInt(max - 1), 6) == UInt(max - (highbit >> 5)));
+  static_assert(rotr(UInt(max - 1), 7) == UInt(max - (highbit >> 6)));
+  static_assert(rotr(UInt(128), 0) == UInt(128));
+  static_assert(rotr(UInt(128), 1) == UInt(64));
+  static_assert(rotr(UInt(128), 2) == UInt(32));
+  static_assert(rotr(UInt(128), 3) == UInt(16));
+  static_assert(rotr(UInt(128), 4) == UInt(8));
+  static_assert(rotr(UInt(128), 5) == UInt(4));
+  static_assert(rotr(UInt(128), 6) == UInt(2));
+  static_assert(rotr(UInt(128), 7) == UInt(1));
+
+  return true;
+}
+
+TEST_BIT_MANIPULATION(rotr);
+//</editor-fold>
+
+//<editor-fold desc="[bit.count]">
+template <class UInt>
+constexpr auto test_countl_zero(UInt x) -> decltype(Kokkos::countl_zero(x)) {
+  using Kokkos::countl_zero;
+
+  static_assert(noexcept(countl_zero(x)));
+  static_assert(std::is_same_v<decltype(countl_zero(x)), int>);
+
+  constexpr auto dig = Kokkos::Experimental::digits_v<UInt>;
+  constexpr auto max = Kokkos::Experimental::finite_max_v<UInt>;
+
+  static_assert(countl_zero(UInt(0)) == dig);
+  static_assert(countl_zero(UInt(1)) == dig - 1);
+  static_assert(countl_zero(UInt(2)) == dig - 2);
+  static_assert(countl_zero(UInt(3)) == dig - 2);
+  static_assert(countl_zero(UInt(4)) == dig - 3);
+  static_assert(countl_zero(UInt(5)) == dig - 3);
+  static_assert(countl_zero(UInt(6)) == dig - 3);
+  static_assert(countl_zero(UInt(7)) == dig - 3);
+  static_assert(countl_zero(UInt(8)) == dig - 4);
+  static_assert(countl_zero(UInt(9)) == dig - 4);
+  static_assert(countl_zero(UInt(127)) == dig - 7);
+  static_assert(countl_zero(UInt(128)) == dig - 8);
+  static_assert(countl_zero(max) == 0);
+
+  return true;
+}
+
+TEST_BIT_MANIPULATION(countl_zero);
+
+template <class UInt>
+constexpr auto test_countl_one(UInt x) -> decltype(Kokkos::countl_one(x)) {
+  using Kokkos::countl_one;
+
+  static_assert(noexcept(countl_one(x)));
+  static_assert(std::is_same_v<decltype(countl_one(x)), int>);
+
+  constexpr auto dig = Kokkos::Experimental::digits_v<UInt>;
+  constexpr auto max = Kokkos::Experimental::finite_max_v<UInt>;
+
+  static_assert(countl_one(UInt(0)) == 0);
+  static_assert(countl_one(UInt(1)) == 0);
+  static_assert(countl_one(UInt(2)) == 0);
+  static_assert(countl_one(UInt(3)) == 0);
+  static_assert(countl_one(UInt(4)) == 0);
+  static_assert(countl_one(UInt(5)) == 0);
+  static_assert(countl_one(UInt(6)) == 0);
+  static_assert(countl_one(UInt(7)) == 0);
+  static_assert(countl_one(UInt(8)) == 0);
+  static_assert(countl_one(UInt(9)) == 0);
+  static_assert(countl_one(UInt(100)) == 0);
+  static_assert(countl_one(max) == dig);
+  static_assert(countl_one(UInt(max - 1)) == dig - 1);
+  static_assert(countl_one(UInt(max - 2)) == dig - 2);
+  static_assert(countl_one(UInt(max - 3)) == dig - 2);
+  static_assert(countl_one(UInt(max - 4)) == dig - 3);
+  static_assert(countl_one(UInt(max - 5)) == dig - 3);
+  static_assert(countl_one(UInt(max - 6)) == dig - 3);
+  static_assert(countl_one(UInt(max - 7)) == dig - 3);
+  static_assert(countl_one(UInt(max - 8)) == dig - 4);
+  static_assert(countl_one(UInt(max - 9)) == dig - 4);
+  static_assert(countl_one(UInt(max - 126)) == dig - 7);
+  static_assert(countl_one(UInt(max - 127)) == dig - 7);
+  static_assert(countl_one(UInt(max - 128)) == dig - 8);
+  static_assert(countl_one(UInt(UInt(1) << (dig - 1))) == 1);
+  static_assert(countl_one(UInt(UInt(3) << (dig - 2))) == 2);
+  static_assert(countl_one(UInt(UInt(7) << (dig - 3))) == 3);
+  static_assert(countl_one(UInt(UInt(255) << (dig - 8))) == 8);
+
+  return true;
+}
+
+TEST_BIT_MANIPULATION(countl_one);
+
+template <class UInt>
+constexpr auto test_countr_zero(UInt x) -> decltype(Kokkos::countr_zero(x)) {
+  using Kokkos::countr_zero;
+
+  static_assert(noexcept(countr_zero(x)));
+  static_assert(std::is_same_v<decltype(countr_zero(x)), int>);
+
+  constexpr auto dig = Kokkos::Experimental::digits_v<UInt>;
+  constexpr auto max = Kokkos::Experimental::finite_max_v<UInt>;
+
+  static_assert(countr_zero(UInt(0)) == dig);
+  static_assert(countr_zero(UInt(1)) == 0);
+  static_assert(countr_zero(UInt(2)) == 1);
+  static_assert(countr_zero(UInt(3)) == 0);
+  static_assert(countr_zero(UInt(4)) == 2);
+  static_assert(countr_zero(UInt(5)) == 0);
+  static_assert(countr_zero(UInt(6)) == 1);
+  static_assert(countr_zero(UInt(7)) == 0);
+  static_assert(countr_zero(UInt(8)) == 3);
+  static_assert(countr_zero(UInt(9)) == 0);
+  static_assert(countr_zero(UInt(126)) == 1);
+  static_assert(countr_zero(UInt(127)) == 0);
+  static_assert(countr_zero(UInt(128)) == 7);
+  static_assert(countr_zero(UInt(129)) == 0);
+  static_assert(countr_zero(UInt(130)) == 1);
+  static_assert(countr_zero(max) == 0);
+
+  return true;
+}
+
+TEST_BIT_MANIPULATION(countr_zero);
+
+template <class UInt>
+constexpr auto test_countr_one(UInt x) -> decltype(Kokkos::countr_one(x)) {
+  using Kokkos::countr_one;
+
+  static_assert(noexcept(countr_one(x)));
+  static_assert(std::is_same_v<decltype(countr_one(x)), int>);
+
+  constexpr auto dig = Kokkos::Experimental::digits_v<UInt>;
+  constexpr auto max = Kokkos::Experimental::finite_max_v<UInt>;
+
+  static_assert(countr_one(UInt(0)) == 0);
+  static_assert(countr_one(UInt(1)) == 1);
+  static_assert(countr_one(UInt(2)) == 0);
+  static_assert(countr_one(UInt(3)) == 2);
+  static_assert(countr_one(UInt(4)) == 0);
+  static_assert(countr_one(UInt(5)) == 1);
+  static_assert(countr_one(UInt(6)) == 0);
+  static_assert(countr_one(UInt(7)) == 3);
+  static_assert(countr_one(UInt(8)) == 0);
+  static_assert(countr_one(UInt(9)) == 1);
+  static_assert(countr_one(UInt(126)) == 0);
+  static_assert(countr_one(UInt(127)) == 7);
+  static_assert(countr_one(UInt(128)) == 0);
+  static_assert(countr_one(UInt(max - 1)) == 0);
+  static_assert(countr_one(max) == dig);
+
+  return true;
+}
+
+TEST_BIT_MANIPULATION(countr_one);
+
+template <class UInt>
+constexpr auto test_popcount(UInt x) -> decltype(Kokkos::popcount(x)) {
+  using Kokkos::popcount;
+
+  static_assert(noexcept(popcount(x)));
+  static_assert(std::is_same_v<decltype(popcount(x)), int>);
+
+  constexpr auto dig = Kokkos::Experimental::digits_v<UInt>;
+  constexpr auto max = Kokkos::Experimental::finite_max_v<UInt>;
+
+  static_assert(popcount(UInt(0)) == 0);
+  static_assert(popcount(UInt(1)) == 1);
+  static_assert(popcount(UInt(2)) == 1);
+  static_assert(popcount(UInt(3)) == 2);
+  static_assert(popcount(UInt(4)) == 1);
+  static_assert(popcount(UInt(5)) == 2);
+  static_assert(popcount(UInt(6)) == 2);
+  static_assert(popcount(UInt(7)) == 3);
+  static_assert(popcount(UInt(8)) == 1);
+  static_assert(popcount(UInt(9)) == 2);
+  static_assert(popcount(UInt(127)) == 7);
+  static_assert(popcount(max) == dig);
+  static_assert(popcount(UInt(max - 1)) == dig - 1);
+
+  return true;
+}
+
+TEST_BIT_MANIPULATION(popcount);
+//</editor-fold>
+
+//<editor-fold desc="[bit.pow.two]">
+template <class UInt>
+constexpr auto test_has_single_bit(UInt x)
+    -> decltype(Kokkos::has_single_bit(x)) {
+  using Kokkos::has_single_bit;
+
+  static_assert(noexcept(has_single_bit(x)));
+  static_assert(std::is_same_v<decltype(has_single_bit(x)), bool>);
+
+  static_assert(!has_single_bit(UInt(0)));
+  static_assert(has_single_bit(UInt(1)));
+  static_assert(has_single_bit(UInt(2)));
+  static_assert(!has_single_bit(UInt(3)));
+  static_assert(has_single_bit(UInt(4)));
+  static_assert(!has_single_bit(UInt(5)));
+  static_assert(!has_single_bit(UInt(6)));
+  static_assert(!has_single_bit(UInt(7)));
+  static_assert(has_single_bit(UInt(8)));
+  static_assert(!has_single_bit(UInt(9)));
+
+  constexpr auto max = Kokkos::Experimental::finite_max_v<UInt>;
+  static_assert(!has_single_bit(max));
+  constexpr UInt one = 1;
+  static_assert(has_single_bit(UInt(one << 0)));
+  static_assert(has_single_bit(UInt(one << 1)));
+  static_assert(has_single_bit(UInt(one << 2)));
+  static_assert(has_single_bit(UInt(one << 3)));
+  static_assert(has_single_bit(UInt(one << 4)));
+  static_assert(has_single_bit(UInt(one << 5)));
+  static_assert(has_single_bit(UInt(one << 6)));
+  static_assert(has_single_bit(UInt(one << 7)));
+
+  return true;
+}
+
+TEST_BIT_MANIPULATION(has_single_bit);
+
+template <class UInt>
+constexpr auto test_bit_floor(UInt x) -> decltype(Kokkos::bit_floor(x)) {
+  using Kokkos::bit_floor;
+
+  static_assert(noexcept(bit_floor(x)));
+  static_assert(std::is_same_v<decltype(bit_floor(x)), UInt>);
+
+  constexpr auto max = Kokkos::Experimental::finite_max_v<UInt>;
+
+  static_assert(bit_floor(UInt(0)) == 0);
+  static_assert(bit_floor(UInt(1)) == 1);
+  static_assert(bit_floor(UInt(2)) == 2);
+  static_assert(bit_floor(UInt(3)) == 2);
+  static_assert(bit_floor(UInt(4)) == 4);
+  static_assert(bit_floor(UInt(5)) == 4);
+  static_assert(bit_floor(UInt(6)) == 4);
+  static_assert(bit_floor(UInt(7)) == 4);
+  static_assert(bit_floor(UInt(8)) == 8);
+  static_assert(bit_floor(UInt(9)) == 8);
+  static_assert(bit_floor(UInt(125)) == 64);
+  static_assert(bit_floor(UInt(126)) == 64);
+  static_assert(bit_floor(UInt(127)) == 64);
+  static_assert(bit_floor(UInt(128)) == 128);
+  static_assert(bit_floor(UInt(129)) == 128);
+  static_assert(bit_floor(max) == UInt(max - (max >> 1)));
+
+  return true;
+}
+
+TEST_BIT_MANIPULATION(bit_floor);
+
+template <class UInt>
+constexpr auto test_bit_ceil(UInt x) -> decltype(Kokkos::bit_ceil(x)) {
+  using Kokkos::bit_ceil;
+
+  static_assert(noexcept(bit_ceil(x)));
+  static_assert(std::is_same_v<decltype(bit_ceil(x)), UInt>);
+
+  static_assert(bit_ceil(UInt(0)) == 1);
+  static_assert(bit_ceil(UInt(1)) == 1);
+  static_assert(bit_ceil(UInt(2)) == 2);
+  static_assert(bit_ceil(UInt(3)) == 4);
+  static_assert(bit_ceil(UInt(4)) == 4);
+  static_assert(bit_ceil(UInt(5)) == 8);
+  static_assert(bit_ceil(UInt(6)) == 8);
+  static_assert(bit_ceil(UInt(7)) == 8);
+  static_assert(bit_ceil(UInt(8)) == 8);
+  static_assert(bit_ceil(UInt(9)) == 16);
+  static_assert(bit_ceil(UInt(60)) == 64);
+  static_assert(bit_ceil(UInt(61)) == 64);
+  static_assert(bit_ceil(UInt(62)) == 64);
+  static_assert(bit_ceil(UInt(63)) == 64);
+  static_assert(bit_ceil(UInt(64)) == 64);
+  static_assert(bit_ceil(UInt(65)) == 128);
+  static_assert(bit_ceil(UInt(66)) == 128);
+  static_assert(bit_ceil(UInt(67)) == 128);
+  static_assert(bit_ceil(UInt(68)) == 128);
+  static_assert(bit_ceil(UInt(69)) == 128);
+
+  return true;
+}
+
+TEST_BIT_MANIPULATION(bit_ceil);
+
+template <class UInt>
+constexpr auto test_bit_width(UInt x) -> decltype(Kokkos::bit_width(x)) {
+  using Kokkos::bit_width;
+
+  static_assert(noexcept(bit_width(x)));
+  static_assert(std::is_same_v<decltype(bit_width(x)), UInt>);
+
+  constexpr auto dig = Kokkos::Experimental::digits_v<UInt>;
+  constexpr auto max = Kokkos::Experimental::finite_max_v<UInt>;
+
+  static_assert(bit_width(UInt(0)) == 0);
+  static_assert(bit_width(UInt(1)) == 1);
+  static_assert(bit_width(UInt(2)) == 2);
+  static_assert(bit_width(UInt(3)) == 2);
+  static_assert(bit_width(UInt(4)) == 3);
+  static_assert(bit_width(UInt(5)) == 3);
+  static_assert(bit_width(UInt(6)) == 3);
+  static_assert(bit_width(UInt(7)) == 3);
+  static_assert(bit_width(UInt(8)) == 4);
+  static_assert(bit_width(UInt(9)) == 4);
+
+  static_assert(bit_width(UInt(max - 1)) == dig);
+  static_assert(bit_width(max) == dig);
+
+  return true;
+}
+
+TEST_BIT_MANIPULATION(bit_width);
+//</editor-fold>
+
+//<editor-fold desc="[bit.byteswap]">
+template <class T>
+constexpr auto test_byteswap(T x) -> decltype(Kokkos::byteswap(x)) {
+  using Kokkos::byteswap;
+
+  static_assert(noexcept(byteswap(x)));
+  static_assert(std::is_same_v<decltype(byteswap(x)), T>);
+
+  return true;
+}
+
+constexpr X test_byteswap(...) { return {}; }
+
+static_assert(test_byteswap((void *)0).did_not_match());  // NOLINT
+static_assert(test_byteswap((float)0).did_not_match());
+constexpr char c2[2] = {};
+static_assert(test_byteswap(c2).did_not_match());
+static_assert(test_byteswap((char)0));
+static_assert(test_byteswap((short)0));
+static_assert(test_byteswap((int)0));
+static_assert(test_byteswap((long)0));
+static_assert(test_byteswap((long long)0));
+static_assert(test_byteswap((unsigned char)0));
+static_assert(test_byteswap((unsigned short)0));
+static_assert(test_byteswap((unsigned int)0));
+static_assert(test_byteswap((unsigned long)0));
+static_assert(test_byteswap((unsigned long long)0));
+
+constexpr bool test_byteswap2() {
+  using Kokkos::byteswap;
+
+  static_assert(byteswap<int8_t>(INT8_C(0x12)) == INT8_C(0x12));
+  static_assert(byteswap<int16_t>(INT16_C(0x1234)) == INT16_C(0x3412));
+  static_assert(byteswap<int32_t>(INT32_C(0x12345678)) == INT32_C(0x78563412));
+
+  // These static_casts are a workaround for an nvcc 11.2 compiler bug
+  static_assert(
+      static_cast<uint64_t>(byteswap<int64_t>(INT64_C(0x123456789abcdef0))) ==
+      static_cast<uint64_t>(INT64_C(0xf0debc9a78563412)));
+
+  static_assert(byteswap<uint8_t>(UINT8_C(0x21)) == UINT8_C(0x21));
+  static_assert(byteswap<uint16_t>(UINT16_C(0x4321)) == UINT16_C(0x2143));
+  static_assert(byteswap<uint32_t>(UINT32_C(0x87654321)) ==
+                UINT32_C(0x21436587));
+  static_assert(byteswap<uint64_t>(UINT64_C(0xfedcba9876543210)) ==
+                UINT64_C(0x1032547698badcfe));
+  static_assert(byteswap<const uint32_t>(UINT32_C(0xdeadbeef)) ==
+                UINT32_C(0xefbeadde));
+
+  return true;
+}
+static_assert(test_byteswap2());
+//</editor-fold>
+
+#undef TEST_BIT_MANIPULATION
+
+//<editor-fold desc="[bit.bit_cast]">
+template <class To, class From>
+constexpr auto test_bit_cast() -> typename std::is_same<
+    decltype(Kokkos::bit_cast<To>(std::declval<From const &>())),
+    To>::value_type {
+  static_assert(
+      std::is_same_v<
+          decltype(Kokkos::bit_cast<To>(std::declval<From const &>())), To>);
+  return true;
+}
+template <class To, class From>
+constexpr X test_bit_cast(...) {
+  return {};
+}
+
+#if !defined(KOKKOS_ENABLE_SYCL) || \
+    (defined(__INTEL_LLVM_COMPILER) && __INTEL_LLVM_COMPILER >= 20240000)
+namespace TypesNotTheSameSize {
+struct To {
+  char a;
+};
+struct From {
+  char b;
+  char c;
+};
+static_assert(test_bit_cast<To, From>().did_not_match());
+}  // namespace TypesNotTheSameSize
+
+namespace ToNotTriviallyCopyable {
+struct To {
+  char a;
+  To(To const &);
+};
+struct From {
+  char b;
+};
+static_assert(test_bit_cast<To, From>().did_not_match());
+}  // namespace ToNotTriviallyCopyable
+
+namespace FromNotTriviallyCopyable {
+struct To {
+  char a;
+};
+struct From {
+  char b;
+  From(From const &);
+};
+static_assert(test_bit_cast<To, From>().did_not_match());
+}  // namespace FromNotTriviallyCopyable
+#endif
+
+namespace ReturnTypeIllFormed {
+struct From {
+  char a;
+  char b;
+};
+static_assert(test_bit_cast<int(), From>().did_not_match());
+static_assert(test_bit_cast<char[2], From>().did_not_match());
+}  // namespace ReturnTypeIllFormed
+   //</editor-fold>
diff --git a/packages/kokkos/core/unit_test/TestBitManipulationBuiltins.hpp b/packages/kokkos/core/unit_test/TestBitManipulationBuiltins.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..092e7cff6180814f73279e91830e33fa619d4134
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestBitManipulationBuiltins.hpp
@@ -0,0 +1,862 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+// clang-format off
+template <class>
+struct type_helper;
+#define DEFINE_TYPE_NAME(T) \
+template <> struct type_helper<T> { static char const * name() { return #T; } };
+DEFINE_TYPE_NAME(unsigned char)
+DEFINE_TYPE_NAME(unsigned short)
+DEFINE_TYPE_NAME(unsigned int)
+DEFINE_TYPE_NAME(unsigned long)
+DEFINE_TYPE_NAME(unsigned long long)
+DEFINE_TYPE_NAME(char)
+DEFINE_TYPE_NAME(short)
+DEFINE_TYPE_NAME(int)
+DEFINE_TYPE_NAME(long)
+DEFINE_TYPE_NAME(long long)
+#undef DEFINE_TYPE_NAME
+// clang-format on
+
+#define DEFINE_BIT_MANIPULATION_FUNCTION_EVAL(FUNC)   \
+  struct BitManipFunction_##FUNC {                    \
+    template <class T>                                \
+    static KOKKOS_FUNCTION auto eval_constexpr(T x) { \
+      return Kokkos::FUNC(x);                         \
+    }                                                 \
+    template <class T>                                \
+    static KOKKOS_FUNCTION auto eval_builtin(T x) {   \
+      return Kokkos::Experimental::FUNC##_builtin(x); \
+    }                                                 \
+    static char const* name() { return #FUNC; }       \
+  }
+
+DEFINE_BIT_MANIPULATION_FUNCTION_EVAL(countl_zero);
+DEFINE_BIT_MANIPULATION_FUNCTION_EVAL(countl_one);
+DEFINE_BIT_MANIPULATION_FUNCTION_EVAL(countr_zero);
+DEFINE_BIT_MANIPULATION_FUNCTION_EVAL(countr_one);
+DEFINE_BIT_MANIPULATION_FUNCTION_EVAL(popcount);
+DEFINE_BIT_MANIPULATION_FUNCTION_EVAL(has_single_bit);
+DEFINE_BIT_MANIPULATION_FUNCTION_EVAL(bit_ceil);
+DEFINE_BIT_MANIPULATION_FUNCTION_EVAL(bit_floor);
+DEFINE_BIT_MANIPULATION_FUNCTION_EVAL(bit_width);
+
+#undef DEFINE_BIT_MANIPULATION_FUNCTION_EVAL
+
+template <class Space, class Func, class Arg, std::size_t N>
+struct TestBitManipFunction {
+  Arg val_[N];
+  TestBitManipFunction(const Arg (&val)[N]) {
+    std::copy(val, val + N, val_);
+    run();
+  }
+  void run() const {
+    int errors = 0;
+    Kokkos::parallel_reduce(Kokkos::RangePolicy<Space>(0, N), *this, errors);
+    ASSERT_EQ(errors, 0) << "Failed check no error for " << Func::name() << "("
+                         << type_helper<Arg>::name() << ")";
+  }
+  KOKKOS_FUNCTION void operator()(int i, int& e) const {
+    if (Func::eval_builtin(val_[i]) != Func::eval_constexpr(val_[i])) {
+      ++e;
+      Kokkos::printf("value at %x which is %d was expected to be %d\n",
+                     (unsigned)val_[i], (int)Func::eval_builtin(val_[i]),
+                     (int)Func::eval_constexpr(val_[i]));
+    }
+  }
+};
+
+template <class Space, class... Func, class Arg, std::size_t N>
+void do_test_bit_manip_function(const Arg (&x)[N]) {
+  (void)std::initializer_list<int>{
+      (TestBitManipFunction<Space, Func, Arg, N>(x), 0)...};
+}
+
+#define TEST_BIT_MANIP_FUNCTION(FUNC) \
+  do_test_bit_manip_function<TEST_EXECSPACE, BitManipFunction_##FUNC>
+
+template <class UInt>
+void test_bit_manip_countl_zero() {
+  using Kokkos::Experimental::countl_zero_builtin;
+  static_assert(noexcept(countl_zero_builtin(UInt())));
+  static_assert(std::is_same_v<decltype(countl_zero_builtin(UInt())), int>);
+  constexpr auto max = Kokkos::Experimental::finite_max_v<UInt>;
+  TEST_BIT_MANIP_FUNCTION(countl_zero)
+  ({
+      UInt(0),
+      UInt(1),
+      UInt(2),
+      UInt(3),
+      UInt(4),
+      UInt(5),
+      UInt(6),
+      UInt(7),
+      UInt(8),
+      UInt(9),
+      UInt(127),
+      UInt(128),
+      UInt(max),
+  });
+}
+
+TEST(TEST_CATEGORY, bit_manip_countl_zero) {
+// FIXME_NVHPC: NVC++-W-0155-Compiler failed to translate accelerator region
+#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC)
+  if constexpr (!std::is_same_v<TEST_EXECSPACE,
+                                Kokkos::Experimental::OpenACC>) {
+#endif
+    test_bit_manip_countl_zero<unsigned char>();
+    test_bit_manip_countl_zero<unsigned short>();
+#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC)
+  }
+#endif
+  test_bit_manip_countl_zero<unsigned int>();
+  test_bit_manip_countl_zero<unsigned long>();
+  test_bit_manip_countl_zero<unsigned long long>();
+}
+
+template <class UInt>
+void test_bit_manip_countl_one() {
+  using Kokkos::Experimental::countl_one_builtin;
+  static_assert(noexcept(countl_one_builtin(UInt())));
+  static_assert(std::is_same_v<decltype(countl_one_builtin(UInt())), int>);
+  constexpr auto dig = Kokkos::Experimental::digits_v<UInt>;
+  constexpr auto max = Kokkos::Experimental::finite_max_v<UInt>;
+  TEST_BIT_MANIP_FUNCTION(countl_one)
+  ({
+      // clang-format off
+      UInt(0),
+      UInt(1),
+      UInt(2),
+      UInt(3),
+      UInt(4),
+      UInt(5),
+      UInt(6),
+      UInt(7),
+      UInt(8),
+      UInt(9),
+      UInt(100),
+      UInt(127),
+      UInt(128),
+      UInt(max),
+      UInt(max - 1),
+      UInt(max - 2),
+      UInt(max - 3),
+      UInt(max - 4),
+      UInt(max - 5),
+      UInt(max - 6),
+      UInt(max - 7),
+      UInt(max - 8),
+      UInt(max - 9),
+      UInt(max - 126),
+      UInt(max - 127),
+      UInt(max - 128),
+      UInt(UInt(1) << (dig - 1)),
+      UInt(UInt(3) << (dig - 2)),
+      UInt(UInt(7) << (dig - 3)),
+      UInt(UInt(255) << (dig - 8)),
+      // clang-format on
+  });
+}
+
+TEST(TEST_CATEGORY, bit_manip_countl_one) {
+// FIXME_NVHPC: NVC++-W-0155-Compiler failed to translate accelerator region
+#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC)
+  if constexpr (!std::is_same_v<TEST_EXECSPACE,
+                                Kokkos::Experimental::OpenACC>) {
+#endif
+    test_bit_manip_countl_one<unsigned char>();
+    test_bit_manip_countl_one<unsigned short>();
+#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC)
+  }
+#endif
+  test_bit_manip_countl_one<unsigned int>();
+  test_bit_manip_countl_one<unsigned long>();
+  test_bit_manip_countl_one<unsigned long long>();
+}
+
+template <class UInt>
+void test_bit_manip_countr_zero() {
+  using Kokkos::Experimental::countr_zero_builtin;
+  static_assert(noexcept(countr_zero_builtin(UInt())));
+  static_assert(std::is_same_v<decltype(countr_zero_builtin(UInt())), int>);
+  constexpr auto max = Kokkos::Experimental::finite_max_v<UInt>;
+  TEST_BIT_MANIP_FUNCTION(countr_zero)
+  ({
+      UInt(0),
+      UInt(1),
+      UInt(2),
+      UInt(3),
+      UInt(4),
+      UInt(5),
+      UInt(6),
+      UInt(7),
+      UInt(8),
+      UInt(9),
+      UInt(126),
+      UInt(127),
+      UInt(128),
+      UInt(129),
+      UInt(130),
+      UInt(max),
+  });
+}
+
+TEST(TEST_CATEGORY, bit_manip_countr_zero) {
+// FIXME_NVHPC: NVC++-W-0155-Compiler failed to translate accelerator region
+#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC)
+  if constexpr (!std::is_same_v<TEST_EXECSPACE,
+                                Kokkos::Experimental::OpenACC>) {
+#endif
+#if defined(KOKKOS_ENABLE_SYCL) && \
+    !defined(KOKKOS_ARCH_INTEL_GPU)  // FIXME_SYCL returns wrong result
+    if (!std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::SYCL>)
+#endif
+      test_bit_manip_countr_zero<unsigned char>();
+    test_bit_manip_countr_zero<unsigned short>();
+#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC)
+  }
+#endif
+  test_bit_manip_countr_zero<unsigned int>();
+  test_bit_manip_countr_zero<unsigned long>();
+  test_bit_manip_countr_zero<unsigned long long>();
+}
+
+template <class UInt>
+void test_bit_manip_countr_one() {
+  using Kokkos::Experimental::countr_one_builtin;
+  static_assert(noexcept(countr_one_builtin(UInt())));
+  static_assert(std::is_same_v<decltype(countr_one_builtin(UInt())), int>);
+  constexpr auto max = Kokkos::Experimental::finite_max_v<UInt>;
+  TEST_BIT_MANIP_FUNCTION(countr_one)
+  ({
+      UInt(0),
+      UInt(1),
+      UInt(2),
+      UInt(3),
+      UInt(4),
+      UInt(5),
+      UInt(6),
+      UInt(7),
+      UInt(8),
+      UInt(9),
+      UInt(126),
+      UInt(127),
+      UInt(128),
+      UInt(max - 1),
+      UInt(max),
+  });
+}
+
+TEST(TEST_CATEGORY, bit_manip_countr_one) {
+// FIXME_NVHPC: NVC++-W-0155-Compiler failed to translate accelerator region
+#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC)
+  if constexpr (!std::is_same_v<TEST_EXECSPACE,
+                                Kokkos::Experimental::OpenACC>) {
+#endif
+#if defined(KOKKOS_ENABLE_SYCL) && \
+    !defined(KOKKOS_ARCH_INTEL_GPU)  // FIXME_SYCL returns wrong result
+    if (!std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::SYCL>)
+#endif
+      test_bit_manip_countr_one<unsigned char>();
+    test_bit_manip_countr_one<unsigned short>();
+#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC)
+  }
+#endif
+  test_bit_manip_countr_one<unsigned int>();
+  test_bit_manip_countr_one<unsigned long>();
+  test_bit_manip_countr_one<unsigned long long>();
+}
+
+template <class UInt>
+void test_bit_manip_popcount() {
+  using Kokkos::Experimental::popcount_builtin;
+  static_assert(noexcept(popcount_builtin(UInt())));
+  static_assert(std::is_same_v<decltype(popcount_builtin(UInt())), int>);
+  constexpr auto max = Kokkos::Experimental::finite_max_v<UInt>;
+  TEST_BIT_MANIP_FUNCTION(popcount)
+  ({
+      UInt(0),
+      UInt(1),
+      UInt(2),
+      UInt(3),
+      UInt(4),
+      UInt(5),
+      UInt(6),
+      UInt(7),
+      UInt(8),
+      UInt(9),
+      UInt(127),
+      UInt(max),
+      UInt(max - 1),
+  });
+}
+
+TEST(TEST_CATEGORY, bit_manip_popcount) {
+// FIXME_NVHPC: NVC++-W-0155-Compiler failed to translate accelerator region
+#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC)
+  if constexpr (!std::is_same_v<TEST_EXECSPACE,
+                                Kokkos::Experimental::OpenACC>) {
+#endif
+    test_bit_manip_popcount<unsigned char>();
+    test_bit_manip_popcount<unsigned short>();
+#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC)
+  }
+#endif
+  test_bit_manip_popcount<unsigned int>();
+  test_bit_manip_popcount<unsigned long>();
+  test_bit_manip_popcount<unsigned long long>();
+}
+
+template <class UInt>
+void test_bit_manip_has_single_bit() {
+  using Kokkos::Experimental::has_single_bit_builtin;
+  static_assert(noexcept(has_single_bit_builtin(UInt())));
+  static_assert(std::is_same_v<decltype(has_single_bit_builtin(UInt())), bool>);
+  constexpr auto max = Kokkos::Experimental::finite_max_v<UInt>;
+  constexpr UInt one = 1;
+  TEST_BIT_MANIP_FUNCTION(has_single_bit)
+  ({
+      // clang-format off
+      UInt(0),
+      UInt(1),
+      UInt(2),
+      UInt(3),
+      UInt(4),
+      UInt(5),
+      UInt(6),
+      UInt(7),
+      UInt(8),
+      UInt(9),
+      UInt(max),
+      UInt(one << 0),
+      UInt(one << 1),
+      UInt(one << 2),
+      UInt(one << 3),
+      UInt(one << 4),
+      UInt(one << 5),
+      UInt(one << 6),
+      UInt(one << 7),
+      // clang-format on
+  });
+}
+
+TEST(TEST_CATEGORY, bit_manip_has_single_bit) {
+// FIXME_NVHPC: NVC++-W-0155-Compiler failed to translate accelerator region
+#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC)
+  if constexpr (!std::is_same_v<TEST_EXECSPACE,
+                                Kokkos::Experimental::OpenACC>) {
+#endif
+    test_bit_manip_has_single_bit<unsigned char>();
+    test_bit_manip_has_single_bit<unsigned short>();
+#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC)
+  }
+#endif
+  test_bit_manip_has_single_bit<unsigned int>();
+  test_bit_manip_has_single_bit<unsigned long>();
+  test_bit_manip_has_single_bit<unsigned long long>();
+}
+
+template <class UInt>
+void test_bit_manip_bit_floor() {
+  using Kokkos::Experimental::bit_floor_builtin;
+  static_assert(noexcept(bit_floor_builtin(UInt())));
+  static_assert(std::is_same_v<decltype(bit_floor_builtin(UInt())), UInt>);
+  constexpr auto max = Kokkos::Experimental::finite_max_v<UInt>;
+  TEST_BIT_MANIP_FUNCTION(bit_floor)
+  ({
+      UInt(0),
+      UInt(1),
+      UInt(2),
+      UInt(3),
+      UInt(4),
+      UInt(5),
+      UInt(6),
+      UInt(7),
+      UInt(8),
+      UInt(9),
+      UInt(125),
+      UInt(126),
+      UInt(127),
+      UInt(128),
+      UInt(129),
+      UInt(max),
+  });
+}
+
+TEST(TEST_CATEGORY, bit_manip_bit_floor) {
+// FIXME_NVHPC: NVC++-W-0155-Compiler failed to translate accelerator region
+#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC)
+  if constexpr (!std::is_same_v<TEST_EXECSPACE,
+                                Kokkos::Experimental::OpenACC>) {
+#endif
+    test_bit_manip_bit_floor<unsigned char>();
+    test_bit_manip_bit_floor<unsigned short>();
+#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC)
+  }
+#endif
+  test_bit_manip_bit_floor<unsigned int>();
+  test_bit_manip_bit_floor<unsigned long>();
+  test_bit_manip_bit_floor<unsigned long long>();
+}
+
+template <class UInt>
+void test_bit_manip_bit_ceil() {
+  using Kokkos::Experimental::bit_ceil_builtin;
+  static_assert(noexcept(bit_ceil_builtin(UInt())));
+  static_assert(std::is_same_v<decltype(bit_ceil_builtin(UInt())), UInt>);
+  TEST_BIT_MANIP_FUNCTION(bit_ceil)
+  ({
+      // clang-format off
+      UInt(0),
+      UInt(1),
+      UInt(2),
+      UInt(3),
+      UInt(4),
+      UInt(5),
+      UInt(6),
+      UInt(7),
+      UInt(8),
+      UInt(9),
+      UInt(60),
+      UInt(61),
+      UInt(62),
+      UInt(63),
+      UInt(64),
+      UInt(65),
+      UInt(66),
+      UInt(67),
+      UInt(68),
+      UInt(69),
+      // clang-format on
+  });
+}
+
+TEST(TEST_CATEGORY, bit_manip_bit_ceil) {
+// FIXME_NVHPC: NVC++-W-0155-Compiler failed to translate accelerator region
+#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC)
+  if constexpr (!std::is_same_v<TEST_EXECSPACE,
+                                Kokkos::Experimental::OpenACC>) {
+#endif
+    test_bit_manip_bit_ceil<unsigned char>();
+    test_bit_manip_bit_ceil<unsigned short>();
+#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC)
+  }
+#endif
+  test_bit_manip_bit_ceil<unsigned int>();
+  test_bit_manip_bit_ceil<unsigned long>();
+  test_bit_manip_bit_ceil<unsigned long long>();
+}
+
+template <class UInt>
+void test_bit_manip_bit_width() {
+  using Kokkos::Experimental::bit_width_builtin;
+  static_assert(noexcept(bit_width_builtin(UInt())));
+  static_assert(std::is_same_v<decltype(bit_width_builtin(UInt())), UInt>);
+  constexpr auto max = Kokkos::Experimental::finite_max_v<UInt>;
+  TEST_BIT_MANIP_FUNCTION(bit_width)
+  ({
+      UInt(0),
+      UInt(1),
+      UInt(2),
+      UInt(3),
+      UInt(4),
+      UInt(5),
+      UInt(6),
+      UInt(7),
+      UInt(8),
+      UInt(9),
+      UInt(max - 1),
+      UInt(max),
+  });
+}
+
+TEST(TEST_CATEGORY, bit_manip_bit_width) {
+// FIXME_NVHPC: NVC++-W-0155-Compiler failed to translate accelerator region
+#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC)
+  if constexpr (!std::is_same_v<TEST_EXECSPACE,
+                                Kokkos::Experimental::OpenACC>) {
+#endif
+    test_bit_manip_bit_width<unsigned char>();
+    test_bit_manip_bit_width<unsigned short>();
+#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC)
+  }
+#endif
+  test_bit_manip_bit_width<unsigned int>();
+  test_bit_manip_bit_width<unsigned long>();
+  test_bit_manip_bit_width<unsigned long long>();
+}
+
+#undef TEST_BIT_MANIP_FUNCTION
+
+#define DEFINE_BIT_ROTATE_FUNCTION_EVAL(FUNC)                \
+  struct BitRotateFunction_##FUNC {                          \
+    template <class T>                                       \
+    static KOKKOS_FUNCTION auto eval_constexpr(T x, int s) { \
+      return Kokkos::FUNC(x, s);                             \
+    }                                                        \
+    template <class T>                                       \
+    static KOKKOS_FUNCTION auto eval_builtin(T x, int s) {   \
+      return Kokkos::Experimental::FUNC##_builtin(x, s);     \
+    }                                                        \
+    static char const* name() { return #FUNC; }              \
+  }
+
+DEFINE_BIT_ROTATE_FUNCTION_EVAL(rotl);
+DEFINE_BIT_ROTATE_FUNCTION_EVAL(rotr);
+
+#undef DEFINE_BIT_ROTATE_FUNCTION_EVAL
+
+template <class T>
+struct P {
+  using type = T;
+  T x;
+  int s;
+};
+
+template <class Space, class Func, class Arg, std::size_t N>
+struct TestBitRotateFunction {
+  Arg val_[N];
+  TestBitRotateFunction(const Arg (&val)[N]) {
+    std::copy(val, val + N, val_);
+    run();
+  }
+  void run() const {
+    int errors = 0;
+    Kokkos::parallel_reduce(Kokkos::RangePolicy<Space>(0, N), *this, errors);
+    ASSERT_EQ(errors, 0) << "Failed check no error for " << Func::name() << "("
+                         << type_helper<typename Arg::type>::name() << ", int)";
+  }
+  KOKKOS_FUNCTION void operator()(int i, int& e) const {
+    if (Func::eval_builtin(val_[i].x, val_[i].s) !=
+        Func::eval_constexpr(val_[i].x, val_[i].s)) {
+      ++e;
+      Kokkos::printf(
+          "value at %x rotated by %d which is %x was expected to be %x\n",
+          (unsigned)val_[i].x, val_[i].s,
+          (unsigned)Func::eval_builtin(val_[i].x, val_[i].s),
+          (unsigned)Func::eval_constexpr(val_[i].x, val_[i].s));
+    }
+  }
+};
+
+template <class Space, class... Func, class Arg, std::size_t N>
+void do_test_bit_rotate_function(const Arg (&x)[N]) {
+  (void)std::initializer_list<int>{
+      (TestBitRotateFunction<Space, Func, Arg, N>(x), 0)...};
+}
+
+#define TEST_BIT_ROTATE_FUNCTION(FUNC) \
+  do_test_bit_rotate_function<TEST_EXECSPACE, BitRotateFunction_##FUNC>
+
+template <class UInt>
+void test_bit_manip_rotl() {
+  using Kokkos::Experimental::rotl_builtin;
+  static_assert(noexcept(rotl_builtin(UInt(), 0)));
+  static_assert(std::is_same_v<decltype(rotl_builtin(UInt(), 0)), UInt>);
+  constexpr auto dig = Kokkos::Experimental::digits_v<UInt>;
+  constexpr auto max = Kokkos::Experimental::finite_max_v<UInt>;
+  TEST_BIT_ROTATE_FUNCTION(rotl)
+  ({
+      // clang-format off
+      P<UInt>{UInt(0), 0},
+      P<UInt>{UInt(0), 1},
+      P<UInt>{UInt(0), 4},
+      P<UInt>{UInt(0), 8},
+      P<UInt>{max, 0},
+      P<UInt>{max, 1},
+      P<UInt>{max, 4},
+      P<UInt>{max, 8},
+      P<UInt>{UInt(1), 0},
+      P<UInt>{UInt(1), 1},
+      P<UInt>{UInt(1), 4},
+      P<UInt>{UInt(1), dig},
+      P<UInt>{UInt(7), dig},
+      P<UInt>{UInt(6), dig - 1},
+      P<UInt>{UInt(3), 6},
+      P<UInt>{UInt(max - 1), 0},
+      P<UInt>{UInt(max - 1), 1},
+      P<UInt>{UInt(max - 1), 2},
+      P<UInt>{UInt(max - 1), 3},
+      P<UInt>{UInt(max - 1), 4},
+      P<UInt>{UInt(max - 1), 5},
+      P<UInt>{UInt(max - 1), 6},
+      P<UInt>{UInt(max - 1), 7},
+      P<UInt>{UInt(1), 0},
+      P<UInt>{UInt(1), 1},
+      P<UInt>{UInt(1), 2},
+      P<UInt>{UInt(1), 3},
+      P<UInt>{UInt(1), 4},
+      P<UInt>{UInt(1), 5},
+      P<UInt>{UInt(1), 6},
+      P<UInt>{UInt(1), 7},
+      // clang-format on
+  });
+}
+
+TEST(TEST_CATEGORY, bit_manip_rotl) {
+// FIXME_NVHPC: NVC++-W-0155-Compiler failed to translate accelerator region
+#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC)
+  if constexpr (!std::is_same_v<TEST_EXECSPACE,
+                                Kokkos::Experimental::OpenACC>) {
+#endif
+    test_bit_manip_rotl<unsigned char>();
+    test_bit_manip_rotl<unsigned short>();
+#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC)
+  }
+#endif
+  test_bit_manip_rotl<unsigned int>();
+  test_bit_manip_rotl<unsigned long>();
+  test_bit_manip_rotl<unsigned long long>();
+}
+
+template <class UInt>
+void test_bit_manip_rotr() {
+  using Kokkos::rotr;
+  using Kokkos::Experimental::rotr_builtin;
+  static_assert(noexcept(rotr_builtin(UInt(), 0)));
+  static_assert(std::is_same_v<decltype(rotr_builtin(UInt(), 0)), UInt>);
+  constexpr auto dig = Kokkos::Experimental::digits_v<UInt>;
+  constexpr auto max = Kokkos::Experimental::finite_max_v<UInt>;
+  TEST_BIT_ROTATE_FUNCTION(rotr)
+  ({
+      // clang-format off
+      P<UInt>{UInt(0), 0},
+      P<UInt>{UInt(0), 1},
+      P<UInt>{UInt(0), 4},
+      P<UInt>{UInt(0), 8},
+      P<UInt>{max, 0},
+      P<UInt>{max, 1},
+      P<UInt>{max, 4},
+      P<UInt>{max, 8},
+      P<UInt>{UInt(128), 0},
+      P<UInt>{UInt(128), 1},
+      P<UInt>{UInt(128), 4},
+      P<UInt>{UInt(1), dig},
+      P<UInt>{UInt(7), dig},
+      P<UInt>{UInt(6), dig - 1},
+      P<UInt>{UInt(36), dig - 2},
+      P<UInt>{UInt(max - 1), 0},
+      P<UInt>{UInt(max - 1), 1},
+      P<UInt>{UInt(max - 1), 2},
+      P<UInt>{UInt(max - 1), 3},
+      P<UInt>{UInt(max - 1), 4},
+      P<UInt>{UInt(max - 1), 5},
+      P<UInt>{UInt(max - 1), 6},
+      P<UInt>{UInt(max - 1), 7},
+      P<UInt>{UInt(128), 0},
+      P<UInt>{UInt(128), 1},
+      P<UInt>{UInt(128), 2},
+      P<UInt>{UInt(128), 3},
+      P<UInt>{UInt(128), 4},
+      P<UInt>{UInt(128), 5},
+      P<UInt>{UInt(128), 6},
+      P<UInt>{UInt(128), 0},
+      // clang-format on
+  });
+}
+
+TEST(TEST_CATEGORY, bit_manip_rotr) {
+// FIXME_NVHPC: NVC++-W-0155-Compiler failed to translate accelerator region
+#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC)
+  if constexpr (!std::is_same_v<TEST_EXECSPACE,
+                                Kokkos::Experimental::OpenACC>) {
+#endif
+    test_bit_manip_rotr<unsigned char>();
+    test_bit_manip_rotr<unsigned short>();
+#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC)
+  }
+#endif
+  test_bit_manip_rotr<unsigned int>();
+  test_bit_manip_rotr<unsigned long>();
+  test_bit_manip_rotr<unsigned long long>();
+}
+
+#undef TEST_BIT_ROTATE_FUNCTION
+
+template <class Space, class T>
+struct TestByteswapFunction {
+  TestByteswapFunction() { run(); }
+  void run() const {
+    int errors = 0;
+    Kokkos::parallel_reduce(Kokkos::RangePolicy<Space>(0, 1), *this, errors);
+    ASSERT_EQ(errors, 0) << "Failed check no error for byteswap("
+                         << type_helper<T>::name() << ")";
+  }
+  KOKKOS_FUNCTION void operator()(int, int& e) const {
+    T value;
+    T expected;
+    switch (sizeof(T)) {
+      case 1:
+        value    = static_cast<T>(0x12);
+        expected = static_cast<T>(0x12);
+        break;
+      case 2:
+        value    = static_cast<T>(0x1234);
+        expected = static_cast<T>(0x3412);
+        break;
+      case 4:
+        value    = static_cast<T>(0x60AF8503);
+        expected = static_cast<T>(0x0385AF60);
+        break;
+      case 8:
+        value    = static_cast<T>(0xABCDFE9477936406);
+        expected = static_cast<T>(0x0664937794FECDAB);
+        break;
+      default: Kokkos::abort("logic error");
+    }
+    using Kokkos::Experimental::byteswap_builtin;
+    if (byteswap_builtin(value) != expected) {
+      ++e;
+      Kokkos::printf("value at %llx which is %llx was expected to be %llx\n",
+                     (unsigned long long)value,
+                     (unsigned long long)byteswap_builtin(value),
+                     (unsigned long long)expected);
+    }
+  }
+};
+
+template <class Integral>
+void test_bit_manip_byteswap() {
+  using Kokkos::rotr;
+  using Kokkos::Experimental::byteswap_builtin;
+  static_assert(noexcept(byteswap_builtin(Integral())));
+  static_assert(
+      std::is_same_v<decltype(byteswap_builtin(Integral())), Integral>);
+  TestByteswapFunction<TEST_EXECSPACE, Integral>();
+}
+
+TEST(TEST_CATEGORY, bit_manip_byeswap) {
+// FIXME_NVHPC: NVC++-W-0155-Compiler failed to translate accelerator region
+#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC)
+  if constexpr (!std::is_same_v<TEST_EXECSPACE,
+                                Kokkos::Experimental::OpenACC>) {
+#endif
+    test_bit_manip_byteswap<char>();
+    test_bit_manip_byteswap<unsigned char>();
+    test_bit_manip_byteswap<short>();
+    test_bit_manip_byteswap<unsigned short>();
+#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC)
+  }
+#endif
+  test_bit_manip_byteswap<int>();
+  test_bit_manip_byteswap<unsigned int>();
+  test_bit_manip_byteswap<long>();
+  test_bit_manip_byteswap<unsigned long>();
+  test_bit_manip_byteswap<long long>();
+  test_bit_manip_byteswap<unsigned long long>();
+}
+
+// CUDA doesn't provide memcmp
+KOKKOS_FUNCTION int my_memcmp(void const* lhs, void const* rhs, size_t count) {
+  auto u1 = static_cast<unsigned char const*>(lhs);
+  auto u2 = static_cast<unsigned char const*>(rhs);
+  while (count-- != 0) {
+    if (*u1 != *u2) {
+      return (*u1 < *u2) ? -1 : +1;
+    }
+    ++u1;
+    ++u2;
+  }
+  return 0;
+}
+
+template <class Space>
+struct TestBitCastFunction {
+  TestBitCastFunction() { run(); }
+  void run() const {
+    int errors = 0;
+    Kokkos::parallel_reduce(Kokkos::RangePolicy<Space>(0, 1), *this, errors);
+    ASSERT_EQ(errors, 0) << "Failed check no error for bit_cast()";
+  }
+  template <typename To, typename From>
+#if defined(KOKKOS_COMPILER_GNU) && (900 <= KOKKOS_COMPILER_GNU) && \
+    (KOKKOS_COMPILER_GNU < 930)
+  // workaround compiler bug seen in GCC 9.0.1 and GCC 9.2.0
+  KOKKOS_FUNCTION bool check(const From& from) const
+#else
+  static KOKKOS_FUNCTION bool check(const From& from)
+#endif
+  {
+    using Kokkos::Experimental::bit_cast_builtin;
+    return bit_cast_builtin<From>(bit_cast_builtin<To>(from)) == from;
+  }
+
+  KOKKOS_FUNCTION void operator()(int, int& e) const {
+    using Kokkos::bit_cast;
+    if (bit_cast<int>(123) != 123) {
+      ++e;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #1\n");
+    }
+    if (bit_cast<int>(123u) != 123) {
+      ++e;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #2\n");
+    }
+    if (bit_cast<int>(~0u) != ~0) {
+      ++e;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #3\n");
+    }
+    if constexpr (sizeof(int) == sizeof(float)) {
+      if (!check<int>(12.34f)) {
+        ++e;
+        KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #4\n");
+      }
+    }
+    if constexpr (sizeof(unsigned long long) == sizeof(double)) {
+      if (!check<unsigned long long>(123.456)) {
+        ++e;
+        KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #5\n");
+      }
+    }
+
+#if defined(KOKKOS_ENABLE_CUDA) && \
+    defined(KOKKOS_COMPILER_NVHPC)  // FIXME_NVHPC 23.7
+    if constexpr (std::is_same_v<Space, Kokkos::Cuda>) {
+      return;
+    }
+#endif
+    struct S {
+      int i;
+
+      KOKKOS_FUNCTION bool operator==(const char* s) const {
+        return my_memcmp(&i, s, sizeof(i)) == 0;
+      }
+    };
+    char arr[sizeof(int)];
+    char arr2[sizeof(int)];
+    for (size_t i = 0; i < sizeof(int); ++i) {
+      arr[i]  = i + 1;
+      arr2[i] = (i + 1) * -(i % 2);
+    }
+    if (!(bit_cast<S>(arr) == arr)) {
+      ++e;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #6\n");
+    }
+    if (!(bit_cast<S>(arr2) == arr2)) {
+      ++e;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #7\n");
+    }
+  }
+};
+
+TEST(TEST_CATEGORY, bit_manip_bit_cast) {
+  TestBitCastFunction<TEST_EXECSPACE>();
+}
diff --git a/packages/kokkos/core/unit_test/TestCheckedIntegerOps.hpp b/packages/kokkos/core/unit_test/TestCheckedIntegerOps.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2940a14711c516a76d9c2dbec0ef96c792c3e75c
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestCheckedIntegerOps.hpp
@@ -0,0 +1,51 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <gtest/gtest.h>
+#include <impl/Kokkos_CheckedIntegerOps.hpp>
+#include <limits>
+
+namespace {
+
+TEST(TEST_CATEGORY, checked_integer_operations_multiply_overflow) {
+  {
+    auto result      = 1u;
+    auto is_overflow = Kokkos::Impl::multiply_overflow(1u, 2u, result);
+    EXPECT_EQ(result, 2u);
+    EXPECT_FALSE(is_overflow);
+  }
+  {
+    auto result      = 1u;
+    auto is_overflow = Kokkos::Impl::multiply_overflow(
+        std::numeric_limits<unsigned>::max(), 2u, result);
+    EXPECT_TRUE(is_overflow);
+  }
+}
+
+TEST(TEST_CATEGORY_DEATH, checked_integer_operations_multiply_overflow_abort) {
+  ::testing::FLAGS_gtest_death_test_style = "threadsafe";
+  {
+    auto result = Kokkos::Impl::multiply_overflow_abort(1u, 2u);
+    EXPECT_EQ(result, 2u);
+  }
+  {
+    ASSERT_DEATH(Kokkos::Impl::multiply_overflow_abort(
+                     std::numeric_limits<unsigned>::max(), 2u),
+                 "Arithmetic overflow detected.");
+  }
+}
+
+}  // namespace
diff --git a/packages/kokkos/core/unit_test/TestCompilerMacros.hpp b/packages/kokkos/core/unit_test/TestCompilerMacros.cpp
similarity index 71%
rename from packages/kokkos/core/unit_test/TestCompilerMacros.hpp
rename to packages/kokkos/core/unit_test/TestCompilerMacros.cpp
index 9d22c4b0a777038df4bf8662fb0c7ac582f42d58..63b368b23ee16ee6ca3d34a475330f495e823ef4 100644
--- a/packages/kokkos/core/unit_test/TestCompilerMacros.hpp
+++ b/packages/kokkos/core/unit_test/TestCompilerMacros.cpp
@@ -14,8 +14,21 @@
 //
 //@HEADER
 
+#include <gtest/gtest.h>
 #include <Kokkos_Core.hpp>
 
+#if 1 != ((defined(KOKKOS_COMPILER_INTEL) ? 1 : 0) +      \
+          (defined(KOKKOS_COMPILER_INTEL_LLVM) ? 1 : 0) + \
+          (defined(KOKKOS_COMPILER_CRAYC) ? 1 : 0) +      \
+          (defined(KOKKOS_COMPILER_CRAY_LLVM) ? 1 : 0) +  \
+          (defined(KOKKOS_COMPILER_APPLECC) ? 1 : 0) +    \
+          (defined(KOKKOS_COMPILER_CLANG) ? 1 : 0) +      \
+          (defined(KOKKOS_COMPILER_GNU) ? 1 : 0) +        \
+          (defined(KOKKOS_COMPILER_NVHPC) ? 1 : 0) +      \
+          (defined(KOKKOS_COMPILER_MSVC) ? 1 : 0))
+#error "Only one host compiler macro can be defined"
+#endif
+
 #if defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_CUDA_LAMBDA)
 #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
 #error "Macro bug: KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA shouldn't be defined"
@@ -26,8 +39,6 @@
 #endif
 #endif
 
-#define KOKKOS_PRAGMA_UNROLL(a)
-
 namespace TestCompilerMacros {
 
 template <class DEVICE_TYPE>
@@ -51,7 +62,7 @@ struct AddFunctor {
 #pragma vector always
 #endif
 #ifdef KOKKOS_ENABLE_PRAGMA_LOOPCOUNT
-#pragma loop count(128)
+#pragma loop_count(128)
 #endif
     for (int j = 0; j < length; j++) {
       a(i, j) += b(i, j);
@@ -75,7 +86,7 @@ bool Test() {
 }  // namespace TestCompilerMacros
 
 namespace Test {
-TEST(TEST_CATEGORY, compiler_macros) {
-  ASSERT_TRUE((TestCompilerMacros::Test<TEST_EXECSPACE>()));
+TEST(defaultdevicetype, compiler_macros) {
+  ASSERT_TRUE((TestCompilerMacros::Test<Kokkos::DefaultHostExecutionSpace>()));
 }
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestComplex.hpp b/packages/kokkos/core/unit_test/TestComplex.hpp
index 87085f3648e58ca9e4f8d1b086b90e19d8fba797..bcae2e1d81600d929365d4fea34d220d29c893ea 100644
--- a/packages/kokkos/core/unit_test/TestComplex.hpp
+++ b/packages/kokkos/core/unit_test/TestComplex.hpp
@@ -519,9 +519,13 @@ TEST(TEST_CATEGORY, complex_operations_arithmetic_types_overloads) {
   ASSERT_EQ(Kokkos::conj(1), Kokkos::complex<double>(1));
   ASSERT_EQ(Kokkos::conj(2.f), Kokkos::complex<float>(2.f));
   ASSERT_EQ(Kokkos::conj(3.), Kokkos::complex<double>(3.));
+// long double has size 12 but Kokkos::complex requires 2*sizeof(T) to be a
+// power of two.
+#ifndef KOKKOS_IMPL_32BIT
   ASSERT_EQ(Kokkos::conj(4.l), Kokkos::complex<long double>(4.l));
   static_assert((
       std::is_same<decltype(Kokkos::conj(1)), Kokkos::complex<double>>::value));
+#endif
   static_assert((std::is_same<decltype(Kokkos::conj(2.f)),
                               Kokkos::complex<float>>::value));
   static_assert((std::is_same<decltype(Kokkos::conj(3.)),
diff --git a/packages/kokkos/core/unit_test/TestDeepCopyAlignment.hpp b/packages/kokkos/core/unit_test/TestDeepCopyAlignment.hpp
index b7e6a8778643efd00a3383847a0975352e84cdec..8e5ff3b9af6091ca2fecb07c228a6ab58b9ed5df 100644
--- a/packages/kokkos/core/unit_test/TestDeepCopyAlignment.hpp
+++ b/packages/kokkos/core/unit_test/TestDeepCopyAlignment.hpp
@@ -341,6 +341,9 @@ struct TestDeepCopyScalarConversion {
 }  // namespace Impl
 
 TEST(TEST_CATEGORY, deep_copy_conversion) {
+#ifdef KOKKOS_IMPL_32BIT
+  GTEST_SKIP() << "Failing KOKKOS_IMPL_32BIT";  // FIXME_32BIT
+#endif
   int64_t N0 = 19381;
   int64_t N1 = 17;
 
diff --git a/packages/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp b/packages/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp
index 7ae73b14d3a085870055147264b4d6147432a9af..929c91db4e00a37e6630e8f6c54a393ab08e6014 100644
--- a/packages/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp
+++ b/packages/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp
@@ -262,7 +262,7 @@ void check_correct_initialization(const Kokkos::InitArguments& argstruct) {
 #endif
   }
 
-  ASSERT_EQ(Kokkos::HostSpace::execution_space::impl_thread_pool_size(),
+  ASSERT_EQ(Kokkos::HostSpace::execution_space().impl_thread_pool_size(),
             expected_nthreads);
 
 #ifdef KOKKOS_ENABLE_CUDA
diff --git a/packages/kokkos/core/unit_test/TestDeviceAndThreads.py b/packages/kokkos/core/unit_test/TestDeviceAndThreads.py
index fd70e3ff681d74e26473f93e1083807c8ad112e4..1d3ff8eea7e7577a63f2dab37a552195f86d5bc7 100644
--- a/packages/kokkos/core/unit_test/TestDeviceAndThreads.py
+++ b/packages/kokkos/core/unit_test/TestDeviceAndThreads.py
@@ -18,8 +18,8 @@
 import unittest
 import subprocess
 
-PREFIX = "$<TARGET_FILE_DIR:KokkosCore_UnitTest_DeviceAndThreads>"
-EXECUTABLE = "$<TARGET_FILE_NAME:KokkosCore_UnitTest_DeviceAndThreads>"
+PREFIX = "$<TARGET_FILE_DIR:Kokkos_CoreUnitTest_DeviceAndThreads>"
+EXECUTABLE = "$<TARGET_FILE_NAME:Kokkos_CoreUnitTest_DeviceAndThreads>"
 COMMAND = "/".join([PREFIX, EXECUTABLE])
 
 
diff --git a/packages/kokkos/core/unit_test/TestExecSpacePartitioning.hpp b/packages/kokkos/core/unit_test/TestExecSpacePartitioning.hpp
index 0b02a5e36f13119c7e2a3035d2a08dbcd8571f2e..65314d6be7cf77f34542d669fcb7e9e74a75622c 100644
--- a/packages/kokkos/core/unit_test/TestExecSpacePartitioning.hpp
+++ b/packages/kokkos/core/unit_test/TestExecSpacePartitioning.hpp
@@ -29,30 +29,35 @@ struct SumFunctor {
 };
 
 template <class ExecSpace>
-void check_distinctive(ExecSpace, ExecSpace) {}
-
+void check_distinctive([[maybe_unused]] ExecSpace exec1,
+                       [[maybe_unused]] ExecSpace exec2) {
+#ifdef KOKKOS_ENABLE_SERIAL
+  if constexpr (std::is_same_v<ExecSpace, Kokkos::Serial>) {
+    ASSERT_NE(exec1, exec2);
+  }
+#endif
+#ifdef KOKKOS_ENABLE_OPENMP
+  if constexpr (std::is_same_v<ExecSpace, Kokkos::OpenMP>) {
+    ASSERT_NE(exec1, exec2);
+  }
+#endif
 #ifdef KOKKOS_ENABLE_CUDA
-void check_distinctive(Kokkos::Cuda exec1, Kokkos::Cuda exec2) {
-  ASSERT_NE(exec1.cuda_stream(), exec2.cuda_stream());
-}
+  if constexpr (std::is_same_v<ExecSpace, Kokkos::Cuda>) {
+    ASSERT_NE(exec1.cuda_stream(), exec2.cuda_stream());
+  }
 #endif
 #ifdef KOKKOS_ENABLE_HIP
-void check_distinctive(Kokkos::HIP exec1, Kokkos::HIP exec2) {
-  ASSERT_NE(exec1.hip_stream(), exec2.hip_stream());
-}
+  if constexpr (std::is_same_v<ExecSpace, Kokkos::HIP>) {
+    ASSERT_NE(exec1.hip_stream(), exec2.hip_stream());
+  }
 #endif
 #ifdef KOKKOS_ENABLE_SYCL
-void check_distinctive(Kokkos::Experimental::SYCL exec1,
-                       Kokkos::Experimental::SYCL exec2) {
-  ASSERT_NE(*exec1.impl_internal_space_instance()->m_queue,
-            *exec2.impl_internal_space_instance()->m_queue);
-}
+  if constexpr (std::is_same_v<ExecSpace, Kokkos::Experimental::SYCL>) {
+    ASSERT_NE(*exec1.impl_internal_space_instance()->m_queue,
+              *exec2.impl_internal_space_instance()->m_queue);
+  }
 #endif
-#ifdef KOKKOS_ENABLE_OPENMP
-void check_distinctive(Kokkos::OpenMP exec1, Kokkos::OpenMP exec2) {
-  ASSERT_NE(exec1, exec2);
 }
-#endif
 }  // namespace
 
 #ifdef KOKKOS_ENABLE_OPENMP
@@ -99,28 +104,6 @@ void test_partitioning(std::vector<TEST_EXECSPACE>& instances) {
       });
   ASSERT_EQ(sum1, sum2);
   ASSERT_EQ(sum1, N * (N - 1) / 2);
-
-#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || \
-    defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_OPENMP)
-  // Eliminate unused function warning
-  // (i.e. when compiling for Serial and CUDA, during Serial compilation the
-  // Cuda overload is unused ...)
-  if (sum1 != sum2) {
-#ifdef KOKKOS_ENABLE_CUDA
-    check_distinctive(Kokkos::Cuda(), Kokkos::Cuda());
-#endif
-#ifdef KOKKOS_ENABLE_HIP
-    check_distinctive(Kokkos::HIP(), Kokkos::HIP());
-#endif
-#ifdef KOKKOS_ENABLE_SYCL
-    check_distinctive(Kokkos::Experimental::SYCL(),
-                      Kokkos::Experimental::SYCL());
-#endif
-#ifdef KOKKOS_ENABLE_OPENMP
-    check_distinctive(Kokkos::OpenMP(), Kokkos::OpenMP());
-#endif
-  }
-#endif
 }
 
 TEST(TEST_CATEGORY, partitioning_by_args) {
@@ -131,9 +114,9 @@ TEST(TEST_CATEGORY, partitioning_by_args) {
 }
 
 TEST(TEST_CATEGORY, partitioning_by_vector) {
-  std::vector<int> weights{1, 1};
-  auto instances =
-      Kokkos::Experimental::partition_space(TEST_EXECSPACE(), weights);
+  // Make sure we can use a temporary as argument for weights
+  auto instances = Kokkos::Experimental::partition_space(
+      TEST_EXECSPACE(), std::vector<int> /*weights*/ {1, 1});
   ASSERT_EQ(int(instances.size()), 2);
   test_partitioning(instances);
 }
diff --git a/packages/kokkos/core/unit_test/TestFunctorAnalysis.hpp b/packages/kokkos/core/unit_test/TestFunctorAnalysis.hpp
index 9ebb9c066a2767d2c3593efdb7bd1daa05db5b25..c024526111b12bef05d945fc82c837f2aa5cd782 100644
--- a/packages/kokkos/core/unit_test/TestFunctorAnalysis.hpp
+++ b/packages/kokkos/core/unit_test/TestFunctorAnalysis.hpp
@@ -38,6 +38,16 @@ struct TestFunctorAnalysis_03 {
   KOKKOS_INLINE_FUNCTION static void init(value_type&) {}
 };
 
+struct TestFunctorAnalysis_04 {
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int, float&) const {}
+
+  KOKKOS_INLINE_FUNCTION
+  void join(float&, float const&) const {}
+
+  KOKKOS_INLINE_FUNCTION static void init(float&) {}
+};
+
 template <class ExecSpace>
 void test_functor_analysis() {
   //------------------------------
@@ -45,7 +55,7 @@ void test_functor_analysis() {
   using A01 =
       Kokkos::Impl::FunctorAnalysis<Kokkos::Impl::FunctorPatternInterface::FOR,
                                     Kokkos::RangePolicy<ExecSpace>,
-                                    decltype(c01)>;
+                                    decltype(c01), void>;
 
   using R01 = typename A01::Reducer;
 
@@ -59,13 +69,13 @@ void test_functor_analysis() {
   static_assert(!A01::has_init_member_function, "");
   static_assert(!A01::has_final_member_function, "");
   static_assert(A01::StaticValueSize == 0, "");
-  ASSERT_EQ(R01(&c01).length(), 0);
+  ASSERT_EQ(R01(c01).length(), 0);
 
   //------------------------------
   auto c02  = KOKKOS_LAMBDA(int, double&){};
   using A02 = Kokkos::Impl::FunctorAnalysis<
       Kokkos::Impl::FunctorPatternInterface::REDUCE,
-      Kokkos::RangePolicy<ExecSpace>, decltype(c02)>;
+      Kokkos::RangePolicy<ExecSpace>, decltype(c02), void>;
   using R02 = typename A02::Reducer;
 
   static_assert(std::is_same<typename A02::value_type, double>::value, "");
@@ -78,14 +88,14 @@ void test_functor_analysis() {
   static_assert(!A02::has_init_member_function, "");
   static_assert(!A02::has_final_member_function, "");
   static_assert(A02::StaticValueSize == sizeof(double), "");
-  ASSERT_EQ(R02(&c02).length(), 1);
+  ASSERT_EQ(R02(c02).length(), 1);
 
   //------------------------------
 
   TestFunctorAnalysis_03 c03;
   using A03 = Kokkos::Impl::FunctorAnalysis<
       Kokkos::Impl::FunctorPatternInterface::REDUCE,
-      Kokkos::RangePolicy<ExecSpace>, TestFunctorAnalysis_03>;
+      Kokkos::RangePolicy<ExecSpace>, TestFunctorAnalysis_03, void>;
   using R03 = typename A03::Reducer;
 
   static_assert(std::is_same<typename A03::value_type,
@@ -106,9 +116,29 @@ void test_functor_analysis() {
   static_assert(!A03::has_final_member_function, "");
   static_assert(
       A03::StaticValueSize == sizeof(TestFunctorAnalysis_03::value_type), "");
-  ASSERT_EQ(R03(&c03).length(), 1);
+  ASSERT_EQ(R03(c03).length(), 1);
 
   //------------------------------
+
+  TestFunctorAnalysis_04 c04;
+  using A04 = Kokkos::Impl::FunctorAnalysis<
+      Kokkos::Impl::FunctorPatternInterface::REDUCE,
+      Kokkos::RangePolicy<ExecSpace>, TestFunctorAnalysis_04, float>;
+  using R04 = typename A04::Reducer;
+
+  static_assert(std::is_same_v<typename A04::value_type, float>);
+  static_assert(
+      std::is_same_v<typename A04::pointer_type, typename A04::value_type*>);
+  static_assert(
+      std::is_same_v<typename A04::reference_type, typename A04::value_type&>);
+  static_assert(
+      std::is_same_v<typename R04::functor_type, TestFunctorAnalysis_04>);
+
+  static_assert(A04::has_join_member_function);
+  static_assert(A04::has_init_member_function);
+  static_assert(!A04::has_final_member_function);
+  static_assert(A04::StaticValueSize == sizeof(typename A04::value_type));
+  ASSERT_EQ(R04(c04).length(), 1);
 }
 
 TEST(TEST_CATEGORY, functor_analysis) {
diff --git a/packages/kokkos/core/unit_test/TestHalfOperators.hpp b/packages/kokkos/core/unit_test/TestHalfOperators.hpp
index 6a2bc359e5bf915c033db5c5ab40a98946267de3..752e3b5081612e70c30964ca9d7cec617ceb9161 100644
--- a/packages/kokkos/core/unit_test/TestHalfOperators.hpp
+++ b/packages/kokkos/core/unit_test/TestHalfOperators.hpp
@@ -17,8 +17,6 @@
 #ifndef TESTHALFOPERATOR_HPP_
 #define TESTHALFOPERATOR_HPP_
 namespace Test {
-#define FP16_EPSILON 0.0009765625F  // 1/2^10
-#define BF16_EPSILON 0.0078125F     // 1/2^7
 using namespace Kokkos::Experimental;
 using ExecutionSpace = TEST_EXECSPACE;
 using ScalarType     = double;
@@ -26,9 +24,19 @@ using ViewType       = Kokkos::View<ScalarType*, ExecutionSpace>;
 using ViewTypeHost   = Kokkos::View<ScalarType*, Kokkos::HostSpace>;
 KOKKOS_FUNCTION
 const half_t& accept_ref(const half_t& a) { return a; }
+KOKKOS_FUNCTION
+double accept_ref_expected(const half_t& a) {
+  double tmp = static_cast<double>(a);
+  return tmp;
+}
 #if !KOKKOS_BHALF_T_IS_FLOAT
 KOKKOS_FUNCTION
 const bhalf_t& accept_ref(const bhalf_t& a) { return a; }
+KOKKOS_FUNCTION
+double accept_ref_expected(const bhalf_t& a) {
+  double tmp = static_cast<double>(a);
+  return tmp;
+}
 #endif  // !KOKKOS_BHALF_T_IS_FLOAT
 
 enum OP_TESTS {
@@ -233,10 +241,27 @@ enum OP_TESTS {
   OR,
   EQ,
   NEQ,
-  LT,
-  GT,
-  LE,
-  GE,  // TODO: TW,
+  LT_H_H,
+  LT_H_S,
+  LT_S_H,
+  LT_H_D,
+  LT_D_H,
+  GT_H_H,
+  GT_H_S,
+  GT_S_H,
+  GT_H_D,
+  GT_D_H,
+  LE_H_H,
+  LE_H_S,
+  LE_S_H,
+  LE_H_D,
+  LE_D_H,
+  GE_H_H,
+  GE_H_S,
+  GE_S_H,
+  GE_H_D,
+  GE_D_H,
+  // TODO: TW,
   PASS_BY_REF,
   AO_IMPL_HALF,
   AO_HALF_T,
@@ -284,20 +309,20 @@ struct Functor_TestHalfVolatileOperators {
     actual_lhs(ASSIGN)   = static_cast<double>(nv_tmp);
     expected_lhs(ASSIGN) = d_lhs;
 
-    actual_lhs(LT)   = h_lhs < h_rhs;
-    expected_lhs(LT) = d_lhs < d_rhs;
+    actual_lhs(LT_H_H)   = h_lhs < h_rhs;
+    expected_lhs(LT_H_H) = d_lhs < d_rhs;
 
-    actual_lhs(LE)   = h_lhs <= h_rhs;
-    expected_lhs(LE) = d_lhs <= d_rhs;
+    actual_lhs(LE_H_H)   = h_lhs <= h_rhs;
+    expected_lhs(LE_H_H) = d_lhs <= d_rhs;
 
     actual_lhs(NEQ)   = h_lhs != h_rhs;
     expected_lhs(NEQ) = d_lhs != d_rhs;
 
-    actual_lhs(GT)   = h_lhs > h_rhs;
-    expected_lhs(GT) = d_lhs > d_rhs;
+    actual_lhs(GT_H_H)   = h_lhs > h_rhs;
+    expected_lhs(GT_H_H) = d_lhs > d_rhs;
 
-    actual_lhs(GE)   = h_lhs >= h_rhs;
-    expected_lhs(GE) = d_lhs >= d_rhs;
+    actual_lhs(GE_H_H)   = h_lhs >= h_rhs;
+    expected_lhs(GE_H_H) = d_lhs >= d_rhs;
 
     actual_lhs(EQ)   = h_lhs == h_rhs;
     expected_lhs(EQ) = d_lhs == d_rhs;
@@ -871,23 +896,63 @@ struct Functor_TestHalfOperators {
     actual_lhs(NEQ)   = h_lhs != h_rhs;
     expected_lhs(NEQ) = d_lhs != d_rhs;
 
-    actual_lhs(LT)   = h_lhs < h_rhs;
-    expected_lhs(LT) = d_lhs < d_rhs;
-
-    actual_lhs(GT)   = h_lhs > h_rhs;
-    expected_lhs(GT) = d_lhs > d_rhs;
-
-    actual_lhs(LE)   = h_lhs <= h_rhs;
-    expected_lhs(LE) = d_lhs <= d_rhs;
-
-    actual_lhs(GE)   = h_lhs >= h_rhs;
-    expected_lhs(GE) = d_lhs >= d_rhs;
+    actual_lhs(LT_H_H)   = h_lhs < h_rhs;
+    expected_lhs(LT_H_H) = d_lhs < d_rhs;
+    actual_lhs(LT_H_S)   = h_lhs < static_cast<float>(h_rhs);
+    expected_lhs(LT_H_S) = d_lhs < d_rhs;
+    actual_lhs(LT_S_H)   = static_cast<float>(h_lhs) < h_rhs;
+    expected_lhs(LT_S_H) = d_lhs < d_rhs;
+    actual_lhs(LT_H_D)   = h_lhs < static_cast<double>(h_rhs);
+    expected_lhs(LT_H_D) = d_lhs < d_rhs;
+    actual_lhs(LT_D_H)   = static_cast<double>(h_lhs) < h_rhs;
+    expected_lhs(LT_D_H) = d_lhs < d_rhs;
+
+    actual_lhs(GT_H_H)   = h_lhs > h_rhs;
+    expected_lhs(GT_H_H) = d_lhs > d_rhs;
+    actual_lhs(GT_H_S)   = h_lhs > static_cast<float>(h_rhs);
+    expected_lhs(GT_H_S) = d_lhs > d_rhs;
+    actual_lhs(GT_S_H)   = static_cast<float>(h_lhs) > h_rhs;
+    expected_lhs(GT_S_H) = d_lhs > d_rhs;
+    actual_lhs(GT_H_D)   = h_lhs > static_cast<double>(h_rhs);
+    expected_lhs(GT_H_D) = d_lhs > d_rhs;
+    actual_lhs(GT_D_H)   = static_cast<double>(h_lhs) > h_rhs;
+    expected_lhs(GT_D_H) = d_lhs > d_rhs;
+
+    actual_lhs(LE_H_H)   = h_lhs <= h_rhs;
+    expected_lhs(LE_H_H) = d_lhs <= d_rhs;
+    actual_lhs(LE_H_S)   = h_lhs <= static_cast<float>(h_rhs);
+    expected_lhs(LE_H_S) = d_lhs <= d_rhs;
+    actual_lhs(LE_S_H)   = static_cast<float>(h_lhs) <= h_rhs;
+    expected_lhs(LE_S_H) = d_lhs <= d_rhs;
+    actual_lhs(LE_H_D)   = h_lhs <= static_cast<double>(h_rhs);
+    expected_lhs(LE_H_D) = d_lhs <= d_rhs;
+    actual_lhs(LE_D_H)   = static_cast<double>(h_lhs) <= h_rhs;
+    expected_lhs(LE_D_H) = d_lhs <= d_rhs;
+
+    actual_lhs(GE_H_H)   = h_lhs >= h_rhs;
+    expected_lhs(GE_H_H) = d_lhs >= d_rhs;
+    actual_lhs(GE_H_S)   = h_lhs >= static_cast<float>(h_rhs);
+    expected_lhs(GE_H_S) = d_lhs >= d_rhs;
+    actual_lhs(GE_S_H)   = static_cast<float>(h_lhs) >= h_rhs;
+    expected_lhs(GE_S_H) = d_lhs >= d_rhs;
+    actual_lhs(GE_H_D)   = h_lhs >= static_cast<double>(h_rhs);
+    expected_lhs(GE_H_D) = d_lhs >= d_rhs;
+    actual_lhs(GE_D_H)   = static_cast<double>(h_lhs) >= h_rhs;
+    expected_lhs(GE_D_H) = d_lhs >= d_rhs;
 
     // actual_lhs(TW)   = h_lhs <=> h_rhs;  // Need C++20?
     // expected_lhs(TW) = d_lhs <=> d_rhs;  // Need C++20?
 
-    actual_lhs(PASS_BY_REF)   = static_cast<double>(accept_ref(h_lhs));
-    expected_lhs(PASS_BY_REF) = d_lhs;
+    actual_lhs(PASS_BY_REF) = static_cast<double>(accept_ref(h_lhs));
+
+    // Use accept_ref and accept_ref_expected to ensure the compiler
+    // does not optimize out the casts half_type -> double -> half_type.
+    // Note that these casts are accompanied by rounding. For the bhalf_t
+    // epsilon, these rounding policies used for casting is enough to cause
+    // the unit tests to fail.
+    // In short, one cannot simply assign static_cast<double>(h_lhs) to
+    // expected_lhs(PASS_BY_REF).
+    expected_lhs(PASS_BY_REF) = accept_ref_expected(h_lhs);
 
     half_tmp = static_cast<float>(h_lhs);
     tmp_ptr  = &(tmp_lhs = half_tmp);
@@ -910,12 +975,7 @@ struct Functor_TestHalfOperators {
 
 template <class half_type>
 void __test_half_operators(half_type h_lhs, half_type h_rhs) {
-  double epsilon = FLT_EPSILON;
-
-  if (std::is_same<half_type, Kokkos::Experimental::half_t>::value)
-    epsilon = FP16_EPSILON;
-  if (std::is_same<half_type, Kokkos::Experimental::bhalf_t>::value)
-    epsilon = BF16_EPSILON;
+  half_type epsilon = Kokkos::Experimental::epsilon<half_type>::value;
 
   Functor_TestHalfOperators<ViewType, half_type> f_device(h_lhs, h_rhs);
   Functor_TestHalfOperators<ViewTypeHost, half_type> f_host(h_lhs, h_rhs);
@@ -930,9 +990,9 @@ void __test_half_operators(half_type h_lhs, half_type h_rhs) {
   for (int op_test = 0; op_test < N_OP_TESTS; op_test++) {
     // printf("op_test = %d\n", op_test);
     ASSERT_NEAR(f_device_actual_lhs(op_test), f_device_expected_lhs(op_test),
-                epsilon);
+                static_cast<double>(epsilon));
     ASSERT_NEAR(f_host.actual_lhs(op_test), f_host.expected_lhs(op_test),
-                epsilon);
+                static_cast<double>(epsilon));
   }
 
 // volatile-qualified parameter type 'volatile half_type' is deprecated
@@ -950,14 +1010,14 @@ void __test_half_operators(half_type h_lhs, half_type h_rhs) {
   Kokkos::deep_copy(f_device_expected_lhs, f_device.expected_lhs);
   for (int op_test = 0; op_test < N_OP_TESTS; op_test++) {
     // printf("op_test = %d\n", op_test);
-    if (op_test == ASSIGN || op_test == LT || op_test == LE || op_test == NEQ ||
-        op_test == EQ || op_test == GT || op_test == GE ||
-        op_test == CADD_H_H || op_test == CSUB_H_H || op_test == CMUL_H_H ||
-        op_test == CDIV_H_H) {
+    if (op_test == ASSIGN || op_test == LT_H_H || op_test == LE_H_H ||
+        op_test == NEQ || op_test == EQ || op_test == GT_H_H ||
+        op_test == GE_H_H || op_test == CADD_H_H || op_test == CSUB_H_H ||
+        op_test == CMUL_H_H || op_test == CDIV_H_H) {
       ASSERT_NEAR(f_device_actual_lhs(op_test), f_device_expected_lhs(op_test),
-                  epsilon);
+                  static_cast<double>(epsilon));
       ASSERT_NEAR(f_host.actual_lhs(op_test), f_host.expected_lhs(op_test),
-                  epsilon);
+                  static_cast<double>(epsilon));
     }
   }
 #endif
diff --git a/packages/kokkos/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp b/packages/kokkos/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp
index 8cf25cef847f6ae7e5b57a7d421e7ca6238d79fe..3ee2ff52051a4fece698b13b3d2ce2f69ef15a26 100644
--- a/packages/kokkos/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp
+++ b/packages/kokkos/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp
@@ -108,12 +108,6 @@ TEST(TEST_CATEGORY, host_shared_ptr_dereference_on_device) {
       static_cast<T*>(Kokkos::kokkos_malloc<MemorySpace>(sizeof(T))),
       [](T* p) { Kokkos::kokkos_free<MemorySpace>(p); });
 
-#if defined(KOKKOS_ENABLE_CUDA) && \
-    defined(KOKKOS_COMPILER_NVHPC)  // FIXME_NVHPC
-  if constexpr (std::is_same_v<TEST_EXECSPACE, Kokkos::Cuda>) {
-    GTEST_SKIP() << "FIXME wrong result";
-  }
-#endif
   check_access_stored_pointer_and_dereference_on_device(device_ptr);
 }
 
diff --git a/packages/kokkos/core/unit_test/TestInit.hpp b/packages/kokkos/core/unit_test/TestInit.hpp
index 7be9ef5f12a3ef0329c8204b11a4a6e6dbd5a2cf..9a8dba8dc8f6e608e52b37687b050ee354b62755 100644
--- a/packages/kokkos/core/unit_test/TestInit.hpp
+++ b/packages/kokkos/core/unit_test/TestInit.hpp
@@ -40,5 +40,3 @@ TEST(TEST_CATEGORY, dispatch) { test_dispatch<TEST_EXECSPACE>(); }
 #endif
 
 }  // namespace Test
-
-#include <TestCompilerMacros.hpp>
diff --git a/packages/kokkos/core/unit_test/TestJoinBackwardCompatibility.hpp b/packages/kokkos/core/unit_test/TestJoinBackwardCompatibility.hpp
index 60da9e0713f0da4df94aa81e3a1778ba99ae28db..24cf52aa7090eabae6dc4434f392349557f31789 100644
--- a/packages/kokkos/core/unit_test/TestJoinBackwardCompatibility.hpp
+++ b/packages/kokkos/core/unit_test/TestJoinBackwardCompatibility.hpp
@@ -27,7 +27,6 @@ enum MyErrorCode {
   error_operator_plus_equal_volatile = 0b010,
   error_join_volatile                = 0b100,
   expected_join_volatile             = 0b1000
-
 };
 
 KOKKOS_FUNCTION constexpr MyErrorCode operator|(MyErrorCode lhs,
@@ -97,7 +96,7 @@ struct ReducerWithJoinThatTakesVolatileQualifiedArgs {
 
 void test_join_backward_compatibility() {
   MyJoinBackCompatValueType result;
-  Kokkos::RangePolicy<> policy(0, 1);
+  Kokkos::RangePolicy<TEST_EXECSPACE> policy(0, 1);
 
   Kokkos::parallel_reduce(
       policy, ReducerWithJoinThatTakesBothVolatileAndNonVolatileQualifiedArgs{},
@@ -120,20 +119,9 @@ void test_join_backward_compatibility() {
   ReducerWithJoinThatTakesVolatileQualifiedArgs my_red;
   my_red.join(vol_result, result2);
   ASSERT_EQ(vol_result.err, expected_join_volatile);
-
-#if defined(KOKKOS_ENABLE_DEPRECATED_CODE_3)
-  MyJoinBackCompatValueType result3;
-  Kokkos::parallel_reduce(
-      policy, ReducerWithJoinThatTakesVolatileQualifiedArgs{}, result3);
-  ASSERT_EQ(result3.err, expected_join_volatile);
-#endif
 }
 
 TEST(TEST_CATEGORY, join_backward_compatibility) {
-#if defined(KOKKOS_ENABLE_CUDA) && \
-    defined(KOKKOS_COMPILER_NVHPC)  // FIXME_NVHPC
-  GTEST_SKIP() << "FIXME wrong result";
-#endif
   test_join_backward_compatibility();
 }
 
diff --git a/packages/kokkos/core/unit_test/TestLocalDeepCopy.hpp b/packages/kokkos/core/unit_test/TestLocalDeepCopy.hpp
index 60cb02b7f0577c1a5325210d8f15ed8945568e9a..1ee23a47c45671f85e79364c0d857bea1b2a6cba 100644
--- a/packages/kokkos/core/unit_test/TestLocalDeepCopy.hpp
+++ b/packages/kokkos/core/unit_test/TestLocalDeepCopy.hpp
@@ -907,7 +907,13 @@ void impl_test_local_deepcopy_rangepolicy_rank_7(const int N) {
 #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
 TEST(TEST_CATEGORY, local_deepcopy_teampolicy_layoutleft) {
   using ExecSpace = TEST_EXECSPACE;
-  using ViewType  = Kokkos::View<double********, Kokkos::LayoutLeft, ExecSpace>;
+#if defined(KOKKOS_ENABLE_CUDA) && \
+    defined(KOKKOS_COMPILER_NVHPC)  // FIXME_NVHPC 23.7
+  if (std::is_same_v<ExecSpace, Kokkos::Cuda>)
+    GTEST_SKIP()
+        << "FIXME_NVHPC : Compiler bug affecting subviews of high rank Views";
+#endif
+  using ViewType = Kokkos::View<double********, Kokkos::LayoutLeft, ExecSpace>;
 
   {  // Rank-1
     impl_test_local_deepcopy_teampolicy_rank_1<ExecSpace, ViewType>(8);
@@ -934,7 +940,13 @@ TEST(TEST_CATEGORY, local_deepcopy_teampolicy_layoutleft) {
 //-------------------------------------------------------------------------------------------------------------
 TEST(TEST_CATEGORY, local_deepcopy_rangepolicy_layoutleft) {
   using ExecSpace = TEST_EXECSPACE;
-  using ViewType  = Kokkos::View<double********, Kokkos::LayoutLeft, ExecSpace>;
+#if defined(KOKKOS_ENABLE_CUDA) && \
+    defined(KOKKOS_COMPILER_NVHPC)  // FIXME_NVHPC 23.7
+  if (std::is_same_v<ExecSpace, Kokkos::Cuda>)
+    GTEST_SKIP()
+        << "FIXME_NVHPC : Compiler bug affecting subviews of high rank Views";
+#endif
+  using ViewType = Kokkos::View<double********, Kokkos::LayoutLeft, ExecSpace>;
 
   {  // Rank-1
     impl_test_local_deepcopy_rangepolicy_rank_1<ExecSpace, ViewType>(8);
@@ -961,6 +973,12 @@ TEST(TEST_CATEGORY, local_deepcopy_rangepolicy_layoutleft) {
 //-------------------------------------------------------------------------------------------------------------
 TEST(TEST_CATEGORY, local_deepcopy_teampolicy_layoutright) {
   using ExecSpace = TEST_EXECSPACE;
+#if defined(KOKKOS_ENABLE_CUDA) && \
+    defined(KOKKOS_COMPILER_NVHPC)  // FIXME_NVHPC 23.7
+  if (std::is_same_v<ExecSpace, Kokkos::Cuda>)
+    GTEST_SKIP()
+        << "FIXME_NVHPC : Compiler bug affecting subviews of high rank Views";
+#endif
   using ViewType = Kokkos::View<double********, Kokkos::LayoutRight, ExecSpace>;
 
   {  // Rank-1
@@ -988,6 +1006,13 @@ TEST(TEST_CATEGORY, local_deepcopy_teampolicy_layoutright) {
 //-------------------------------------------------------------------------------------------------------------
 TEST(TEST_CATEGORY, local_deepcopy_rangepolicy_layoutright) {
   using ExecSpace = TEST_EXECSPACE;
+#if defined(KOKKOS_ENABLE_CUDA) && \
+    defined(KOKKOS_COMPILER_NVHPC)  // FIXME_NVHPC 23.7
+  if (std::is_same_v<ExecSpace, Kokkos::Cuda>)
+    GTEST_SKIP()
+        << "FIXME_NVHPC : Compiler bug affecting subviews of high rank Views";
+#endif
+
   using ViewType = Kokkos::View<double********, Kokkos::LayoutRight, ExecSpace>;
 
   {  // Rank-1
diff --git a/packages/kokkos/core/unit_test/TestMDRangePolicyConstructors.hpp b/packages/kokkos/core/unit_test/TestMDRangePolicyConstructors.hpp
index 9eeaba9fd44609cebc129900d81ffcc67ede34c1..f577f415e7cb9486514de4004b88a66536183cc7 100644
--- a/packages/kokkos/core/unit_test/TestMDRangePolicyConstructors.hpp
+++ b/packages/kokkos/core/unit_test/TestMDRangePolicyConstructors.hpp
@@ -81,7 +81,6 @@ TEST(TEST_CATEGORY, md_range_policy_construction_from_arrays) {
   construct_mdrange_policy_variable_type<std::int64_t>();
 }
 
-#ifndef KOKKOS_COMPILER_NVHPC       // FIXME_NVHPC
 #ifndef KOKKOS_ENABLE_OPENMPTARGET  // FIXME_OPENMPTARGET
 TEST(TEST_CATEGORY_DEATH, policy_bounds_unsafe_narrowing_conversions) {
   using Policy = Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>,
@@ -95,6 +94,5 @@ TEST(TEST_CATEGORY_DEATH, policy_bounds_unsafe_narrowing_conversions) {
       "unsafe narrowing conversion");
 }
 #endif
-#endif
 
 }  // namespace
diff --git a/packages/kokkos/core/unit_test/TestMDRangeReduce.hpp b/packages/kokkos/core/unit_test/TestMDRangeReduce.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..007fa420c3a8fba78ee89dd81cd031583a795f67
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestMDRangeReduce.hpp
@@ -0,0 +1,68 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+namespace {
+
+template <typename T>
+void MDRangeReduceTester([[maybe_unused]] int bound, int k) {
+  const auto policy_MD = Kokkos::MDRangePolicy<Kokkos::Rank<2>, TEST_EXECSPACE>(
+      {0, 0}, {bound, 2});
+
+  // No explicit fence() calls needed because result is in HostSpace
+  {
+    T lor_MD = 0;
+    Kokkos::parallel_reduce(
+        policy_MD,
+        KOKKOS_LAMBDA(const int i, const int, T& res) { res = res || i == k; },
+        Kokkos::LOr<T>(lor_MD));
+    EXPECT_EQ(lor_MD, 1);
+  }
+  {
+    // Stick just a few true values in the Logical-OR reduction space,
+    // to try to make sure every value is being captured
+    T land_MD = 0;
+    Kokkos::parallel_reduce(
+        policy_MD, KOKKOS_LAMBDA(const int, const int, T& res) { res = 1; },
+        Kokkos::LAnd<T>(land_MD));
+    EXPECT_EQ(land_MD, 1);
+  }
+}
+
+TEST(TEST_CATEGORY, mdrange_parallel_reduce_primitive_types) {
+#if defined(KOKKOS_ENABLE_OPENMPTARGET)
+  GTEST_SKIP() << "FIXME OPENMPTARGET Tests of MDRange reduce over values "
+                  "smaller than int would fail";
+#elif defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_CUDA_LAMBDA)
+  GTEST_SKIP() << "Skipped ENABLE_CUDA_LAMBDA";
+#else
+  for (int bound : {0, 1, 7, 32, 65, 7000}) {
+    for (int k = 0; k < bound; ++k) {
+      MDRangeReduceTester<bool>(bound, k);
+      MDRangeReduceTester<signed char>(bound, k);
+      MDRangeReduceTester<int8_t>(bound, k);
+      MDRangeReduceTester<int16_t>(bound, k);
+      MDRangeReduceTester<int32_t>(bound, k);
+      MDRangeReduceTester<int64_t>(bound, k);
+    }
+  }
+#endif
+}
+
+}  // namespace
diff --git a/packages/kokkos/core/unit_test/TestMDSpan.hpp b/packages/kokkos/core/unit_test/TestMDSpan.hpp
index 6359bd447d3a53c682870dbb156fbcec006f335e..ef0bea1394a6bb5084be1879062c4b7479c42baa 100644
--- a/packages/kokkos/core/unit_test/TestMDSpan.hpp
+++ b/packages/kokkos/core/unit_test/TestMDSpan.hpp
@@ -30,12 +30,12 @@ void test_mdspan_minimal_functional() {
       "FillSequence", Kokkos::RangePolicy<TEST_EXECSPACE>(0, N),
       KOKKOS_LAMBDA(int i) { a(i) = i; });
 
-  mdspan_ns::mdspan<int, mdspan_ns::dextents<int, 1>> a_mds(a.data(), N);
+  Kokkos::mdspan<int, Kokkos::dextents<int, 1>> a_mds(a.data(), N);
   int errors;
   Kokkos::parallel_reduce(
       "CheckMinimalMDSpan", Kokkos::RangePolicy<TEST_EXECSPACE>(0, N),
       KOKKOS_LAMBDA(int i, int& err) {
-        mdspan_ns::mdspan<int, mdspan_ns::dextents<int, 1>> b_mds(a.data(), N);
+        Kokkos::mdspan<int, Kokkos::dextents<int, 1>> b_mds(a.data(), N);
 #ifdef KOKKOS_ENABLE_CXX23
         if (a_mds[i] != i) err++;
         if (b_mds[i] != i) err++;
diff --git a/packages/kokkos/core/unit_test/TestMathematicalConstants.hpp b/packages/kokkos/core/unit_test/TestMathematicalConstants.hpp
index f52bfeaff7d97f35f6892d3e96a624a698707763..e446d8132101639cab785e6f078411894c409d84 100644
--- a/packages/kokkos/core/unit_test/TestMathematicalConstants.hpp
+++ b/packages/kokkos/core/unit_test/TestMathematicalConstants.hpp
@@ -63,7 +63,8 @@ struct TestMathematicalConstants {
 
   KOKKOS_FUNCTION void use_on_device() const {
 #if defined(KOKKOS_COMPILER_NVCC) || defined(KOKKOS_ENABLE_OPENMPTARGET) || \
-    defined(KOKKOS_ENABLE_OPENACC)
+    defined(KOKKOS_ENABLE_OPENACC) ||                                       \
+    defined(KOKKOS_COMPILER_NVHPC)  // FIXME_NVHPC 23.7
     take_by_value(Trait::value);
 #else
     (void)take_address_of(Trait::value);
diff --git a/packages/kokkos/core/unit_test/TestMathematicalFunctions.hpp b/packages/kokkos/core/unit_test/TestMathematicalFunctions.hpp
index b198006cbb4bd2cb53ff7c373f039f64204d9a52..d32ef4ca230d7b32e340aaf1ca66098446509f59 100644
--- a/packages/kokkos/core/unit_test/TestMathematicalFunctions.hpp
+++ b/packages/kokkos/core/unit_test/TestMathematicalFunctions.hpp
@@ -30,16 +30,25 @@
 #define MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
 #endif
 
-// WORKAROUND icpx changing default FP model when optimization level is >= 1
-// using -fp-model=precise works too
-#if defined(__INTEL_LLVM_COMPILER)
-#define KOKKOS_IMPL_WORKAROUND_INTEL_LLVM_DEFAULT_FLOATING_POINT_MODEL
+#if defined KOKKOS_COMPILER_INTEL || \
+    (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130)
+#define MATHEMATICAL_FUNCTIONS_TEST_UNREACHABLE __builtin_unreachable();
+#else
+#define MATHEMATICAL_FUNCTIONS_TEST_UNREACHABLE
 #endif
 
+namespace KE = Kokkos::Experimental;
+
 // clang-format off
 template <class>
 struct math_unary_function_return_type;
 // Floating-point types
+#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT
+template <> struct math_unary_function_return_type<KE::half_t> { using type = KE::half_t; };
+#endif // defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT
+#if defined(KOKKOS_BHALF_T_IS_FLOAT) && !KOKKOS_BHALF_T_IS_FLOAT
+template <> struct math_unary_function_return_type<KE::bhalf_t> { using type = KE::bhalf_t; };
+#endif // defined(KOKKOS_BHALF_T_IS_FLOAT) && !KOKKOS_BHALF_T_IS_FLOAT
 template <> struct math_unary_function_return_type<      float> { using type =       float; };
 template <> struct math_unary_function_return_type<     double> { using type =      double; };
 #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
@@ -59,6 +68,29 @@ template <class T>
 using math_unary_function_return_type_t = typename math_unary_function_return_type<T>::type;
 template <class, class>
 struct math_binary_function_return_type;
+#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT
+template <> struct math_binary_function_return_type<KE::half_t, KE::half_t> { using type = KE::half_t; };
+template <> struct math_binary_function_return_type<short, KE::half_t> { using type = double; };
+template <> struct math_binary_function_return_type<unsigned short, KE::half_t> { using type = double; };
+template <> struct math_binary_function_return_type<int, KE::half_t> { using type = double; };
+template <> struct math_binary_function_return_type<unsigned int, KE::half_t> { using type = double; };
+template <> struct math_binary_function_return_type<long, KE::half_t> { using type = double; };
+template <> struct math_binary_function_return_type<unsigned long, KE::half_t> { using type = double; };
+template <> struct math_binary_function_return_type<long long, KE::half_t> { using type = double; };
+template <> struct math_binary_function_return_type<unsigned long long, KE::half_t> { using type = double; };
+#endif // defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT
+#if defined(KOKKOS_BHALF_T_IS_FLOAT) && !KOKKOS_BHALF_T_IS_FLOAT
+template <> struct math_binary_function_return_type<KE::bhalf_t, KE::bhalf_t> { using type = KE::bhalf_t; };
+template <> struct math_binary_function_return_type<KE::half_t, KE::bhalf_t> { using type = KE::half_t; };
+template <> struct math_binary_function_return_type<short, KE::bhalf_t> { using type = double; };
+template <> struct math_binary_function_return_type<unsigned short, KE::bhalf_t> { using type = double; };
+template <> struct math_binary_function_return_type<int, KE::bhalf_t> { using type = double; };
+template <> struct math_binary_function_return_type<unsigned int, KE::bhalf_t> { using type = double; };
+template <> struct math_binary_function_return_type<long, KE::bhalf_t> { using type = double; };
+template <> struct math_binary_function_return_type<unsigned long, KE::bhalf_t> { using type = double; };
+template <> struct math_binary_function_return_type<long long, KE::bhalf_t> { using type = double; };
+template <> struct math_binary_function_return_type<unsigned long long, KE::bhalf_t> { using type = double; };
+#endif // defined(KOKKOS_BHALF_T_IS_FLOAT) && !KOKKOS_BHALF_T_IS_FLOAT
 template <> struct math_binary_function_return_type<             float,              float> { using type =       float; };
 template <> struct math_binary_function_return_type<             float,             double> { using type =      double; };
 template <> struct math_binary_function_return_type<             float,               bool> { using type =      double; };
@@ -206,11 +238,39 @@ struct FloatingPointComparison {
   KOKKOS_FUNCTION double eps(T) const {
     return DBL_EPSILON;
   }
+#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT
+  KOKKOS_FUNCTION
+  KE::half_t eps(KE::half_t) const {
+// FIXME_NVHPC compile-time error
+#ifdef KOKKOS_COMPILER_NVHPC
+    return 0.0009765625F;
+#else
+    return KE::epsilon<KE::half_t>::value;
+#endif
+  }
+#endif
+#if defined(KOKKOS_BHALF_T_IS_FLOAT) && !KOKKOS_BHALF_T_IS_FLOAT
+  KOKKOS_FUNCTION
+  KE::bhalf_t eps(KE::bhalf_t) const {
+// FIXME_NVHPC compile-time error
+#ifdef KOKKOS_COMPILER_NVHPC
+    return 0.0078125;
+#else
+    return KE::epsilon<KE::bhalf_t>::value;
+#endif
+  }
+#endif
   KOKKOS_FUNCTION
   double eps(float) const { return FLT_EPSILON; }
+// POWER9 gives unexpected values with LDBL_EPSILON issues
+// https://stackoverflow.com/questions/68960416/ppc64-long-doubles-machine-epsilon-calculation
+#if defined(KOKKOS_ARCH_POWER9) || defined(KOKKOS_ARCH_POWER8)
+  KOKKOS_FUNCTION
+  double eps(long double) const { return DBL_EPSILON; }
+#else
   KOKKOS_FUNCTION
   double eps(long double) const { return LDBL_EPSILON; }
-
+#endif
   // Using absolute here instead of abs, since we actually test abs ...
   template <class T>
   KOKKOS_FUNCTION std::enable_if_t<std::is_signed<T>::value, T> absolute(
@@ -231,9 +291,8 @@ struct FloatingPointComparison {
 
     bool ar = absolute(fpv) < abs_tol;
     if (!ar) {
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-          "absolute value exceeds tolerance [|%e| > %e]\n", (double)fpv,
-          abs_tol);
+      Kokkos::printf("absolute value exceeds tolerance [|%e| > %e]\n",
+                     (double)fpv, abs_tol);
     }
 
     return ar;
@@ -254,9 +313,8 @@ struct FloatingPointComparison {
       double rel_diff = abs_diff / min_denom;
       bool ar         = abs_diff == 0 || rel_diff < rel_tol;
       if (!ar) {
-        KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-            "relative difference exceeds tolerance [%e > %e]\n",
-            (double)rel_diff, rel_tol);
+        Kokkos::printf("relative difference exceeds tolerance [%e > %e]\n",
+                       (double)rel_diff, rel_tol);
       }
 
       return ar;
@@ -267,7 +325,38 @@ struct FloatingPointComparison {
 template <class>
 struct math_function_name;
 
-#define DEFINE_UNARY_FUNCTION_EVAL(FUNC, ULP_FACTOR)                  \
+#define DEFINE_UNARY_FUNCTION_EVAL(FUNC, ULP_FACTOR)                    \
+  struct MathUnaryFunction_##FUNC {                                     \
+    template <typename T>                                               \
+    static KOKKOS_FUNCTION auto eval(T x) {                             \
+      static_assert(                                                    \
+          std::is_same<decltype(Kokkos::FUNC((T)0)),                    \
+                       math_unary_function_return_type_t<T>>::value);   \
+      return Kokkos::FUNC(x);                                           \
+    }                                                                   \
+    template <typename T>                                               \
+    static auto eval_std(T x) {                                         \
+      if constexpr (std::is_same<T, KE::half_t>::value ||               \
+                    std::is_same<T, KE::bhalf_t>::value) {              \
+        return std::FUNC(static_cast<float>(x));                        \
+      } else {                                                          \
+        static_assert(                                                  \
+            std::is_same<decltype(std::FUNC((T)0)),                     \
+                         math_unary_function_return_type_t<T>>::value); \
+        return std::FUNC(x);                                            \
+      }                                                                 \
+      MATHEMATICAL_FUNCTIONS_TEST_UNREACHABLE                           \
+    }                                                                   \
+    static KOKKOS_FUNCTION double ulp_factor() { return ULP_FACTOR; }   \
+  };                                                                    \
+  using kk_##FUNC = MathUnaryFunction_##FUNC;                           \
+  template <>                                                           \
+  struct math_function_name<MathUnaryFunction_##FUNC> {                 \
+    static constexpr char name[] = #FUNC;                               \
+  };                                                                    \
+  constexpr char math_function_name<MathUnaryFunction_##FUNC>::name[]
+
+#define DEFINE_UNARY_FUNCTION_EVAL_CUSTOM(FUNC, ULP_FACTOR, REF_FUNC) \
   struct MathUnaryFunction_##FUNC {                                   \
     template <typename T>                                             \
     static KOKKOS_FUNCTION auto eval(T x) {                           \
@@ -279,9 +368,9 @@ struct math_function_name;
     template <typename T>                                             \
     static auto eval_std(T x) {                                       \
       static_assert(                                                  \
-          std::is_same<decltype(std::FUNC((T)0)),                     \
+          std::is_same<decltype(REF_FUNC),                            \
                        math_unary_function_return_type_t<T>>::value); \
-      return std::FUNC(x);                                            \
+      return REF_FUNC;                                                \
     }                                                                 \
     static KOKKOS_FUNCTION double ulp_factor() { return ULP_FACTOR; } \
   };                                                                  \
@@ -292,23 +381,20 @@ struct math_function_name;
   };                                                                  \
   constexpr char math_function_name<MathUnaryFunction_##FUNC>::name[]
 
-#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_1
+#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_3
 // Generally the expected ULP error should come from here:
 // https://www.gnu.org/software/libc/manual/html_node/Errors-in-Math-Functions.html
 // For now 1s largely seem to work ...
 DEFINE_UNARY_FUNCTION_EVAL(exp, 2);
-#ifdef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC exp2 not device callable,
-                              // workaround computes it via exp
-DEFINE_UNARY_FUNCTION_EVAL(exp2, 30);
-#else
 DEFINE_UNARY_FUNCTION_EVAL(exp2, 2);
-#endif
 DEFINE_UNARY_FUNCTION_EVAL(expm1, 2);
 DEFINE_UNARY_FUNCTION_EVAL(log, 2);
 DEFINE_UNARY_FUNCTION_EVAL(log10, 2);
 DEFINE_UNARY_FUNCTION_EVAL(log2, 2);
 DEFINE_UNARY_FUNCTION_EVAL(log1p, 2);
+#endif
 
+#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_1
 DEFINE_UNARY_FUNCTION_EVAL(sqrt, 2);
 DEFINE_UNARY_FUNCTION_EVAL(cbrt, 2);
 
@@ -325,6 +411,10 @@ DEFINE_UNARY_FUNCTION_EVAL(tanh, 2);
 DEFINE_UNARY_FUNCTION_EVAL(asinh, 4);
 DEFINE_UNARY_FUNCTION_EVAL(acosh, 2);
 DEFINE_UNARY_FUNCTION_EVAL(atanh, 2);
+
+// non-standard math functions
+DEFINE_UNARY_FUNCTION_EVAL_CUSTOM(rsqrt, 2,
+                                  decltype(std::sqrt(x))(1) / std::sqrt(x));
 #endif
 
 #ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_2
@@ -355,29 +445,42 @@ DEFINE_UNARY_FUNCTION_EVAL(logb, 2);
 
 #undef DEFINE_UNARY_FUNCTION_EVAL
 
-#define DEFINE_BINARY_FUNCTION_EVAL(FUNC, ULP_FACTOR)                     \
-  struct MathBinaryFunction_##FUNC {                                      \
-    template <typename T, typename U>                                     \
-    static KOKKOS_FUNCTION auto eval(T x, U y) {                          \
-      static_assert(                                                      \
-          std::is_same<decltype(Kokkos::FUNC((T)0, (U)0)),                \
-                       math_binary_function_return_type_t<T, U>>::value); \
-      return Kokkos::FUNC(x, y);                                          \
-    }                                                                     \
-    template <typename T, typename U>                                     \
-    static auto eval_std(T x, U y) {                                      \
-      static_assert(                                                      \
-          std::is_same<decltype(std::FUNC((T)0, (U)0)),                   \
-                       math_binary_function_return_type_t<T, U>>::value); \
-      return std::FUNC(x, y);                                             \
-    }                                                                     \
-    static KOKKOS_FUNCTION double ulp_factor() { return ULP_FACTOR; }     \
-  };                                                                      \
-  using kk_##FUNC = MathBinaryFunction_##FUNC;                            \
-  template <>                                                             \
-  struct math_function_name<MathBinaryFunction_##FUNC> {                  \
-    static constexpr char name[] = #FUNC;                                 \
-  };                                                                      \
+#define DEFINE_BINARY_FUNCTION_EVAL(FUNC, ULP_FACTOR)                          \
+  struct MathBinaryFunction_##FUNC {                                           \
+    template <typename T, typename U>                                          \
+    static KOKKOS_FUNCTION auto eval(T x, U y) {                               \
+      static_assert(                                                           \
+          std::is_same<decltype(Kokkos::FUNC((T)0, (U)0)),                     \
+                       math_binary_function_return_type_t<T, U>>::value);      \
+      return Kokkos::FUNC(x, y);                                               \
+    }                                                                          \
+    template <typename T, typename U>                                          \
+    static auto eval_std(T x, U y) {                                           \
+      constexpr bool const x_is_half =                                         \
+          (KE::Impl::is_float16<T>::value || KE::Impl::is_bfloat16<T>::value); \
+      constexpr bool const y_is_half =                                         \
+          (KE::Impl::is_float16<U>::value || KE::Impl::is_bfloat16<U>::value); \
+      if constexpr (x_is_half && y_is_half)                                    \
+        return std::FUNC(static_cast<float>(x), static_cast<float>(y));        \
+      else if constexpr (x_is_half)                                            \
+        return std::FUNC(static_cast<float>(x), y);                            \
+      else if constexpr (y_is_half)                                            \
+        return std::FUNC(x, static_cast<float>(y));                            \
+      else {                                                                   \
+        static_assert(                                                         \
+            std::is_same<decltype(std::FUNC((T)0, (U)0)),                      \
+                         math_binary_function_return_type_t<T, U>>::value);    \
+        return std::FUNC(x, y);                                                \
+      }                                                                        \
+      MATHEMATICAL_FUNCTIONS_TEST_UNREACHABLE                                  \
+    }                                                                          \
+    static KOKKOS_FUNCTION double ulp_factor() { return ULP_FACTOR; }          \
+  };                                                                           \
+  using kk_##FUNC = MathBinaryFunction_##FUNC;                                 \
+  template <>                                                                  \
+  struct math_function_name<MathBinaryFunction_##FUNC> {                       \
+    static constexpr char name[] = #FUNC;                                      \
+  };                                                                           \
   constexpr char math_function_name<MathBinaryFunction_##FUNC>::name[]
 
 #ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_1
@@ -435,6 +538,12 @@ DEFINE_TYPE_NAME(long long)
 DEFINE_TYPE_NAME(unsigned int)
 DEFINE_TYPE_NAME(unsigned long)
 DEFINE_TYPE_NAME(unsigned long long)
+#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT
+DEFINE_TYPE_NAME(KE::half_t)
+#endif
+#if defined(KOKKOS_BHALF_T_IS_FLOAT) && !KOKKOS_BHALF_T_IS_FLOAT
+DEFINE_TYPE_NAME(KE::bhalf_t)
+#endif
 DEFINE_TYPE_NAME(float)
 DEFINE_TYPE_NAME(double)
 DEFINE_TYPE_NAME(long double)
@@ -463,9 +572,9 @@ struct TestMathUnaryFunction : FloatingPointComparison {
     bool ar = compare(Func::eval(val_[i]), res_[i], Func::ulp_factor());
     if (!ar) {
       ++e;
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-          "value at %f which is %f was expected to be %f\n", (double)val_[i],
-          (double)Func::eval(val_[i]), (double)res_[i]);
+      Kokkos::printf("value at %f which is %f was expected to be %f\n",
+                     (double)val_[i], (double)Func::eval(val_[i]),
+                     (double)res_[i]);
     }
   }
 };
@@ -474,11 +583,36 @@ template <class Space, class... Func, class Arg, std::size_t N>
 void do_test_math_unary_function(const Arg (&x)[N]) {
   (void)std::initializer_list<int>{
       (TestMathUnaryFunction<Space, Func, Arg, N>(x), 0)...};
+
+  // test if potentially device specific math functions also work on host
+  if constexpr (!std::is_same_v<Space, Kokkos::DefaultHostExecutionSpace>)
+    (void)std::initializer_list<int>{
+        (TestMathUnaryFunction<Kokkos::DefaultHostExecutionSpace, Func, Arg, N>(
+             x),
+         0)...};
 }
 
 #define TEST_MATH_FUNCTION(FUNC) \
   do_test_math_unary_function<TEST_EXECSPACE, MathUnaryFunction_##FUNC>
 
+template <class Half, class Space, class... Func, class Arg, std::size_t N>
+void do_test_half_math_unary_function(const Arg (&x)[N]) {
+  Half y[N];
+  std::copy(x, x + N, y);  // cast to array of half type
+  (void)std::initializer_list<int>{
+      (TestMathUnaryFunction<Space, Func, Half, N>(y), 0)...};
+
+  // test if potentially device specific math functions also work on host
+  if constexpr (!std::is_same_v<Space, Kokkos::DefaultHostExecutionSpace>)
+    (void)std::initializer_list<int>{(
+        TestMathUnaryFunction<Kokkos::DefaultHostExecutionSpace, Func, Half, N>(
+            y),
+        0)...};
+}
+
+#define TEST_HALF_MATH_FUNCTION(FUNC, T) \
+  do_test_half_math_unary_function<T, TEST_EXECSPACE, MathUnaryFunction_##FUNC>
+
 template <class Space, class Func, class Arg1, class Arg2,
           class Ret = math_binary_function_return_type_t<Arg1, Arg2>>
 struct TestMathBinaryFunction : FloatingPointComparison {
@@ -501,9 +635,9 @@ struct TestMathBinaryFunction : FloatingPointComparison {
     bool ar = compare(Func::eval(val1_, val2_), res_, Func::ulp_factor());
     if (!ar) {
       ++e;
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-          "value at %f, %f which is %f was expected to be %f\n", (double)val1_,
-          (double)val2_, (double)Func::eval(val1_, val2_), (double)res_);
+      Kokkos::printf("value at %f, %f which is %f was expected to be %f\n",
+                     (double)val1_, (double)val2_,
+                     (double)Func::eval(val1_, val2_), (double)res_);
     }
   }
 };
@@ -542,10 +676,9 @@ struct TestMathTernaryFunction : FloatingPointComparison {
         compare(Func::eval(val1_, val2_, val3_), res_, Func::ulp_factor());
     if (!ar) {
       ++e;
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-          "value at %f, %f, %f which is %f was expected to be %f\n",
-          (double)val1_, (double)val2_, (double)val3_,
-          (double)Func::eval(val1_, val2_, val3_), (double)res_);
+      Kokkos::printf("value at %f, %f, %f which is %f was expected to be %f\n",
+                     (double)val1_, (double)val2_, (double)val3_,
+                     (double)Func::eval(val1_, val2_, val3_), (double)res_);
     }
   }
 };
@@ -567,6 +700,8 @@ TEST(TEST_CATEGORY, mathematical_functions_trigonometric_functions) {
   TEST_MATH_FUNCTION(sin)({2u, 3u, 4u, 5u, 6u});
   TEST_MATH_FUNCTION(sin)({2ul, 3ul, 4ul, 5ul, 6ul});
   TEST_MATH_FUNCTION(sin)({2ull, 3ull, 4ull, 5ull, 6ull});
+  TEST_HALF_MATH_FUNCTION(sin, KE::half_t)({.1f, .2f, .3f});
+  TEST_HALF_MATH_FUNCTION(sin, KE::bhalf_t)({.1f, .2f, .3f});
   TEST_MATH_FUNCTION(sin)({.1f, .2f, .3f});
   TEST_MATH_FUNCTION(sin)({.4, .5, .6});
 #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
@@ -580,6 +715,8 @@ TEST(TEST_CATEGORY, mathematical_functions_trigonometric_functions) {
   TEST_MATH_FUNCTION(cos)({2u, 3u, 4u, 5u, 6u});
   TEST_MATH_FUNCTION(cos)({2ul, 3ul, 4ul, 5ul, 6ul});
   TEST_MATH_FUNCTION(cos)({2ull, 3ull, 4ull, 5ull, 6ull});
+  TEST_HALF_MATH_FUNCTION(cos, KE::half_t)({.1f, .2f, .3f});
+  TEST_HALF_MATH_FUNCTION(cos, KE::bhalf_t)({.1f, .2f, .3f});
   TEST_MATH_FUNCTION(cos)({.1f, .2f, .3f});
   TEST_MATH_FUNCTION(cos)({.4, .5, .6});
 #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
@@ -593,6 +730,8 @@ TEST(TEST_CATEGORY, mathematical_functions_trigonometric_functions) {
   TEST_MATH_FUNCTION(tan)({2u, 3u, 4u, 5u, 6u});
   TEST_MATH_FUNCTION(tan)({2ul, 3ul, 4ul, 5ul, 6ul});
   TEST_MATH_FUNCTION(tan)({2ull, 3ull, 4ull, 5ull, 6ull});
+  TEST_HALF_MATH_FUNCTION(tan, KE::half_t)({.1f, .2f, .3f});
+  TEST_HALF_MATH_FUNCTION(tan, KE::bhalf_t)({.1f, .2f, .3f});
   TEST_MATH_FUNCTION(tan)({.1f, .2f, .3f});
   TEST_MATH_FUNCTION(tan)({.4, .5, .6});
 #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
@@ -606,6 +745,8 @@ TEST(TEST_CATEGORY, mathematical_functions_trigonometric_functions) {
   TEST_MATH_FUNCTION(asin)({0u, 1u});
   TEST_MATH_FUNCTION(asin)({0ul, 1ul});
   TEST_MATH_FUNCTION(asin)({0ull, 1ull});
+  TEST_HALF_MATH_FUNCTION(asin, KE::half_t)({-1.f, .9f, -.8f, .7f, -.6f});
+  TEST_HALF_MATH_FUNCTION(asin, KE::bhalf_t)({-1.f, .9f, -.8f, .7f, -.6f});
   TEST_MATH_FUNCTION(asin)({-1.f, .9f, -.8f, .7f, -.6f});
   TEST_MATH_FUNCTION(asin)({-.5, .4, -.3, .2, -.1, 0.});
 #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
@@ -619,6 +760,8 @@ TEST(TEST_CATEGORY, mathematical_functions_trigonometric_functions) {
   TEST_MATH_FUNCTION(acos)({0u, 1u});
   TEST_MATH_FUNCTION(acos)({0ul, 1ul});
   TEST_MATH_FUNCTION(acos)({0ull, 1ull});
+  TEST_HALF_MATH_FUNCTION(acos, KE::half_t)({-1.f, .9f, -.8f, .7f, -.6f});
+  TEST_HALF_MATH_FUNCTION(acos, KE::bhalf_t)({-1.f, .9f, -.8f, .7f, -.6f});
   TEST_MATH_FUNCTION(acos)({-1.f, .9f, -.8f, .7f, -.6f});
   TEST_MATH_FUNCTION(acos)({-.5, .4, -.3, .2, -.1, 0.});
 #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
@@ -632,6 +775,10 @@ TEST(TEST_CATEGORY, mathematical_functions_trigonometric_functions) {
   TEST_MATH_FUNCTION(atan)({0u, 1u});
   TEST_MATH_FUNCTION(atan)({0ul, 1ul});
   TEST_MATH_FUNCTION(atan)({0ull, 1ull});
+  TEST_HALF_MATH_FUNCTION(atan, KE::half_t)
+  ({-1.5f, 1.3f, -1.1f, .9f, -.7f, .5f});
+  TEST_HALF_MATH_FUNCTION(atan, KE::bhalf_t)
+  ({-1.5f, 1.3f, -1.1f, .9f, -.7f, .5f});
   TEST_MATH_FUNCTION(atan)({-1.5f, 1.3f, -1.1f, .9f, -.7f, .5f});
   TEST_MATH_FUNCTION(atan)({1.4, -1.2, 1., -.8, .6, -.4, .2, -0.});
 #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
@@ -648,6 +795,8 @@ TEST(TEST_CATEGORY, mathematical_functions_power_functions) {
   TEST_MATH_FUNCTION(sqrt)({0u, 1u, 2u, 3u, 5u, 7u});
   TEST_MATH_FUNCTION(sqrt)({0ul, 1ul, 2ul, 3ul, 5ul, 7ul});
   TEST_MATH_FUNCTION(sqrt)({0ull, 1ull, 2ull, 3ull, 5ull, 7ull});
+  TEST_HALF_MATH_FUNCTION(sqrt, KE::half_t)({10.f, 20.f, 30.f, 40.f});
+  TEST_HALF_MATH_FUNCTION(sqrt, KE::bhalf_t)({10.f, 20.f, 30.f, 40.f});
   TEST_MATH_FUNCTION(sqrt)({10.f, 20.f, 30.f, 40.f});
   TEST_MATH_FUNCTION(sqrt)({11.1, 22.2, 33.3, 44.4});
 #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
@@ -660,18 +809,29 @@ TEST(TEST_CATEGORY, mathematical_functions_power_functions) {
   TEST_MATH_FUNCTION(cbrt)({0u, 1u, 2u, 3u, 4u, 5u});
   TEST_MATH_FUNCTION(cbrt)({0ul, 1ul, 2ul, 3ul, 4ul, 5ul});
   TEST_MATH_FUNCTION(cbrt)({0ull, 1ull, 2ull, 3ull, 4ull, 5ull});
+  TEST_HALF_MATH_FUNCTION(cbrt, KE::half_t)({-1.f, .2f, -3.f, .4f, -5.f});
+  TEST_HALF_MATH_FUNCTION(cbrt, KE::bhalf_t)({-1.f, .2f, -3.f, .4f, -5.f});
   TEST_MATH_FUNCTION(cbrt)({-1.f, .2f, -3.f, .4f, -5.f});
   TEST_MATH_FUNCTION(cbrt)({11.1, -2.2, 33.3, -4.4, 55.5});
 #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
   TEST_MATH_FUNCTION(cbrt)({-10.l, 20.l, -30.l, 40.l, -50.l});
 #endif
 
+  do_test_math_binary_function<TEST_EXECSPACE, kk_pow>(
+      static_cast<KE::half_t>(2.f), static_cast<KE::half_t>(3.f));
+  do_test_math_binary_function<TEST_EXECSPACE, kk_pow>(
+      static_cast<KE::bhalf_t>(2.f), static_cast<KE::bhalf_t>(3.f));
+  do_test_math_binary_function<TEST_EXECSPACE, kk_pow>(2.f, 3.f);
   do_test_math_binary_function<TEST_EXECSPACE, kk_pow>(2.f, 3.f);
   do_test_math_binary_function<TEST_EXECSPACE, kk_pow>(2., 3.);
 #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
   do_test_math_binary_function<TEST_EXECSPACE, kk_pow>(2.l, 3.l);
 #endif
 
+  do_test_math_binary_function<TEST_EXECSPACE, kk_hypot>(
+      static_cast<KE::half_t>(2.f), static_cast<KE::half_t>(3.f));
+  do_test_math_binary_function<TEST_EXECSPACE, kk_hypot>(
+      static_cast<KE::bhalf_t>(2.f), static_cast<KE::bhalf_t>(3.f));
   do_test_math_binary_function<TEST_EXECSPACE, kk_hypot>(2.f, 3.f);
   do_test_math_binary_function<TEST_EXECSPACE, kk_hypot>(2., 3.);
 #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
@@ -699,7 +859,9 @@ TEST(TEST_CATEGORY, mathematical_functions_fma) {
   do_test_math_ternary_function<TEST_EXECSPACE, kk3_fma>(2.l, 3.l, 4.l);
 #endif
 }
+#endif
 
+#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_3
 TEST(TEST_CATEGORY, mathematical_functions_exponential_functions) {
   TEST_MATH_FUNCTION(exp)({-9, -8, -7, -6, -5, 4, 3, 2, 1, 0});
   TEST_MATH_FUNCTION(exp)({-9l, -8l, -7l, -6l, -5l, 4l, 3l, 2l, 1l, 0l});
@@ -707,6 +869,10 @@ TEST(TEST_CATEGORY, mathematical_functions_exponential_functions) {
   TEST_MATH_FUNCTION(exp)({0u, 1u, 2u, 3u, 4u, 5u});
   TEST_MATH_FUNCTION(exp)({0ul, 1ul, 2ul, 3ul, 4ul, 5ul});
   TEST_MATH_FUNCTION(exp)({0ull, 1ull, 2ull, 3ull, 4ull, 5ull});
+  TEST_HALF_MATH_FUNCTION(exp, KE::half_t)
+  ({-98.f, -7.6f, -.54f, 3.2f, 1.f, -0.f});
+  TEST_HALF_MATH_FUNCTION(exp, KE::bhalf_t)
+  ({-98.f, -7.6f, -.54f, 3.2f, 1.f, -0.f});
   TEST_MATH_FUNCTION(exp)({-98.f, -7.6f, -.54f, 3.2f, 1.f, -0.f});
   TEST_MATH_FUNCTION(exp)({-98., -7.6, -.54, 3.2, 1., -0.});
 #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
@@ -719,6 +885,10 @@ TEST(TEST_CATEGORY, mathematical_functions_exponential_functions) {
   TEST_MATH_FUNCTION(exp2)({0u, 1u, 2u, 3u, 4u, 5u});
   TEST_MATH_FUNCTION(exp2)({0ul, 1ul, 2ul, 3ul, 4ul, 5ul});
   TEST_MATH_FUNCTION(exp2)({0ull, 1ull, 2ull, 3ull, 4ull, 5ull});
+  TEST_HALF_MATH_FUNCTION(exp2, KE::half_t)
+  ({-98.f, -7.6f, -.54f, 3.2f, 1.f, -0.f});
+  TEST_HALF_MATH_FUNCTION(exp2, KE::bhalf_t)
+  ({-98.f, -7.6f, -.54f, 3.2f, 1.f, -0.f});
   TEST_MATH_FUNCTION(exp2)({-98.f, -7.6f, -.54f, 3.2f, 1.f, -0.f});
   TEST_MATH_FUNCTION(exp2)({-98., -7.6, -.54, 3.2, 1., -0.});
 #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
@@ -731,6 +901,10 @@ TEST(TEST_CATEGORY, mathematical_functions_exponential_functions) {
   TEST_MATH_FUNCTION(expm1)({0u, 1u, 2u, 3u, 4u, 5u});
   TEST_MATH_FUNCTION(expm1)({0ul, 1ul, 2ul, 3ul, 4ul, 5ul});
   TEST_MATH_FUNCTION(expm1)({0ull, 1ull, 2ull, 3ull, 4ull, 5ull});
+  TEST_HALF_MATH_FUNCTION(expm1, KE::half_t)
+  ({-98.f, -7.6f, -.54f, 3.2f, 1.f, -0.f});
+  TEST_HALF_MATH_FUNCTION(expm1, KE::bhalf_t)
+  ({-98.f, -7.6f, -.54f, 3.2f, 1.f, -0.f});
   TEST_MATH_FUNCTION(expm1)({-98.f, -7.6f, -.54f, 3.2f, 1.f, -0.f});
   TEST_MATH_FUNCTION(expm1)({-98., -7.6, -.54, 3.2, 1., -0.});
 #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
@@ -743,6 +917,8 @@ TEST(TEST_CATEGORY, mathematical_functions_exponential_functions) {
   TEST_MATH_FUNCTION(log)({1u, 23u, 456u, 7890u});
   TEST_MATH_FUNCTION(log)({1ul, 23ul, 456ul, 7890ul});
   TEST_MATH_FUNCTION(log)({1ull, 23ull, 456ull, 7890ull});
+  TEST_HALF_MATH_FUNCTION(log, KE::half_t)({1234.f, 567.f, 89.f, .1f});
+  TEST_HALF_MATH_FUNCTION(log, KE::bhalf_t)({1234.f, 567.f, 89.f, .1f});
   TEST_MATH_FUNCTION(log)({1234.f, 567.f, 89.f, .1f});
   TEST_MATH_FUNCTION(log)({1234., 567., 89., .02});
 #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
@@ -755,6 +931,8 @@ TEST(TEST_CATEGORY, mathematical_functions_exponential_functions) {
   TEST_MATH_FUNCTION(log10)({1u, 23u, 456u, 7890u});
   TEST_MATH_FUNCTION(log10)({1ul, 23ul, 456ul, 7890ul});
   TEST_MATH_FUNCTION(log10)({1ull, 23ull, 456ull, 7890ull});
+  TEST_HALF_MATH_FUNCTION(log10, KE::half_t)({1234.f, 567.f, 89.f, .1f});
+  TEST_HALF_MATH_FUNCTION(log10, KE::bhalf_t)({1234.f, 567.f, 89.f, .1f});
   TEST_MATH_FUNCTION(log10)({1234.f, 567.f, 89.f, .1f});
   TEST_MATH_FUNCTION(log10)({1234., 567., 89., .02});
 #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
@@ -762,9 +940,9 @@ TEST(TEST_CATEGORY, mathematical_functions_exponential_functions) {
 #endif
 
 // FIXME_OPENMPTARGET FIXME_AMD
-#if defined(KOKKOS_ENABLE_OPENMPTARGET) &&                           \
-    (defined(KOKKOS_ARCH_VEGA906) || defined(KOKKOS_ARCH_VEGA908) || \
-     defined(KOKKOS_ARCH_VEGA90A))
+#if defined(KOKKOS_ENABLE_OPENMPTARGET) &&                                 \
+    (defined(KOKKOS_ARCH_AMD_GFX906) || defined(KOKKOS_ARCH_AMD_GFX908) || \
+     defined(KOKKOS_ARCH_AMD_GFX90A) || defined(KOKKOS_ARCH_AMD_GFX942))
 
   TEST_MATH_FUNCTION(log2)({1, 23, 456, 7890});
 #endif
@@ -773,6 +951,8 @@ TEST(TEST_CATEGORY, mathematical_functions_exponential_functions) {
   TEST_MATH_FUNCTION(log2)({1u, 23u, 456u, 7890u});
   TEST_MATH_FUNCTION(log2)({1ul, 23ul, 456ul, 7890ul});
   TEST_MATH_FUNCTION(log2)({1ull, 23ull, 456ull, 7890ull});
+  TEST_HALF_MATH_FUNCTION(log2, KE::half_t)({1234.f, 567.f, 89.f, .1f});
+  TEST_HALF_MATH_FUNCTION(log2, KE::bhalf_t)({1234.f, 567.f, 89.f, .1f});
   TEST_MATH_FUNCTION(log2)({1234.f, 567.f, 89.f, .1f});
   TEST_MATH_FUNCTION(log2)({1234., 567., 89., .02});
 #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
@@ -785,13 +965,17 @@ TEST(TEST_CATEGORY, mathematical_functions_exponential_functions) {
   TEST_MATH_FUNCTION(log1p)({1u, 23u, 456u, 7890u, 0u});
   TEST_MATH_FUNCTION(log1p)({1ul, 23ul, 456ul, 7890ul, 0ul});
   TEST_MATH_FUNCTION(log1p)({1ull, 23ull, 456ull, 7890ull, 0ull});
+  TEST_HALF_MATH_FUNCTION(log1p, KE::half_t)({1234.f, 567.f, 89.f, -.9f});
+  TEST_HALF_MATH_FUNCTION(log1p, KE::bhalf_t)({1234.f, 567.f, 89.f, -.9f});
   TEST_MATH_FUNCTION(log1p)({1234.f, 567.f, 89.f, -.9f});
   TEST_MATH_FUNCTION(log1p)({1234., 567., 89., -.08});
 #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
   TEST_MATH_FUNCTION(log1p)({1234.l, 567.l, 89.l, -.007l});
 #endif
 }
+#endif
 
+#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_1
 TEST(TEST_CATEGORY, mathematical_functions_hyperbolic_functions) {
   TEST_MATH_FUNCTION(sinh)({-3, -2, -1, 0, 1});
   TEST_MATH_FUNCTION(sinh)({-3l, -2l, -1l, 0l, 1l});
@@ -799,6 +983,8 @@ TEST(TEST_CATEGORY, mathematical_functions_hyperbolic_functions) {
   TEST_MATH_FUNCTION(sinh)({2u, 3u, 4u, 5u, 6u});
   TEST_MATH_FUNCTION(sinh)({2ul, 3ul, 4ul, 5ul, 6ul});
   TEST_MATH_FUNCTION(sinh)({2ull, 3ull, 4ull, 5ull, 6ull});
+  TEST_HALF_MATH_FUNCTION(sinh, KE::half_t)({.1f, -2.f, 3.f});
+  TEST_HALF_MATH_FUNCTION(sinh, KE::bhalf_t)({.1f, -2.f, 3.f});
   TEST_MATH_FUNCTION(sinh)({.1f, -2.f, 3.f});
   TEST_MATH_FUNCTION(sinh)({-4., .5, -.6});
 #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
@@ -811,6 +997,8 @@ TEST(TEST_CATEGORY, mathematical_functions_hyperbolic_functions) {
   TEST_MATH_FUNCTION(cosh)({2u, 3u, 4u, 5u, 6u});
   TEST_MATH_FUNCTION(cosh)({2ul, 3ul, 4ul, 5ul, 6ul});
   TEST_MATH_FUNCTION(cosh)({2ull, 3ull, 4ull, 5ull, 6ull});
+  TEST_HALF_MATH_FUNCTION(cosh, KE::half_t)({.1f, -2.f, 3.f});
+  TEST_HALF_MATH_FUNCTION(cosh, KE::bhalf_t)({.1f, -2.f, 3.f});
   TEST_MATH_FUNCTION(cosh)({.1f, -2.f, 3.f});
   TEST_MATH_FUNCTION(cosh)({-4., .5, -.6});
 #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
@@ -823,6 +1011,8 @@ TEST(TEST_CATEGORY, mathematical_functions_hyperbolic_functions) {
   TEST_MATH_FUNCTION(tanh)({2u, 3u, 4u, 5u, 6u});
   TEST_MATH_FUNCTION(tanh)({2ul, 3ul, 4ul, 5ul, 6ul});
   TEST_MATH_FUNCTION(tanh)({2ull, 3ull, 4ull, 5ull, 6ull});
+  TEST_HALF_MATH_FUNCTION(tanh, KE::half_t)({.1f, -2.f, 3.f});
+  TEST_HALF_MATH_FUNCTION(tanh, KE::bhalf_t)({.1f, -2.f, 3.f});
   TEST_MATH_FUNCTION(tanh)({.1f, -2.f, 3.f});
   TEST_MATH_FUNCTION(tanh)({-4., .5, -.6});
 #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
@@ -835,6 +1025,8 @@ TEST(TEST_CATEGORY, mathematical_functions_hyperbolic_functions) {
   TEST_MATH_FUNCTION(asinh)({2u, 3u, 4u, 5u, 6u});
   TEST_MATH_FUNCTION(asinh)({2ul, 3ul, 4ul, 5ul, 6ul});
   TEST_MATH_FUNCTION(asinh)({2ull, 3ull, 4ull, 5ull, 6ull});
+  TEST_HALF_MATH_FUNCTION(asinh, KE::half_t)({.1f, -2.f, 3.f});
+  TEST_HALF_MATH_FUNCTION(asinh, KE::bhalf_t)({.1f, -2.f, 3.f});
   TEST_MATH_FUNCTION(asinh)({.1f, -2.f, 3.f});
   TEST_MATH_FUNCTION(asinh)({-4., .5, -.6});
 #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
@@ -847,6 +1039,8 @@ TEST(TEST_CATEGORY, mathematical_functions_hyperbolic_functions) {
   TEST_MATH_FUNCTION(acosh)({1u, 2u, 3u, 4u, 5u, 6u});
   TEST_MATH_FUNCTION(acosh)({1ul, 2ul, 3ul, 4ul, 5ul, 6ul});
   TEST_MATH_FUNCTION(acosh)({1ull, 2ull, 3ull, 4ull, 5ull, 6ull});
+  TEST_HALF_MATH_FUNCTION(acosh, KE::half_t)({1.2f, 34.f, 56.f, 789.f});
+  TEST_HALF_MATH_FUNCTION(acosh, KE::bhalf_t)({1.2f, 34.f, 56.f, 789.f});
   TEST_MATH_FUNCTION(acosh)({1.2f, 34.f, 56.f, 789.f});
   TEST_MATH_FUNCTION(acosh)({1.2, 34., 56., 789.});
 #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
@@ -859,12 +1053,30 @@ TEST(TEST_CATEGORY, mathematical_functions_hyperbolic_functions) {
   TEST_MATH_FUNCTION(atanh)({0u});
   TEST_MATH_FUNCTION(atanh)({0ul});
   TEST_MATH_FUNCTION(atanh)({0ull});
+  TEST_HALF_MATH_FUNCTION(atanh, KE::half_t)
+  ({-.97f, .86f, -.53f, .42f, -.1f, 0.f});
+  TEST_HALF_MATH_FUNCTION(atanh, KE::bhalf_t)
+  ({-.97f, .86f, -.53f, .42f, -.1f, 0.f});
   TEST_MATH_FUNCTION(atanh)({-.97f, .86f, -.53f, .42f, -.1f, 0.f});
   TEST_MATH_FUNCTION(atanh)({-.97, .86, -.53, .42, -.1, 0.});
 #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
   TEST_MATH_FUNCTION(atanh)({-.97l, .86l, -.53l, .42l, -.1l, 0.l});
 #endif
 }
+
+TEST(TEST_CATEGORY, mathematical_functions_non_standard) {
+  TEST_MATH_FUNCTION(rsqrt)({1, 2, 3, 5, 7, 11});
+  TEST_MATH_FUNCTION(rsqrt)({1l, 2l, 3l, 5l, 7l, 11l});
+  TEST_MATH_FUNCTION(rsqrt)({1ll, 2ll, 3ll, 5ll, 7ll, 11ll});
+  TEST_MATH_FUNCTION(rsqrt)({1u, 2u, 3u, 5u, 7u});
+  TEST_MATH_FUNCTION(rsqrt)({1ul, 2ul, 3ul, 5ul, 7ul});
+  TEST_MATH_FUNCTION(rsqrt)({1ull, 2ull, 3ull, 5ull, 7ull});
+  TEST_MATH_FUNCTION(rsqrt)({10.f, 20.f, 30.f, 40.f});
+  TEST_MATH_FUNCTION(rsqrt)({11.1, 22.2, 33.3, 44.4});
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+  TEST_MATH_FUNCTION(rsqrt)({10.l, 20.l, 30.l, 40.l});
+#endif
+}
 #endif
 
 #ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_2
@@ -876,6 +1088,8 @@ TEST(TEST_CATEGORY, mathematical_functions_error_and_gamma_functions) {
   TEST_MATH_FUNCTION(erf)({2u, 3u, 4u, 5u, 6u});
   TEST_MATH_FUNCTION(erf)({2ul, 3ul, 4ul, 5ul, 6ul});
   TEST_MATH_FUNCTION(erf)({2ull, 3ull, 4ull, 5ull, 6ull});
+  TEST_HALF_MATH_FUNCTION(erf, KE::half_t)({.1f, -2.f, 3.f});
+  TEST_HALF_MATH_FUNCTION(erf, KE::bhalf_t)({.1f, -2.f, 3.f});
   TEST_MATH_FUNCTION(erf)({.1f, -2.f, 3.f});
   TEST_MATH_FUNCTION(erf)({-4., .5, -.6});
 #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
@@ -888,6 +1102,8 @@ TEST(TEST_CATEGORY, mathematical_functions_error_and_gamma_functions) {
   TEST_MATH_FUNCTION(erfc)({2u, 3u, 4u, 5u, 6u});
   TEST_MATH_FUNCTION(erfc)({2ul, 3ul, 4ul, 5ul, 6ul});
   TEST_MATH_FUNCTION(erfc)({2ull, 3ull, 4ull, 5ull, 6ull});
+  TEST_HALF_MATH_FUNCTION(erfc, KE::half_t)({.1f, -2.f, 3.f});
+  TEST_HALF_MATH_FUNCTION(erfc, KE::bhalf_t)({.1f, -2.f, 3.f});
   TEST_MATH_FUNCTION(erfc)({.1f, -2.f, 3.f});
   TEST_MATH_FUNCTION(erfc)({-4., .5, -.6});
 #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
@@ -900,6 +1116,8 @@ TEST(TEST_CATEGORY, mathematical_functions_error_and_gamma_functions) {
   TEST_MATH_FUNCTION(tgamma)({1u, 2u, 3u, 4u, 56u, 78u});
   TEST_MATH_FUNCTION(tgamma)({1ul, 2ul, 3ul, 4ul, 56ul, 78ul});
   TEST_MATH_FUNCTION(tgamma)({1ull, 2ull, 3ull, 4ull, 56ull, 78ull});
+  TEST_HALF_MATH_FUNCTION(tgamma, KE::half_t)({.1f, -2.2f, 3.f});
+  TEST_HALF_MATH_FUNCTION(tgamma, KE::bhalf_t)({.1f, -2.2f, 3.f});
   TEST_MATH_FUNCTION(tgamma)({.1f, -2.2f, 3.f});
   TEST_MATH_FUNCTION(tgamma)({-4.4, .5, -.6});
 #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
@@ -912,6 +1130,8 @@ TEST(TEST_CATEGORY, mathematical_functions_error_and_gamma_functions) {
   TEST_MATH_FUNCTION(lgamma)({1u, 2u, 3u, 4u, 56u, 78u});
   TEST_MATH_FUNCTION(lgamma)({1ul, 2ul, 3ul, 4ul, 56ul, 78ul});
   TEST_MATH_FUNCTION(lgamma)({1ull, 2ull, 3ull, 4ull, 56ull, 78ull});
+  TEST_HALF_MATH_FUNCTION(lgamma, KE::half_t)({.1f, -2.2f, 3.f});
+  TEST_HALF_MATH_FUNCTION(lgamma, KE::bhalf_t)({.1f, -2.2f, 3.f});
   TEST_MATH_FUNCTION(lgamma)({.1f, -2.2f, 3.f});
   TEST_MATH_FUNCTION(lgamma)({-4.4, .5, -.6});
 #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
@@ -927,6 +1147,8 @@ TEST(TEST_CATEGORY,
   TEST_MATH_FUNCTION(ceil)({2u, 3u, 4u, 5u, 6u});
   TEST_MATH_FUNCTION(ceil)({2ul, 3ul, 4ul, 5ul, 6ul});
   TEST_MATH_FUNCTION(ceil)({2ull, 3ull, 4ull, 5ull, 6ull});
+  TEST_HALF_MATH_FUNCTION(ceil, KE::half_t)({-1.1f, 2.2f, -3.3f, 4.4f, -5.5f});
+  TEST_HALF_MATH_FUNCTION(ceil, KE::bhalf_t)({-1.1f, 2.2f, -3.3f, 4.4f, -5.5f});
   TEST_MATH_FUNCTION(ceil)({-1.1f, 2.2f, -3.3f, 4.4f, -5.5f});
   TEST_MATH_FUNCTION(ceil)({-6.6, 7.7, -8.8, 9.9});
 #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
@@ -939,6 +1161,9 @@ TEST(TEST_CATEGORY,
   TEST_MATH_FUNCTION(floor)({2u, 3u, 4u, 5u, 6u});
   TEST_MATH_FUNCTION(floor)({2ul, 3ul, 4ul, 5ul, 6ul});
   TEST_MATH_FUNCTION(floor)({2ull, 3ull, 4ull, 5ull, 6ull});
+  TEST_HALF_MATH_FUNCTION(floor, KE::half_t)({-1.1f, 2.2f, -3.3f, 4.4f, -5.5f});
+  TEST_HALF_MATH_FUNCTION(floor, KE::bhalf_t)
+  ({-1.1f, 2.2f, -3.3f, 4.4f, -5.5f});
   TEST_MATH_FUNCTION(floor)({-1.1f, 2.2f, -3.3f, 4.4f, -5.5f});
   TEST_MATH_FUNCTION(floor)({-6.6, 7.7, -8.8, 9.9});
 #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
@@ -951,6 +1176,9 @@ TEST(TEST_CATEGORY,
   TEST_MATH_FUNCTION(trunc)({2u, 3u, 4u, 5u, 6u});
   TEST_MATH_FUNCTION(trunc)({2ul, 3ul, 4ul, 5ul, 6ul});
   TEST_MATH_FUNCTION(trunc)({2ull, 3ull, 4ull, 5ull, 6ull});
+  TEST_HALF_MATH_FUNCTION(trunc, KE::half_t)({-1.1f, 2.2f, -3.3f, 4.4f, -5.5f});
+  TEST_HALF_MATH_FUNCTION(trunc, KE::bhalf_t)
+  ({-1.1f, 2.2f, -3.3f, 4.4f, -5.5f});
   TEST_MATH_FUNCTION(trunc)({-1.1f, 2.2f, -3.3f, 4.4f, -5.5f});
   TEST_MATH_FUNCTION(trunc)({-6.6, 7.7, -8.8, 9.9});
 #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
@@ -963,6 +1191,10 @@ TEST(TEST_CATEGORY,
   TEST_MATH_FUNCTION(round)({2u, 3u, 4u, 5u, 6u});
   TEST_MATH_FUNCTION(round)({2ul, 3ul, 4ul, 5ul, 6ul});
   TEST_MATH_FUNCTION(round)({2ull, 3ull, 4ull, 5ull, 6ull});
+  TEST_HALF_MATH_FUNCTION(round, KE::half_t)
+  ({2.3f, 2.5f, 2.7f, -2.3f, -2.5f, -2.7f, -0.0f});
+  TEST_HALF_MATH_FUNCTION(round, KE::bhalf_t)
+  ({2.3f, 2.5f, 2.7f, -2.3f, -2.5f, -2.7f, -0.0f});
   TEST_MATH_FUNCTION(round)({2.3f, 2.5f, 2.7f, -2.3f, -2.5f, -2.7f, -0.0f});
   TEST_MATH_FUNCTION(round)({2.3, 2.5, 2.7, -2.3, -2.5, -2.7, -0.0});
 #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
@@ -976,6 +1208,10 @@ TEST(TEST_CATEGORY,
   TEST_MATH_FUNCTION(nearbyint)({2u, 3u, 4u, 5u, 6u});
   TEST_MATH_FUNCTION(nearbyint)({2ul, 3ul, 4ul, 5ul, 6ul});
   TEST_MATH_FUNCTION(nearbyint)({2ull, 3ull, 4ull, 5ull, 6ull});
+  TEST_HALF_MATH_FUNCTION(nearbyint, KE::half_t)
+  ({-1.1f, 2.2f, -3.3f, 4.4f, -5.5f});
+  TEST_HALF_MATH_FUNCTION(nearbyint, KE::bhalf_t)
+  ({-1.1f, 2.2f, -3.3f, 4.4f, -5.5f});
   TEST_MATH_FUNCTION(nearbyint)({-1.1f, 2.2f, -3.3f, 4.4f, -5.5f});
   TEST_MATH_FUNCTION(nearbyint)({-6.6, 7.7, -8.8, 9.9});
 #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
@@ -992,12 +1228,22 @@ TEST(TEST_CATEGORY,
   TEST_MATH_FUNCTION(logb)({2u, 3u, 4u, 5u, 6u});
   TEST_MATH_FUNCTION(logb)({2ul, 3ul, 4ul, 5ul, 6ul});
   TEST_MATH_FUNCTION(logb)({2ull, 3ull, 4ull, 5ull, 6ull});
+  TEST_HALF_MATH_FUNCTION(logb, KE::half_t)({123.45f, 6789.0f});
+  TEST_HALF_MATH_FUNCTION(logb, KE::bhalf_t)({123.45f, 6789.0f});
   TEST_MATH_FUNCTION(logb)({123.45f, 6789.0f});
   TEST_MATH_FUNCTION(logb)({123.45, 6789.0});
 #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
   TEST_MATH_FUNCTION(logb)({123.45l, 6789.0l});
 #endif
 
+  do_test_math_binary_function<TEST_EXECSPACE, kk_nextafter>(
+      0, static_cast<KE::half_t>(1.f));
+  do_test_math_binary_function<TEST_EXECSPACE, kk_nextafter>(
+      1, static_cast<KE::half_t>(2.f));
+  do_test_math_binary_function<TEST_EXECSPACE, kk_nextafter>(
+      0, static_cast<KE::bhalf_t>(1.f));
+  do_test_math_binary_function<TEST_EXECSPACE, kk_nextafter>(
+      1, static_cast<KE::bhalf_t>(2.f));
   do_test_math_binary_function<TEST_EXECSPACE, kk_nextafter>(0, 1.f);
   do_test_math_binary_function<TEST_EXECSPACE, kk_nextafter>(1, 2.f);
   do_test_math_binary_function<TEST_EXECSPACE, kk_nextafter>(0.1, 0);
@@ -1006,6 +1252,14 @@ TEST(TEST_CATEGORY,
   do_test_math_binary_function<TEST_EXECSPACE, kk_nextafter>(1.l, 2.l);
 #endif
 
+  do_test_math_binary_function<TEST_EXECSPACE, kk_copysign>(
+      0, static_cast<KE::half_t>(1.f));
+  do_test_math_binary_function<TEST_EXECSPACE, kk_copysign>(
+      1, static_cast<KE::half_t>(2.f));
+  do_test_math_binary_function<TEST_EXECSPACE, kk_copysign>(
+      0, static_cast<KE::bhalf_t>(1.f));
+  do_test_math_binary_function<TEST_EXECSPACE, kk_copysign>(
+      1, static_cast<KE::bhalf_t>(2.f));
   do_test_math_binary_function<TEST_EXECSPACE, kk_copysign>(0, 1.f);
   do_test_math_binary_function<TEST_EXECSPACE, kk_copysign>(1, 2.f);
   do_test_math_binary_function<TEST_EXECSPACE, kk_copysign>(0.1, 0);
@@ -1033,46 +1287,57 @@ struct TestAbsoluteValueFunction {
     using Kokkos::abs;
     if (abs(1) != 1 || abs(-1) != 1) {
       ++e;
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed abs(int)\n");
+      Kokkos::printf("failed abs(int)\n");
     }
     if (abs(2l) != 2l || abs(-2l) != 2l) {
       ++e;
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed abs(long int)\n");
+      Kokkos::printf("failed abs(long int)\n");
     }
     if (abs(3ll) != 3ll || abs(-3ll) != 3ll) {
       ++e;
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed abs(long long int)\n");
+      Kokkos::printf("failed abs(long long int)\n");
     }
     if (abs(4.f) != 4.f || abs(-4.f) != 4.f) {
       ++e;
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed abs(float)\n");
+      Kokkos::printf("failed abs(float)\n");
+    }
+    if (abs(static_cast<KE::half_t>(4.f)) != static_cast<KE::half_t>(4.f) ||
+        abs(static_cast<KE::half_t>(-4.f)) != static_cast<KE::half_t>(4.f)) {
+      ++e;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed abs(KE::half_t)\n");
+    }
+    if (abs(static_cast<KE::bhalf_t>(4.f)) != static_cast<KE::bhalf_t>(4.f) ||
+        abs(static_cast<KE::bhalf_t>(-4.f)) != static_cast<KE::bhalf_t>(4.f)) {
+      ++e;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed abs(KE::bhalf_t)\n");
     }
     if (abs(5.) != 5. || abs(-5.) != 5.) {
       ++e;
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed abs(double)\n");
+      Kokkos::printf("failed abs(double)\n");
     }
 #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
     if (abs(6.l) != 6.l || abs(-6.l) != 6.l) {
       ++e;
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed abs(long double)\n");
+      Kokkos::printf("failed abs(long double)\n");
     }
 #endif
     // special values
     using Kokkos::isinf;
     using Kokkos::isnan;
-    if (abs(-0.) != 0.
-#ifndef KOKKOS_IMPL_WORKAROUND_INTEL_LLVM_DEFAULT_FLOATING_POINT_MODEL
-        || !isinf(abs(-INFINITY)) || !isnan(abs(-NAN))
-#endif
-    ) {
+    if (abs(-0.) != 0. || !isinf(abs(-INFINITY)) || !isnan(abs(-NAN))) {
       ++e;
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-          "failed abs(floating_point) special values\n");
+      Kokkos::printf("failed abs(floating_point) special values\n");
     }
 
     static_assert(std::is_same<decltype(abs(1)), int>::value, "");
     static_assert(std::is_same<decltype(abs(2l)), long>::value, "");
     static_assert(std::is_same<decltype(abs(3ll)), long long>::value, "");
+    static_assert(std::is_same<decltype(abs(static_cast<KE::half_t>(4.f))),
+                               KE::half_t>::value,
+                  "");
+    static_assert(std::is_same<decltype(abs(static_cast<KE::bhalf_t>(4.f))),
+                               KE::bhalf_t>::value,
+                  "");
     static_assert(std::is_same<decltype(abs(4.f)), float>::value, "");
     static_assert(std::is_same<decltype(abs(5.)), double>::value, "");
 #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
@@ -1086,62 +1351,444 @@ TEST(TEST_CATEGORY, mathematical_functions_absolute_value) {
 }
 
 template <class Space>
-struct TestIsNaN {
-  TestIsNaN() { run(); }
+struct TestFloatingPointAbsoluteValueFunction {
+  TestFloatingPointAbsoluteValueFunction() { run(); }
+  void run() const {
+    int errors = 0;
+    Kokkos::parallel_reduce(Kokkos::RangePolicy<Space>(0, 1), *this, errors);
+    ASSERT_EQ(errors, 0);
+  }
+  KOKKOS_FUNCTION void operator()(int, int& e) const {
+    using Kokkos::fabs;
+    if (fabs(4.f) != 4.f || fabs(-4.f) != 4.f) {
+      ++e;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fabs(float)\n");
+    }
+    if (fabs(static_cast<KE::half_t>(4.f)) != static_cast<KE::half_t>(4.f) ||
+        fabs(static_cast<KE::half_t>(-4.f)) != static_cast<KE::half_t>(4.f)) {
+      ++e;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fabs(KE::half_t)\n");
+    }
+    if (fabs(static_cast<KE::bhalf_t>(4.f)) != static_cast<KE::bhalf_t>(4.f) ||
+        fabs(static_cast<KE::bhalf_t>(-4.f)) != static_cast<KE::bhalf_t>(4.f)) {
+      ++e;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fabs(KE::bhalf_t)\n");
+    }
+    if (fabs(5.) != 5. || fabs(-5.) != 5.) {
+      ++e;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fabs(double)\n");
+    }
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+    if (fabs(6.l) != 6.l || fabs(-6.l) != 6.l) {
+      ++e;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fabs(long double)\n");
+    }
+#endif
+    // special values
+    using Kokkos::isinf;
+    using Kokkos::isnan;
+    if (fabs(-0.) != 0. || !isinf(fabs(-INFINITY)) || !isnan(fabs(-NAN))) {
+      ++e;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "failed fabs(floating_point) special values\n");
+    }
+
+    static_assert(std::is_same<decltype(fabs(static_cast<KE::half_t>(4.f))),
+                               KE::half_t>::value);
+    static_assert(std::is_same<decltype(fabs(static_cast<KE::bhalf_t>(4.f))),
+                               KE::bhalf_t>::value);
+    static_assert(std::is_same<decltype(fabs(4.f)), float>::value);
+    static_assert(std::is_same<decltype(fabs(5.)), double>::value);
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+    static_assert(std::is_same<decltype(fabs(6.l)), long double>::value);
+#endif
+  }
+};
+
+TEST(TEST_CATEGORY, mathematical_functions_floating_point_absolute_value) {
+  TestFloatingPointAbsoluteValueFunction<TEST_EXECSPACE>();
+}
+
+template <class Space>
+struct TestFloatingPointRemainderFunction : FloatingPointComparison {
+  TestFloatingPointRemainderFunction() { run(); }
   void run() const {
     int errors = 0;
     Kokkos::parallel_reduce(Kokkos::RangePolicy<Space>(0, 1), *this, errors);
     ASSERT_EQ(errors, 0);
   }
   KOKKOS_FUNCTION void operator()(int, int& e) const {
+    using Kokkos::fmod;
+    if (!compare(fmod(6.2f, 4.f), 2.2f, 1) &&
+        !compare(fmod(-6.2f, 4.f), -2.2f, 1)) {
+      ++e;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fmod(float)\n");
+    }
+    if (!compare(
+            fmod(static_cast<KE::half_t>(6.2f), static_cast<KE::half_t>(4.f)),
+            static_cast<KE::half_t>(2.2f), 1) &&
+        !compare(
+            fmod(static_cast<KE::half_t>(-6.2f), static_cast<KE::half_t>(4.f)),
+            -static_cast<KE::half_t>(2.2f), 1)) {
+      ++e;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fmod(KE::half_t)\n");
+    }
+    if (!compare(
+            fmod(static_cast<KE::bhalf_t>(6.2f), static_cast<KE::bhalf_t>(4.f)),
+            static_cast<KE::bhalf_t>(2.2f), 1) &&
+        !compare(fmod(static_cast<KE::bhalf_t>(-6.2f),
+                      static_cast<KE::bhalf_t>(4.f)),
+                 -static_cast<KE::bhalf_t>(2.2f), 1)) {
+      ++e;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fmod(KE::bhalf_t)\n");
+    }
+    if (!compare(fmod(6.2, 4.), 2.2, 1) && !compare(fmod(-6.2, 4.), -2.2, 1)) {
+      ++e;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fmod(double)\n");
+    }
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+    if (!compare(fmod(6.2l, 4.l), 2.2l, 1) &&
+        !compare(fmod(-6.2l, 4.l), -2.2l, 1)) {
+      ++e;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fmod(long double)\n");
+    }
+#endif
+    // special values
+    using Kokkos::isinf;
     using Kokkos::isnan;
-    using Kokkos::Experimental::quiet_NaN;
-    using Kokkos::Experimental::signaling_NaN;
-    if (isnan(1) || isnan(INT_MAX)) {
+    if (!isinf(fmod(-KE::infinity<float>::value, 1.f)) &&
+        !isnan(fmod(-KE::quiet_NaN<float>::value, 1.f))) {
+      ++e;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "failed fmod(floating_point) special values\n");
+    }
+
+    static_assert(std::is_same<decltype(fmod(static_cast<KE::half_t>(4.f),
+                                             static_cast<KE::half_t>(4.f))),
+                               KE::half_t>::value,
+                  "");
+    static_assert(std::is_same<decltype(fmod(static_cast<KE::bhalf_t>(4.f),
+                                             static_cast<KE::bhalf_t>(4.f))),
+                               KE::bhalf_t>::value,
+                  "");
+    static_assert(std::is_same<decltype(fmod(4.f, 4.f)), float>::value, "");
+    static_assert(std::is_same<decltype(fmod(5., 5.)), double>::value, "");
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+    static_assert(std::is_same<decltype(fmod(6.l, 6.l)), long double>::value,
+                  "");
+#endif
+  }
+};
+
+TEST(TEST_CATEGORY, mathematical_functions_remainder_function) {
+  TestFloatingPointRemainderFunction<TEST_EXECSPACE>();
+}
+
+#if 0
+// TODO: Adjust expected values, see https://github.com/kokkos/kokkos/issues/6275
+template <class Space>
+struct TestIEEEFloatingPointRemainderFunction : FloatingPointComparison {
+  TestIEEEFloatingPointRemainderFunction() { run(); }
+  void run() const {
+    int errors = 0;
+    Kokkos::parallel_reduce(Kokkos::RangePolicy<Space>(0, 1), *this, errors);
+    ASSERT_EQ(errors, 0);
+  }
+  KOKKOS_FUNCTION void operator()(int, int& e) const {
+    using Kokkos::remainder;
+    if (!compare(remainder(6.2f, 4.f), 2.2f, 2) &&
+        !compare(remainder(-6.2f, 4.f), 2.2f, 1)) {
+      ++e;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed remainder(float)\n");
+    }
+    if (!compare(remainder(static_cast<KE::half_t>(6.2f),
+                           static_cast<KE::half_t>(4.f)),
+                 static_cast<KE::half_t>(2.2f), 1) &&
+        !compare(remainder(static_cast<KE::half_t>(-6.2f),
+                           static_cast<KE::half_t>(4.f)),
+                 -static_cast<KE::half_t>(2.2f), 1)) {
       ++e;
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed isnan(integral)\n");
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed remainder(KE::half_t)\n");
     }
-    if (isnan(2.f)
-#ifndef KOKKOS_IMPL_WORKAROUND_INTEL_LLVM_DEFAULT_FLOATING_POINT_MODEL
-        || !isnan(quiet_NaN<float>::value) ||
-        !isnan(signaling_NaN<float>::value)
+    if (!compare(remainder(static_cast<KE::bhalf_t>(6.2f),
+                           static_cast<KE::bhalf_t>(4.f)),
+                 static_cast<KE::bhalf_t>(2.2f), 1) &&
+        !compare(remainder(static_cast<KE::bhalf_t>(-6.2f),
+                           static_cast<KE::bhalf_t>(4.f)),
+                 -static_cast<KE::bhalf_t>(2.2f), 1)) {
+      ++e;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed remainder(KE::bhalf_t)\n");
+    }
+    if (!compare(remainder(6.2, 4.), 2.2, 2) &&
+        !compare(remainder(-6.2, 4.), 2.2, 1)) {
+      ++e;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed remainder(double)\n");
+    }
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+    if (!compare(remainder(6.2l, 4.l), 2.2l, 1) &&
+        !compare(remainder(-6.2l, 4.l), -2.2l, 1)) {
+      ++e;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed remainder(long double)\n");
+    }
+#endif
+    // special values
+    using Kokkos::isinf;
+    using Kokkos::isnan;
+    if (!isinf(remainder(-KE::infinity<float>::value, 1.f)) &&
+        !isnan(remainder(-KE::quiet_NaN<float>::value, 1.f))) {
+      ++e;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "failed remainder(floating_point) special values\n");
+    }
+
+    static_assert(
+        std::is_same<decltype(remainder(static_cast<KE::half_t>(4.f),
+                                        static_cast<KE::half_t>(4.f))),
+                     KE::half_t>::value,
+        "");
+    static_assert(
+        std::is_same<decltype(remainder(static_cast<KE::bhalf_t>(4.f),
+                                        static_cast<KE::bhalf_t>(4.f))),
+                     KE::bhalf_t>::value,
+        "");
+    static_assert(std::is_same<decltype(remainder(4.f, 4.f)), float>::value,
+                  "");
+    static_assert(std::is_same<decltype(remainder(5., 5.)), double>::value, "");
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+    static_assert(
+        std::is_same<decltype(remainder(6.l, 6.l)), long double>::value, "");
+#endif
+  }
+};
+
+TEST(TEST_CATEGORY, mathematical_functions_ieee_remainder_function) {
+  TestIEEEFloatingPointRemainderFunction<TEST_EXECSPACE>();
+}
 #endif
 
+// TODO: TestFpClassify, see https://github.com/kokkos/kokkos/issues/6279
+
+template <class Space>
+struct TestIsFinite {
+  TestIsFinite() { run(); }
+  void run() const {
+    int errors = 0;
+    Kokkos::parallel_reduce(Kokkos::RangePolicy<Space>(0, 1), *this, errors);
+    ASSERT_EQ(errors, 0);
+  }
+  KOKKOS_FUNCTION void operator()(int, int& e) const {
+    using KE::infinity;
+    using KE::quiet_NaN;
+    using KE::signaling_NaN;
+    using Kokkos::isfinite;
+    if (!isfinite(1) || !isfinite(INT_MAX)) {
+      ++e;
+      Kokkos::printf("failed isfinite(integral)\n");
+    }
+    if (!isfinite(2.f) || isfinite(quiet_NaN<float>::value) ||
+        isfinite(signaling_NaN<float>::value) ||
+        isfinite(infinity<float>::value)) {
+      ++e;
+      Kokkos::printf("failed isfinite(float)\n");
+    }
+    if (!isfinite(static_cast<KE::half_t>(2.f))
+#ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC 23.7
+        || isfinite(quiet_NaN<KE::half_t>::value) ||
+        isfinite(signaling_NaN<KE::half_t>::value) ||
+        isfinite(infinity<KE::half_t>::value)
+#endif
     ) {
       ++e;
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed isnan(float)\n");
+      Kokkos::printf("failed isfinite(KE::half_t)\n");
     }
-    if (isnan(3.)
-#ifndef KOKKOS_IMPL_WORKAROUND_INTEL_LLVM_DEFAULT_FLOATING_POINT_MODEL
-#ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC
-        || !isnan(quiet_NaN<double>::value) ||
-        !isnan(signaling_NaN<double>::value)
+    if (!isfinite(static_cast<KE::bhalf_t>(2.f))
+#ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC 23.7
+        || isfinite(quiet_NaN<KE::bhalf_t>::value) ||
+        isfinite(signaling_NaN<KE::bhalf_t>::value) ||
+        isfinite(infinity<KE::bhalf_t>::value)
 #endif
+    ) {
+      ++e;
+      Kokkos::printf("failed isfinite(KE::bhalf_t)\n");
+    }
+    if (!isfinite(3.)
+#ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC 23.7
+        || isfinite(quiet_NaN<double>::value) ||
+        isfinite(signaling_NaN<double>::value) ||
+        isfinite(infinity<double>::value)
 #endif
     ) {
       ++e;
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed isnan(double)\n");
+      Kokkos::printf("failed isfinite(double)\n");
+    }
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+    if (!isfinite(4.l) || isfinite(quiet_NaN<long double>::value) ||
+        isfinite(signaling_NaN<long double>::value) ||
+        isfinite(infinity<long double>::value)) {
+      ++e;
+      Kokkos::printf("failed isfinite(long double)\n");
+    }
+#endif
+    // special values
+    if (isfinite(INFINITY) || isfinite(NAN)) {
+      ++e;
+      Kokkos::printf("failed isfinite(floating_point) special values\n");
     }
+
+    static_assert(std::is_same<decltype(isfinite(1)), bool>::value);
+    static_assert(std::is_same<decltype(isfinite(2.f)), bool>::value);
+    static_assert(std::is_same<decltype(isfinite(3.)), bool>::value);
 #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
-    if (isnan(4.l)
-#ifndef KOKKOS_IMPL_WORKAROUND_INTEL_LLVM_DEFAULT_FLOATING_POINT_MODEL
-        || !isnan(quiet_NaN<long double>::value) ||
-        !isnan(signaling_NaN<long double>::value)
+    static_assert(std::is_same<decltype(isfinite(4.l)), bool>::value);
+#endif
+  }
+};
+
+TEST(TEST_CATEGORY, mathematical_functions_isfinite) {
+  TestIsFinite<TEST_EXECSPACE>();
+}
+
+template <class Space>
+struct TestIsInf {
+  TestIsInf() { run(); }
+  void run() const {
+    int errors = 0;
+    Kokkos::parallel_reduce(Kokkos::RangePolicy<Space>(0, 1), *this, errors);
+    ASSERT_EQ(errors, 0);
+  }
+  KOKKOS_FUNCTION void operator()(int, int& e) const {
+    using KE::infinity;
+    using KE::quiet_NaN;
+    using KE::signaling_NaN;
+    using Kokkos::isinf;
+    if (isinf(1) || isinf(INT_MAX)) {
+      ++e;
+      Kokkos::printf("failed isinf(integral)\n");
+    }
+    if (isinf(2.f) || isinf(quiet_NaN<float>::value) ||
+        isinf(signaling_NaN<float>::value) || !isinf(infinity<float>::value)) {
+      ++e;
+      Kokkos::printf("failed isinf(float)\n");
+    }
+    if (isinf(static_cast<KE::half_t>(2.f))
+#ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC 23.7
+        || isinf(quiet_NaN<KE::half_t>::value) ||
+        isinf(signaling_NaN<KE::half_t>::value) ||
+        !isinf(infinity<KE::half_t>::value)
 #endif
     ) {
       ++e;
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed isnan(long double)\n");
+      Kokkos::printf("failed isinf(KE::half_t)\n");
+    }
+    if (isinf(static_cast<KE::bhalf_t>(2.f))
+#ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC 23.7
+        || isinf(quiet_NaN<KE::bhalf_t>::value) ||
+        isinf(signaling_NaN<KE::bhalf_t>::value) ||
+        !isinf(infinity<KE::bhalf_t>::value)
+#endif
+    ) {
+      ++e;
+      Kokkos::printf("failed isinf(KE::bhalf_t)\n");
+    }
+    if (isinf(3.)
+#ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC 23.7
+        || isinf(quiet_NaN<double>::value) ||
+        isinf(signaling_NaN<double>::value) || !isinf(infinity<double>::value)
+#endif
+    ) {
+      ++e;
+      Kokkos::printf("failed isinf(double)\n");
+    }
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+    if (isinf(4.l) || isinf(quiet_NaN<long double>::value) ||
+        isinf(signaling_NaN<long double>::value) ||
+        !isinf(infinity<long double>::value)) {
+      ++e;
+      Kokkos::printf("failed isinf(long double)\n");
     }
 #endif
     // special values
-    if (isnan(INFINITY)
-#ifndef KOKKOS_IMPL_WORKAROUND_INTEL_LLVM_DEFAULT_FLOATING_POINT_MODEL
-        || !isnan(NAN)
+    if (!isinf(INFINITY) || isinf(NAN)) {
+      ++e;
+      Kokkos::printf("failed isinf(floating_point) special values\n");
+    }
+
+    static_assert(std::is_same<decltype(isinf(1)), bool>::value);
+    static_assert(std::is_same<decltype(isinf(2.f)), bool>::value);
+    static_assert(std::is_same<decltype(isinf(3.)), bool>::value);
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+    static_assert(std::is_same<decltype(isinf(4.l)), bool>::value);
+#endif
+  }
+};
+
+TEST(TEST_CATEGORY, mathematical_functions_isinf) {
+  TestIsInf<TEST_EXECSPACE>();
+}
+
+template <class Space>
+struct TestIsNaN {
+  TestIsNaN() { run(); }
+  void run() const {
+    int errors = 0;
+    Kokkos::parallel_reduce(Kokkos::RangePolicy<Space>(0, 1), *this, errors);
+    ASSERT_EQ(errors, 0);
+  }
+  KOKKOS_FUNCTION void operator()(int, int& e) const {
+    using KE::infinity;
+    using KE::quiet_NaN;
+    using KE::signaling_NaN;
+    using Kokkos::isnan;
+    if (isnan(1) || isnan(INT_MAX)) {
+      ++e;
+      Kokkos::printf("failed isnan(integral)\n");
+    }
+    if (isnan(2.f) || !isnan(quiet_NaN<float>::value) ||
+        !isnan(signaling_NaN<float>::value) || isnan(infinity<float>::value)) {
+      ++e;
+      Kokkos::printf("failed isnan(float)\n");
+    }
+    if (isnan(static_cast<KE::half_t>(2.f))
+#ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC 23.7
+        || !isnan(quiet_NaN<KE::half_t>::value) ||
+        !isnan(signaling_NaN<KE::half_t>::value) ||
+        isnan(infinity<KE::half_t>::value)
 #endif
     ) {
       ++e;
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-          "failed isnan(floating_point) special values\n");
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed isnan(KE::half_t)\n");
+    }
+    if (isnan(static_cast<KE::bhalf_t>(2.f))
+#ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC 23.7
+        || !isnan(quiet_NaN<KE::bhalf_t>::value) ||
+        !isnan(signaling_NaN<KE::bhalf_t>::value) ||
+        isnan(infinity<KE::bhalf_t>::value)
+#endif
+    ) {
+      ++e;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed isnan(KE::bhalf_t)\n");
+    }
+    if (isnan(3.)
+#ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC 23.7
+        || !isnan(quiet_NaN<double>::value) ||
+        !isnan(signaling_NaN<double>::value) || isnan(infinity<double>::value)
+#endif
+    ) {
+      ++e;
+      Kokkos::printf("failed isnan(double)\n");
+    }
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+    if (isnan(4.l) || !isnan(quiet_NaN<long double>::value) ||
+        !isnan(signaling_NaN<long double>::value) ||
+        isnan(infinity<long double>::value)) {
+      ++e;
+      Kokkos::printf("failed isnan(long double)\n");
+    }
+#endif
+    // special values
+    if (isnan(INFINITY) || !isnan(NAN)) {
+      ++e;
+      Kokkos::printf("failed isnan(floating_point) special values\n");
     }
 
     static_assert(std::is_same<decltype(isnan(1)), bool>::value, "");
@@ -1156,4 +1803,6 @@ struct TestIsNaN {
 TEST(TEST_CATEGORY, mathematical_functions_isnan) {
   TestIsNaN<TEST_EXECSPACE>();
 }
+
+// TODO: TestSignBit, see https://github.com/kokkos/kokkos/issues/6279
 #endif
diff --git a/packages/kokkos/core/unit_test/TestMathematicalFunctions1.hpp b/packages/kokkos/core/unit_test/TestMathematicalFunctions1.hpp
index d902a04422d5b1c19d401476de6d3e6faada1fca..7452d45e42acb9315e52f907ebd815c9d35d60f5 100644
--- a/packages/kokkos/core/unit_test/TestMathematicalFunctions1.hpp
+++ b/packages/kokkos/core/unit_test/TestMathematicalFunctions1.hpp
@@ -15,5 +15,7 @@
 //@HEADER
 
 #define KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_2
+#define KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_3
 #include "TestMathematicalFunctions.hpp"
 #undef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_2
+#undef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_3
diff --git a/packages/kokkos/core/unit_test/TestMathematicalFunctions2.hpp b/packages/kokkos/core/unit_test/TestMathematicalFunctions2.hpp
index 58572ebe6fe1dc73cbce9aabb6f1263cf5d4a6b9..72f792b08970007fced72d081fa0189def20ca6c 100644
--- a/packages/kokkos/core/unit_test/TestMathematicalFunctions2.hpp
+++ b/packages/kokkos/core/unit_test/TestMathematicalFunctions2.hpp
@@ -15,5 +15,7 @@
 //@HEADER
 
 #define KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_1
+#define KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_3
 #include "TestMathematicalFunctions.hpp"
 #undef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_1
+#undef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_3
diff --git a/packages/kokkos/core/unit_test/TestMathematicalFunctions3.hpp b/packages/kokkos/core/unit_test/TestMathematicalFunctions3.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..3d7b356367dd5f17de3a3981c1e941dc10c6654d
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestMathematicalFunctions3.hpp
@@ -0,0 +1,21 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#define KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_1
+#define KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_2
+#include "TestMathematicalFunctions.hpp"
+#undef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_1
+#undef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_2
diff --git a/packages/kokkos/core/unit_test/TestMathematicalSpecialFunctions.hpp b/packages/kokkos/core/unit_test/TestMathematicalSpecialFunctions.hpp
index 7ae202770583a45a82925c50f1a43232b104274b..06c84c751370d7c4b6a703047b2e1b834fa1934a 100644
--- a/packages/kokkos/core/unit_test/TestMathematicalSpecialFunctions.hpp
+++ b/packages/kokkos/core/unit_test/TestMathematicalSpecialFunctions.hpp
@@ -911,11 +911,14 @@ struct TestComplexBesselJ1Y1Function {
                 Kokkos::abs(h_ref_cbj1(i)) * 1e-13);
     }
 
+// FIXME_SYCL Failing for Intel GPUs
+#if !(defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU))
     EXPECT_EQ(h_ref_cby1(0), h_cby1(0));
     for (int i = 1; i < N; i++) {
       EXPECT_LE(Kokkos::abs(h_cby1(i) - h_ref_cby1(i)),
                 Kokkos::abs(h_ref_cby1(i)) * 1e-13);
     }
+#endif
 
     ////Test large arguments
     d_z_large        = ViewType("d_z_large", 6);
@@ -1055,7 +1058,7 @@ struct TestComplexBesselI0K0Function {
   void testit() {
     using Kokkos::Experimental::infinity;
 
-    int N      = 25;
+    int N      = 26;
     d_z        = ViewType("d_z", N);
     d_cbi0     = ViewType("d_cbi0", N);
     d_cbk0     = ViewType("d_cbk0", N);
@@ -1091,6 +1094,7 @@ struct TestComplexBesselI0K0Function {
     h_z(22) = Kokkos::complex<double>(-28.0, 0.0);
     h_z(23) = Kokkos::complex<double>(60.0, 0.0);
     h_z(24) = Kokkos::complex<double>(-60.0, 0.0);
+    h_z(25) = Kokkos::complex<double>(7.998015e-5, 0.0);
 
     Kokkos::deep_copy(d_z, h_z);
 
@@ -1149,6 +1153,7 @@ struct TestComplexBesselI0K0Function {
     h_ref_cbi0(22) = Kokkos::complex<double>(1.095346047317573e+11, 0);
     h_ref_cbi0(23) = Kokkos::complex<double>(5.894077055609803e+24, 0);
     h_ref_cbi0(24) = Kokkos::complex<double>(5.894077055609803e+24, 0);
+    h_ref_cbi0(25) = Kokkos::complex<double>(1.0000000015992061009, 0);
 
     h_ref_cbk0(0) = Kokkos::complex<double>(infinity<double>::value, 0);
     h_ref_cbk0(1) =
@@ -1195,17 +1200,31 @@ struct TestComplexBesselI0K0Function {
     h_ref_cbk0(23) = Kokkos::complex<double>(1.413897840559108e-27, 0);
     h_ref_cbk0(24) =
         Kokkos::complex<double>(1.413897840559108e-27, -1.851678917759592e+25);
+    h_ref_cbk0(25) = Kokkos::complex<double>(9.5496636116079915979, 0.);
 
+    // FIXME_HIP Disable the test when using ROCm 5.5 and 5.6 due to a known
+    // compiler bug
+#if !defined(KOKKOS_ENABLE_HIP) || (HIP_VERSION_MAJOR != 5) || \
+    ((HIP_VERSION_MAJOR == 5) &&                               \
+     !((HIP_VERSION_MINOR == 5) || (HIP_VERSION_MINOR == 6)))
     for (int i = 0; i < N; i++) {
       EXPECT_LE(Kokkos::abs(h_cbi0(i) - h_ref_cbi0(i)),
                 Kokkos::abs(h_ref_cbi0(i)) * 1e-13);
     }
 
     EXPECT_EQ(h_ref_cbk0(0), h_cbk0(0));
-    for (int i = 1; i < N; i++) {
+    int upper_limit = N;
+    // FIXME_SYCL Failing for Intel GPUs, 19 is the first failing test case
+#if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU)
+    if (std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::SYCL>)
+      upper_limit = 19;
+#endif
+    for (int i = 1; i < upper_limit; i++) {
       EXPECT_LE(Kokkos::abs(h_cbk0(i) - h_ref_cbk0(i)),
-                Kokkos::abs(h_ref_cbk0(i)) * 1e-13);
+                Kokkos::abs(h_ref_cbk0(i)) * 1e-13)
+          << "at index " << i;
     }
+#endif
 
     ////Test large arguments
     d_z_large        = ViewType("d_z_large", 6);
@@ -1443,9 +1462,16 @@ struct TestComplexBesselI1K1Function {
     }
 
     EXPECT_EQ(h_ref_cbk1(0), h_cbk1(0));
-    for (int i = 1; i < N; i++) {
+    int upper_limit = N;
+    // FIXME_SYCL Failing for Intel GPUs, 8 is the first failing test case
+#if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU)
+    if (std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::SYCL>)
+      upper_limit = 8;
+#endif
+    for (int i = 1; i < upper_limit; i++) {
       EXPECT_LE(Kokkos::abs(h_cbk1(i) - h_ref_cbk1(i)),
-                Kokkos::abs(h_ref_cbk1(i)) * 1e-13);
+                Kokkos::abs(h_ref_cbk1(i)) * 1e-13)
+          << "at index " << i;
     }
 
     ////Test large arguments
@@ -1686,6 +1712,11 @@ struct TestComplexBesselH1Function {
     h_ref_ch11(24) =
         Kokkos::complex<double>(-5.430453818237824e-02, -1.530182458039000e-02);
 
+    // FIXME_HIP Disable the test when using ROCm 5.5 and 5.6 due to a known
+    // compiler bug
+#if !defined(KOKKOS_ENABLE_HIP) || (HIP_VERSION_MAJOR != 5) || \
+    ((HIP_VERSION_MAJOR == 5) &&                               \
+     !((HIP_VERSION_MINOR == 5) || (HIP_VERSION_MINOR == 6)))
     EXPECT_EQ(h_ref_ch10(0), h_ch10(0));
     for (int i = 1; i < N; i++) {
       EXPECT_LE(Kokkos::abs(h_ch10(i) - h_ref_ch10(i)),
@@ -1694,11 +1725,18 @@ struct TestComplexBesselH1Function {
     }
 
     EXPECT_EQ(h_ref_ch11(0), h_ch11(0));
-    for (int i = 1; i < N; i++) {
+    int upper_limit = N;
+    // FIXME_SYCL Failing for Intel GPUs, 16 is the first failing test case
+#if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU)
+    if (std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::SYCL>)
+      upper_limit = 16;
+#endif
+    for (int i = 1; i < upper_limit; i++) {
       EXPECT_LE(Kokkos::abs(h_ch11(i) - h_ref_ch11(i)),
                 Kokkos::abs(h_ref_ch11(i)) * 1e-13)
           << "at index " << i;
     }
+#endif
   }
 
   KOKKOS_INLINE_FUNCTION
@@ -1868,6 +1906,11 @@ struct TestComplexBesselH2Function {
     h_ref_ch21(24) =
         Kokkos::complex<double>(1.629136145471347e-01, +1.530182458039000e-02);
 
+    // FIXME_HIP Disable the test when using ROCm 5.5 and 5.6 due to a known
+    // compiler bug
+#if !defined(KOKKOS_ENABLE_HIP) || (HIP_VERSION_MAJOR != 5) || \
+    ((HIP_VERSION_MAJOR == 5) &&                               \
+     !((HIP_VERSION_MINOR == 5) || (HIP_VERSION_MINOR == 6)))
     EXPECT_EQ(h_ref_ch20(0), h_ch20(0));
     for (int i = 1; i < N; i++) {
       EXPECT_LE(Kokkos::abs(h_ch20(i) - h_ref_ch20(i)),
@@ -1875,10 +1918,18 @@ struct TestComplexBesselH2Function {
     }
 
     EXPECT_EQ(h_ref_ch21(0), h_ch21(0));
-    for (int i = 1; i < N; i++) {
+    int upper_limit = N;
+    // FIXME_SYCL Failing for Intel GPUs, 17 is the first failing test case
+#if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU)
+    if (std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::SYCL>)
+      upper_limit = 17;
+#endif
+    for (int i = 1; i < upper_limit; i++) {
       EXPECT_LE(Kokkos::abs(h_ch21(i) - h_ref_ch21(i)),
-                Kokkos::abs(h_ref_ch21(i)) * 1e-13);
+                Kokkos::abs(h_ref_ch21(i)) * 1e-13)
+          << "at index " << i;
     }
+#endif
   }
 
   KOKKOS_INLINE_FUNCTION
diff --git a/packages/kokkos/core/unit_test/TestMinMaxClamp.hpp b/packages/kokkos/core/unit_test/TestMinMaxClamp.hpp
index 7ed2c6d3bacececde0c538163dc45abf473ad40c..6466b3b1727933d641d6f0374b0c6ebd8af33f33 100644
--- a/packages/kokkos/core/unit_test/TestMinMaxClamp.hpp
+++ b/packages/kokkos/core/unit_test/TestMinMaxClamp.hpp
@@ -174,10 +174,6 @@ TEST(TEST_CATEGORY, minmax) {
   EXPECT_EQ(r2.first, 2);
   EXPECT_EQ(r2.second, 3);
 
-#ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC nvhpc can't deal with device side
-                               // constexpr constructors so I removed the
-                               // constexpr in pair, which makes static_assert
-                               // here fail
   static_assert((Kokkos::pair<float, float>(Kokkos::minmax(3.f, 2.f)) ==
                  Kokkos::make_pair(2.f, 3.f)));
   static_assert(
@@ -208,7 +204,6 @@ TEST(TEST_CATEGORY, minmax) {
                                    ::Test::PairIntCompareFirst{0, 5},
                                })
                     .second.second == 4);  // rightmost
-#endif
 }
 
 template <class ViewType>
diff --git a/packages/kokkos/core/unit_test/TestNumericTraits.hpp b/packages/kokkos/core/unit_test/TestNumericTraits.hpp
index 9146297cd8a2f0438649820ce512b9573f4b16df..2b5531f29a6a8d7c3fc10db59967fb9d1479832c 100644
--- a/packages/kokkos/core/unit_test/TestNumericTraits.hpp
+++ b/packages/kokkos/core/unit_test/TestNumericTraits.hpp
@@ -40,7 +40,14 @@ struct extrema {
 
   DEFINE_EXTREMA(float, -FLT_MAX, FLT_MAX);
   DEFINE_EXTREMA(double, -DBL_MAX, DBL_MAX);
+
+#if !defined(KOKKOS_ENABLE_CUDA) || \
+    !defined(KOKKOS_COMPILER_NVHPC)  // 23.7 long double
   DEFINE_EXTREMA(long double, -LDBL_MAX, LDBL_MAX);
+#else
+  static long double min(long double) { return -LDBL_MAX; }
+  static long double max(long double) { return LDBL_MAX; }
+#endif
 
 #undef DEFINE_EXTREMA
 };
@@ -94,8 +101,8 @@ struct TestNumericTraits {
 
   KOKKOS_FUNCTION void operator()(Infinity, int, int& e) const {
     using Kokkos::Experimental::infinity;
-    auto const inf  = infinity<T>::value;
-    auto const zero = T(0);
+    constexpr auto inf = infinity<T>::value;
+    auto const zero    = T(0);
     e += (int)!(inf + inf == inf);
     e += (int)!(inf != zero);
     use_on_device();
@@ -138,10 +145,10 @@ struct TestNumericTraits {
   KOKKOS_FUNCTION void operator()(MaxExponent10, int, int&) const { use_on_device(); }
   // clang-format on
   KOKKOS_FUNCTION void operator()(QuietNaN, int, int& e) const {
-#ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC
+#ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC 23.7 nan
     using Kokkos::Experimental::quiet_NaN;
-    constexpr auto nan  = quiet_NaN<T>::value;
-    constexpr auto zero = T(0);
+    constexpr auto nan = quiet_NaN<T>::value;
+    auto const zero    = T(0);
     e += (int)!(nan != nan);
     e += (int)!(nan != zero);
 #else
@@ -150,10 +157,10 @@ struct TestNumericTraits {
     use_on_device();
   }
   KOKKOS_FUNCTION void operator()(SignalingNaN, int, int& e) const {
-#ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC
+#ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC 23.7 nan
     using Kokkos::Experimental::signaling_NaN;
-    constexpr auto nan  = signaling_NaN<T>::value;
-    constexpr auto zero = T(0);
+    constexpr auto nan = signaling_NaN<T>::value;
+    auto const zero    = T(0);
     e += (int)!(nan != nan);
     e += (int)!(nan != zero);
 #else
@@ -163,7 +170,8 @@ struct TestNumericTraits {
   }
 
   KOKKOS_FUNCTION void use_on_device() const {
-#if defined(KOKKOS_COMPILER_NVCC) || defined(KOKKOS_ENABLE_OPENMPTARGET)
+#if defined(KOKKOS_COMPILER_NVCC) || defined(KOKKOS_COMPILER_NVHPC) || \
+    defined(KOKKOS_ENABLE_OPENMPTARGET) || defined(KOKKOS_ENABLE_OPENACC)
     take_by_value(trait<T>::value);
 #else
     (void)take_address_of(trait<T>::value);
@@ -196,39 +204,58 @@ struct TestNumericTraits<
 #endif
 
 TEST(TEST_CATEGORY, numeric_traits_infinity) {
+#ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC 23.7
+  TestNumericTraits<TEST_EXECSPACE, Kokkos::Experimental::half_t, Infinity>();
+  TestNumericTraits<TEST_EXECSPACE, Kokkos::Experimental::bhalf_t, Infinity>();
+#endif
   TestNumericTraits<TEST_EXECSPACE, float, Infinity>();
   TestNumericTraits<TEST_EXECSPACE, double, Infinity>();
   // FIXME_NVHPC long double not supported
-#if !defined(KOKKOS_COMPILER_NVHPC)
+#if !defined(KOKKOS_ENABLE_CUDA) || \
+    !defined(KOKKOS_COMPILER_NVHPC)  // 23.7 long double
   TestNumericTraits<TEST_EXECSPACE, long double, Infinity>();
 #endif
 }
 
 TEST(TEST_CATEGORY, numeric_traits_epsilon) {
+#ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC 23.7 bit_comparison_type
+  TestNumericTraits<TEST_EXECSPACE, Kokkos::Experimental::half_t, Epsilon>();
+  TestNumericTraits<TEST_EXECSPACE, Kokkos::Experimental::bhalf_t, Epsilon>();
+#endif
   TestNumericTraits<TEST_EXECSPACE, float, Epsilon>();
   TestNumericTraits<TEST_EXECSPACE, double, Epsilon>();
   // FIXME_NVHPC long double not supported
-#if !defined(KOKKOS_COMPILER_NVHPC)
+#if !defined(KOKKOS_ENABLE_CUDA) || \
+    !defined(KOKKOS_COMPILER_NVHPC)  // 23.7 long double:
   TestNumericTraits<TEST_EXECSPACE, long double, Epsilon>();
 #endif
 }
 
 TEST(TEST_CATEGORY, numeric_traits_round_error) {
+#ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC 23.7 bit_comparison_type
+  TestNumericTraits<TEST_EXECSPACE, Kokkos::Experimental::half_t, RoundError>();
+  TestNumericTraits<TEST_EXECSPACE, Kokkos::Experimental::bhalf_t,
+                    RoundError>();
+#endif
   TestNumericTraits<TEST_EXECSPACE, float, RoundError>();
   TestNumericTraits<TEST_EXECSPACE, double, RoundError>();
-#ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC:
-  // nvc++-Fatal-/home/projects/x86-64/nvidia/hpc_sdk/Linux_x86_64/22.3/compilers/bin/tools/cpp2
-  // TERMINATED by signal 11
+  // FIXME_NVHPC long double not supported
+#if !defined(KOKKOS_ENABLE_CUDA) || \
+    !defined(KOKKOS_COMPILER_NVHPC)  // 23.7 long double:
   TestNumericTraits<TEST_EXECSPACE, long double, RoundError>();
 #endif
 }
 
 TEST(TEST_CATEGORY, numeric_traits_norm_min) {
+#ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC 23.7 bit_comparison_type
+  TestNumericTraits<TEST_EXECSPACE, Kokkos::Experimental::half_t, NormMin>();
+  TestNumericTraits<TEST_EXECSPACE, Kokkos::Experimental::bhalf_t, NormMin>();
+#endif
   TestNumericTraits<TEST_EXECSPACE, float, NormMin>();
   TestNumericTraits<TEST_EXECSPACE, double, NormMin>();
-#ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC:
-  // nvc++-Fatal-/home/projects/x86-64/nvidia/hpc_sdk/Linux_x86_64/22.3/compilers/bin/tools/cpp2
-  // TERMINATED by signal 11
+  // FIXME_NVHPC long double not supported
+#if !defined(KOKKOS_ENABLE_CUDA) || \
+    !defined(KOKKOS_COMPILER_NVHPC)  // 23.7 long double:
   TestNumericTraits<TEST_EXECSPACE, long double, NormMin>();
 #endif
 }
@@ -236,9 +263,9 @@ TEST(TEST_CATEGORY, numeric_traits_norm_min) {
 TEST(TEST_CATEGORY, numeric_traits_denorm_min) {
   TestNumericTraits<TEST_EXECSPACE, float, DenormMin>();
   TestNumericTraits<TEST_EXECSPACE, double, DenormMin>();
-#ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC:
-  // nvc++-Fatal-/home/projects/x86-64/nvidia/hpc_sdk/Linux_x86_64/22.3/compilers/bin/tools/cpp2
-  // TERMINATED by signal 11
+  // FIXME_NVHPC long double not supported
+#if !defined(KOKKOS_ENABLE_CUDA) || \
+    !defined(KOKKOS_COMPILER_NVHPC)  // 23.7 long double:
   TestNumericTraits<TEST_EXECSPACE, long double, DenormMin>();
 #endif
 }
@@ -275,9 +302,8 @@ TEST(TEST_CATEGORY, numeric_traits_finite_min_max) {
   TestNumericTraits<TEST_EXECSPACE, float, FiniteMax>();
   TestNumericTraits<TEST_EXECSPACE, double, FiniteMin>();
   TestNumericTraits<TEST_EXECSPACE, double, FiniteMax>();
-#ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC:
-  // nvc++-Fatal-/home/projects/x86-64/nvidia/hpc_sdk/Linux_x86_64/22.3/compilers/bin/tools/cpp2
-  // TERMINATED by signal 11
+#if !defined(KOKKOS_ENABLE_CUDA) || \
+    !defined(KOKKOS_COMPILER_NVHPC)  // 23.7 long double:
   TestNumericTraits<TEST_EXECSPACE, long double, FiniteMin>();
   TestNumericTraits<TEST_EXECSPACE, long double, FiniteMax>();
 #endif
@@ -296,11 +322,12 @@ TEST(TEST_CATEGORY, numeric_traits_digits) {
   TestNumericTraits<TEST_EXECSPACE, unsigned long int, Digits>();
   TestNumericTraits<TEST_EXECSPACE, long long int, Digits>();
   TestNumericTraits<TEST_EXECSPACE, unsigned long long int, Digits>();
+  TestNumericTraits<TEST_EXECSPACE, Kokkos::Experimental::half_t, Digits>();
+  TestNumericTraits<TEST_EXECSPACE, Kokkos::Experimental::bhalf_t, Digits>();
   TestNumericTraits<TEST_EXECSPACE, float, Digits>();
   TestNumericTraits<TEST_EXECSPACE, double, Digits>();
-#ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC:
-  // nvc++-Fatal-/home/projects/x86-64/nvidia/hpc_sdk/Linux_x86_64/22.3/compilers/bin/tools/cpp2
-  // TERMINATED by signal 11
+#if !defined(KOKKOS_ENABLE_CUDA) || \
+    !defined(KOKKOS_COMPILER_NVHPC)  // 23.7 long double:
   TestNumericTraits<TEST_EXECSPACE, long double, Digits>();
 #endif
 }
@@ -318,11 +345,12 @@ TEST(TEST_CATEGORY, numeric_traits_digits10) {
   TestNumericTraits<TEST_EXECSPACE, unsigned long int, Digits10>();
   TestNumericTraits<TEST_EXECSPACE, long long int, Digits10>();
   TestNumericTraits<TEST_EXECSPACE, unsigned long long int, Digits10>();
+  TestNumericTraits<TEST_EXECSPACE, Kokkos::Experimental::half_t, Digits10>();
+  TestNumericTraits<TEST_EXECSPACE, Kokkos::Experimental::bhalf_t, Digits10>();
   TestNumericTraits<TEST_EXECSPACE, float, Digits10>();
   TestNumericTraits<TEST_EXECSPACE, double, Digits10>();
-#ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC:
-  // nvc++-Fatal-/home/projects/x86-64/nvidia/hpc_sdk/Linux_x86_64/22.3/compilers/bin/tools/cpp2
-  // TERMINATED by signal 11
+#if !defined(KOKKOS_ENABLE_CUDA) || \
+    !defined(KOKKOS_COMPILER_NVHPC)  // 23.7 long double:
   TestNumericTraits<TEST_EXECSPACE, long double, Digits10>();
 #endif
 }
@@ -330,9 +358,8 @@ TEST(TEST_CATEGORY, numeric_traits_digits10) {
 TEST(TEST_CATEGORY, numeric_traits_max_digits10) {
   TestNumericTraits<TEST_EXECSPACE, float, MaxDigits10>();
   TestNumericTraits<TEST_EXECSPACE, double, MaxDigits10>();
-#ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC:
-  // nvc++-Fatal-/home/projects/x86-64/nvidia/hpc_sdk/Linux_x86_64/22.3/compilers/bin/tools/cpp2
-  // TERMINATED by signal 11
+#if !defined(KOKKOS_ENABLE_CUDA) || \
+    !defined(KOKKOS_COMPILER_NVHPC)  // 23.7 long double:
   TestNumericTraits<TEST_EXECSPACE, long double, MaxDigits10>();
 #endif
 }
@@ -349,23 +376,27 @@ TEST(TEST_CATEGORY, numeric_traits_radix) {
   TestNumericTraits<TEST_EXECSPACE, unsigned long int, Radix>();
   TestNumericTraits<TEST_EXECSPACE, long long int, Radix>();
   TestNumericTraits<TEST_EXECSPACE, unsigned long long int, Radix>();
+  TestNumericTraits<TEST_EXECSPACE, Kokkos::Experimental::half_t, Radix>();
+  TestNumericTraits<TEST_EXECSPACE, Kokkos::Experimental::bhalf_t, Radix>();
   TestNumericTraits<TEST_EXECSPACE, float, Radix>();
   TestNumericTraits<TEST_EXECSPACE, double, Radix>();
-#ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC:
-  // nvc++-Fatal-/home/projects/x86-64/nvidia/hpc_sdk/Linux_x86_64/22.3/compilers/bin/tools/cpp2
-  // TERMINATED by signal 11
+#if !defined(KOKKOS_ENABLE_CUDA) || \
+    !defined(KOKKOS_COMPILER_NVHPC)  // 23.7 long double:
   TestNumericTraits<TEST_EXECSPACE, long double, Radix>();
 #endif
 }
 
 TEST(TEST_CATEGORY, numeric_traits_min_max_exponent) {
+  TestNumericTraits<TEST_EXECSPACE, Kokkos::Experimental::half_t,
+                    MinExponent>();
+  TestNumericTraits<TEST_EXECSPACE, Kokkos::Experimental::bhalf_t,
+                    MaxExponent>();
   TestNumericTraits<TEST_EXECSPACE, float, MinExponent>();
   TestNumericTraits<TEST_EXECSPACE, float, MaxExponent>();
   TestNumericTraits<TEST_EXECSPACE, double, MinExponent>();
   TestNumericTraits<TEST_EXECSPACE, double, MaxExponent>();
-#ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC:
-  // nvc++-Fatal-/home/projects/x86-64/nvidia/hpc_sdk/Linux_x86_64/22.3/compilers/bin/tools/cpp2
-  // TERMINATED by signal 11
+#if !defined(KOKKOS_ENABLE_CUDA) || \
+    !defined(KOKKOS_COMPILER_NVHPC)  // 23.7 long double:
   TestNumericTraits<TEST_EXECSPACE, long double, MinExponent>();
   TestNumericTraits<TEST_EXECSPACE, long double, MaxExponent>();
 #endif
@@ -376,24 +407,27 @@ TEST(TEST_CATEGORY, numeric_traits_min_max_exponent10) {
   TestNumericTraits<TEST_EXECSPACE, float, MaxExponent10>();
   TestNumericTraits<TEST_EXECSPACE, double, MinExponent10>();
   TestNumericTraits<TEST_EXECSPACE, double, MaxExponent10>();
-#ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC:
-  // nvc++-Fatal-/home/projects/x86-64/nvidia/hpc_sdk/Linux_x86_64/22.3/compilers/bin/tools/cpp2
-  // TERMINATED by signal 11
+#if !defined(KOKKOS_ENABLE_CUDA) || \
+    !defined(KOKKOS_COMPILER_NVHPC)  // 23.7 long double:
   TestNumericTraits<TEST_EXECSPACE, long double, MinExponent10>();
   TestNumericTraits<TEST_EXECSPACE, long double, MaxExponent10>();
 #endif
 }
 TEST(TEST_CATEGORY, numeric_traits_quiet_and_signaling_nan) {
+#ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC 23.7
+  TestNumericTraits<TEST_EXECSPACE, Kokkos::Experimental::half_t, QuietNaN>();
+  TestNumericTraits<TEST_EXECSPACE, Kokkos::Experimental::half_t,
+                    SignalingNaN>();
+  TestNumericTraits<TEST_EXECSPACE, Kokkos::Experimental::bhalf_t, QuietNaN>();
+  TestNumericTraits<TEST_EXECSPACE, Kokkos::Experimental::bhalf_t,
+                    SignalingNaN>();
+#endif
   TestNumericTraits<TEST_EXECSPACE, float, QuietNaN>();
   TestNumericTraits<TEST_EXECSPACE, float, SignalingNaN>();
   TestNumericTraits<TEST_EXECSPACE, double, QuietNaN>();
   TestNumericTraits<TEST_EXECSPACE, double, SignalingNaN>();
-#ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC:
-  // Unsupported unknown data type 38.
-  // Unsupported unknown data type 38.
-  // Unsupported unknown data type 38.
-  // nvc++-Fatal-/home/projects/x86-64/nvidia/hpc_sdk/Linux_x86_64/22.3/compilers/bin/tools/cpp2
-  // TERMINATED by signal 11
+#if !defined(KOKKOS_ENABLE_CUDA) || \
+    !defined(KOKKOS_COMPILER_NVHPC)  // 23.7 long double:
   TestNumericTraits<TEST_EXECSPACE, long double, QuietNaN>();
   TestNumericTraits<TEST_EXECSPACE, long double, SignalingNaN>();
 #endif
@@ -481,7 +515,14 @@ CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(double, round_error);
 CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(long double, round_error);
 CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(float, denorm_min);
 CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(double, denorm_min);
+
+// FIXME_OPENMPTARGET - The static_assert causes issues on Intel GPUs with the
+// OpenMPTarget backend.
+#if !(defined(KOKKOS_ENABLE_OPENMPTARGET) && \
+      defined(KOKKOS_COMPILER_INTEL_LLVM))
 CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(long double, denorm_min);
+#endif
+
 // clang-format off
 static_assert(Kokkos::Experimental::norm_min<float      >::value == std::numeric_limits<      float>::min(), "");
 static_assert(Kokkos::Experimental::norm_min<double     >::value == std::numeric_limits<     double>::min(), "");
diff --git a/packages/kokkos/core/unit_test/TestOther.hpp b/packages/kokkos/core/unit_test/TestOther.hpp
index 5d118ce9b0225f3a35ee4d3877247bffed1e9a75..fcf0353a88ca803c77cd992e4e6c70f44c177ac9 100644
--- a/packages/kokkos/core/unit_test/TestOther.hpp
+++ b/packages/kokkos/core/unit_test/TestOther.hpp
@@ -17,12 +17,7 @@
 #ifndef KOKKOS_TEST_OTHER_HPP
 #define KOKKOS_TEST_OTHER_HPP
 #include <TestAggregate.hpp>
-#ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC:
-// NVC++-F-0000-Internal compiler error. Basic LLVM base data type required 23
-// (/ascldap/users/crtrott/Kokkos/kokkos/build/core/unit_test/cuda/TestCuda_Other.cpp:
-// 204) NVC++/x86-64 Linux 22.3-0: compilation aborted
 #include <TestMemoryPool.hpp>
-#endif
 #include <TestCXX11.hpp>
 
 #include <TestViewCtorPropEmbeddedDim.hpp>
diff --git a/packages/kokkos/core/unit_test/TestParallelScanRangePolicy.hpp b/packages/kokkos/core/unit_test/TestParallelScanRangePolicy.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..6335b4a06f13542b07c7dec8da7f5e21b498c2ac
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestParallelScanRangePolicy.hpp
@@ -0,0 +1,253 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+#include <cstdio>
+
+// This test checks parallel_scan() calls which use RangePolicy.
+
+namespace {
+
+template <typename ValueType>
+struct TestParallelScanRangePolicy {
+  // This typedef is needed for parallel_scan() where a
+  // work count is given (instead of a RangePolicy) so
+  // that the execution space can be deduced internally.
+  using execution_space = TEST_EXECSPACE;
+
+  using ViewType = Kokkos::View<ValueType*, execution_space>;
+
+  ViewType prefix_results;
+  ViewType postfix_results;
+
+  // Operator defining work done in parallel_scan.
+  // Simple scan over [0,1,...,N-1].
+  // Compute both prefix and postfix scans.
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const size_t i, ValueType& update, bool final_pass) const {
+    if (final_pass) {
+      prefix_results(i) = update;
+    }
+    update += i;
+    if (final_pass) {
+      postfix_results(i) = update;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init(ValueType& update) const { update = 0; }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(ValueType& update, const ValueType& input) const {
+    update += input;
+  }
+
+  template <typename... Args>
+  void test_scan(const size_t work_size) {
+    // Reset member data based on work_size
+    prefix_results  = ViewType("prefix_results", work_size);
+    postfix_results = ViewType("postfix_results", work_size);
+
+    // Lambda for checking errors from stored value at each index.
+    auto check_scan_results = [&]() {
+      auto const prefix_h = Kokkos::create_mirror_view_and_copy(
+          Kokkos::HostSpace(), prefix_results);
+      auto const postfix_h = Kokkos::create_mirror_view_and_copy(
+          Kokkos::HostSpace(), postfix_results);
+
+      for (size_t i = 0; i < work_size; ++i) {
+        // Check prefix sum
+        ASSERT_EQ(ValueType((i * (i - 1)) / 2), prefix_h(i));
+
+        // Check postfix sum
+        ASSERT_EQ(ValueType(((i + 1) * i) / 2), postfix_h(i));
+      }
+
+      // Reset results
+      Kokkos::deep_copy(prefix_results, 0);
+      Kokkos::deep_copy(postfix_results, 0);
+    };
+
+    // If policy template args are not given, call parallel_scan()
+    // with work_size input, if args are given, call
+    // parallel_scan() with RangePolicy<Args...>(0, work_size).
+    // For each case, call parallel_scan() with all possible
+    // function signatures.
+    if (sizeof...(Args) == 0) {
+      // Input: label, work_count, functor
+      Kokkos::parallel_scan("TestWithStrArg1", work_size, *this);
+      check_scan_results();
+
+      // Input: work_count, functor
+      Kokkos::parallel_scan(work_size, *this);
+      check_scan_results();
+
+      // Input: label, work_count, functor
+      // Input/Output: return_value
+      {
+        ValueType return_val = 0;
+        Kokkos::parallel_scan("TestWithStrArg2", work_size, *this, return_val);
+        check_scan_results();
+        ASSERT_EQ(ValueType(work_size * (work_size - 1) / 2),
+                  return_val);  // sum( 0 .. N-1 )
+      }
+
+      // Input: work_count, functor
+      // Input/Output: return_value
+      {
+        ValueType return_val = 0;
+        Kokkos::parallel_scan(work_size, *this, return_val);
+        check_scan_results();
+        ASSERT_EQ(ValueType(work_size * (work_size - 1) / 2),
+                  return_val);  // sum( 0 .. N-1 )
+      }
+
+      // Input: work_count, functor
+      // Input/Output: return_view (host space)
+      {
+        Kokkos::View<ValueType, Kokkos::HostSpace> return_view("return_view");
+        Kokkos::parallel_scan(work_size, *this, return_view);
+        check_scan_results();
+        ASSERT_EQ(ValueType(work_size * (work_size - 1) / 2),
+                  return_view());  // sum( 0 .. N-1 )
+      }
+    } else {
+      // Construct RangePolicy for parallel_scan
+      // based on template Args and work_size.
+      Kokkos::RangePolicy<execution_space, Args...> policy(0, work_size);
+
+      // Input: label, work_count, functor
+      Kokkos::parallel_scan("TestWithStrArg3", policy, *this);
+      check_scan_results();
+
+      // Input: work_count, functor
+      Kokkos::parallel_scan(policy, *this);
+      check_scan_results();
+
+      {
+        // Input: label, work_count, functor
+        // Input/Output: return_value
+        ValueType return_val = 0;
+        Kokkos::parallel_scan("TestWithStrArg4", policy, *this, return_val);
+        check_scan_results();
+        ASSERT_EQ(ValueType(work_size * (work_size - 1) / 2),
+                  return_val);  // sum( 0 .. N-1 )
+      }
+
+      // Input: work_count, functor
+      // Input/Output: return_value
+      {
+        ValueType return_val = 0;
+        Kokkos::parallel_scan(policy, *this, return_val);
+        check_scan_results();
+        ASSERT_EQ(ValueType(work_size * (work_size - 1) / 2),
+                  return_val);  // sum( 0 .. N-1 )
+      }
+
+      // Input: work_count, functor
+      // Input/Output: return_view (Device)
+      {
+        Kokkos::View<ValueType, execution_space> return_view("return_view");
+        Kokkos::parallel_scan(policy, *this, return_view);
+        check_scan_results();
+
+        ValueType total;
+        Kokkos::deep_copy(total, return_view);
+        ASSERT_EQ(ValueType(work_size * (work_size - 1) / 2),
+                  total);  // sum( 0 .. N-1 )
+      }
+
+      // Check Kokkos::Experimental::require()
+      // for one of the signatures.
+      {
+        using Property =
+            Kokkos::Experimental::WorkItemProperty::HintLightWeight_t;
+        const auto policy_with_require =
+            Kokkos::Experimental::require(policy, Property());
+
+        // Input: work_count, functor
+        // Input/Output: return_value
+        ValueType return_val = 0;
+        Kokkos::parallel_scan(policy_with_require, *this, return_val);
+        check_scan_results();
+        ASSERT_EQ(ValueType(work_size * (work_size - 1) / 2),
+                  return_val);  // sum( 0 .. N-1 )
+      }
+    }
+  }
+
+  // Run test_scan() for a collection of work size
+  template <typename... Args>
+  void test_scan(const std::vector<size_t> work_sizes) {
+    for (size_t i = 0; i < work_sizes.size(); ++i) {
+      test_scan<Args...>(work_sizes[i]);
+    }
+  }
+};  // struct TestParallelScanRangePolicy
+
+TEST(TEST_CATEGORY, parallel_scan_range_policy) {
+  {
+    TestParallelScanRangePolicy<char> f;
+
+    std::vector<size_t> work_sizes{5, 10};
+    f.test_scan<>(work_sizes);
+    f.test_scan<Kokkos::Schedule<Kokkos::Static>>(work_sizes);
+    f.test_scan<Kokkos::Schedule<Kokkos::Dynamic>>(work_sizes);
+  }
+  {
+    TestParallelScanRangePolicy<short int> f;
+
+    std::vector<size_t> work_sizes{50, 100};
+    f.test_scan<>(work_sizes);
+    f.test_scan<Kokkos::Schedule<Kokkos::Static>>(work_sizes);
+    f.test_scan<Kokkos::Schedule<Kokkos::Dynamic>>(work_sizes);
+  }
+  {
+    TestParallelScanRangePolicy<int> f;
+
+    std::vector<size_t> work_sizes{0, 1, 2, 1000, 1001};
+    f.test_scan<>(work_sizes);
+    f.test_scan<Kokkos::Schedule<Kokkos::Static>>(work_sizes);
+    f.test_scan<Kokkos::Schedule<Kokkos::Dynamic>>(work_sizes);
+  }
+  {
+    TestParallelScanRangePolicy<long int> f;
+
+    std::vector<size_t> work_sizes{1000, 10000};
+    f.test_scan<>(work_sizes);
+    f.test_scan<Kokkos::Schedule<Kokkos::Static>>(work_sizes);
+    f.test_scan<Kokkos::Schedule<Kokkos::Dynamic>>(work_sizes);
+  }
+  {
+    TestParallelScanRangePolicy<float> f;
+
+    std::vector<size_t> work_sizes{13, 34};
+    f.test_scan<>(work_sizes);
+    f.test_scan<Kokkos::Schedule<Kokkos::Static>>(work_sizes);
+    f.test_scan<Kokkos::Schedule<Kokkos::Dynamic>>(work_sizes);
+  }
+  {
+    TestParallelScanRangePolicy<double> f;
+
+    std::vector<size_t> work_sizes{17, 59};
+    f.test_scan<>(work_sizes);
+    f.test_scan<Kokkos::Schedule<Kokkos::Static>>(work_sizes);
+    f.test_scan<Kokkos::Schedule<Kokkos::Dynamic>>(work_sizes);
+  }
+}
+}  // namespace
diff --git a/packages/kokkos/core/unit_test/TestPrintf.hpp b/packages/kokkos/core/unit_test/TestPrintf.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..8a190aea604f06ab2892e7f581ac9a8c32ec92f1
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestPrintf.hpp
@@ -0,0 +1,37 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+template <class ExecutionSpace>
+void test_kokkos_printf() {
+  ::testing::internal::CaptureStdout();
+  Kokkos::parallel_for(
+      Kokkos::RangePolicy<ExecutionSpace>(0, 1),
+      KOKKOS_LAMBDA(int) { Kokkos::printf("Print an integer: %d", 2); });
+  Kokkos::fence();
+  auto const captured = ::testing::internal::GetCapturedStdout();
+  std::string expected_string("Print an integer: 2");
+  ASSERT_EQ(captured, expected_string);
+}
+
+// FIXME_OPENMPTARGET non-string-literal argument used in printf is not
+// supported for spir64
+#if !(defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU))
+TEST(TEST_CATEGORY, kokkos_printf) { test_kokkos_printf<TEST_EXECSPACE>(); }
+#endif
diff --git a/packages/kokkos/core/unit_test/TestQuadPrecisionMath.hpp b/packages/kokkos/core/unit_test/TestQuadPrecisionMath.hpp
index 3801acd4a2a315e27dfbe9e51f87009d73589d50..a7a0fbf140259fa7d2202ab1b5a56476028ef49c 100644
--- a/packages/kokkos/core/unit_test/TestQuadPrecisionMath.hpp
+++ b/packages/kokkos/core/unit_test/TestQuadPrecisionMath.hpp
@@ -98,7 +98,7 @@ constexpr bool test_quad_precision_math_constants() {
   static_assert(Kokkos::numbers::log10e_v<__float128> == M_LOG10Eq);
   static_assert(Kokkos::numbers::pi_v    <__float128> == M_PIq);
 #if defined(KOKKOS_COMPILER_GNU) && (KOKKOS_COMPILER_GNU >= 930)
-  static_assert(Kokkos::::inv_pi_v<__float128> == M_1_PIq);
+  static_assert(Kokkos::numbers::inv_pi_v<__float128> == M_1_PIq);
 #endif
   // inv_sqrtpi_v
   static_assert(Kokkos::numbers::ln2_v   <__float128> == M_LN2q);
diff --git a/packages/kokkos/core/unit_test/TestRange.hpp b/packages/kokkos/core/unit_test/TestRange.hpp
index 0c465bdc1cfee80f2904a7083f1fcab67e138d7e..8cd95a24bff079a3b93e7b264195c97409b10206 100644
--- a/packages/kokkos/core/unit_test/TestRange.hpp
+++ b/packages/kokkos/core/unit_test/TestRange.hpp
@@ -134,8 +134,7 @@ struct TestRange {
   KOKKOS_INLINE_FUNCTION
   void operator()(const VerifyInitTag &, const int i) const {
     if (i != m_flags(i)) {
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF("TestRange::test_for_error at %d != %d\n",
-                                    i, m_flags(i));
+      Kokkos::printf("TestRange::test_for_error at %d != %d\n", i, m_flags(i));
     }
   }
 
@@ -147,8 +146,7 @@ struct TestRange {
   KOKKOS_INLINE_FUNCTION
   void operator()(const VerifyResetTag &, const int i) const {
     if (2 * i != m_flags(i)) {
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF("TestRange::test_for_error at %d != %d\n",
-                                    i, m_flags(i));
+      Kokkos::printf("TestRange::test_for_error at %d != %d\n", i, m_flags(i));
     }
   }
 
@@ -160,8 +158,8 @@ struct TestRange {
   KOKKOS_INLINE_FUNCTION
   void operator()(const VerifyOffsetTag &, const int i) const {
     if (i + offset != m_flags(i)) {
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF("TestRange::test_for_error at %d != %d\n",
-                                    i + offset, m_flags(i));
+      Kokkos::printf("TestRange::test_for_error at %d != %d\n", i + offset,
+                     m_flags(i));
     }
   }
 
@@ -203,55 +201,6 @@ struct TestRange {
     update += 1 + m_flags(i - offset);
   }
 
-  //----------------------------------------
-
-  void test_scan() {
-    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace, ScheduleType>(0, N),
-                         *this);
-
-    auto check_scan_results = [&]() {
-      auto const host_mirror =
-          Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), result_view);
-      for (int i = 0; i < N; ++i) {
-        if (((i + 1) * i) / 2 != host_mirror(i)) {
-          std::cout << "Error at " << i << std::endl;
-          EXPECT_EQ(size_t(((i + 1) * i) / 2), size_t(host_mirror(i)));
-        }
-      }
-    };
-
-    Kokkos::parallel_scan(
-        "TestKernelScan",
-        Kokkos::RangePolicy<ExecSpace, ScheduleType, OffsetTag>(0, N), *this);
-
-    check_scan_results();
-
-    value_type total = 0;
-    Kokkos::parallel_scan(
-        "TestKernelScanWithTotal",
-        Kokkos::RangePolicy<ExecSpace, ScheduleType, OffsetTag>(0, N), *this,
-        total);
-
-    check_scan_results();
-
-    ASSERT_EQ(size_t((N - 1) * (N) / 2), size_t(total));  // sum( 0 .. N-1 )
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const OffsetTag &, const int i, value_type &update,
-                  bool final) const {
-    update += m_flags(i);
-
-    if (final) {
-      if (update != (i * (i + 1)) / 2) {
-        KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-            "TestRange::test_scan error (%d,%d) : %d != %d\n", i, m_flags(i),
-            (i * (i + 1)) / 2, update);
-      }
-      result_view(i) = update;
-    }
-  }
-
   void test_dynamic_policy() {
 #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
     auto const N_no_implicit_capture = N;
@@ -404,49 +353,17 @@ TEST(TEST_CATEGORY, range_reduce) {
 }
 
 #ifndef KOKKOS_ENABLE_OPENMPTARGET
-TEST(TEST_CATEGORY, range_scan) {
-  {
-    TestRange<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> > f(0);
-    f.test_scan();
-  }
-  {
-    TestRange<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> > f(0);
-    f.test_scan();
-  }
+TEST(TEST_CATEGORY, range_dynamic_policy) {
 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) && \
     !defined(KOKKOS_ENABLE_SYCL)
   {
     TestRange<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> > f(0);
     f.test_dynamic_policy();
   }
-#endif
-
-  {
-    TestRange<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> > f(2);
-    f.test_scan();
-  }
-  {
-    TestRange<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> > f(3);
-    f.test_scan();
-  }
-#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) && \
-    !defined(KOKKOS_ENABLE_SYCL)
   {
     TestRange<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> > f(3);
     f.test_dynamic_policy();
   }
-#endif
-
-  {
-    TestRange<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> > f(1000);
-    f.test_scan();
-  }
-  {
-    TestRange<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> > f(1001);
-    f.test_scan();
-  }
-#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) && \
-    !defined(KOKKOS_ENABLE_SYCL)
   {
     TestRange<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> > f(1001);
     f.test_dynamic_policy();
@@ -454,4 +371,5 @@ TEST(TEST_CATEGORY, range_scan) {
 #endif
 }
 #endif
+
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestRangePolicyRequire.hpp b/packages/kokkos/core/unit_test/TestRangePolicyRequire.hpp
index 749f8b97d2b59e53c486d220c013f7ed2537adea..975ac8bd7e3c78b19fb5af6138a260024e37bce4 100644
--- a/packages/kokkos/core/unit_test/TestRangePolicyRequire.hpp
+++ b/packages/kokkos/core/unit_test/TestRangePolicyRequire.hpp
@@ -142,8 +142,8 @@ struct TestRangeRequire {
   KOKKOS_INLINE_FUNCTION
   void operator()(const VerifyInitTag &, const int i) const {
     if (i != m_flags(i)) {
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-          "TestRangeRequire::test_for error at %d != %d\n", i, m_flags(i));
+      Kokkos::printf("TestRangeRequire::test_for error at %d != %d\n", i,
+                     m_flags(i));
     }
   }
 
@@ -155,8 +155,8 @@ struct TestRangeRequire {
   KOKKOS_INLINE_FUNCTION
   void operator()(const VerifyResetTag &, const int i) const {
     if (2 * i != m_flags(i)) {
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-          "TestRangeRequire::test_for error at %d != %d\n", i, m_flags(i));
+      Kokkos::printf("TestRangeRequire::test_for error at %d != %d\n", i,
+                     m_flags(i));
     }
   }
 
@@ -168,9 +168,8 @@ struct TestRangeRequire {
   KOKKOS_INLINE_FUNCTION
   void operator()(const VerifyOffsetTag &, const int i) const {
     if (i + offset != m_flags(i)) {
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-          "TestRangeRequire::test_for error at %d != %d\n", i + offset,
-          m_flags(i));
+      Kokkos::printf("TestRangeRequire::test_for error at %d != %d\n",
+                     i + offset, m_flags(i));
     }
   }
 
@@ -214,36 +213,6 @@ struct TestRangeRequire {
 
   //----------------------------------------
 
-  void test_scan() {
-    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace, ScheduleType>(0, N),
-                         *this);
-
-    Kokkos::parallel_scan(
-        "TestKernelScan",
-        Kokkos::RangePolicy<ExecSpace, ScheduleType, OffsetTag>(0, N), *this);
-
-    int total = 0;
-    Kokkos::parallel_scan(
-        "TestKernelScanWithTotal",
-        Kokkos::RangePolicy<ExecSpace, ScheduleType, OffsetTag>(0, N), *this,
-        total);
-    ASSERT_EQ(size_t((N - 1) * (N) / 2), size_t(total));  // sum( 0 .. N-1 )
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const OffsetTag &, const int i, value_type &update,
-                  bool final) const {
-    update += m_flags(i);
-
-    if (final) {
-      if (update != (i * (i + 1)) / 2) {
-        KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-            "TestRangeRequire::test_scan error %d : %d != %d\n", i,
-            (i * (i + 1)) / 2, m_flags(i));
-      }
-    }
-  }
-
   void test_dynamic_policy() {
 #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
     auto const N_no_implicit_capture = N;
@@ -416,63 +385,22 @@ TEST(TEST_CATEGORY, range_reduce_require) {
 }
 
 #ifndef KOKKOS_ENABLE_OPENMPTARGET
-TEST(TEST_CATEGORY, range_scan_require) {
-  using Property = Kokkos::Experimental::WorkItemProperty::HintLightWeight_t;
-  {
-    TestRangeRequire<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>, Property>
-        f(0);
-    f.test_scan();
-  }
-  {
-    TestRangeRequire<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>,
-                     Property>
-        f(0);
-    f.test_scan();
-  }
+TEST(TEST_CATEGORY, range_dynamic_policy_require) {
 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) && \
     !defined(KOKKOS_ENABLE_SYCL)
+  using Property = Kokkos::Experimental::WorkItemProperty::HintLightWeight_t;
   {
     TestRangeRequire<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>,
                      Property>
         f(0);
     f.test_dynamic_policy();
   }
-#endif
-
-  {
-    TestRangeRequire<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>, Property>
-        f(2);
-    f.test_scan();
-  }
-  {
-    TestRangeRequire<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>,
-                     Property>
-        f(3);
-    f.test_scan();
-  }
-#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) && \
-    !defined(KOKKOS_ENABLE_SYCL)
   {
     TestRangeRequire<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>,
                      Property>
         f(3);
     f.test_dynamic_policy();
   }
-#endif
-
-  {
-    TestRangeRequire<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>, Property>
-        f(1000);
-    f.test_scan();
-  }
-  {
-    TestRangeRequire<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>,
-                     Property>
-        f(1001);
-    f.test_scan();
-  }
-#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) && \
-    !defined(KOKKOS_ENABLE_SYCL)
   {
     TestRangeRequire<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>,
                      Property>
@@ -482,4 +410,5 @@ TEST(TEST_CATEGORY, range_scan_require) {
 #endif
 }
 #endif
+
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestReduce.hpp b/packages/kokkos/core/unit_test/TestReduce.hpp
index 4cf30f6fbed47fb05a4f4fed5b83eff542ba04d7..e1aa851f10203d5a5d2c29a79fd83ed6ab52f4bb 100644
--- a/packages/kokkos/core/unit_test/TestReduce.hpp
+++ b/packages/kokkos/core/unit_test/TestReduce.hpp
@@ -369,7 +369,10 @@ class TestReduceDynamic {
 
   TestReduceDynamic(const size_type nwork) {
     run_test_dynamic(nwork);
+#ifndef KOKKOS_ENABLE_OPENACC
+    // FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions.
     run_test_dynamic_minmax(nwork);
+#endif
     run_test_dynamic_final(nwork);
   }
 
@@ -542,6 +545,8 @@ TEST(TEST_CATEGORY, int64_t_reduce_dynamic_view) {
 
 // FIXME_OPENMPTARGET: Not yet implemented.
 #ifndef KOKKOS_ENABLE_OPENMPTARGET
+// FIXME_OPENACC: Not yet implemented.
+#ifndef KOKKOS_ENABLE_OPENACC
 TEST(TEST_CATEGORY, int_combined_reduce) {
   using functor_type = CombinedReduceFunctorSameType<int64_t, TEST_EXECSPACE>;
   constexpr uint64_t nw = 1000;
@@ -619,4 +624,5 @@ TEST(TEST_CATEGORY, int_combined_reduce_mixed) {
   }
 }
 #endif
+#endif
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestReducers.hpp b/packages/kokkos/core/unit_test/TestReducers.hpp
index 633b203afe73138b8e7b5857b8e3430b34bbc534..957b9a0ca1a7e432ed4f579ec9e7320776e3981e 100644
--- a/packages/kokkos/core/unit_test/TestReducers.hpp
+++ b/packages/kokkos/core/unit_test/TestReducers.hpp
@@ -22,6 +22,17 @@
 
 //--------------------------------------------------------------------------
 
+namespace Test {
+struct MyPair : Kokkos::pair<int, int> {};
+}  // namespace Test
+
+template <>
+struct Kokkos::reduction_identity<Test::MyPair> {
+  KOKKOS_FUNCTION static Test::MyPair min() {
+    return Test::MyPair{{INT_MAX, INT_MAX}};
+  }
+};
+
 namespace Test {
 
 struct ReducerTag {};
@@ -74,6 +85,20 @@ struct TestReducers {
     }
   };
 
+  struct MinLocFunctor2D {
+    Kokkos::View<const Scalar**, ExecSpace> values;
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(
+        const int& i, const int& j,
+        typename Kokkos::MinLoc<Scalar, MyPair>::value_type& value) const {
+      if (values(i, j) < value.val) {
+        value.val = values(i, j);
+        value.loc = {{i, j}};
+      }
+    }
+  };
+
   struct MaxLocFunctor {
     Kokkos::View<const Scalar*, ExecSpace> values;
 
@@ -88,6 +113,20 @@ struct TestReducers {
     }
   };
 
+  struct MaxLocFunctor2D {
+    Kokkos::View<const Scalar**, ExecSpace> values;
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(
+        const int& i, const int& j,
+        typename Kokkos::MaxLoc<Scalar, MyPair>::value_type& value) const {
+      if (values(i, j) > value.val) {
+        value.val = values(i, j);
+        value.loc = {{i, j}};
+      }
+    }
+  };
+
   struct MinMaxLocFunctor {
     Kokkos::View<const Scalar*, ExecSpace> values;
 
@@ -107,6 +146,25 @@ struct TestReducers {
     }
   };
 
+  struct MinMaxLocFunctor2D {
+    Kokkos::View<const Scalar**, ExecSpace> values;
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(
+        const int& i, const int& j,
+        typename Kokkos::MinMaxLoc<Scalar, MyPair>::value_type& value) const {
+      if (values(i, j) > value.max_val) {
+        value.max_val = values(i, j);
+        value.max_loc = {{i, j}};
+      }
+
+      if (values(i, j) < value.min_val) {
+        value.min_val = values(i, j);
+        value.min_loc = {{i, j}};
+      }
+    }
+  };
+
   struct BAndFunctor {
     Kokkos::View<const Scalar*, ExecSpace> values;
 
@@ -598,6 +656,44 @@ struct TestReducers {
     }
   }
 
+  static void test_minloc_2d(int N) {
+    using reducer_type = Kokkos::MinLoc<Scalar, MyPair>;
+    using value_type   = typename reducer_type::value_type;
+
+    Kokkos::View<Scalar**, ExecSpace> values("Values", N, N);
+    auto h_values        = Kokkos::create_mirror_view(values);
+    Scalar reference_min = std::numeric_limits<Scalar>::max();
+    MyPair reference_loc = {{-1, -1}};
+
+    for (int i = 0; i < N; i++)
+      for (int j = 0; j < N; j++) {
+        h_values(i, j) = (Scalar)(rand() % 100000 + 2);
+
+        if (h_values(i, j) < reference_min) {
+          reference_min = h_values(i, j);
+          reference_loc = {{i, j}};
+        } else if (h_values(i, j) == reference_min) {
+          // Make min unique.
+          h_values(i, j) += Scalar(1);
+        }
+      }
+    Kokkos::deep_copy(values, h_values);
+
+    MinLocFunctor2D f;
+    f.values = values;
+
+    {
+      value_type min_scalar;
+      reducer_type reducer_scalar(min_scalar);
+
+      Kokkos::parallel_reduce(
+          Kokkos::MDRangePolicy<Kokkos::Rank<2>, ExecSpace>({0, 0}, {N, N}), f,
+          reducer_scalar);
+      ASSERT_EQ(min_scalar.val, reference_min);
+      ASSERT_EQ(min_scalar.loc, reference_loc);
+    }
+  }
+
   static void test_maxloc(int N) {
     using value_type = typename Kokkos::MaxLoc<Scalar, int>::value_type;
 
@@ -661,6 +757,44 @@ struct TestReducers {
     }
   }
 
+  static void test_maxloc_2d(int N) {
+    using reducer_type = Kokkos::MaxLoc<Scalar, MyPair>;
+    using value_type   = typename reducer_type::value_type;
+
+    Kokkos::View<Scalar**, ExecSpace> values("Values", N, N);
+    auto h_values        = Kokkos::create_mirror_view(values);
+    Scalar reference_max = std::numeric_limits<Scalar>::min();
+    MyPair reference_loc = {{-1, -1}};
+
+    for (int i = 0; i < N; ++i)
+      for (int j = 0; j < N; ++j) {
+        h_values(i, j) = (Scalar)(rand() % 100000 + 2);
+
+        if (h_values(i, j) > reference_max) {
+          reference_max = h_values(i, j);
+          reference_loc = {{i, j}};
+        } else if (h_values(i, j) == reference_max) {
+          // Make max unique.
+          h_values(i, j) -= Scalar(1);
+        }
+      }
+    Kokkos::deep_copy(values, h_values);
+
+    MaxLocFunctor2D f;
+    f.values = values;
+
+    {
+      value_type max_scalar;
+      reducer_type reducer_scalar(max_scalar);
+
+      Kokkos::parallel_reduce(
+          Kokkos::MDRangePolicy<Kokkos::Rank<2>, ExecSpace>({0, 0}, {N, N}), f,
+          reducer_scalar);
+      ASSERT_EQ(max_scalar.val, reference_max);
+      ASSERT_EQ(max_scalar.loc, reference_loc);
+    }
+  }
+
   static void test_minmaxloc(int N) {
     using value_type = typename Kokkos::MinMaxLoc<Scalar, int>::value_type;
 
@@ -777,6 +911,78 @@ struct TestReducers {
     }
   }
 
+  static void test_minmaxloc_2d(int N) {
+    using reducer_type = Kokkos::MinMaxLoc<Scalar, MyPair>;
+    using value_type   = typename reducer_type::value_type;
+
+    Kokkos::View<Scalar**, ExecSpace> values("Values", N, N);
+    auto h_values           = Kokkos::create_mirror_view(values);
+    Scalar reference_max    = std::numeric_limits<Scalar>::min();
+    Scalar reference_min    = std::numeric_limits<Scalar>::max();
+    MyPair reference_minloc = {{-1, -1}};
+    MyPair reference_maxloc = {{-1, -1}};
+
+    for (int i = 0; i < N; i++)
+      for (int j = 0; j < N; j++) {
+        h_values(i, j) = (Scalar)(rand() % 100000 + 2);
+      }
+
+    for (int i = 0; i < N; i++)
+      for (int j = 0; j < N; j++) {
+        if (h_values(i, j) > reference_max) {
+          reference_max    = h_values(i, j);
+          reference_maxloc = {{i, j}};
+        } else if (h_values(i, j) == reference_max) {
+          // Make max unique.
+          h_values(i, j) -= Scalar(1);
+        }
+      }
+
+    for (int i = 0; i < N; i++)
+      for (int j = 0; j < N; j++) {
+        if (h_values(i, j) < reference_min) {
+          reference_min    = h_values(i, j);
+          reference_minloc = {{i, j}};
+        } else if (h_values(i, j) == reference_min) {
+          // Make min unique.
+          h_values(i, j) += Scalar(1);
+        }
+      }
+
+    Kokkos::deep_copy(values, h_values);
+
+    MinMaxLocFunctor2D f;
+    f.values = values;
+    {
+      value_type minmax_scalar;
+      reducer_type reducer_scalar(minmax_scalar);
+
+      Kokkos::parallel_reduce(
+          Kokkos::MDRangePolicy<Kokkos::Rank<2>, ExecSpace>({0, 0}, {N, N}), f,
+          reducer_scalar);
+
+      ASSERT_EQ(minmax_scalar.min_val, reference_min);
+      for (int i = 0; i < N; i++)
+        for (int j = 0; j < N; j++) {
+          if ((minmax_scalar.min_loc == MyPair{{i, j}}) &&
+              (h_values(i, j) == reference_min)) {
+            reference_minloc = {{i, j}};
+          }
+        }
+      ASSERT_EQ(minmax_scalar.min_loc, reference_minloc);
+
+      ASSERT_EQ(minmax_scalar.max_val, reference_max);
+      for (int i = 0; i < N; i++)
+        for (int j = 0; j < N; j++) {
+          if ((minmax_scalar.max_loc == MyPair{{i, j}}) &&
+              (h_values(i, j) == reference_max)) {
+            reference_maxloc = {{i, j}};
+          }
+        }
+      ASSERT_EQ(minmax_scalar.max_loc, reference_maxloc);
+    }
+  }
+
   static void test_BAnd(int N) {
     Kokkos::View<Scalar*, ExecSpace> values("Values", N);
     auto h_values         = Kokkos::create_mirror_view(values);
@@ -982,14 +1188,39 @@ struct TestReducers {
     test_sum(10001);
     test_prod(35);
     test_min(10003);
+#if !defined(KOKKOS_ENABLE_OPENACC)
+    // FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions.
     test_minloc(10003);
+// FIXME_OPENMPTARGET requires custom reductions.
+#if !defined(KOKKOS_ENABLE_OPENMPTARGET)
+    test_minloc_2d(100);
+#endif
+#endif
     test_max(10007);
+#if !defined(KOKKOS_ENABLE_OPENACC)
+    // FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions.
     test_maxloc(10007);
-#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_CLANG) && \
-    (KOKKOS_COMPILER_CLANG < 1300)
-    // FIXME_OPENMPTARGET - The minmaxloc test fails llvm <= 13 version.
+// FIXME_OPENMPTARGET requires custom reductions.
+#if !defined(KOKKOS_ENABLE_OPENMPTARGET)
+    test_maxloc_2d(100);
+#endif
+#endif
+// FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions.
+#if !defined(KOKKOS_ENABLE_OPENACC)
+// FIXME_OPENMPTARGET - The minmaxloc test fails llvm < 13 version,
+// test_minmaxloc_2d requires custom reductions
+#if defined(KOKKOS_ENABLE_OPENMPTARGET)
+#if defined(KOKKOS_COMPILER_CLANG) && (KOKKOS_COMPILER_CLANG >= 1300) && \
+    (KOKKOS_COMPILER_CLANG <= 1700)
+    test_minmaxloc(10007);
+#else
+    if (!std::is_same_v<ExecSpace, Kokkos::Experimental::OpenMPTarget>)
+      test_minmaxloc(10007);
+#endif
 #else
     test_minmaxloc(10007);
+    test_minmaxloc_2d(100);
+#endif
 #endif
   }
 
@@ -1000,14 +1231,44 @@ struct TestReducers {
     test_sum(10001);
     test_prod(sizeof(Scalar) > 4 ? 35 : 19);  // avoid int overflow (see above)
     test_min(10003);
+#if !defined(KOKKOS_ENABLE_OPENACC)
+    // FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions.
     test_minloc(10003);
+#if defined(KOKKOS_ENABLE_CUDA)
+    if (!std::is_same_v<ExecSpace, Kokkos::Cuda>)
+#endif
+    // FIXME_OPENMPTARGET requires custom reductions.
+#if !defined(KOKKOS_ENABLE_OPENMPTARGET)
+      test_minloc_2d(100);
+#endif
+#endif
     test_max(10007);
+#if !defined(KOKKOS_ENABLE_OPENACC)
+    // FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions.
     test_maxloc(10007);
-#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_CLANG) && \
-    (KOKKOS_COMPILER_CLANG < 1300)
-    // FIXME_OPENMPTARGET - The minmaxloc test fails llvm <= 13 version.
+#if defined(KOKKOS_ENABLE_CUDA)
+    if (!std::is_same_v<ExecSpace, Kokkos::Cuda>)
+#endif
+// FIXME_OPENMPTARGET requires custom reductions.
+#if !defined(KOKKOS_ENABLE_OPENMPTARGET)
+      test_maxloc_2d(100);
+#endif
+#endif
+// FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions.
+#if !defined(KOKKOS_ENABLE_OPENACC)
+// FIXME_OPENMPTARGET - The minmaxloc test fails llvm < 13 version,
+// the minmaxloc_2d test requires custom reductions.
+#if defined(KOKKOS_ENABLE_OPENMPTARGET)
+#if defined(KOKKOS_COMPILER_CLANG) && (KOKKOS_COMPILER_CLANG >= 1300)
+    test_minmaxloc(10007);
+#else
+    if (!std::is_same_v<ExecSpace, Kokkos::Experimental::OpenMPTarget>)
+      test_minmaxloc(10007);
+#endif
 #else
     test_minmaxloc(10007);
+    test_minmaxloc_2d(100);
+#endif
 #endif
     test_BAnd(35);
     test_BOr(35);
diff --git a/packages/kokkos/core/unit_test/TestReducers_b.hpp b/packages/kokkos/core/unit_test/TestReducers_b.hpp
index a55870776e4f731a45dc94d9350c3707f2ce9bbe..dedd161f6a15417e5e5f7ab931d0a1ecfbabd691 100644
--- a/packages/kokkos/core/unit_test/TestReducers_b.hpp
+++ b/packages/kokkos/core/unit_test/TestReducers_b.hpp
@@ -16,8 +16,12 @@
 
 #include <TestReducers.hpp>
 
+// FIXME_OPENMPTARGET - Fails at runtime post clang/16
+#if defined(KOKKOS_ENABLE_OPENMPTARGER) && defined(KOKKOS_COMPILER_CLANG) && \
+    (KOKKOS_COMPILER_CLANG >= 1600)
 namespace Test {
 TEST(TEST_CATEGORY, reducers_size_t) {
   TestReducers<size_t, TEST_EXECSPACE>::execute_integer();
 }
 }  // namespace Test
+#endif
diff --git a/packages/kokkos/core/unit_test/TestScan.hpp b/packages/kokkos/core/unit_test/TestScan.hpp
deleted file mode 100644
index 8c6a02f31fe3b5ff0dae001a50391993259cdaab..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/TestScan.hpp
+++ /dev/null
@@ -1,172 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#include <Kokkos_Core.hpp>
-#include <cstdio>
-
-namespace {
-
-template <class Device, class T, T ImbalanceSz>
-struct TestScan {
-  using execution_space = Device;
-  using value_type      = T;
-
-  Kokkos::View<int, Device, Kokkos::MemoryTraits<Kokkos::Atomic> > errors;
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const int iwork, value_type& update,
-                  const bool final_pass) const {
-    const value_type n = iwork + 1;
-    const value_type imbalance =
-        ((ImbalanceSz <= n) && (value_type(0) == n % ImbalanceSz))
-            ? ImbalanceSz
-            : value_type(0);
-
-    // Insert an artificial load imbalance
-
-    for (value_type i = 0; i < imbalance; ++i) {
-      ++update;
-    }
-
-    update += n - imbalance;
-
-    if (final_pass) {
-      const value_type answer =
-          n & 1 ? (n * ((n + 1) / 2)) : ((n / 2) * (n + 1));
-
-      if (answer != update) {
-        int fail = errors()++;
-
-        if (fail < 20) {
-          KOKKOS_IMPL_DO_NOT_USE_PRINTF("TestScan(%d,%ld) != %ld\n", iwork,
-                                        static_cast<long>(update),
-                                        static_cast<long>(answer));
-        }
-      }
-    }
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  void init(value_type& update) const { update = 0; }
-
-  KOKKOS_INLINE_FUNCTION
-  void join(value_type& update, const value_type& input) const {
-    update += input;
-  }
-
-  TestScan(const size_t N) {
-    Kokkos::View<int, Device> errors_a("Errors");
-    Kokkos::deep_copy(errors_a, 0);
-    errors = errors_a;
-
-    {
-      Kokkos::parallel_scan(N, *this);
-      check_error();
-    }
-
-    {
-      Kokkos::deep_copy(errors_a, 0);
-      value_type total = 0;
-      Kokkos::parallel_scan(N, *this, total);
-
-      // We can't return a value in a constructor so use a lambda as wrapper to
-      // ignore it.
-      [&] { ASSERT_EQ(size_t((N + 1) * N / 2), size_t(total)); }();
-      check_error();
-    }
-
-    {
-      Kokkos::deep_copy(errors_a, 0);
-      Kokkos::View<value_type, Kokkos::HostSpace> total_view("total");
-      Kokkos::parallel_scan(N, *this, total_view);
-      Kokkos::fence();
-
-      // We can't return a value in a constructor so use a lambda as wrapper to
-      // ignore it.
-      [&] { ASSERT_EQ(size_t((N + 1) * N / 2), size_t(total_view())); }();
-      check_error();
-    }
-
-    {
-      Kokkos::deep_copy(errors_a, 0);
-      Kokkos::View<value_type, typename Device::memory_space> total_view(
-          "total");
-      typename Device::execution_space exec;
-      Kokkos::parallel_scan(
-          Kokkos::RangePolicy<typename Device::execution_space>(exec, 0, N),
-          *this, total_view);
-      value_type total;
-      Kokkos::deep_copy(exec, total, total_view);
-      exec.fence();
-
-      // We can't return a value in a constructor so use a lambda as wrapper to
-      // ignore it.
-      [&] { ASSERT_EQ(size_t((N + 1) * N / 2), size_t(total)); }();
-      check_error();
-    }
-  }
-
-  TestScan(const size_t Start, const size_t N) {
-    using exec_policy = Kokkos::RangePolicy<execution_space>;
-
-    Kokkos::View<int, Device> errors_a("Errors");
-    Kokkos::deep_copy(errors_a, 0);
-    errors = errors_a;
-
-    Kokkos::parallel_scan(exec_policy(Start, N), *this);
-    Kokkos::fence();
-
-    check_error();
-  }
-
-  void check_error() {
-    int total_errors;
-    Kokkos::deep_copy(total_errors, errors);
-    ASSERT_EQ(total_errors, 0);
-  }
-
-  static void test_range(const size_t begin, const size_t end) {
-    for (auto i = begin; i < end; ++i) {
-      (void)TestScan(i);
-    }
-  }
-};
-}  // namespace
-
-TEST(TEST_CATEGORY, scan) {
-  constexpr auto imbalance_size = 1000;
-  TestScan<TEST_EXECSPACE, int64_t, imbalance_size>::test_range(1, 1000);
-  TestScan<TEST_EXECSPACE, int64_t, imbalance_size>(0);
-  TestScan<TEST_EXECSPACE, int64_t, imbalance_size>(100000);
-  TestScan<TEST_EXECSPACE, int64_t, imbalance_size>(10000000);
-}
-
-TEST(TEST_CATEGORY, small_size_scan) {
-  constexpr auto imbalance_size = 10;  // Pick to not overflow...
-  TestScan<TEST_EXECSPACE, std::int8_t, imbalance_size>(0);
-  TestScan<TEST_EXECSPACE, std::int8_t, imbalance_size>(5);
-  TestScan<TEST_EXECSPACE, std::int8_t, imbalance_size>(10);
-  TestScan<TEST_EXECSPACE, std::int8_t, imbalance_size>(
-      static_cast<std::size_t>(
-          std::sqrt(std::numeric_limits<std::int8_t>::max())));
-  constexpr auto short_imbalance_size = 100;  // Pick to not overflow...
-  TestScan<TEST_EXECSPACE, std::int16_t, short_imbalance_size>(0);
-  TestScan<TEST_EXECSPACE, std::int16_t, short_imbalance_size>(5);
-  TestScan<TEST_EXECSPACE, std::int16_t, short_imbalance_size>(100);
-  TestScan<TEST_EXECSPACE, std::int16_t, short_imbalance_size>(
-      static_cast<std::size_t>(
-          std::sqrt(std::numeric_limits<std::int16_t>::max())));
-}
diff --git a/packages/kokkos/core/unit_test/TestSharedSpace.cpp b/packages/kokkos/core/unit_test/TestSharedSpace.cpp
index 8112c956db42ffb95cf791e722136333dedb15a4..3e59b796137efd9341a450f36be6bea0f56dbcd0 100644
--- a/packages/kokkos/core/unit_test/TestSharedSpace.cpp
+++ b/packages/kokkos/core/unit_test/TestSharedSpace.cpp
@@ -104,10 +104,10 @@ TEST(defaultdevicetype, shared_space) {
                                Kokkos::DefaultHostExecutionSpace>)
     GTEST_SKIP() << "Skipping as host and device are the same space";
 
-#if defined(KOKKOS_ARCH_VEGA906) || defined(KOKKOS_ARCH_VEGA908) || \
-    defined(KOKKOS_ARCH_NAVI)
-  GTEST_SKIP()
-      << "skipping because specified arch does not support page migration";
+#if defined(KOKKOS_ARCH_AMD_GPU) && defined(KOKKOS_ENABLE_HIP)
+  if (!Kokkos::SharedSpace().impl_hip_driver_check_page_migration())
+    GTEST_SKIP()
+        << "skipping because specified arch does not support page migration";
 #endif
 #if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU)
   GTEST_SKIP()
diff --git a/packages/kokkos/core/unit_test/TestStackTrace.hpp b/packages/kokkos/core/unit_test/TestStackTrace.hpp
index f5a0b95a02d33082a38846d3eb684c739593da8b..4dbe436e93016a8faf37fb785c3dfdc16a452ad5 100644
--- a/packages/kokkos/core/unit_test/TestStackTrace.hpp
+++ b/packages/kokkos/core/unit_test/TestStackTrace.hpp
@@ -33,6 +33,12 @@ void stacktrace_test_f4();
 
 void my_fancy_handler();
 
+size_t find_first_non_whitespace(const std::string& s, const size_t start_pos) {
+  constexpr size_t num_ws_chars = 3;
+  const char ws_chars[]         = "\n\t ";
+  return s.find_first_not_of(ws_chars, start_pos, num_ws_chars);
+}
+
 void test_stacktrace(bool bTerminate, bool bCustom = true) {
   stacktrace_test_f1(std::cout);
   bool bDynamic = false;
@@ -44,7 +50,7 @@ void test_stacktrace(bool bTerminate, bool bCustom = true) {
     bDynamic = std::string::npos != foutput.find("stacktrace");
 
     if (bDynamic) {
-      printf("test_f1: %s \n", foutput.c_str());
+      printf("test_f1:\n%s \n", foutput.c_str());
       ASSERT_NE(std::string::npos, foutput.find("stacktrace_test_f1"));
       for (auto x : {"stacktrace_test_f0", "stacktrace_test_f2",
                      "stacktrace_test_f3", "stacktrace_test_f4"}) {
@@ -59,13 +65,13 @@ void test_stacktrace(bool bTerminate, bool bCustom = true) {
 
     if (bDynamic) {
       std::string foutput = sstream.str();
-      printf("demangled test_f1: %s \n", foutput.c_str());
-      ASSERT_TRUE(std::string::npos !=
-                  foutput.find("Test::stacktrace_test_f1"));
+      printf("demangled test_f1:\n%s \n", foutput.c_str());
+      ASSERT_NE(std::string::npos, foutput.find("Test::stacktrace_test_f1"));
       for (auto x : {"stacktrace_test_f0", "stacktrace_test_f2",
                      "stacktrace_test_f3", "stacktrace_test_f4"}) {
         ASSERT_EQ(std::string::npos, foutput.find(x));
       }
+      EXPECT_EQ(0u, find_first_non_whitespace(foutput, 0));
     }
   }
 
@@ -84,7 +90,7 @@ void test_stacktrace(bool bTerminate, bool bCustom = true) {
 
     if (bDynamic) {
       std::string foutput = sstream.str();
-      printf("test_f3: %s \n", foutput.c_str());
+      printf("test_f3:\n%s \n", foutput.c_str());
       for (auto x : {"stacktrace_test_f1", "stacktrace_test_f3"}) {
         ASSERT_NE(std::string::npos, foutput.find(x));
       }
@@ -99,10 +105,11 @@ void test_stacktrace(bool bTerminate, bool bCustom = true) {
 
     if (bDynamic) {
       std::string foutput = sstream.str();
-      printf("demangled test_f3: %s \n", foutput.c_str());
+      printf("demangled test_f3:\n%s \n", foutput.c_str());
       for (auto x : {"stacktrace_test_f1", "stacktrace_test_f3"}) {
         ASSERT_NE(std::string::npos, foutput.find(x));
       }
+      EXPECT_EQ(0u, find_first_non_whitespace(foutput, 0));
     }
 
     // TODO make sure stacktrace_test_f2/4 don't show up
diff --git a/packages/kokkos/core/unit_test/TestTaskScheduler.hpp b/packages/kokkos/core/unit_test/TestTaskScheduler.hpp
index 5a0394f6c180ea6b3176bf6e6fa15ca66b768b30..e9f3a655686e566204342fdb7c18ac70878dfe5e 100644
--- a/packages/kokkos/core/unit_test/TestTaskScheduler.hpp
+++ b/packages/kokkos/core/unit_test/TestTaskScheduler.hpp
@@ -170,9 +170,9 @@ struct TestTaskDependence {
 
   KOKKOS_INLINE_FUNCTION
   void operator()(typename sched_type::member_type& member) {
-    auto& sched = member.scheduler();
-    enum { CHUNK = 8 };
-    const int n = CHUNK < m_count ? CHUNK : m_count;
+    auto& sched                = member.scheduler();
+    static constexpr int CHUNK = 8;
+    const int n                = CHUNK < m_count ? CHUNK : m_count;
 
     if (1 < m_count) {
       const int increment = (m_count + n - 1) / n;
@@ -711,7 +711,7 @@ struct TestMultipleDependence {
     using value_type = int;
     KOKKOS_INLINE_FUNCTION
     void operator()(typename Scheduler::member_type&, int& result) {
-      double value = 0;
+      double value = 1;
       // keep this one busy for a while
       for (int i = 0; i < 10000; ++i) {
         value += i * i / 7.138 / value;
diff --git a/packages/kokkos/core/unit_test/TestTeam.hpp b/packages/kokkos/core/unit_test/TestTeam.hpp
index 0f86f9f3697b77187f173fb5b5564b2ab2d0ab49..0a40856f937749a5e7cfbf3912d3d9f659280936 100644
--- a/packages/kokkos/core/unit_test/TestTeam.hpp
+++ b/packages/kokkos/core/unit_test/TestTeam.hpp
@@ -69,10 +69,9 @@ struct TestTeamPolicy {
         member.team_rank() + member.team_size() * member.league_rank();
 
     if (tid != m_flags(member.team_rank(), member.league_rank())) {
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-          "TestTeamPolicy member(%d,%d) error %d != %d\n", member.league_rank(),
-          member.team_rank(), tid,
-          m_flags(member.team_rank(), member.league_rank()));
+      Kokkos::printf("TestTeamPolicy member(%d,%d) error %d != %d\n",
+                     member.league_rank(), member.team_rank(), tid,
+                     m_flags(member.team_rank(), member.league_rank()));
     }
   }
 
@@ -391,7 +390,7 @@ class ScanTeamFunctor {
     ind.team_reduce(Kokkos::Max<int64_t>(m));
 
     if (m != ind.league_rank() + (ind.team_size() - 1)) {
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+      Kokkos::printf(
           "ScanTeamFunctor[%i.%i of %i.%i] reduce_max_answer(%li) != "
           "reduce_max(%li)\n",
           static_cast<int>(ind.league_rank()),
@@ -413,7 +412,7 @@ class ScanTeamFunctor {
         ind.team_scan(ind.league_rank() + 1 + ind.team_rank() + 1);
 
     if (answer != result || answer != result2) {
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+      Kokkos::printf(
           "ScanTeamFunctor[%i.%i of %i.%i] answer(%li) != scan_first(%li) or "
           "scan_second(%li)\n",
           static_cast<int>(ind.league_rank()),
@@ -515,7 +514,7 @@ struct SharedTeamFunctor {
 
     if ((shared_A.data() == nullptr && SHARED_COUNT > 0) ||
         (shared_B.data() == nullptr && SHARED_COUNT > 0)) {
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+      Kokkos::printf(
           "member( %i/%i , %i/%i ) Failed to allocate shared memory of size "
           "%lu\n",
           static_cast<int>(ind.league_rank()),
@@ -644,9 +643,8 @@ struct TestLambdaSharedTeam {
 
           if ((shared_A.data() == nullptr && SHARED_COUNT > 0) ||
               (shared_B.data() == nullptr && SHARED_COUNT > 0)) {
-            KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-                "Failed to allocate shared memory of size %lu\n",
-                static_cast<unsigned long>(SHARED_COUNT));
+            Kokkos::printf("Failed to allocate shared memory of size %lu\n",
+                           static_cast<unsigned long>(SHARED_COUNT));
 
             ++update;  // Failure to allocate is an error.
           } else {
@@ -712,9 +710,8 @@ struct ScratchTeamFunctor {
     if ((scratch_ptr.data() == nullptr) ||
         (scratch_A.data() == nullptr && SHARED_TEAM_COUNT > 0) ||
         (scratch_B.data() == nullptr && SHARED_THREAD_COUNT > 0)) {
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-          "Failed to allocate shared memory of size %lu\n",
-          static_cast<unsigned long>(SHARED_TEAM_COUNT));
+      Kokkos::printf("Failed to allocate shared memory of size %lu\n",
+                     static_cast<unsigned long>(SHARED_TEAM_COUNT));
 
       ++update;  // Failure to allocate is an error.
     } else {
@@ -1582,7 +1579,7 @@ struct TestScratchAlignment {
     Kokkos::fence();
     int minimal_scratch_allocation_failed = 0;
     Kokkos::deep_copy(minimal_scratch_allocation_failed, flag);
-    ASSERT_TRUE(minimal_scratch_allocation_failed == 0);
+    ASSERT_EQ(minimal_scratch_allocation_failed, 0);
   }
 
   // test alignment of successive allocations
@@ -1615,10 +1612,9 @@ struct TestScratchAlignment {
             flag() = 1;
 
           // Now request aligned memory such that the allocation after
-          // for scratch_ptr2 would be unaligned if it doesn't pad
-          // correct.
-          // Depending on whether scratch_ptr3 is 4 or 8 byte aligned
-          // we need to request different amount of memory.
+          // scratch_ptr2 would be unaligned if it doesn't pad correctly.
+          // Depending on scratch_ptr3 being 4 or 8 byte aligned
+          // we need to request a different amount of memory.
           if ((scratch_ptr3 + 12) % 8 == 4)
             scratch_ptr1 = reinterpret_cast<intptr_t>(
                 team.team_shmem().get_shmem_aligned(24, 4));
@@ -1631,26 +1627,27 @@ struct TestScratchAlignment {
           scratch_ptr3 = reinterpret_cast<intptr_t>(
               team.team_shmem().get_shmem_aligned(8, 4));
 
-          // note the difference between scratch_ptr2 and scratch_ptr1
-          // is 4 bytes larger than what we requested in either of the
-          // two cases.
+          // The difference between scratch_ptr2 and scratch_ptr1 should be 4
+          // bytes larger than what we requested in either case.
           if (((scratch_ptr2 - scratch_ptr1) != 28) &&
               ((scratch_ptr2 - scratch_ptr1) != 16))
             flag() = 1;
-          // check that there wasn't unnneccessary padding happening
-          // i.e. scratch_ptr2 was allocated with a 32 byte request
-          // and since scratch_ptr3 is then already aligned it difference
-          // should match that
+          // Check that there wasn't unneccessary padding happening. Since
+          // scratch_ptr2 was allocated with a 32 byte request and scratch_ptr3
+          // is then already aligned, its difference should match 32 bytes.
           if ((scratch_ptr3 - scratch_ptr2) != 32) flag() = 1;
+
           // check actually alignment of ptrs is as requested
-          if (((scratch_ptr1 % 4) != 0) || ((scratch_ptr2 % 8) != 0) ||
-              ((scratch_ptr3 % 4) != 0))
+          // cast to int here to avoid failure with icpx in mixed integer type
+          // comparison
+          if ((int(scratch_ptr1 % 4) != 0) || (int(scratch_ptr2 % 8) != 0) ||
+              (int(scratch_ptr3 % 4) != 0))
             flag() = 1;
         });
     Kokkos::fence();
     int raw_get_shmem_alignment_failed = 0;
     Kokkos::deep_copy(raw_get_shmem_alignment_failed, flag);
-    ASSERT_TRUE(raw_get_shmem_alignment_failed == 0);
+    ASSERT_EQ(raw_get_shmem_alignment_failed, 0);
   }
 };
 
@@ -1724,7 +1721,7 @@ struct TestRepeatedTeamReduce {
   KOKKOS_FUNCTION void operator()(const int i, int &bad) const {
     if (v(i) != v(0) + i) {
       ++bad;
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF("Failing at %d!\n", i);
+      Kokkos::printf("Failing at %d!\n", i);
     }
   }
 
diff --git a/packages/kokkos/core/unit_test/TestTeamBasic.hpp b/packages/kokkos/core/unit_test/TestTeamBasic.hpp
index 5ee8629656fc06c17e95d3ab4936285bc15f4ba2..c395bc0837ce8ba4637174c244b07bae55885e13 100644
--- a/packages/kokkos/core/unit_test/TestTeamBasic.hpp
+++ b/packages/kokkos/core/unit_test/TestTeamBasic.hpp
@@ -182,6 +182,9 @@ struct LargeTeamScratchFunctor {
 };
 
 TEST(TEST_CATEGORY, large_team_scratch_size) {
+#ifdef KOKKOS_IMPL_32BIT
+  GTEST_SKIP() << "Fails on 32-bit";  // FIXME_32BIT
+#endif
   const int level   = 1;
   const int n_teams = 1;
 
@@ -247,6 +250,9 @@ struct long_wrapper {
   KOKKOS_FUNCTION
   long_wrapper(long val) : value(val) {}
 
+  KOKKOS_FUNCTION
+  long_wrapper(const long_wrapper& val) : value(val.value) {}
+
   KOKKOS_FUNCTION
   friend void operator+=(long_wrapper& lhs, const long_wrapper& rhs) {
     lhs.value += rhs.value;
diff --git a/packages/kokkos/core/unit_test/TestTeamCombinedReducers.hpp b/packages/kokkos/core/unit_test/TestTeamCombinedReducers.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..47c2f666c95cbcfc7d2af18044844034190f4eaf
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestTeamCombinedReducers.hpp
@@ -0,0 +1,515 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <Kokkos_Core.hpp>
+#include <gtest/gtest.h>
+
+namespace {
+
+// Extended lambdas in parallel_for and parallel_reduce will not compile if
+// KOKKOS_ENABLE_CUDA_LAMBDA is off
+#if !defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_CUDA_LAMBDA)
+
+struct TeamTeamCombinedReducer {
+ public:
+  void test_team_thread_range_only_scalars(const int n) {
+    auto policy = Kokkos::TeamPolicy<TEST_EXECSPACE>(1, Kokkos::AUTO);
+    using team_member_type = decltype(policy)::member_type;
+
+    auto teamView = Kokkos::View<int[4], TEST_EXECSPACE::memory_space>("view");
+
+    Kokkos::parallel_for(
+        policy, KOKKOS_LAMBDA(team_member_type const& team) {
+          auto teamThreadRange = Kokkos::TeamThreadRange(team, n);
+          int teamResult0, teamResult1, teamResult2, teamResult3;
+
+          Kokkos::parallel_reduce(
+              teamThreadRange,
+              [=](int const& i, int& localVal0, int& localVal1, int& localVal2,
+                  int& localVal3) {
+                localVal0 += 1;
+                localVal1 += i + 1;
+                localVal2 += (i + 1) * n;
+                localVal3 += n;
+              },
+              teamResult0, teamResult1, teamResult2, teamResult3);
+
+          Kokkos::single(Kokkos::PerTeam(team), [=]() {
+            teamView(0) = teamResult0;
+            teamView(1) = teamResult1;
+            teamView(2) = teamResult2;
+            teamView(3) = teamResult3;
+          });
+        });
+
+    auto hostView = Kokkos::create_mirror_view_and_copy(
+        Kokkos::DefaultHostExecutionSpace(), teamView);
+
+    if (n == 0) {
+      EXPECT_EQ(Kokkos::reduction_identity<int>::sum(), hostView(0));
+      EXPECT_EQ(Kokkos::reduction_identity<int>::sum(), hostView(1));
+      EXPECT_EQ(Kokkos::reduction_identity<int>::sum(), hostView(2));
+      EXPECT_EQ(Kokkos::reduction_identity<int>::sum(), hostView(3));
+    } else {
+      EXPECT_EQ(n, hostView(0));
+      EXPECT_EQ((n * (n + 1) / 2), hostView(1));
+      EXPECT_EQ(n * n * (n + 1) / 2, hostView(2));
+      EXPECT_EQ(n * n, hostView(3));
+    }
+  }
+
+  void test_team_thread_range_only_builtin(const int n) {
+    auto policy = Kokkos::TeamPolicy<TEST_EXECSPACE>(1, Kokkos::AUTO);
+    using team_member_type = decltype(policy)::member_type;
+
+    auto teamView = Kokkos::View<int[4], TEST_EXECSPACE::memory_space>("view");
+
+    Kokkos::parallel_for(
+        policy, KOKKOS_LAMBDA(team_member_type const& team) {
+          auto teamThreadRange = Kokkos::TeamThreadRange(team, n);
+          int teamResult0, teamResult1, teamResult2, teamResult3;
+
+          Kokkos::parallel_reduce(
+              teamThreadRange,
+              [=](int const& i, int& localVal0, int& localVal1, int& localVal2,
+                  int& localVal3) {
+                localVal0 += i + 1;
+                localVal1 *= n;
+                localVal2 = (localVal2 > (i + 1)) ? (i + 1) : localVal2;
+                localVal3 = (localVal3 < (i + 1)) ? (i + 1) : localVal3;
+              },
+              Kokkos::Sum<int>(teamResult0), Kokkos::Prod<int>(teamResult1),
+              Kokkos::Min<int>(teamResult2), Kokkos::Max<int>(teamResult3));
+
+          Kokkos::single(Kokkos::PerTeam(team), [=]() {
+            teamView(0) = teamResult0;
+            teamView(1) = teamResult1;
+            teamView(2) = teamResult2;
+            teamView(3) = teamResult3;
+          });
+        });
+
+    auto hostView = Kokkos::create_mirror_view_and_copy(
+        Kokkos::DefaultHostExecutionSpace(), teamView);
+
+    if (n == 0) {
+      EXPECT_EQ(Kokkos::reduction_identity<int>::sum(), hostView(0));
+      EXPECT_EQ(Kokkos::reduction_identity<int>::prod(), hostView(1));
+      EXPECT_EQ(Kokkos::reduction_identity<int>::min(), hostView(2));
+      EXPECT_EQ(Kokkos::reduction_identity<int>::max(), hostView(3));
+    } else {
+      EXPECT_EQ((n * (n + 1) / 2), hostView(0));
+      EXPECT_EQ(std::pow(n, n), hostView(1));
+      EXPECT_EQ(1, hostView(2));
+      EXPECT_EQ(n, hostView(3));
+    }
+  }
+
+  void test_team_thread_range_combined_reducers(const int n) {
+    auto policy = Kokkos::TeamPolicy<TEST_EXECSPACE>(1, Kokkos::AUTO);
+    using team_member_type = decltype(policy)::member_type;
+
+    auto teamView = Kokkos::View<int*, TEST_EXECSPACE::memory_space>("view", 4);
+
+    Kokkos::parallel_for(
+        policy, KOKKOS_LAMBDA(team_member_type const& team) {
+          auto teamThreadRange = Kokkos::TeamThreadRange(team, n);
+          int teamResult0, teamResult1, teamResult2, teamResult3;
+
+          Kokkos::parallel_reduce(
+              teamThreadRange,
+              [=](int const& i, int& localVal0, int& localVal1, int& localVal2,
+                  int& localVal3) {
+                localVal0 += i + 1;
+                localVal1 += i + 1;
+                localVal2 = (localVal2 < (i + 1)) ? (i + 1) : localVal2;
+                localVal3 += n;
+              },
+              teamResult0, Kokkos::Sum<int>(teamResult1),
+              Kokkos::Max<int>(teamResult2), teamResult3);
+
+          Kokkos::single(Kokkos::PerTeam(team), [=]() {
+            teamView(0) = teamResult0;
+            teamView(1) = teamResult1;
+            teamView(2) = teamResult2;
+            teamView(3) = teamResult3;
+          });
+        });
+
+    auto hostView = Kokkos::create_mirror_view_and_copy(
+        Kokkos::DefaultHostExecutionSpace(), teamView);
+
+    if (n == 0) {
+      EXPECT_EQ(Kokkos::reduction_identity<int>::sum(), hostView(0));
+      EXPECT_EQ(Kokkos::reduction_identity<int>::sum(), hostView(1));
+      EXPECT_EQ(Kokkos::reduction_identity<int>::max(), hostView(2));
+      EXPECT_EQ(Kokkos::reduction_identity<int>::sum(), hostView(3));
+    } else {
+      EXPECT_EQ((n * (n + 1) / 2), hostView(0));
+      EXPECT_EQ((n * (n + 1) / 2), hostView(1));
+      EXPECT_EQ(n, hostView(2));
+      EXPECT_EQ(n * n, hostView(3));
+    }
+  }
+
+  void test_thread_vector_range_only_scalars(const int n) {
+    auto policy = Kokkos::TeamPolicy<TEST_EXECSPACE>(1, Kokkos::AUTO);
+    using team_member_type = decltype(policy)::member_type;
+
+    auto teamView = Kokkos::View<int[4], TEST_EXECSPACE::memory_space>("view");
+
+    Kokkos::parallel_for(
+        policy, KOKKOS_LAMBDA(team_member_type const& team) {
+          auto teamThreadRange   = Kokkos::TeamThreadRange(team, 1);
+          auto threadVectorRange = Kokkos::ThreadVectorRange(team, n);
+          int teamResult0, teamResult1, teamResult2, teamResult3;
+
+          Kokkos::parallel_for(teamThreadRange, [&](int const&) {
+            Kokkos::parallel_reduce(
+                threadVectorRange,
+                [=](int const& i, int& localVal0, int& localVal1,
+                    int& localVal2, int& localVal3) {
+                  localVal0 += 1;
+                  localVal1 += i + 1;
+                  localVal2 += (i + 1) * n;
+                  localVal3 += n;
+                },
+                teamResult0, teamResult1, teamResult2, teamResult3);
+
+            teamView(0) = teamResult0;
+            teamView(1) = teamResult1;
+            teamView(2) = teamResult2;
+            teamView(3) = teamResult3;
+          });
+        });
+
+    auto hostView = Kokkos::create_mirror_view_and_copy(
+        Kokkos::DefaultHostExecutionSpace(), teamView);
+
+    if (n == 0) {
+      EXPECT_EQ(Kokkos::reduction_identity<int>::sum(), hostView(0));
+      EXPECT_EQ(Kokkos::reduction_identity<int>::sum(), hostView(1));
+      EXPECT_EQ(Kokkos::reduction_identity<int>::sum(), hostView(2));
+      EXPECT_EQ(Kokkos::reduction_identity<int>::sum(), hostView(3));
+    } else {
+      EXPECT_EQ(n, hostView(0));
+      EXPECT_EQ((n * (n + 1) / 2), hostView(1));
+      EXPECT_EQ(n * n * (n + 1) / 2, hostView(2));
+      EXPECT_EQ(n * n, hostView(3));
+    }
+  }
+
+  void test_thread_vector_range_only_builtin(const int n) {
+    auto policy = Kokkos::TeamPolicy<TEST_EXECSPACE>(1, Kokkos::AUTO);
+    using team_member_type = decltype(policy)::member_type;
+
+    auto teamView = Kokkos::View<int[4], TEST_EXECSPACE::memory_space>("view");
+
+    Kokkos::parallel_for(
+        policy, KOKKOS_LAMBDA(team_member_type const& team) {
+          auto teamThreadRange   = Kokkos::TeamThreadRange(team, 1);
+          auto threadVectorRange = Kokkos::ThreadVectorRange(team, n);
+          int teamResult0, teamResult1, teamResult2, teamResult3;
+
+          Kokkos::parallel_for(teamThreadRange, [&](int const&) {
+            Kokkos::parallel_reduce(
+                threadVectorRange,
+                [=](int const& i, int& localVal0, int& localVal1,
+                    int& localVal2, int& localVal3) {
+                  localVal0 += i + 1;
+                  localVal1 *= n;
+                  localVal2 = (localVal2 > (i + 1)) ? (i + 1) : localVal2;
+                  localVal3 = (localVal3 < (i + 1)) ? (i + 1) : localVal3;
+                },
+                Kokkos::Sum<int>(teamResult0), Kokkos::Prod<int>(teamResult1),
+                Kokkos::Min<int>(teamResult2), Kokkos::Max<int>(teamResult3));
+
+            teamView(0) = teamResult0;
+            teamView(1) = teamResult1;
+            teamView(2) = teamResult2;
+            teamView(3) = teamResult3;
+          });
+        });
+
+    auto hostView = Kokkos::create_mirror_view_and_copy(
+        Kokkos::DefaultHostExecutionSpace(), teamView);
+
+    if (n == 0) {
+      EXPECT_EQ(Kokkos::reduction_identity<int>::sum(), hostView(0));
+      EXPECT_EQ(Kokkos::reduction_identity<int>::prod(), hostView(1));
+      EXPECT_EQ(Kokkos::reduction_identity<int>::min(), hostView(2));
+      EXPECT_EQ(Kokkos::reduction_identity<int>::max(), hostView(3));
+    } else {
+      EXPECT_EQ((n * (n + 1) / 2), hostView(0));
+      EXPECT_EQ(std::pow(n, n), hostView(1));
+      EXPECT_EQ(1, hostView(2));
+      EXPECT_EQ(n, hostView(3));
+    }
+  }
+
+  void test_thread_vector_range_combined_reducers(const int n) {
+    auto policy = Kokkos::TeamPolicy<TEST_EXECSPACE>(1, Kokkos::AUTO);
+    using team_member_type = decltype(policy)::member_type;
+
+    auto teamView = Kokkos::View<int[4], TEST_EXECSPACE::memory_space>("view");
+
+    Kokkos::parallel_for(
+        policy, KOKKOS_LAMBDA(team_member_type const& team) {
+          auto teamThreadRange   = Kokkos::TeamThreadRange(team, 1);
+          auto threadVectorRange = Kokkos::ThreadVectorRange(team, n);
+          int teamResult0, teamResult1, teamResult2, teamResult3;
+
+          Kokkos::parallel_for(teamThreadRange, [&](int const&) {
+            Kokkos::parallel_reduce(
+                threadVectorRange,
+                [=](int const& i, int& localVal0, int& localVal1,
+                    int& localVal2, int& localVal3) {
+                  localVal0 *= n;
+                  localVal1 += i + 1;
+                  localVal2 = (localVal2 > (i + 1)) ? (i + 1) : localVal2;
+                  localVal3 += n;
+                },
+                Kokkos::Prod<int>(teamResult0), teamResult1,
+                Kokkos::Min<int>(teamResult2), teamResult3);
+
+            teamView(0) = teamResult0;
+            teamView(1) = teamResult1;
+            teamView(2) = teamResult2;
+            teamView(3) = teamResult3;
+          });
+        });
+
+    auto hostView = Kokkos::create_mirror_view_and_copy(
+        Kokkos::DefaultHostExecutionSpace(), teamView);
+
+    if (n == 0) {
+      EXPECT_EQ(Kokkos::reduction_identity<int>::prod(), hostView(0));
+      EXPECT_EQ(Kokkos::reduction_identity<int>::sum(), hostView(1));
+      EXPECT_EQ(Kokkos::reduction_identity<int>::min(), hostView(2));
+      EXPECT_EQ(Kokkos::reduction_identity<int>::sum(), hostView(3));
+    } else {
+      EXPECT_EQ(std::pow(n, n), hostView(0));
+      EXPECT_EQ((n * (n + 1) / 2), hostView(1));
+      EXPECT_EQ(1, hostView(2));
+      EXPECT_EQ(n * n, hostView(3));
+    }
+  }
+
+  void test_team_vector_range_only_scalars(const int n) {
+    auto policy = Kokkos::TeamPolicy<TEST_EXECSPACE>(1, Kokkos::AUTO);
+    using team_member_type = decltype(policy)::member_type;
+
+    auto teamView = Kokkos::View<int[4], TEST_EXECSPACE::memory_space>("view");
+
+    Kokkos::parallel_for(
+        policy, KOKKOS_LAMBDA(team_member_type const& team) {
+          auto teamVectorRange = Kokkos::TeamVectorRange(team, n);
+          int teamResult0, teamResult1, teamResult2, teamResult3;
+
+          Kokkos::parallel_reduce(
+              teamVectorRange,
+              [=](int const& i, int& localVal0, int& localVal1, int& localVal2,
+                  int& localVal3) {
+                localVal0 += 1;
+                localVal1 += i + 1;
+                localVal2 += (i + 1) * n;
+                localVal3 += n;
+              },
+              teamResult0, teamResult1, teamResult2, teamResult3);
+
+          Kokkos::single(Kokkos::PerTeam(team), [=]() {
+            teamView(0) = teamResult0;
+            teamView(1) = teamResult1;
+            teamView(2) = teamResult2;
+            teamView(3) = teamResult3;
+          });
+        });
+
+    auto hostView = Kokkos::create_mirror_view_and_copy(
+        Kokkos::DefaultHostExecutionSpace(), teamView);
+
+    if (n == 0) {
+      EXPECT_EQ(Kokkos::reduction_identity<int>::sum(), hostView(0));
+      EXPECT_EQ(Kokkos::reduction_identity<int>::sum(), hostView(1));
+      EXPECT_EQ(Kokkos::reduction_identity<int>::sum(), hostView(2));
+      EXPECT_EQ(Kokkos::reduction_identity<int>::sum(), hostView(3));
+    } else {
+      EXPECT_EQ(n, hostView(0));
+      EXPECT_EQ((n * (n + 1) / 2), hostView(1));
+      EXPECT_EQ(n * n * (n + 1) / 2, hostView(2));
+      EXPECT_EQ(n * n, hostView(3));
+    }
+  }
+
+  void test_team_vector_range_only_builtin(const int n) {
+    auto policy = Kokkos::TeamPolicy<TEST_EXECSPACE>(1, Kokkos::AUTO);
+    using team_member_type = decltype(policy)::member_type;
+
+    auto teamView = Kokkos::View<int[4], TEST_EXECSPACE::memory_space>("view");
+
+    Kokkos::parallel_for(
+        policy, KOKKOS_LAMBDA(team_member_type const& team) {
+          auto teamVectorRange = Kokkos::TeamVectorRange(team, n);
+          int teamResult0, teamResult1, teamResult2, teamResult3;
+
+          Kokkos::parallel_reduce(
+              teamVectorRange,
+              [=](int const& i, int& localVal0, int& localVal1, int& localVal2,
+                  int& localVal3) {
+                localVal0 += i + 1;
+                localVal1 *= n;
+                localVal2 = (localVal2 > (i + 1)) ? (i + 1) : localVal2;
+                localVal3 = (localVal3 < (i + 1)) ? (i + 1) : localVal3;
+              },
+              Kokkos::Sum<int>(teamResult0), Kokkos::Prod<int>(teamResult1),
+              Kokkos::Min<int>(teamResult2), Kokkos::Max<int>(teamResult3));
+
+          Kokkos::single(Kokkos::PerTeam(team), [=]() {
+            teamView(0) = teamResult0;
+            teamView(1) = teamResult1;
+            teamView(2) = teamResult2;
+            teamView(3) = teamResult3;
+          });
+        });
+
+    auto hostView = Kokkos::create_mirror_view_and_copy(
+        Kokkos::DefaultHostExecutionSpace(), teamView);
+
+    if (n == 0) {
+      EXPECT_EQ(Kokkos::reduction_identity<int>::sum(), hostView(0));
+      EXPECT_EQ(Kokkos::reduction_identity<int>::prod(), hostView(1));
+      EXPECT_EQ(Kokkos::reduction_identity<int>::min(), hostView(2));
+      EXPECT_EQ(Kokkos::reduction_identity<int>::max(), hostView(3));
+    } else {
+      EXPECT_EQ((n * (n + 1) / 2), hostView(0));
+      EXPECT_EQ(std::pow(n, n), hostView(1));
+      EXPECT_EQ(1, hostView(2));
+      EXPECT_EQ(n, hostView(3));
+    }
+  }
+
+  void test_team_vector_range_combined_reducers(const int n) {
+    auto policy = Kokkos::TeamPolicy<TEST_EXECSPACE>(1, Kokkos::AUTO);
+    using team_member_type = decltype(policy)::member_type;
+
+    auto teamView = Kokkos::View<int[4], TEST_EXECSPACE::memory_space>("view");
+
+    Kokkos::parallel_for(
+        policy, KOKKOS_LAMBDA(team_member_type const& team) {
+          auto teamVectorRange = Kokkos::TeamVectorRange(team, n);
+          int teamResult0, teamResult1, teamResult2, teamResult3;
+
+          Kokkos::parallel_reduce(
+              teamVectorRange,
+              [=](int const& i, int& localVal0, int& localVal1, int& localVal2,
+                  int& localVal3) {
+                localVal0 += i + 1;
+                localVal1 += i + 1;
+                localVal2 = (localVal2 < (i + 1)) ? (i + 1) : localVal2;
+                localVal3 += n;
+              },
+              teamResult0, Kokkos::Sum<int>(teamResult1),
+              Kokkos::Max<int>(teamResult2), teamResult3);
+
+          Kokkos::single(Kokkos::PerTeam(team), [=]() {
+            teamView(0) = teamResult0;
+            teamView(1) = teamResult1;
+            teamView(2) = teamResult2;
+            teamView(3) = teamResult3;
+          });
+        });
+
+    auto hostView = Kokkos::create_mirror_view_and_copy(
+        Kokkos::DefaultHostExecutionSpace(), teamView);
+
+    if (n == 0) {
+      EXPECT_EQ(Kokkos::reduction_identity<int>::sum(), hostView(0));
+      EXPECT_EQ(Kokkos::reduction_identity<int>::sum(), hostView(1));
+      EXPECT_EQ(Kokkos::reduction_identity<int>::max(), hostView(2));
+      EXPECT_EQ(Kokkos::reduction_identity<int>::sum(), hostView(3));
+    } else {
+      EXPECT_EQ((n * (n + 1) / 2), hostView(0));
+      EXPECT_EQ((n * (n + 1) / 2), hostView(1));
+      EXPECT_EQ(n, hostView(2));
+      EXPECT_EQ(n * n, hostView(3));
+    }
+  }
+};
+
+TEST(TEST_CATEGORY, team_thread_range_combined_reducers) {
+#if defined(KOKKOS_ENABLE_OPENMPTARGET)
+  if constexpr (std::is_same_v<TEST_EXECSPACE,
+                               Kokkos::Experimental::OpenMPTarget>)
+    GTEST_SKIP() << "team_reduce with a generic reducer is not implemented for "
+                 << TEST_EXECSPACE::name();
+
+#elif defined(KOKKOS_ENABLE_OPENACC)
+  if constexpr (std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::OpenACC>)
+    GTEST_SKIP() << "team_reduce with a generic reducer is not implemented for "
+                 << TEST_EXECSPACE::name();
+#endif
+
+  TeamTeamCombinedReducer tester;
+  tester.test_team_thread_range_only_scalars(5);
+  tester.test_team_thread_range_only_builtin(7);
+  tester.test_team_thread_range_combined_reducers(0);
+  tester.test_team_thread_range_combined_reducers(9);
+}
+
+TEST(TEST_CATEGORY, thread_vector_range_combined_reducers) {
+#if defined(KOKKOS_ENABLE_OPENMPTARGET)
+  if constexpr (std::is_same_v<TEST_EXECSPACE,
+                               Kokkos::Experimental::OpenMPTarget>)
+    GTEST_SKIP() << "team_reduce with a generic reducer is not implemented for "
+                 << TEST_EXECSPACE::name();
+
+#elif defined(KOKKOS_ENABLE_OPENACC)
+  if constexpr (std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::OpenACC>)
+    GTEST_SKIP() << "team_reduce with a generic reducer is not implemented for "
+                 << TEST_EXECSPACE::name();
+#endif
+
+  TeamTeamCombinedReducer tester;
+  tester.test_thread_vector_range_only_scalars(5);
+  tester.test_thread_vector_range_only_builtin(7);
+  tester.test_thread_vector_range_combined_reducers(0);
+  tester.test_thread_vector_range_combined_reducers(9);
+}
+
+TEST(TEST_CATEGORY, team_vector_range_combined_reducers) {
+#ifdef KOKKOS_ENABLE_OPENMPTARGET  // FIXME_OPENMPTARGET
+  if constexpr (std::is_same_v<TEST_EXECSPACE,
+                               Kokkos::Experimental::OpenMPTarget>)
+    GTEST_SKIP() << "team_reduce with a generic reducer is not implemented for "
+                 << TEST_EXECSPACE::name();
+#endif
+
+#ifdef KOKKOS_ENABLE_OPENACC  // FIXME_OPENACC
+  if constexpr (std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::OpenACC>)
+    GTEST_SKIP() << "team_reduce with a generic reducer is not implemented for "
+                 << TEST_EXECSPACE::name();
+#endif
+
+  TeamTeamCombinedReducer tester;
+  tester.test_team_vector_range_only_scalars(5);
+  tester.test_team_vector_range_only_builtin(7);
+  tester.test_team_vector_range_combined_reducers(0);
+  tester.test_team_vector_range_combined_reducers(9);
+}
+
+#endif
+
+}  // namespace
diff --git a/packages/kokkos/core/unit_test/TestTeamMDRange.hpp b/packages/kokkos/core/unit_test/TestTeamMDRange.hpp
index 8ac7e8338c749a62913ed87cd7682c35ebcccdc2..6e65cde0cf88da3082bf8f56b24f49f6cf22ed9d 100644
--- a/packages/kokkos/core/unit_test/TestTeamMDRange.hpp
+++ b/packages/kokkos/core/unit_test/TestTeamMDRange.hpp
@@ -1904,6 +1904,13 @@ TEST(TEST_CATEGORY, ThreadVectorMDRangeParallelReduce) {
     GTEST_SKIP() << "skipping because of bug in group_barrier implementation";
 #endif
 
+// FIXME_OPENMPTARGET_CRAY: The unit tests fails correctness.
+#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_CRAYCLANG)
+  if (std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::OpenMPTarget>)
+    GTEST_SKIP() << "Cray compiler fails correctness at runtime with the "
+                    "OpenMPTarget backend.";
+#endif
+
   TestThreadVectorMDRangeParallelReduce<TEST_EXECSPACE>::
       test_parallel_reduce_for_4D_ThreadVectorMDRange<Left>(dims);
   TestThreadVectorMDRangeParallelReduce<TEST_EXECSPACE>::
@@ -1937,6 +1944,13 @@ TEST(TEST_CATEGORY, TeamVectorMDRangeParallelReduce) {
     GTEST_SKIP() << "skipping because of bug in group_barrier implementation";
 #endif
 
+// FIXME_OPENMPTARGET_CRAY: The unit tests fails correctness.
+#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_CRAYCLANG)
+  if (std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::OpenMPTarget>)
+    GTEST_SKIP() << "Cray compiler fails correctness at runtime with the "
+                    "OpenMPTarget backend.";
+#endif
+
   TestTeamVectorMDRangeParallelReduce<TEST_EXECSPACE>::
       test_parallel_reduce_for_4D_TeamVectorMDRange<Left>(dims);
   TestTeamVectorMDRangeParallelReduce<TEST_EXECSPACE>::
diff --git a/packages/kokkos/core/unit_test/TestTeamReductionScan.hpp b/packages/kokkos/core/unit_test/TestTeamReductionScan.hpp
index f8f56ad0d85128941dd96fbaa6ef16e6b39f2924..8bdd5e8432201964d93ea4bb89a5ee335724b34f 100644
--- a/packages/kokkos/core/unit_test/TestTeamReductionScan.hpp
+++ b/packages/kokkos/core/unit_test/TestTeamReductionScan.hpp
@@ -113,6 +113,10 @@ TEST(TEST_CATEGORY, repeated_team_reduce) {
                     "properly implemented";
 #endif
 
+#ifdef KOKKOS_IMPL_32BIT
+  GTEST_SKIP() << "Failing KOKKOS_IMPL_32BIT";  // FIXME_32BIT
+#endif
+
   TestRepeatedTeamReduce<TEST_EXECSPACE>();
 }
 
diff --git a/packages/kokkos/core/unit_test/TestTeamScan.hpp b/packages/kokkos/core/unit_test/TestTeamScan.hpp
index cec36236d848be12644d4c1ccd60d415986c74e1..833683227eb0d4331a907f2476d30892ca514980 100644
--- a/packages/kokkos/core/unit_test/TestTeamScan.hpp
+++ b/packages/kokkos/core/unit_test/TestTeamScan.hpp
@@ -130,4 +130,133 @@ TEST(TEST_CATEGORY, team_scan) {
   TestTeamScan<TEST_EXECSPACE, double>{}(2596, 1311);
 }
 
+// Temporary: This condition will progressively be reduced when parallel_scan
+// with return value will be implemented for more backends.
+#if !defined(KOKKOS_ENABLE_OPENACC)
+template <class ExecutionSpace, class DataType>
+struct TestTeamScanRetVal {
+  using execution_space = ExecutionSpace;
+  using value_type      = DataType;
+  using policy_type     = Kokkos::TeamPolicy<execution_space>;
+  using member_type     = typename policy_type::member_type;
+  using view_1d_type    = Kokkos::View<value_type*, execution_space>;
+  using view_2d_type    = Kokkos::View<value_type**, execution_space>;
+
+  view_2d_type a_d;
+  view_2d_type a_r;
+  view_1d_type a_s;
+  int32_t M = 0;
+  int32_t N = 0;
+
+  KOKKOS_FUNCTION
+  void operator()(const member_type& team) const {
+    auto leagueRank = team.league_rank();
+
+    auto beg = 0;
+    auto end = N;
+
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(team, beg, end),
+        [&](const int i) { a_d(leagueRank, i) = leagueRank * N + i; });
+
+    DataType accum;
+    Kokkos::parallel_scan(
+        Kokkos::TeamThreadRange(team, beg, end),
+        [&](int i, DataType& val, const bool final) {
+          val += a_d(leagueRank, i);
+          if (final) a_r(leagueRank, i) = val;
+        },
+        accum);
+
+    // Save return value from parallel_scan
+    Kokkos::single(Kokkos::PerTeam(team), [&]() { a_s(leagueRank) = accum; });
+  }
+
+  auto operator()(int32_t _M, int32_t _N) {
+    std::stringstream ss;
+    ss << Kokkos::Impl::demangle(typeid(*this).name());
+    ss << "(/*M=*/" << _M << ", /*N=*/" << _N << ")";
+    std::string const test_id = ss.str();
+
+    M   = _M;
+    N   = _N;
+    a_d = view_2d_type("a_d", M, N);
+    a_r = view_2d_type("a_r", M, N);
+    a_s = view_1d_type("a_s", M);
+
+    // Set team size explicitly to check whether non-power-of-two team sizes can
+    // be used.
+    if (ExecutionSpace().concurrency() > 10000)
+      Kokkos::parallel_for(policy_type(M, 127), *this);
+    else if (ExecutionSpace().concurrency() > 2)
+      Kokkos::parallel_for(policy_type(M, 3), *this);
+    else
+      Kokkos::parallel_for(policy_type(M, 1), *this);
+
+    Kokkos::fence();
+    auto a_i  = Kokkos::create_mirror_view(a_d);
+    auto a_o  = Kokkos::create_mirror_view(a_r);
+    auto a_os = Kokkos::create_mirror_view(a_s);
+    Kokkos::deep_copy(a_i, a_d);
+    Kokkos::deep_copy(a_o, a_r);
+    Kokkos::deep_copy(a_os, a_s);
+
+    for (int32_t i = 0; i < M; ++i) {
+      value_type scan_ref = 0;
+      value_type scan_calc;
+      value_type abs_err = 0;
+      // each fp addition is subject to small loses in precision and these
+      // compound as loop so we set the base error to be the machine epsilon and
+      // then add in another epsilon each iteration. For example, with CUDA
+      // backend + 32-bit float + large N values (e.g. 1,000) + high
+      // thread-counts (e.g. 1024), this test will fail w/o epsilon
+      // accommodation
+      constexpr value_type epsilon = std::numeric_limits<value_type>::epsilon();
+      for (int32_t j = 0; j < N; ++j) {
+        scan_ref += a_i(i, j);
+        scan_calc = a_o(i, j);
+        if (std::is_integral<value_type>::value) {
+          ASSERT_EQ(scan_ref, scan_calc)
+              << test_id
+              << " calculated scan output value differs from reference at "
+                 "indices i="
+              << i << " and j=" << j;
+        } else {
+          abs_err += epsilon;
+          ASSERT_NEAR(scan_ref, scan_calc, abs_err)
+              << test_id
+              << " calculated scan output value differs from reference at "
+                 "indices i="
+              << i << " and j=" << j;
+        }
+      }
+      // Validate return value from parallel_scan
+      if (std::is_integral<value_type>::value) {
+        ASSERT_EQ(scan_ref, a_os(i));
+      } else {
+        ASSERT_NEAR(scan_ref, a_os(i), abs_err);
+      }
+    }
+  }
+};
+
+TEST(TEST_CATEGORY, team_scan_ret_val) {
+  TestTeamScanRetVal<TEST_EXECSPACE, int32_t>{}(0, 0);
+  TestTeamScanRetVal<TEST_EXECSPACE, int32_t>{}(0, 1);
+  TestTeamScanRetVal<TEST_EXECSPACE, int32_t>{}(1, 0);
+  TestTeamScanRetVal<TEST_EXECSPACE, uint32_t>{}(99, 32);
+  TestTeamScanRetVal<TEST_EXECSPACE, uint32_t>{}(139, 64);
+  TestTeamScanRetVal<TEST_EXECSPACE, uint32_t>{}(163, 128);
+  TestTeamScanRetVal<TEST_EXECSPACE, int64_t>{}(433, 256);
+  TestTeamScanRetVal<TEST_EXECSPACE, uint64_t>{}(976, 512);
+  TestTeamScanRetVal<TEST_EXECSPACE, uint64_t>{}(1234, 1024);
+  TestTeamScanRetVal<TEST_EXECSPACE, float>{}(2596, 34);
+  TestTeamScanRetVal<TEST_EXECSPACE, double>{}(2596, 59);
+  TestTeamScanRetVal<TEST_EXECSPACE, float>{}(2596, 65);
+  TestTeamScanRetVal<TEST_EXECSPACE, double>{}(2596, 371);
+  TestTeamScanRetVal<TEST_EXECSPACE, int64_t>{}(2596, 987);
+  TestTeamScanRetVal<TEST_EXECSPACE, double>{}(2596, 1311);
+}
+#endif
+
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestTeamScratch.hpp b/packages/kokkos/core/unit_test/TestTeamScratch.hpp
index 3fdf81e04408d4a551f6222d229e7753e8965ae8..c072a87c7b2a4a08466f3e435bfdd229448e533d 100644
--- a/packages/kokkos/core/unit_test/TestTeamScratch.hpp
+++ b/packages/kokkos/core/unit_test/TestTeamScratch.hpp
@@ -54,5 +54,23 @@ TEST(TEST_CATEGORY, multi_level_scratch) {
 #endif
 }
 
+struct DummyTeamParallelForFunctor {
+  KOKKOS_FUNCTION void operator()(
+      Kokkos::TeamPolicy<TEST_EXECSPACE>::member_type) const {}
+};
+
+TEST(TEST_CATEGORY, team_scratch_memory_index_parallel_for) {
+  // Requesting per team scratch memory for a largish number of teams, resulted
+  // in problems computing the correct scratch pointer due to missed
+  // initialization of the maximum number of scratch pad indices in the Cuda
+  // baackend.
+  const int scratch_size = 4896;
+  const int league_size  = 7535;
+
+  Kokkos::TeamPolicy<TEST_EXECSPACE> policy(league_size, Kokkos::AUTO);
+  policy.set_scratch_size(1, Kokkos::PerTeam(scratch_size));
+  Kokkos::parallel_for("kernel", policy, DummyTeamParallelForFunctor());
+}
+
 }  // namespace Test
 #endif
diff --git a/packages/kokkos/core/unit_test/TestTeamTeamSize.hpp b/packages/kokkos/core/unit_test/TestTeamTeamSize.hpp
index 7a6a4dd581929be1a44e655d1084794ac324c83e..b4304fc2eb69482376e257faa972675ccc1e3ab5 100644
--- a/packages/kokkos/core/unit_test/TestTeamTeamSize.hpp
+++ b/packages/kokkos/core/unit_test/TestTeamTeamSize.hpp
@@ -31,10 +31,6 @@ class MyArray {
   void operator+=(const MyArray& src) {
     for (int i = 0; i < N; i++) values[i] += src.values[i];
   }
-  KOKKOS_INLINE_FUNCTION
-  void operator=(const MyArray& src) {
-    for (int i = 0; i < N; i++) values[i] = src.values[i];
-  }
 };
 
 template <class T, int N, class PolicyType, int S>
diff --git a/packages/kokkos/core/unit_test/TestTeamVector.hpp b/packages/kokkos/core/unit_test/TestTeamVector.hpp
index 15410bb63fa115e3e008a9e8641e481996c482d0..39122736ed7e29c9b4809920f0ce1d6d8b6a7d50 100644
--- a/packages/kokkos/core/unit_test/TestTeamVector.hpp
+++ b/packages/kokkos/core/unit_test/TestTeamVector.hpp
@@ -50,9 +50,8 @@ struct functor_team_for {
 
     if (values.data() == nullptr ||
         static_cast<size_type>(values.extent(0)) < shmemSize) {
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-          "FAILED to allocate shared memory of size %u\n",
-          static_cast<unsigned int>(shmemSize));
+      Kokkos::printf("FAILED to allocate shared memory of size %u\n",
+                     static_cast<unsigned int>(shmemSize));
     } else {
       // Initialize shared memory.
       values(team.team_rank()) = 0;
@@ -82,10 +81,9 @@ struct functor_team_for {
         }
 
         if (test != value) {
-          KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-              "FAILED team_parallel_for %i %i %lf %lf\n", team.league_rank(),
-              team.team_rank(), static_cast<double>(test),
-              static_cast<double>(value));
+          Kokkos::printf("FAILED team_parallel_for %i %i %lf %lf\n",
+                         team.league_rank(), team.team_rank(),
+                         static_cast<double>(test), static_cast<double>(value));
           flag() = 1;
         }
       });
@@ -141,18 +139,17 @@ struct functor_team_reduce {
 
       if (test != value) {
         if (team.league_rank() == 0) {
-          KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-              "FAILED team_parallel_reduce %i %i %lf %lf %lu\n",
-              team.league_rank(), team.team_rank(), static_cast<double>(test),
-              static_cast<double>(value),
-              static_cast<unsigned long>(sizeof(Scalar)));
+          Kokkos::printf("FAILED team_parallel_reduce %i %i %lf %lf %lu\n",
+                         team.league_rank(), team.team_rank(),
+                         static_cast<double>(test), static_cast<double>(value),
+                         static_cast<unsigned long>(sizeof(Scalar)));
         }
 
         flag() = 1;
       }
       if (test != shared_value(0)) {
         if (team.league_rank() == 0) {
-          KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          Kokkos::printf(
               "FAILED team_parallel_reduce with shared result %i %i %lf %lf "
               "%lu\n",
               team.league_rank(), team.team_rank(), static_cast<double>(test),
@@ -213,7 +210,7 @@ struct functor_team_reduce_reducer {
       }
 
       if (test != value) {
-        KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+        Kokkos::printf(
             "FAILED team_vector_parallel_reduce_reducer %i %i %lf %lf\n",
             team.league_rank(), team.team_rank(), static_cast<double>(test),
             static_cast<double>(value));
@@ -221,7 +218,7 @@ struct functor_team_reduce_reducer {
         flag() = 1;
       }
       if (test != shared_value(0)) {
-        KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+        Kokkos::printf(
             "FAILED team_vector_parallel_reduce_reducer shared value %i %i %lf "
             "%lf\n",
             team.league_rank(), team.team_rank(), static_cast<double>(test),
@@ -260,9 +257,8 @@ struct functor_team_vector_for {
 
     if (values.data() == nullptr ||
         static_cast<size_type>(values.extent(0)) < shmemSize) {
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-          "FAILED to allocate shared memory of size %u\n",
-          static_cast<unsigned int>(shmemSize));
+      Kokkos::printf("FAILED to allocate shared memory of size %u\n",
+                     static_cast<unsigned int>(shmemSize));
     } else {
       team.team_barrier();
 
@@ -292,10 +288,9 @@ struct functor_team_vector_for {
         }
 
         if (test != value) {
-          KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-              "FAILED team_vector_parallel_for %i %i %lf %lf\n",
-              team.league_rank(), team.team_rank(), static_cast<double>(test),
-              static_cast<double>(value));
+          Kokkos::printf("FAILED team_vector_parallel_for %i %i %lf %lf\n",
+                         team.league_rank(), team.team_rank(),
+                         static_cast<double>(test), static_cast<double>(value));
           flag() = 1;
         }
       });
@@ -342,7 +337,7 @@ struct functor_team_vector_reduce {
 
       if (test != value) {
         if (team.league_rank() == 0) {
-          KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          Kokkos::printf(
               "FAILED team_vector_parallel_reduce %i %i %lf %lf %lu\n",
               team.league_rank(), team.team_rank(), static_cast<double>(test),
               static_cast<double>(value),
@@ -394,7 +389,7 @@ struct functor_team_vector_reduce_reducer {
       }
 
       if (test != value) {
-        KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+        Kokkos::printf(
             "FAILED team_vector_parallel_reduce_reducer %i %i %lf %lf\n",
             team.league_rank(), team.team_rank(), static_cast<double>(test),
             static_cast<double>(value));
@@ -441,10 +436,9 @@ struct functor_vec_single {
         [&](int /*i*/, Scalar &val) { val += value; }, value2);
 
     if (value2 != (value * Scalar(nEnd - nStart))) {
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-          "FAILED vector_single broadcast %i %i %lf %lf\n", team.league_rank(),
-          team.team_rank(), static_cast<double>(value2),
-          static_cast<double>(value));
+      Kokkos::printf("FAILED vector_single broadcast %i %i %lf %lf\n",
+                     team.league_rank(), team.team_rank(),
+                     static_cast<double>(value2), static_cast<double>(value));
 
       flag() = 1;
     }
@@ -474,8 +468,8 @@ struct functor_vec_for {
 
     if (values.data() == nullptr ||
         values.extent(0) < (unsigned)team.team_size() * 13) {
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF("FAILED to allocate memory of size %i\n",
-                                    static_cast<int>(team.team_size() * 13));
+      Kokkos::printf("FAILED to allocate memory of size %i\n",
+                     static_cast<int>(team.team_size() * 13));
       flag() = 1;
     } else {
       Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, 13), [&](int i) {
@@ -495,10 +489,9 @@ struct functor_vec_for {
         }
 
         if (test != value) {
-          KOKKOS_IMPL_DO_NOT_USE_PRINTF("FAILED vector_par_for %i %i %lf %lf\n",
-                                        team.league_rank(), team.team_rank(),
-                                        static_cast<double>(test),
-                                        static_cast<double>(value));
+          Kokkos::printf("FAILED vector_par_for %i %i %lf %lf\n",
+                         team.league_rank(), team.team_rank(),
+                         static_cast<double>(test), static_cast<double>(value));
 
           flag() = 1;
         }
@@ -532,9 +525,9 @@ struct functor_vec_red {
       for (int i = 0; i < 13; i++) test += i;
 
       if (test != value) {
-        KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-            "FAILED vector_par_reduce %i %i %lf %lf\n", team.league_rank(),
-            team.team_rank(), (double)test, (double)value);
+        Kokkos::printf("FAILED vector_par_reduce %i %i %lf %lf\n",
+                       team.league_rank(), team.team_rank(), (double)test,
+                       (double)value);
         flag() = 1;
       }
     });
@@ -570,9 +563,9 @@ struct functor_vec_red_reducer {
       for (int i = 0; i < 13; i++) test *= (i % 5 + 1);
 
       if (test != value) {
-        KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-            "FAILED vector_par_reduce_reducer %i %i %lf %lf\n",
-            team.league_rank(), team.team_rank(), (double)test, (double)value);
+        Kokkos::printf("FAILED vector_par_reduce_reducer %i %i %lf %lf\n",
+                       team.league_rank(), team.team_rank(), (double)test,
+                       (double)value);
 
         flag() = 1;
       }
@@ -591,27 +584,81 @@ struct functor_vec_scan {
 
   KOKKOS_INLINE_FUNCTION
   void operator()(typename policy_type::member_type team) const {
-    Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team, 13),
-                          [&](int i, Scalar &val, bool final) {
-                            val += i;
-
-                            if (final) {
-                              Scalar test = 0;
-                              for (int k = 0; k <= i; k++) test += k;
-
-                              if (test != val) {
-                                KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-                                    "FAILED vector_par_scan %i %i %lf %lf\n",
-                                    team.league_rank(), team.team_rank(),
-                                    static_cast<double>(test),
-                                    static_cast<double>(val));
-
-                                flag() = 1;
-                              }
-                            }
-                          });
+    Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team, 13), [&](int i,
+                                                                   Scalar &val,
+                                                                   bool final) {
+      val += i;
+
+      if (final) {
+        Scalar test = 0;
+        for (int k = 0; k <= i; k++) test += k;
+
+        if (test != val) {
+          Kokkos::printf("FAILED vector_par_scan %i %i %lf %lf\n",
+                         team.league_rank(), team.team_rank(),
+                         static_cast<double>(test), static_cast<double>(val));
+
+          flag() = 1;
+        }
+      }
+    });
+  }
+};
+
+// Temporary: This condition will progressively be reduced when parallel_scan
+// with return value will be implemented for more backends.
+#if !defined(KOKKOS_ENABLE_OPENACC)
+template <typename Scalar, class ExecutionSpace>
+struct functor_vec_scan_ret_val {
+  using policy_type     = Kokkos::TeamPolicy<ExecutionSpace>;
+  using execution_space = ExecutionSpace;
+
+  Kokkos::View<int, ExecutionSpace> flag;
+  int team_size;
+
+  functor_vec_scan_ret_val(Kokkos::View<int, ExecutionSpace> flag_, int tsize)
+      : flag(flag_), team_size(tsize) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(typename policy_type::member_type team) const {
+    Scalar return_val;
+    int upper_bound = 13;
+
+    Kokkos::parallel_scan(
+        Kokkos::ThreadVectorRange(team, upper_bound),
+        [&](int i, Scalar &val, bool final) {
+          val += i;
+
+          if (final) {
+            Scalar test = 0;
+            for (int k = 0; k <= i; k++) test += k;
+
+            if (test != val) {
+              Kokkos::printf("FAILED vector_par_scan %i %i %lf %lf\n",
+                             team.league_rank(), team.team_rank(),
+                             static_cast<double>(test),
+                             static_cast<double>(val));
+
+              flag() = 1;
+            }
+          }
+        },
+        return_val);
+
+    Scalar sum_ref = ((upper_bound - 1) * (upper_bound)) / 2;
+
+    if (flag() == 0 && return_val != sum_ref) {
+      Kokkos::printf(
+          "FAILED vector_scan_ret_val: league_rank %i, team_rank %i, sum_ref "
+          "%lf, return_val %lf\n",
+          team.league_rank(), team.team_rank(), static_cast<double>(sum_ref),
+          static_cast<double>(return_val));
+
+      flag() = 1;
+    }
   }
 };
+#endif
 
 template <typename Scalar, class ExecutionSpace>
 struct functor_reduce {
@@ -684,6 +731,14 @@ bool test_scalar(int nteams, int team_size, int test) {
     Kokkos::parallel_for(
         "B", Kokkos::TeamPolicy<ExecutionSpace>(nteams, team_size, 8),
         functor_vec_single<Scalar, ExecutionSpace>(d_flag, 4, 13));
+  } else if (test == 12) {
+// Temporary: This condition will progressively be reduced when parallel_scan
+// with return value will be implemented for more backends.
+#if !defined(KOKKOS_ENABLE_OPENACC)
+    Kokkos::parallel_for(
+        Kokkos::TeamPolicy<ExecutionSpace>(nteams, team_size, 8),
+        functor_vec_scan_ret_val<Scalar, ExecutionSpace>(d_flag, team_size));
+#endif
   }
 
   Kokkos::deep_copy(h_flag, d_flag);
@@ -858,9 +913,6 @@ struct checkScan {
   view_type inputs  = view_type{"inputs"};
   view_type outputs = view_type{"outputs"};
 
-  value_type result;
-  Reducer reducer = {result};
-
   struct ThreadVectorFunctor {
     KOKKOS_FUNCTION void operator()(const size_type j, value_type &update,
                                     const bool final) const {
@@ -908,6 +960,8 @@ struct checkScan {
       const {
     const size_type iTeam       = team.league_rank();
     const size_type iTeamOffset = iTeam * n_per_team;
+    value_type dummy;
+    Reducer reducer = {dummy};
     Kokkos::parallel_for(
         Kokkos::TeamThreadRange(team, n_team_thread_range),
         TeamThreadRangeFunctor{team, reducer, iTeamOffset, outputs, inputs});
@@ -938,7 +992,9 @@ struct checkScan {
     Kokkos::View<value_type[n], Kokkos::HostSpace> expected("expected");
     {
       value_type identity;
+      Reducer reducer = {identity};
       reducer.init(identity);
+
       for (int i = 0; i < expected.extent_int(0); ++i) {
         const int vector       = i % n_vector_range;
         const value_type accum = vector == 0 ? identity : expected(i - 1);
@@ -956,14 +1012,11 @@ struct checkScan {
 };
 }  // namespace VectorScanReducer
 
-#if !(defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND) || defined(KOKKOS_ENABLE_HIP))
+#if !defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND)
 TEST(TEST_CATEGORY, team_vector) {
   ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(0)));
   ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(1)));
-#if !(defined(KOKKOS_ENABLE_CUDA) && \
-      defined(KOKKOS_COMPILER_NVHPC))  // FIXME_NVHPC
   ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(2)));
-#endif
   ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(3)));
   ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(4)));
   ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(5)));
@@ -973,6 +1026,7 @@ TEST(TEST_CATEGORY, team_vector) {
   ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(9)));
   ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(10)));
   ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(11)));
+  ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(12)));
 }
 #endif
 
@@ -1011,7 +1065,7 @@ TEST(TEST_CATEGORY, parallel_scan_with_reducers) {
   constexpr int n_vector_range = 100;
 
 #if defined(KOKKOS_ENABLE_CUDA) && \
-    defined(KOKKOS_COMPILER_NVHPC)  // FIXME_NVHPC
+    defined(KOKKOS_COMPILER_NVHPC)  // FIXME_NVHPC 23.7
   if constexpr (std::is_same_v<TEST_EXECSPACE, Kokkos::Cuda>) {
     GTEST_SKIP() << "All but max inclusive scan differ at index 101";
   }
diff --git a/packages/kokkos/core/unit_test/TestTeamVectorRange.hpp b/packages/kokkos/core/unit_test/TestTeamVectorRange.hpp
index bc7eb76932caed3938f578eadca62e52668ca2a0..06139eb345d9cdf3ebd100404f033e71b63683bf 100644
--- a/packages/kokkos/core/unit_test/TestTeamVectorRange.hpp
+++ b/packages/kokkos/core/unit_test/TestTeamVectorRange.hpp
@@ -160,9 +160,8 @@ struct functor_teamvector_for {
     shared_int values         = shared_int(team.team_shmem(), shmemSize);
 
     if (values.data() == nullptr || values.extent(0) < shmemSize) {
-      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-          "FAILED to allocate shared memory of size %u\n",
-          static_cast<unsigned int>(shmemSize));
+      Kokkos::printf("FAILED to allocate shared memory of size %u\n",
+                     static_cast<unsigned int>(shmemSize));
     } else {
       // Initialize shared memory.
       Kokkos::parallel_for(Kokkos::TeamVectorRange(team, 131),
@@ -195,10 +194,9 @@ struct functor_teamvector_for {
         }
 
         if (test != value) {
-          KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-              "FAILED teamvector_parallel_for %i %i %lf %lf\n",
-              team.league_rank(), team.team_rank(), static_cast<double>(test),
-              static_cast<double>(value));
+          Kokkos::printf("FAILED teamvector_parallel_for %i %i %lf %lf\n",
+                         team.league_rank(), team.team_rank(),
+                         static_cast<double>(test), static_cast<double>(value));
           flag() = 1;
         }
       });
@@ -262,7 +260,7 @@ struct functor_teamvector_reduce {
 
       if (test != value) {
         if (team.league_rank() == 0) {
-          KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          Kokkos::printf(
               "FAILED teamvector_parallel_reduce %i %i %lf %lf %lu\n",
               (int)team.league_rank(), (int)team.team_rank(),
               static_cast<double>(test), static_cast<double>(value),
@@ -273,7 +271,7 @@ struct functor_teamvector_reduce {
       }
       if (test != shared_value(0)) {
         if (team.league_rank() == 0) {
-          KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          Kokkos::printf(
               "FAILED teamvector_parallel_reduce with shared result %i %i %lf "
               "%lf %lu\n",
               static_cast<int>(team.league_rank()),
@@ -335,7 +333,7 @@ struct functor_teamvector_reduce_reducer {
       }
 
       if (test != value) {
-        KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+        Kokkos::printf(
             "FAILED teamvector_parallel_reduce_reducer %i %i %lf %lf\n",
             team.league_rank(), team.team_rank(), static_cast<double>(test),
             static_cast<double>(value));
@@ -343,7 +341,7 @@ struct functor_teamvector_reduce_reducer {
         flag() = 1;
       }
       if (test != shared_value(0)) {
-        KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+        Kokkos::printf(
             "FAILED teamvector_parallel_reduce_reducer shared value %i %i %lf "
             "%lf\n",
             team.league_rank(), team.team_rank(), static_cast<double>(test),
@@ -439,7 +437,7 @@ namespace Test {
 TEST(TEST_CATEGORY, team_teamvector_range) {
   ASSERT_TRUE((TestTeamVectorRange::Test<TEST_EXECSPACE>(0)));
 #if defined(KOKKOS_ENABLE_CUDA) && \
-    defined(KOKKOS_COMPILER_NVHPC)  // FIXME_NVHPC
+    defined(KOKKOS_COMPILER_NVHPC)  // FIXME_NVHPC 23.7
   if constexpr (std::is_same_v<TEST_EXECSPACE, Kokkos::Cuda>) {
     GTEST_SKIP() << "Disabling 2/3rd of the test for now";
   }
diff --git a/packages/kokkos/core/unit_test/TestUtilities.hpp b/packages/kokkos/core/unit_test/TestUtilities.hpp
index 8de5f5e0d16bee1532b557fffe042b3d33f00e87..b1f9d30c1fc95a7aefe41ee2919a68e2bf603506 100644
--- a/packages/kokkos/core/unit_test/TestUtilities.hpp
+++ b/packages/kokkos/core/unit_test/TestUtilities.hpp
@@ -41,4 +41,94 @@ void test_is_specialization_of() {
                 "");
 }
 
+namespace {
+enum Enum { EZero, EOne };
+enum EnumBool : bool { EBFalse, EBTrue };
+enum class ScopedEnum { SEZero, SEOne };
+enum class ScopedEnumShort : short { SESZero, SESOne };
+class Class {};
+
+template <typename Base, typename Derived>
+inline constexpr bool is_public_unambiguous_base_of_v =
+    std::is_convertible_v<Derived*, Base*> && !std::is_same_v<Derived, Base>;
+}  // namespace
+
+void test_to_underlying() {
+  using Kokkos::Impl::to_underlying;
+
+  constexpr auto e0 = to_underlying(EZero);
+  static_assert(e0 == 0);
+
+  constexpr auto e1 = to_underlying(EOne);
+  static_assert(e1 == 1);
+
+  constexpr auto eb0 = to_underlying(EBFalse);
+  constexpr bool b0  = false;
+  static_assert(std::is_same_v<decltype(eb0), decltype(b0)>);
+  static_assert(eb0 == b0);
+
+  constexpr auto eb1 = to_underlying(EBTrue);
+  constexpr bool b1  = true;
+  static_assert(std::is_same_v<decltype(eb1), decltype(b1)>);
+  static_assert(eb1 == b1);
+
+  constexpr auto se0 = to_underlying(ScopedEnum::SEZero);
+  static_assert(se0 == 0);
+
+  constexpr auto se1 = to_underlying(ScopedEnum::SEOne);
+  static_assert(se1 == 1);
+
+  constexpr auto ses0 = to_underlying(ScopedEnumShort::SESZero);
+  constexpr short s0  = 0;
+  static_assert(std::is_same_v<decltype(ses0), decltype(s0)>);
+  static_assert(ses0 == s0);
+
+  constexpr auto ses1 = to_underlying(ScopedEnumShort::SESOne);
+  constexpr short s1  = 1;
+  static_assert(std::is_same_v<decltype(ses1), decltype(s1)>);
+  static_assert(ses1 == s1);
+}
+
+void test_is_scoped_enum() {
+  using Kokkos::Impl::is_scoped_enum;
+  using Kokkos::Impl::is_scoped_enum_v;
+
+  static_assert(!is_scoped_enum<int>{});
+  static_assert(!is_scoped_enum<int>::value);
+  static_assert(!is_scoped_enum_v<int>);
+  static_assert(
+      is_public_unambiguous_base_of_v<std::false_type, is_scoped_enum<int>>);
+
+  static_assert(!is_scoped_enum<Class>{});
+  static_assert(!is_scoped_enum<Class>::value);
+  static_assert(!is_scoped_enum_v<Class>);
+  static_assert(
+      is_public_unambiguous_base_of_v<std::false_type, is_scoped_enum<Class>>);
+
+  static_assert(!is_scoped_enum<Enum>{});
+  static_assert(!is_scoped_enum<Enum>::value);
+  static_assert(!is_scoped_enum_v<Enum>);
+  static_assert(
+      is_public_unambiguous_base_of_v<std::false_type, is_scoped_enum<Enum>>);
+
+  static_assert(!is_scoped_enum<EnumBool>{});
+  static_assert(!is_scoped_enum<EnumBool>::value);
+  static_assert(!is_scoped_enum_v<EnumBool>);
+  static_assert(is_public_unambiguous_base_of_v<std::false_type,
+                                                is_scoped_enum<EnumBool>>);
+
+  static_assert(is_scoped_enum<ScopedEnum>{});
+  static_assert(is_scoped_enum<ScopedEnum>::value);
+  static_assert(is_scoped_enum_v<ScopedEnum>);
+  static_assert(is_public_unambiguous_base_of_v<std::true_type,
+                                                is_scoped_enum<ScopedEnum>>);
+
+  static_assert(is_scoped_enum<ScopedEnumShort>{});
+  static_assert(is_scoped_enum<ScopedEnumShort>::value);
+  static_assert(is_scoped_enum_v<ScopedEnumShort>);
+  static_assert(
+      is_public_unambiguous_base_of_v<std::true_type,
+                                      is_scoped_enum<ScopedEnumShort>>);
+}
+
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestVersionMacros.cpp b/packages/kokkos/core/unit_test/TestVersionMacros.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e18b5973100d5cad1c1714ef0b1534c192f237d0
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestVersionMacros.cpp
@@ -0,0 +1,59 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <Kokkos_Core.hpp>
+
+#ifndef KOKKOS_VERSION
+static_assert(false, "KOKKOS_VERSION macro is not defined!");
+#endif
+
+#ifndef KOKKOS_VERSION_MAJOR
+static_assert(false, "KOKKOS_VERSION_MAJOR macro is not defined!");
+#endif
+
+#ifndef KOKKOS_VERSION_MINOR
+static_assert(false, "KOKKOS_VERSION_MINOR macro is not defined!");
+#endif
+
+#ifndef KOKKOS_VERSION_PATCH
+static_assert(false, "KOKKOS_VERSION_PATCH macro is not defined!");
+#endif
+
+static_assert(KOKKOS_VERSION == KOKKOS_VERSION_MAJOR * 10000 +
+                                    KOKKOS_VERSION_MINOR * 100 +
+                                    KOKKOS_VERSION_PATCH);
+
+// clang-format off
+static_assert(!KOKKOS_VERSION_LESS            (KOKKOS_VERSION_MAJOR    , KOKKOS_VERSION_MINOR, KOKKOS_VERSION_PATCH));
+static_assert(!KOKKOS_VERSION_LESS            (KOKKOS_VERSION_MAJOR - 1, KOKKOS_VERSION_MINOR, KOKKOS_VERSION_PATCH));
+static_assert( KOKKOS_VERSION_LESS            (KOKKOS_VERSION_MAJOR + 1, KOKKOS_VERSION_MINOR, KOKKOS_VERSION_PATCH));
+
+static_assert( KOKKOS_VERSION_LESS_EQUAL      (KOKKOS_VERSION_MAJOR    , KOKKOS_VERSION_MINOR, KOKKOS_VERSION_PATCH));
+static_assert(!KOKKOS_VERSION_LESS_EQUAL      (KOKKOS_VERSION_MAJOR - 1, KOKKOS_VERSION_MINOR, KOKKOS_VERSION_PATCH));
+static_assert( KOKKOS_VERSION_LESS_EQUAL      (KOKKOS_VERSION_MAJOR + 1, KOKKOS_VERSION_MINOR, KOKKOS_VERSION_PATCH));
+
+static_assert(!KOKKOS_VERSION_GREATER         (KOKKOS_VERSION_MAJOR    , KOKKOS_VERSION_MINOR, KOKKOS_VERSION_PATCH));
+static_assert( KOKKOS_VERSION_GREATER         (KOKKOS_VERSION_MAJOR - 1, KOKKOS_VERSION_MINOR, KOKKOS_VERSION_PATCH));
+static_assert(!KOKKOS_VERSION_GREATER         (KOKKOS_VERSION_MAJOR + 1, KOKKOS_VERSION_MINOR, KOKKOS_VERSION_PATCH));
+
+static_assert( KOKKOS_VERSION_GREATER_EQUAL   (KOKKOS_VERSION_MAJOR    , KOKKOS_VERSION_MINOR, KOKKOS_VERSION_PATCH));
+static_assert( KOKKOS_VERSION_GREATER_EQUAL   (KOKKOS_VERSION_MAJOR - 1, KOKKOS_VERSION_MINOR, KOKKOS_VERSION_PATCH));
+static_assert(!KOKKOS_VERSION_GREATER_EQUAL   (KOKKOS_VERSION_MAJOR + 1, KOKKOS_VERSION_MINOR, KOKKOS_VERSION_PATCH));
+
+static_assert( KOKKOS_VERSION_EQUAL           (KOKKOS_VERSION_MAJOR    , KOKKOS_VERSION_MINOR, KOKKOS_VERSION_PATCH));
+static_assert(!KOKKOS_VERSION_EQUAL           (KOKKOS_VERSION_MAJOR - 1, KOKKOS_VERSION_MINOR, KOKKOS_VERSION_PATCH));
+static_assert(!KOKKOS_VERSION_EQUAL           (KOKKOS_VERSION_MAJOR + 1, KOKKOS_VERSION_MINOR, KOKKOS_VERSION_PATCH));
+// clang-format on
diff --git a/packages/kokkos/core/unit_test/TestViewAPI.hpp b/packages/kokkos/core/unit_test/TestViewAPI.hpp
index 767f6e5e49c2355d097d2f997c2856fdd3fc1319..ffc500e4a9adb184b1239586c798c9119a9a35f5 100644
--- a/packages/kokkos/core/unit_test/TestViewAPI.hpp
+++ b/packages/kokkos/core/unit_test/TestViewAPI.hpp
@@ -1227,7 +1227,7 @@ class TestViewAPI {
       Kokkos::deep_copy(typename hView4::execution_space(), dx, hx);
       Kokkos::deep_copy(typename hView4::execution_space(), dy, dx);
       Kokkos::deep_copy(typename hView4::execution_space(), hy, dy);
-      typename dView4::execution_space().fence();
+      typename hView4::execution_space().fence();
 
       for (size_t ip = 0; ip < N0; ++ip)
         for (size_t i1 = 0; i1 < N1; ++i1)
@@ -1238,7 +1238,7 @@ class TestViewAPI {
 
       Kokkos::deep_copy(typename hView4::execution_space(), dx, T(0));
       Kokkos::deep_copy(typename hView4::execution_space(), hx, dx);
-      typename dView4::execution_space().fence();
+      typename hView4::execution_space().fence();
 
       for (size_t ip = 0; ip < N0; ++ip)
         for (size_t i1 = 0; i1 < N1; ++i1)
diff --git a/packages/kokkos/core/unit_test/TestViewAPI_e.hpp b/packages/kokkos/core/unit_test/TestViewAPI_e.hpp
index df66396ab845f212edd507ec926afc2195b193b4..2e416d032055bbc760b49d5da4e62a841def9186 100644
--- a/packages/kokkos/core/unit_test/TestViewAPI_e.hpp
+++ b/packages/kokkos/core/unit_test/TestViewAPI_e.hpp
@@ -100,7 +100,7 @@ void test_left_stride(Extents... extents) {
   size_t expected_stride = 1;
   size_t all_strides[view_type::rank + 1];
   view.stride(all_strides);
-  for (int i = 0; i < view_type::rank; ++i) {
+  for (size_t i = 0; i < view_type::rank; ++i) {
     ASSERT_EQ(view.stride(i), expected_stride);
     ASSERT_EQ(all_strides[i], expected_stride);
     expected_stride *= view.extent(i);
@@ -115,7 +115,7 @@ void test_right_stride(Extents... extents) {
   size_t expected_stride = 1;
   size_t all_strides[view_type::rank + 1];
   view.stride(all_strides);
-  for (int ri = 0; ri < view_type::rank; ++ri) {
+  for (size_t ri = 0; ri < view_type::rank; ++ri) {
     auto i = view_type::rank - 1 - ri;
     ASSERT_EQ(view.stride(i), expected_stride);
     ASSERT_EQ(all_strides[i], expected_stride);
diff --git a/packages/kokkos/core/unit_test/TestViewCtorDimMatch.hpp b/packages/kokkos/core/unit_test/TestViewCtorDimMatch.hpp
index ff62eaecccf13ef456e588982cb3ce9a7359a738..d71841eef847a4bf792a91176c42f2627dbc305f 100644
--- a/packages/kokkos/core/unit_test/TestViewCtorDimMatch.hpp
+++ b/packages/kokkos/core/unit_test/TestViewCtorDimMatch.hpp
@@ -49,9 +49,7 @@ using DType = int;
 
 // Skip test execution when KOKKOS_ENABLE_OPENMPTARGET is enabled until
 // Kokkos::abort() aborts properly on that backend
-// Skip test execution when KOKKOS_COMPILER_NVHPC until fixed in GTEST
-#if defined(KOKKOS_ENABLE_OPENMPTARGET) || (KOKKOS_COMPILER_NVHPC)
-#else
+#ifndef KOKKOS_ENABLE_OPENMPTARGET
 TEST(TEST_CATEGORY_DEATH, view_construction_with_wrong_params_dyn) {
   ::testing::FLAGS_gtest_death_test_style = "threadsafe";
 
diff --git a/packages/kokkos/core/unit_test/TestViewLayoutStrideAssignment.hpp b/packages/kokkos/core/unit_test/TestViewLayoutStrideAssignment.hpp
index 865f87efae36dca5949d47801462f31ee3a144dc..d383ab223b9e346374017c43e8decf7903b81a72 100644
--- a/packages/kokkos/core/unit_test/TestViewLayoutStrideAssignment.hpp
+++ b/packages/kokkos/core/unit_test/TestViewLayoutStrideAssignment.hpp
@@ -582,7 +582,6 @@ TEST(TEST_CATEGORY, view_layoutstride_right_to_layoutright_assignment) {
   }
 }
 
-#ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC
 TEST(TEST_CATEGORY_DEATH, view_layoutstride_right_to_layoutleft_assignment) {
   using exec_space = TEST_EXECSPACE;
 
@@ -886,7 +885,6 @@ TEST(TEST_CATEGORY_DEATH, view_layoutstride_left_to_layoutright_assignment) {
                  "View assignment must have compatible layouts");
   }
 }
-#endif
 
 }  // namespace Test
 
diff --git a/packages/kokkos/core/unit_test/TestViewMapping_a.hpp b/packages/kokkos/core/unit_test/TestViewMapping_a.hpp
index 9df044ec7a4b7fd70097fcdf6e6029b1fc371dd5..9173f0d4316e4d1422268316fce4d76a731de8da 100644
--- a/packages/kokkos/core/unit_test/TestViewMapping_a.hpp
+++ b/packages/kokkos/core/unit_test/TestViewMapping_a.hpp
@@ -713,7 +713,7 @@ void test_view_mapping() {
                               typename Space::memory_space>::value));
     ASSERT_TRUE((std::is_same<typename T::reference_type, int&>::value));
 
-    ASSERT_EQ(T::Rank, 1);
+    ASSERT_EQ(T::rank, size_t(1));
 
     ASSERT_TRUE((std::is_same<typename C::data_type, const int*>::value));
     ASSERT_TRUE((std::is_same<typename C::const_data_type, const int*>::value));
@@ -734,7 +734,7 @@ void test_view_mapping() {
                               typename Space::memory_space>::value));
     ASSERT_TRUE((std::is_same<typename C::reference_type, const int&>::value));
 
-    ASSERT_EQ(C::Rank, 1);
+    ASSERT_EQ(C::rank, size_t(1));
 
     ASSERT_EQ(vr1.extent(0), size_t(N));
 
@@ -781,7 +781,7 @@ void test_view_mapping() {
     ASSERT_TRUE((std::is_same<typename T::memory_space,
                               typename Space::memory_space>::value));
     ASSERT_TRUE((std::is_same<typename T::reference_type, int&>::value));
-    ASSERT_EQ(T::Rank, 1);
+    ASSERT_EQ(T::rank, size_t(1));
 
     ASSERT_EQ(vr1.extent(0), size_t(N));
 
@@ -1038,10 +1038,7 @@ void test_view_mapping() {
     ASSERT_EQ(a.use_count(), 1);
     ASSERT_EQ(b.use_count(), 0);
 
-// TODO: a.use_count() and x.use_count() are 0 with the asynchronous HPX
-// backend. Why?
-#if !defined(KOKKOS_ENABLE_CUDA_LAMBDA) && \
-    !(defined(KOKKOS_ENABLE_HPX) && defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH))
+#if !defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_ENABLE_CUDA_LAMBDA)
     // Cannot launch host lambda when CUDA lambda is enabled.
 
     using host_exec_space =
diff --git a/packages/kokkos/core/unit_test/TestViewMapping_subview.hpp b/packages/kokkos/core/unit_test/TestViewMapping_subview.hpp
index 069ad09da98f8088a35bd2a5f1103c8ad9778da9..888abf4ca8dae2cbd8d2148f5be7033f817930c0 100644
--- a/packages/kokkos/core/unit_test/TestViewMapping_subview.hpp
+++ b/packages/kokkos/core/unit_test/TestViewMapping_subview.hpp
@@ -100,8 +100,7 @@ struct TestViewMappingSubview {
 
   KOKKOS_INLINE_FUNCTION
   void operator()(const int, long& error_count) const {
-    auto Ad = Kokkos::subview<Kokkos::MemoryUnmanaged>(
-        Aa, Kokkos::pair<int, int>(1, AN - 1));
+    auto Ad = Kokkos::subview(Aa, Kokkos::pair<int, int>(1, AN - 1));
 
     for (int i = 1; i < AN - 1; ++i)
       if (&Aa[i] != &Ab[i - 1]) ++error_count;
diff --git a/packages/kokkos/core/unit_test/TestViewMemoryAccessViolation.hpp b/packages/kokkos/core/unit_test/TestViewMemoryAccessViolation.hpp
index cdd90426eb4ce2c9ae9ddc83be3e9776902e47e2..daf24ce7c0cb67de118e76764ac76022af5a4857 100644
--- a/packages/kokkos/core/unit_test/TestViewMemoryAccessViolation.hpp
+++ b/packages/kokkos/core/unit_test/TestViewMemoryAccessViolation.hpp
@@ -18,7 +18,6 @@
 
 #include <gtest/gtest.h>
 
-#ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC
 template <class View, class ExecutionSpace>
 struct TestViewMemoryAccessViolation {
   View v;
@@ -143,7 +142,7 @@ void test_view_memory_access_violations_from_device() {
 }
 
 // FIXME_SYCL
-#if !(defined(KOKKOS_COMPILER_INTEL) && defined(KOKKOS_ENABLE_SYCL))
+#if !(defined(KOKKOS_COMPILER_INTEL_LLVM) && defined(KOKKOS_ENABLE_SYCL))
 TEST(TEST_CATEGORY_DEATH, view_memory_access_violations_from_host) {
   ::testing::FLAGS_gtest_death_test_style = "threadsafe";
 
@@ -182,7 +181,12 @@ TEST(TEST_CATEGORY_DEATH, view_memory_access_violations_from_device) {
                     "able to abort from the device";
   }
 #endif
+#if defined(KOKKOS_ENABLE_OPENACC)  // FIXME_OPENACC
+  if (std::is_same<ExecutionSpace, Kokkos::Experimental::OpenACC>::value) {
+    GTEST_SKIP() << "skipping because OpenACC backend is currently not "
+                    "able to abort from the device";
+  }
+#endif
 
   test_view_memory_access_violations_from_device<ExecutionSpace>();
 }
-#endif
diff --git a/packages/kokkos/core/unit_test/TestViewRank.cpp b/packages/kokkos/core/unit_test/TestViewRank.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7ea11afca37a938cdc5660116923aa33e5f05a45
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestViewRank.cpp
@@ -0,0 +1,63 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <Kokkos_Core.hpp>
+
+namespace {
+
+template <class View, size_t Rank, size_t RankDynamic>
+constexpr bool test_view_rank_and_dynamic_rank() {
+  static_assert(View::rank == Rank);
+  static_assert(View::rank() == Rank);
+  static_assert(View::rank_dynamic == RankDynamic);
+  static_assert(View::rank_dynamic() == RankDynamic);
+  static_assert(std::is_convertible_v<decltype(View::rank), size_t>);
+  static_assert(std::is_same_v<decltype(View::rank()), size_t>);
+  static_assert(std::is_convertible_v<decltype(View::rank_dynamic), size_t>);
+  static_assert(std::is_same_v<decltype(View::rank_dynamic()), size_t>);
+  auto rank = View::rank;  // not an integral type in contrast to Kokkos version
+  // less than 4.0.01
+  static_assert(!std::is_integral_v<decltype(rank)>);
+  auto rank_preferred = View::rank();  // since 4.0.01
+  static_assert(std::is_same_v<decltype(rank_preferred), size_t>);
+  (void)rank;
+  (void)rank_preferred;
+  return true;
+}
+
+// clang-format off
+static_assert(test_view_rank_and_dynamic_rank<Kokkos::View<long long>, 0, 0>());
+
+static_assert(test_view_rank_and_dynamic_rank<Kokkos::View<unsigned[1]>, 1, 0>());
+static_assert(test_view_rank_and_dynamic_rank<Kokkos::View<unsigned * >, 1, 1>());
+
+static_assert(test_view_rank_and_dynamic_rank<Kokkos::View<double[1][2]>, 2, 0>());
+static_assert(test_view_rank_and_dynamic_rank<Kokkos::View<double * [2]>, 2, 1>());
+static_assert(test_view_rank_and_dynamic_rank<Kokkos::View<double *  * >, 2, 2>());
+
+static_assert(test_view_rank_and_dynamic_rank<Kokkos::View<float[1][2][3]>, 3, 0>());
+static_assert(test_view_rank_and_dynamic_rank<Kokkos::View<float * [2][3]>, 3, 1>());
+static_assert(test_view_rank_and_dynamic_rank<Kokkos::View<float *  * [3]>, 3, 2>());
+static_assert(test_view_rank_and_dynamic_rank<Kokkos::View<float *  *  * >, 3, 3>());
+
+static_assert(test_view_rank_and_dynamic_rank<Kokkos::View<int[1][2][3][4]>, 4, 0>());
+static_assert(test_view_rank_and_dynamic_rank<Kokkos::View<int * [2][3][4]>, 4, 1>());
+static_assert(test_view_rank_and_dynamic_rank<Kokkos::View<int *  * [3][4]>, 4, 2>());
+static_assert(test_view_rank_and_dynamic_rank<Kokkos::View<int *  *  * [4]>, 4, 3>());
+static_assert(test_view_rank_and_dynamic_rank<Kokkos::View<int *  *  *  * >, 4, 4>());
+//clang-format on
+
+}  // namespace
diff --git a/packages/kokkos/core/unit_test/TestViewSubview.hpp b/packages/kokkos/core/unit_test/TestViewSubview.hpp
index eb0050a6827900385723b6f6b220f275b5510653..386887d923eacef0ec1299058cf91435b88dca60 100644
--- a/packages/kokkos/core/unit_test/TestViewSubview.hpp
+++ b/packages/kokkos/core/unit_test/TestViewSubview.hpp
@@ -487,8 +487,8 @@ void test_left_1(bool use_constr) {
       for (int i1 = 0; i1 < (int)sx4.extent(1); ++i1)
         for (int i2 = 0; i2 < (int)sx4.extent(2); ++i2)
           for (int i3 = 0; i3 < (int)sx4.extent(3); ++i3) {
-            ASSERT_TRUE(&sx4(i0, i1, i2, i3) ==
-                        &x8(0, 0 + i0, 1, 1 + i1, 1, 0 + i2, 2, 2 + i3));
+            ASSERT_EQ(&sx4(i0, i1, i2, i3),
+                      &x8(0, 0 + i0, 1, 1 + i1, 1, 0 + i2, 2, 2 + i3));
           }
   }
 }
@@ -559,8 +559,8 @@ void test_left_2() {
       for (int i1 = 0; i1 < (int)sx4.extent(1); ++i1)
         for (int i2 = 0; i2 < (int)sx4.extent(2); ++i2)
           for (int i3 = 0; i3 < (int)sx4.extent(3); ++i3) {
-            ASSERT_TRUE(&sx4(i0, i1, i2, i3) ==
-                        &x4(1 + i0, 1 + i1, 0 + i2, 2 + i3));
+            ASSERT_EQ(&sx4(i0, i1, i2, i3),
+                      &x4(1 + i0, 1 + i1, 0 + i2, 2 + i3));
           }
   }
 }
@@ -769,8 +769,8 @@ void test_right_1(bool use_constr) {
       for (int i1 = 0; i1 < (int)sx4.extent(1); ++i1)
         for (int i2 = 0; i2 < (int)sx4.extent(2); ++i2)
           for (int i3 = 0; i3 < (int)sx4.extent(3); ++i3) {
-            ASSERT_TRUE(&sx4(i0, i1, i2, i3) ==
-                        &x8(0, 0 + i0, 1, 1 + i1, 1, 0 + i2, 2, 2 + i3));
+            ASSERT_EQ(&sx4(i0, i1, i2, i3),
+                      &x8(0, 0 + i0, 1, 1 + i1, 1, 0 + i2, 2, 2 + i3));
           }
   }
 }
@@ -865,7 +865,7 @@ struct FillView_3D {
   using exec_t = typename Space::execution_space;
   using view_t = Kokkos::View<int***, Layout, Space>;
   using rank_t = Kokkos::Rank<
-      view_t::Rank,
+      view_t::rank,
       std::is_same<Layout, Kokkos::LayoutLeft>::value ? Kokkos::Iterate::Left
                                                       : Kokkos::Iterate::Right,
       std::is_same<Layout, Kokkos::LayoutLeft>::value ? Kokkos::Iterate::Left
@@ -893,7 +893,7 @@ struct FillView_4D {
   using exec_t = typename Space::execution_space;
   using view_t = Kokkos::View<int****, Layout, Space>;
   using rank_t = Kokkos::Rank<
-      view_t::Rank,
+      view_t::rank,
       std::is_same<Layout, Kokkos::LayoutLeft>::value ? Kokkos::Iterate::Left
                                                       : Kokkos::Iterate::Right,
       std::is_same<Layout, Kokkos::LayoutLeft>::value ? Kokkos::Iterate::Left
@@ -922,7 +922,7 @@ struct FillView_5D {
   using exec_t = typename Space::execution_space;
   using view_t = Kokkos::View<int*****, Layout, Space>;
   using rank_t = Kokkos::Rank<
-      view_t::Rank,
+      view_t::rank,
       std::is_same<Layout, Kokkos::LayoutLeft>::value ? Kokkos::Iterate::Left
                                                       : Kokkos::Iterate::Right,
       std::is_same<Layout, Kokkos::LayoutLeft>::value ? Kokkos::Iterate::Left
@@ -1471,30 +1471,30 @@ void test_3d_subview_5d_impl_layout() {
 }
 
 inline void test_subview_legal_args_right() {
+  ASSERT_EQ(
+      0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
+             Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::ALL_t,
+             Kokkos::ALL_t, Kokkos::pair<int, int>, int, int>::value));
+  ASSERT_EQ(
+      0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
+             Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::ALL_t,
+             Kokkos::ALL_t, Kokkos::ALL_t, int, int>::value));
+  ASSERT_EQ(
+      0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
+             Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::ALL_t,
+             Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, int>::value));
+  ASSERT_EQ(
+      0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
+             Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::ALL_t,
+             Kokkos::pair<int, int>, Kokkos::ALL_t, int, int>::value));
   ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0,
-                   Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t,
-                   Kokkos::pair<int, int>, int, int>::value));
-  ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
-                   Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0,
-                   Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t,
-                   Kokkos::Impl::ALL_t, int, int>::value));
-  ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
-                   Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0,
-                   Kokkos::Impl::ALL_t, Kokkos::pair<int, int>,
-                   Kokkos::pair<int, int>, int, int>::value));
-  ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
-                   Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0,
-                   Kokkos::Impl::ALL_t, Kokkos::pair<int, int>,
-                   Kokkos::Impl::ALL_t, int, int>::value));
-  ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
-                   Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0,
-                   Kokkos::pair<int, int>, Kokkos::Impl::ALL_t,
+                   Kokkos::pair<int, int>, Kokkos::ALL_t,
                    Kokkos::pair<int, int>, int, int>::value));
   ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0,
-                   Kokkos::pair<int, int>, Kokkos::Impl::ALL_t,
-                   Kokkos::Impl::ALL_t, int, int>::value));
+                   Kokkos::pair<int, int>, Kokkos::ALL_t, Kokkos::ALL_t, int,
+                   int>::value));
   ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0,
                    Kokkos::pair<int, int>, Kokkos::pair<int, int>,
@@ -1502,98 +1502,101 @@ inline void test_subview_legal_args_right() {
   ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0,
                    Kokkos::pair<int, int>, Kokkos::pair<int, int>,
-                   Kokkos::Impl::ALL_t, int, int>::value));
+                   Kokkos::ALL_t, int, int>::value));
 
+  ASSERT_EQ(
+      0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
+             Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::ALL_t,
+             int, Kokkos::ALL_t, Kokkos::pair<int, int>, int>::value));
+  ASSERT_EQ(
+      0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
+             Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::ALL_t,
+             int, Kokkos::ALL_t, Kokkos::ALL_t, int>::value));
+  ASSERT_EQ(
+      0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
+             Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::ALL_t,
+             int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int>::value));
+  ASSERT_EQ(
+      0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
+             Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::ALL_t,
+             int, Kokkos::pair<int, int>, Kokkos::ALL_t, int>::value));
   ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0,
-                   Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t,
-                   Kokkos::pair<int, int>, int>::value));
-  ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
-                   Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0,
-                   Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t,
-                   Kokkos::Impl::ALL_t, int>::value));
-  ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
-                   Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0,
-                   Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>,
-                   Kokkos::pair<int, int>, int>::value));
-  ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
-                   Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0,
-                   Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>,
-                   Kokkos::Impl::ALL_t, int>::value));
-  ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
-                   Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0,
-                   Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t,
+                   Kokkos::pair<int, int>, int, Kokkos::ALL_t,
                    Kokkos::pair<int, int>, int>::value));
   ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0,
-                   Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t,
-                   Kokkos::Impl::ALL_t, int>::value));
+                   Kokkos::pair<int, int>, int, Kokkos::ALL_t, Kokkos::ALL_t,
+                   int>::value));
   ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0,
                    Kokkos::pair<int, int>, int, Kokkos::pair<int, int>,
                    Kokkos::pair<int, int>, int>::value));
   ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0,
-                   Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t,
+                   Kokkos::pair<int, int>, int, Kokkos::ALL_t,
                    Kokkos::pair<int, int>, int>::value));
 
+  ASSERT_EQ(
+      0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
+             Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::ALL_t,
+             Kokkos::ALL_t, int, Kokkos::pair<int, int>, int>::value));
+  ASSERT_EQ(
+      0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
+             Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::ALL_t,
+             Kokkos::ALL_t, int, Kokkos::ALL_t, int>::value));
+  ASSERT_EQ(
+      0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
+             Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::ALL_t,
+             Kokkos::pair<int, int>, int, Kokkos::pair<int, int>, int>::value));
+  ASSERT_EQ(
+      0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
+             Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::ALL_t,
+             Kokkos::pair<int, int>, int, Kokkos::ALL_t, int>::value));
   ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0,
-                   Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int,
-                   Kokkos::pair<int, int>, int>::value));
-  ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
-                   Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0,
-                   Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int,
-                   Kokkos::Impl::ALL_t, int>::value));
-  ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
-                   Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0,
-                   Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int,
-                   Kokkos::pair<int, int>, int>::value));
-  ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
-                   Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0,
-                   Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int,
-                   Kokkos::Impl::ALL_t, int>::value));
-  ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
-                   Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0,
-                   Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int,
+                   Kokkos::pair<int, int>, Kokkos::ALL_t, int,
                    Kokkos::pair<int, int>, int>::value));
   ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0,
-                   Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int,
-                   Kokkos::Impl::ALL_t, int>::value));
+                   Kokkos::pair<int, int>, Kokkos::ALL_t, int, Kokkos::ALL_t,
+                   int>::value));
   ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0,
                    Kokkos::pair<int, int>, Kokkos::pair<int, int>, int,
                    Kokkos::pair<int, int>, int>::value));
   ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0,
-                   Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int,
+                   Kokkos::pair<int, int>, Kokkos::ALL_t, int,
                    Kokkos::pair<int, int>, int>::value));
 
+  ASSERT_EQ(
+      0,
+      (Kokkos::Impl::SubviewLegalArgsCompileTime<
+          Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::ALL_t,
+          Kokkos::ALL_t, Kokkos::pair<int, int>, int>::value));
   ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int,
-                   Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t,
-                   Kokkos::pair<int, int>, int>::value));
-  ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
-                   Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int,
-                   Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t,
-                   Kokkos::Impl::ALL_t, int>::value));
-  ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
-                   Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int,
-                   Kokkos::Impl::ALL_t, Kokkos::pair<int, int>,
-                   Kokkos::pair<int, int>, int>::value));
-  ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
-                   Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int,
-                   Kokkos::Impl::ALL_t, Kokkos::pair<int, int>,
-                   Kokkos::Impl::ALL_t, int>::value));
+                   Kokkos::ALL_t, Kokkos::ALL_t, Kokkos::ALL_t, int>::value));
+  ASSERT_EQ(
+      0,
+      (Kokkos::Impl::SubviewLegalArgsCompileTime<
+          Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::ALL_t,
+          Kokkos::pair<int, int>, Kokkos::pair<int, int>, int>::value));
+  ASSERT_EQ(
+      0,
+      (Kokkos::Impl::SubviewLegalArgsCompileTime<
+          Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::ALL_t,
+          Kokkos::pair<int, int>, Kokkos::ALL_t, int>::value));
   ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int,
-                   Kokkos::pair<int, int>, Kokkos::Impl::ALL_t,
+                   Kokkos::pair<int, int>, Kokkos::ALL_t,
                    Kokkos::pair<int, int>, int>::value));
-  ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
-                   Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int,
-                   Kokkos::pair<int, int>, Kokkos::Impl::ALL_t,
-                   Kokkos::Impl::ALL_t, int>::value));
+  ASSERT_EQ(
+      0,
+      (Kokkos::Impl::SubviewLegalArgsCompileTime<
+          Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int,
+          Kokkos::pair<int, int>, Kokkos::ALL_t, Kokkos::ALL_t, int>::value));
   ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int,
                    Kokkos::pair<int, int>, Kokkos::pair<int, int>,
@@ -1601,32 +1604,35 @@ inline void test_subview_legal_args_right() {
   ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int,
                    Kokkos::pair<int, int>, Kokkos::pair<int, int>,
-                   Kokkos::Impl::ALL_t, int>::value));
+                   Kokkos::ALL_t, int>::value));
 
+  ASSERT_EQ(
+      0,
+      (Kokkos::Impl::SubviewLegalArgsCompileTime<
+          Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::ALL_t,
+          Kokkos::ALL_t, int, Kokkos::pair<int, int>>::value));
   ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int,
-                   Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int,
-                   Kokkos::pair<int, int>>::value));
-  ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
-                   Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int,
-                   Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int,
-                   Kokkos::Impl::ALL_t>::value));
-  ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
-                   Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int,
-                   Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int,
-                   Kokkos::pair<int, int>>::value));
-  ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
-                   Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int,
-                   Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int,
-                   Kokkos::Impl::ALL_t>::value));
+                   Kokkos::ALL_t, Kokkos::ALL_t, int, Kokkos::ALL_t>::value));
+  ASSERT_EQ(
+      0,
+      (Kokkos::Impl::SubviewLegalArgsCompileTime<
+          Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::ALL_t,
+          Kokkos::pair<int, int>, int, Kokkos::pair<int, int>>::value));
+  ASSERT_EQ(
+      0,
+      (Kokkos::Impl::SubviewLegalArgsCompileTime<
+          Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::ALL_t,
+          Kokkos::pair<int, int>, int, Kokkos::ALL_t>::value));
   ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int,
-                   Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int,
+                   Kokkos::pair<int, int>, Kokkos::ALL_t, int,
                    Kokkos::pair<int, int>>::value));
-  ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
-                   Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int,
-                   Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int,
-                   Kokkos::Impl::ALL_t>::value));
+  ASSERT_EQ(
+      0,
+      (Kokkos::Impl::SubviewLegalArgsCompileTime<
+          Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int,
+          Kokkos::pair<int, int>, Kokkos::ALL_t, int, Kokkos::ALL_t>::value));
   ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int,
                    Kokkos::pair<int, int>, Kokkos::pair<int, int>, int,
@@ -1634,32 +1640,31 @@ inline void test_subview_legal_args_right() {
   ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int,
                    Kokkos::pair<int, int>, Kokkos::pair<int, int>, int,
-                   Kokkos::Impl::ALL_t>::value));
+                   Kokkos::ALL_t>::value));
 
-  ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
-                   Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int,
-                   Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t,
-                   Kokkos::pair<int, int>>::value));
+  ASSERT_EQ(0,
+            (Kokkos::Impl::SubviewLegalArgsCompileTime<
+                Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int,
+                Kokkos::ALL_t, Kokkos::ALL_t, Kokkos::pair<int, int>>::value));
   ASSERT_EQ(1, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int,
-                   Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t,
-                   Kokkos::Impl::ALL_t>::value));
+                   Kokkos::ALL_t, Kokkos::ALL_t, Kokkos::ALL_t>::value));
   ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int,
-                   Kokkos::Impl::ALL_t, Kokkos::pair<int, int>,
+                   Kokkos::ALL_t, Kokkos::pair<int, int>,
                    Kokkos::pair<int, int>>::value));
+  ASSERT_EQ(0,
+            (Kokkos::Impl::SubviewLegalArgsCompileTime<
+                Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int,
+                Kokkos::ALL_t, Kokkos::pair<int, int>, Kokkos::ALL_t>::value));
   ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int,
-                   Kokkos::Impl::ALL_t, Kokkos::pair<int, int>,
-                   Kokkos::Impl::ALL_t>::value));
-  ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
-                   Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int,
-                   Kokkos::pair<int, int>, Kokkos::Impl::ALL_t,
+                   Kokkos::pair<int, int>, Kokkos::ALL_t,
                    Kokkos::pair<int, int>>::value));
-  ASSERT_EQ(1, (Kokkos::Impl::SubviewLegalArgsCompileTime<
-                   Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int,
-                   Kokkos::pair<int, int>, Kokkos::Impl::ALL_t,
-                   Kokkos::Impl::ALL_t>::value));
+  ASSERT_EQ(1,
+            (Kokkos::Impl::SubviewLegalArgsCompileTime<
+                Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int,
+                Kokkos::pair<int, int>, Kokkos::ALL_t, Kokkos::ALL_t>::value));
   ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int,
                    Kokkos::pair<int, int>, Kokkos::pair<int, int>,
@@ -1667,36 +1672,35 @@ inline void test_subview_legal_args_right() {
   ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int,
                    Kokkos::pair<int, int>, Kokkos::pair<int, int>,
-                   Kokkos::Impl::ALL_t>::value));
+                   Kokkos::ALL_t>::value));
 
   ASSERT_EQ(1, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0,
-                   Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t,
-                   Kokkos::Impl::ALL_t>::value));
-  ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
-                   Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0,
-                   Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t,
-                   Kokkos::pair<int, int>>::value));
-  ASSERT_EQ(1, (Kokkos::Impl::SubviewLegalArgsCompileTime<
-                   Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0,
-                   Kokkos::pair<int, int>, Kokkos::Impl::ALL_t,
-                   Kokkos::Impl::ALL_t>::value));
-  ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
-                   Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0,
-                   Kokkos::pair<int, int>, Kokkos::Impl::ALL_t,
-                   Kokkos::pair<int, int>>::value));
-  ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
-                   Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0,
-                   Kokkos::Impl::ALL_t, Kokkos::pair<int, int>,
-                   Kokkos::Impl::ALL_t>::value));
+                   Kokkos::ALL_t, Kokkos::ALL_t, Kokkos::ALL_t>::value));
+  ASSERT_EQ(
+      0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
+             Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::ALL_t,
+             Kokkos::ALL_t, Kokkos::pair<int, int>>::value));
+  ASSERT_EQ(1,
+            (Kokkos::Impl::SubviewLegalArgsCompileTime<
+                Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0,
+                Kokkos::pair<int, int>, Kokkos::ALL_t, Kokkos::ALL_t>::value));
   ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0,
-                   Kokkos::Impl::ALL_t, Kokkos::pair<int, int>,
+                   Kokkos::pair<int, int>, Kokkos::ALL_t,
                    Kokkos::pair<int, int>>::value));
+  ASSERT_EQ(
+      0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
+             Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::ALL_t,
+             Kokkos::pair<int, int>, Kokkos::ALL_t>::value));
+  ASSERT_EQ(
+      0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
+             Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::ALL_t,
+             Kokkos::pair<int, int>, Kokkos::pair<int, int>>::value));
   ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0,
                    Kokkos::pair<int, int>, Kokkos::pair<int, int>,
-                   Kokkos::Impl::ALL_t>::value));
+                   Kokkos::ALL_t>::value));
   ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0,
                    Kokkos::pair<int, int>, Kokkos::pair<int, int>,
@@ -1704,34 +1708,30 @@ inline void test_subview_legal_args_right() {
 }
 
 inline void test_subview_legal_args_left() {
+  ASSERT_EQ(1,
+            (Kokkos::Impl::SubviewLegalArgsCompileTime<
+                Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::ALL_t,
+                Kokkos::ALL_t, Kokkos::pair<int, int>, int, int>::value));
+  ASSERT_EQ(1,
+            (Kokkos::Impl::SubviewLegalArgsCompileTime<
+                Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::ALL_t,
+                Kokkos::ALL_t, Kokkos::ALL_t, int, int>::value));
   ASSERT_EQ(
-      1,
-      (Kokkos::Impl::SubviewLegalArgsCompileTime<
-          Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t,
-          Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, int>::value));
-  ASSERT_EQ(
-      1,
-      (Kokkos::Impl::SubviewLegalArgsCompileTime<
-          Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t,
-          Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, int>::value));
-  ASSERT_EQ(
-      0,
-      (Kokkos::Impl::SubviewLegalArgsCompileTime<
-          Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t,
-          Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, int>::value));
-  ASSERT_EQ(
-      0,
-      (Kokkos::Impl::SubviewLegalArgsCompileTime<
-          Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t,
-          Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, int>::value));
+      0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
+             Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::ALL_t,
+             Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, int>::value));
+  ASSERT_EQ(0,
+            (Kokkos::Impl::SubviewLegalArgsCompileTime<
+                Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::ALL_t,
+                Kokkos::pair<int, int>, Kokkos::ALL_t, int, int>::value));
   ASSERT_EQ(1, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0,
-                   Kokkos::pair<int, int>, Kokkos::Impl::ALL_t,
+                   Kokkos::pair<int, int>, Kokkos::ALL_t,
                    Kokkos::pair<int, int>, int, int>::value));
   ASSERT_EQ(1, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0,
-                   Kokkos::pair<int, int>, Kokkos::Impl::ALL_t,
-                   Kokkos::Impl::ALL_t, int, int>::value));
+                   Kokkos::pair<int, int>, Kokkos::ALL_t, Kokkos::ALL_t, int,
+                   int>::value));
   ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0,
                    Kokkos::pair<int, int>, Kokkos::pair<int, int>,
@@ -1739,106 +1739,101 @@ inline void test_subview_legal_args_left() {
   ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0,
                    Kokkos::pair<int, int>, Kokkos::pair<int, int>,
-                   Kokkos::Impl::ALL_t, int, int>::value));
-
+                   Kokkos::ALL_t, int, int>::value));
+
+  ASSERT_EQ(0,
+            (Kokkos::Impl::SubviewLegalArgsCompileTime<
+                Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::ALL_t,
+                int, Kokkos::ALL_t, Kokkos::pair<int, int>, int>::value));
+  ASSERT_EQ(0,
+            (Kokkos::Impl::SubviewLegalArgsCompileTime<
+                Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::ALL_t,
+                int, Kokkos::ALL_t, Kokkos::ALL_t, int>::value));
   ASSERT_EQ(
-      0,
-      (Kokkos::Impl::SubviewLegalArgsCompileTime<
-          Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t,
-          int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int>::value));
-  ASSERT_EQ(
-      0,
-      (Kokkos::Impl::SubviewLegalArgsCompileTime<
-          Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t,
-          int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int>::value));
-  ASSERT_EQ(
-      0,
-      (Kokkos::Impl::SubviewLegalArgsCompileTime<
-          Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t,
-          int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int>::value));
-  ASSERT_EQ(
-      0,
-      (Kokkos::Impl::SubviewLegalArgsCompileTime<
-          Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t,
-          int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int>::value));
+      0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
+             Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::ALL_t,
+             int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int>::value));
+  ASSERT_EQ(0,
+            (Kokkos::Impl::SubviewLegalArgsCompileTime<
+                Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::ALL_t,
+                int, Kokkos::pair<int, int>, Kokkos::ALL_t, int>::value));
   ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0,
-                   Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t,
+                   Kokkos::pair<int, int>, int, Kokkos::ALL_t,
                    Kokkos::pair<int, int>, int>::value));
   ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0,
-                   Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t,
-                   Kokkos::Impl::ALL_t, int>::value));
+                   Kokkos::pair<int, int>, int, Kokkos::ALL_t, Kokkos::ALL_t,
+                   int>::value));
   ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0,
                    Kokkos::pair<int, int>, int, Kokkos::pair<int, int>,
                    Kokkos::pair<int, int>, int>::value));
   ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0,
-                   Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t,
+                   Kokkos::pair<int, int>, int, Kokkos::ALL_t,
                    Kokkos::pair<int, int>, int>::value));
 
+  ASSERT_EQ(0,
+            (Kokkos::Impl::SubviewLegalArgsCompileTime<
+                Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::ALL_t,
+                Kokkos::ALL_t, int, Kokkos::pair<int, int>, int>::value));
+  ASSERT_EQ(0,
+            (Kokkos::Impl::SubviewLegalArgsCompileTime<
+                Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::ALL_t,
+                Kokkos::ALL_t, int, Kokkos::ALL_t, int>::value));
   ASSERT_EQ(
-      0,
-      (Kokkos::Impl::SubviewLegalArgsCompileTime<
-          Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t,
-          Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, int>::value));
-  ASSERT_EQ(
-      0,
-      (Kokkos::Impl::SubviewLegalArgsCompileTime<
-          Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t,
-          Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, int>::value));
-  ASSERT_EQ(
-      0,
-      (Kokkos::Impl::SubviewLegalArgsCompileTime<
-          Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t,
-          Kokkos::pair<int, int>, int, Kokkos::pair<int, int>, int>::value));
-  ASSERT_EQ(
-      0,
-      (Kokkos::Impl::SubviewLegalArgsCompileTime<
-          Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t,
-          Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, int>::value));
+      0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
+             Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::ALL_t,
+             Kokkos::pair<int, int>, int, Kokkos::pair<int, int>, int>::value));
+  ASSERT_EQ(0,
+            (Kokkos::Impl::SubviewLegalArgsCompileTime<
+                Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::ALL_t,
+                Kokkos::pair<int, int>, int, Kokkos::ALL_t, int>::value));
   ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0,
-                   Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int,
+                   Kokkos::pair<int, int>, Kokkos::ALL_t, int,
                    Kokkos::pair<int, int>, int>::value));
   ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0,
-                   Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int,
-                   Kokkos::Impl::ALL_t, int>::value));
+                   Kokkos::pair<int, int>, Kokkos::ALL_t, int, Kokkos::ALL_t,
+                   int>::value));
   ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0,
                    Kokkos::pair<int, int>, Kokkos::pair<int, int>, int,
                    Kokkos::pair<int, int>, int>::value));
   ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0,
-                   Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int,
+                   Kokkos::pair<int, int>, Kokkos::ALL_t, int,
                    Kokkos::pair<int, int>, int>::value));
 
+  ASSERT_EQ(
+      0,
+      (Kokkos::Impl::SubviewLegalArgsCompileTime<
+          Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::ALL_t,
+          Kokkos::ALL_t, Kokkos::pair<int, int>, int>::value));
   ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int,
-                   Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t,
-                   Kokkos::pair<int, int>, int>::value));
-  ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
-                   Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int,
-                   Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t,
-                   Kokkos::Impl::ALL_t, int>::value));
-  ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
-                   Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int,
-                   Kokkos::Impl::ALL_t, Kokkos::pair<int, int>,
-                   Kokkos::pair<int, int>, int>::value));
-  ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
-                   Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int,
-                   Kokkos::Impl::ALL_t, Kokkos::pair<int, int>,
-                   Kokkos::Impl::ALL_t, int>::value));
+                   Kokkos::ALL_t, Kokkos::ALL_t, Kokkos::ALL_t, int>::value));
+  ASSERT_EQ(
+      0,
+      (Kokkos::Impl::SubviewLegalArgsCompileTime<
+          Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::ALL_t,
+          Kokkos::pair<int, int>, Kokkos::pair<int, int>, int>::value));
+  ASSERT_EQ(
+      0,
+      (Kokkos::Impl::SubviewLegalArgsCompileTime<
+          Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::ALL_t,
+          Kokkos::pair<int, int>, Kokkos::ALL_t, int>::value));
   ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int,
-                   Kokkos::pair<int, int>, Kokkos::Impl::ALL_t,
+                   Kokkos::pair<int, int>, Kokkos::ALL_t,
                    Kokkos::pair<int, int>, int>::value));
-  ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
-                   Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int,
-                   Kokkos::pair<int, int>, Kokkos::Impl::ALL_t,
-                   Kokkos::Impl::ALL_t, int>::value));
+  ASSERT_EQ(
+      0,
+      (Kokkos::Impl::SubviewLegalArgsCompileTime<
+          Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int,
+          Kokkos::pair<int, int>, Kokkos::ALL_t, Kokkos::ALL_t, int>::value));
   ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int,
                    Kokkos::pair<int, int>, Kokkos::pair<int, int>,
@@ -1846,32 +1841,35 @@ inline void test_subview_legal_args_left() {
   ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int,
                    Kokkos::pair<int, int>, Kokkos::pair<int, int>,
-                   Kokkos::Impl::ALL_t, int>::value));
+                   Kokkos::ALL_t, int>::value));
 
+  ASSERT_EQ(
+      0,
+      (Kokkos::Impl::SubviewLegalArgsCompileTime<
+          Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::ALL_t,
+          Kokkos::ALL_t, int, Kokkos::pair<int, int>>::value));
   ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int,
-                   Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int,
-                   Kokkos::pair<int, int>>::value));
-  ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
-                   Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int,
-                   Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int,
-                   Kokkos::Impl::ALL_t>::value));
-  ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
-                   Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int,
-                   Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int,
-                   Kokkos::pair<int, int>>::value));
-  ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
-                   Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int,
-                   Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int,
-                   Kokkos::Impl::ALL_t>::value));
+                   Kokkos::ALL_t, Kokkos::ALL_t, int, Kokkos::ALL_t>::value));
+  ASSERT_EQ(
+      0,
+      (Kokkos::Impl::SubviewLegalArgsCompileTime<
+          Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::ALL_t,
+          Kokkos::pair<int, int>, int, Kokkos::pair<int, int>>::value));
+  ASSERT_EQ(
+      0,
+      (Kokkos::Impl::SubviewLegalArgsCompileTime<
+          Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::ALL_t,
+          Kokkos::pair<int, int>, int, Kokkos::ALL_t>::value));
   ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int,
-                   Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int,
+                   Kokkos::pair<int, int>, Kokkos::ALL_t, int,
                    Kokkos::pair<int, int>>::value));
-  ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
-                   Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int,
-                   Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int,
-                   Kokkos::Impl::ALL_t>::value));
+  ASSERT_EQ(
+      0,
+      (Kokkos::Impl::SubviewLegalArgsCompileTime<
+          Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int,
+          Kokkos::pair<int, int>, Kokkos::ALL_t, int, Kokkos::ALL_t>::value));
   ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int,
                    Kokkos::pair<int, int>, Kokkos::pair<int, int>, int,
@@ -1879,32 +1877,31 @@ inline void test_subview_legal_args_left() {
   ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int,
                    Kokkos::pair<int, int>, Kokkos::pair<int, int>, int,
-                   Kokkos::Impl::ALL_t>::value));
+                   Kokkos::ALL_t>::value));
 
+  ASSERT_EQ(0,
+            (Kokkos::Impl::SubviewLegalArgsCompileTime<
+                Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int,
+                Kokkos::ALL_t, Kokkos::ALL_t, Kokkos::pair<int, int>>::value));
   ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int,
-                   Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t,
-                   Kokkos::pair<int, int>>::value));
+                   Kokkos::ALL_t, Kokkos::ALL_t, Kokkos::ALL_t>::value));
   ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int,
-                   Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t,
-                   Kokkos::Impl::ALL_t>::value));
-  ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
-                   Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int,
-                   Kokkos::Impl::ALL_t, Kokkos::pair<int, int>,
+                   Kokkos::ALL_t, Kokkos::pair<int, int>,
                    Kokkos::pair<int, int>>::value));
+  ASSERT_EQ(0,
+            (Kokkos::Impl::SubviewLegalArgsCompileTime<
+                Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int,
+                Kokkos::ALL_t, Kokkos::pair<int, int>, Kokkos::ALL_t>::value));
   ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int,
-                   Kokkos::Impl::ALL_t, Kokkos::pair<int, int>,
-                   Kokkos::Impl::ALL_t>::value));
-  ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
-                   Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int,
-                   Kokkos::pair<int, int>, Kokkos::Impl::ALL_t,
+                   Kokkos::pair<int, int>, Kokkos::ALL_t,
                    Kokkos::pair<int, int>>::value));
-  ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
-                   Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int,
-                   Kokkos::pair<int, int>, Kokkos::Impl::ALL_t,
-                   Kokkos::Impl::ALL_t>::value));
+  ASSERT_EQ(0,
+            (Kokkos::Impl::SubviewLegalArgsCompileTime<
+                Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int,
+                Kokkos::pair<int, int>, Kokkos::ALL_t, Kokkos::ALL_t>::value));
   ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int,
                    Kokkos::pair<int, int>, Kokkos::pair<int, int>,
@@ -1912,40 +1909,35 @@ inline void test_subview_legal_args_left() {
   ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int,
                    Kokkos::pair<int, int>, Kokkos::pair<int, int>,
-                   Kokkos::Impl::ALL_t>::value));
+                   Kokkos::ALL_t>::value));
 
-  ASSERT_EQ(
-      1,
-      (Kokkos::Impl::SubviewLegalArgsCompileTime<
-          Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::Impl::ALL_t,
-          Kokkos::Impl::ALL_t, Kokkos::pair<int, int>>::value));
-  ASSERT_EQ(
-      1,
-      (Kokkos::Impl::SubviewLegalArgsCompileTime<
-          Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::Impl::ALL_t,
-          Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t>::value));
+  ASSERT_EQ(1,
+            (Kokkos::Impl::SubviewLegalArgsCompileTime<
+                Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::ALL_t,
+                Kokkos::ALL_t, Kokkos::pair<int, int>>::value));
   ASSERT_EQ(1, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0,
-                   Kokkos::pair<int, int>, Kokkos::Impl::ALL_t,
-                   Kokkos::pair<int, int>>::value));
+                   Kokkos::ALL_t, Kokkos::ALL_t, Kokkos::ALL_t>::value));
   ASSERT_EQ(1, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0,
-                   Kokkos::pair<int, int>, Kokkos::Impl::ALL_t,
-                   Kokkos::Impl::ALL_t>::value));
-  ASSERT_EQ(
-      0,
-      (Kokkos::Impl::SubviewLegalArgsCompileTime<
-          Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::Impl::ALL_t,
-          Kokkos::pair<int, int>, Kokkos::Impl::ALL_t>::value));
-  ASSERT_EQ(
-      0,
-      (Kokkos::Impl::SubviewLegalArgsCompileTime<
-          Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::Impl::ALL_t,
-          Kokkos::pair<int, int>, Kokkos::pair<int, int>>::value));
+                   Kokkos::pair<int, int>, Kokkos::ALL_t,
+                   Kokkos::pair<int, int>>::value));
+  ASSERT_EQ(1,
+            (Kokkos::Impl::SubviewLegalArgsCompileTime<
+                Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0,
+                Kokkos::pair<int, int>, Kokkos::ALL_t, Kokkos::ALL_t>::value));
+  ASSERT_EQ(0,
+            (Kokkos::Impl::SubviewLegalArgsCompileTime<
+                Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::ALL_t,
+                Kokkos::pair<int, int>, Kokkos::ALL_t>::value));
+  ASSERT_EQ(0,
+            (Kokkos::Impl::SubviewLegalArgsCompileTime<
+                Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::ALL_t,
+                Kokkos::pair<int, int>, Kokkos::pair<int, int>>::value));
   ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0,
                    Kokkos::pair<int, int>, Kokkos::pair<int, int>,
-                   Kokkos::Impl::ALL_t>::value));
+                   Kokkos::ALL_t>::value));
   ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime<
                    Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0,
                    Kokkos::pair<int, int>, Kokkos::pair<int, int>,
@@ -2118,15 +2110,33 @@ void test_unmanaged_subview_reset() {
 template <std::underlying_type_t<Kokkos::MemoryTraitsFlags> MTF>
 struct TestSubviewMemoryTraitsConstruction {
   void operator()() const noexcept {
-    using view_type          = Kokkos::View<double*, Kokkos::HostSpace>;
-    using size_type          = view_type::size_type;
     using memory_traits_type = Kokkos::MemoryTraits<MTF>;
-
-    view_type v("v", 7);
+    using view_type =
+        Kokkos::View<double*, Kokkos::HostSpace, memory_traits_type>;
+    using size_type = typename view_type::size_type;
+
+    // Create a managed View first and then apply the desired memory traits to
+    // an unmanaged version of it since a managed View can't use the Unmanaged
+    // trait.
+    Kokkos::View<double*, Kokkos::HostSpace> v_original("v", 7);
+    view_type v(v_original.data(), v_original.size());
     for (size_type i = 0; i != v.size(); ++i) v[i] = static_cast<double>(i);
 
     std::pair<int, int> range(3, 5);
-    auto sv = Kokkos::subview<memory_traits_type>(v, range);
+    auto sv = Kokkos::subview(v, range);
+
+    // check that the subview memory traits are the same as the original view
+    // (with the Aligned trait stripped).
+    using view_memory_traits    = typename decltype(v)::memory_traits;
+    using subview_memory_traits = typename decltype(sv)::memory_traits;
+    static_assert(view_memory_traits::impl_value ==
+                  memory_traits_type::impl_value);
+    if constexpr (memory_traits_type::is_aligned)
+      static_assert(subview_memory_traits::impl_value + Kokkos::Aligned ==
+                    memory_traits_type::impl_value);
+    else
+      static_assert(subview_memory_traits::impl_value ==
+                    memory_traits_type::impl_value);
 
     ASSERT_EQ(2u, sv.size());
     EXPECT_EQ(3., sv[0]);
@@ -2140,6 +2150,7 @@ inline void test_subview_memory_traits_construction() {
   // RandomAccess (2)
   // Atomic (4)
   // Restricted (8)
+  // Aligned (16)
   TestSubviewMemoryTraitsConstruction<0>()();
   TestSubviewMemoryTraitsConstruction<1>()();
   TestSubviewMemoryTraitsConstruction<2>()();
@@ -2156,6 +2167,22 @@ inline void test_subview_memory_traits_construction() {
   TestSubviewMemoryTraitsConstruction<13>()();
   TestSubviewMemoryTraitsConstruction<14>()();
   TestSubviewMemoryTraitsConstruction<15>()();
+  TestSubviewMemoryTraitsConstruction<16>()();
+  TestSubviewMemoryTraitsConstruction<17>()();
+  TestSubviewMemoryTraitsConstruction<18>()();
+  TestSubviewMemoryTraitsConstruction<19>()();
+  TestSubviewMemoryTraitsConstruction<20>()();
+  TestSubviewMemoryTraitsConstruction<21>()();
+  TestSubviewMemoryTraitsConstruction<22>()();
+  TestSubviewMemoryTraitsConstruction<23>()();
+  TestSubviewMemoryTraitsConstruction<24>()();
+  TestSubviewMemoryTraitsConstruction<25>()();
+  TestSubviewMemoryTraitsConstruction<26>()();
+  TestSubviewMemoryTraitsConstruction<27>()();
+  TestSubviewMemoryTraitsConstruction<28>()();
+  TestSubviewMemoryTraitsConstruction<29>()();
+  TestSubviewMemoryTraitsConstruction<30>()();
+  TestSubviewMemoryTraitsConstruction<31>()();
 }
 
 //----------------------------------------------------------------------------
diff --git a/packages/kokkos/core/unit_test/UnitTest_DeviceAndThreads.cpp b/packages/kokkos/core/unit_test/UnitTest_DeviceAndThreads.cpp
index 2b3c90ef2aed7825dd3200f7a6635861616ec06f..b522ac3e69b748165d2c553739d54819388e7bc6 100644
--- a/packages/kokkos/core/unit_test/UnitTest_DeviceAndThreads.cpp
+++ b/packages/kokkos/core/unit_test/UnitTest_DeviceAndThreads.cpp
@@ -38,21 +38,24 @@ int get_device_count() {
 }
 
 int get_device_id() {
+  int device_id;
 #if defined(KOKKOS_ENABLE_CUDA)
-  int device;
-  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDevice(&device));
-  return device;
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDevice(&device_id));
 #elif defined(KOKKOS_ENABLE_HIP)
-  int device_id;
   KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDevice(&device_id));
-  return device_id;
 #elif defined(KOKKOS_ENABLE_OPENMPTARGET)
-  return omp_get_device_num();
+  device_id = omp_get_device_num();
 #elif defined(KOKKOS_ENABLE_OPENACC)
-  return acc_get_device_num(acc_get_device_type());
+  device_id = acc_get_device_num(acc_get_device_type());
+#elif defined(KOKKOS_ENABLE_SYCL)
+  // FIXME_SYCL ?
+  assert(false);
+  return -2;
 #else
-  return -1;
+  device_id = -1;
 #endif
+  assert(device_id == Kokkos::device_id());
+  return device_id;
 }
 
 int get_max_threads() {
@@ -66,7 +69,9 @@ int get_max_threads() {
 }
 
 int get_num_threads() {
-  return Kokkos::DefaultHostExecutionSpace().concurrency();
+  int const num_threads = Kokkos::DefaultHostExecutionSpace().concurrency();
+  assert(num_threads == Kokkos::num_threads());
+  return num_threads;
 }
 
 int get_disable_warnings() { return !Kokkos::show_warnings(); }
diff --git a/packages/kokkos/core/unit_test/category_files/TestHIP_Category.hpp b/packages/kokkos/core/unit_test/category_files/TestHIP_Category.hpp
index adaed3281ab4a7d64ce73d349e682e2d3a15b75b..6086f7e05bb3888dfbea4887d7140caada013dbb 100644
--- a/packages/kokkos/core/unit_test/category_files/TestHIP_Category.hpp
+++ b/packages/kokkos/core/unit_test/category_files/TestHIP_Category.hpp
@@ -23,5 +23,6 @@
 #define TEST_CATEGORY_NUMBER 6
 #define TEST_CATEGORY_DEATH hip_DeathTest
 #define TEST_EXECSPACE Kokkos::HIP
+#define TEST_CATEGORY_FIXTURE(name) hip_##name
 
 #endif
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp
index 407aa60a0a645172120ac644fbd47caacbe7d8ca..ae603101abb32ce2b701886f11307ff1b10ac210 100644
--- a/packages/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp
@@ -338,11 +338,6 @@ struct TestViewCudaTexture {
 };
 
 TEST(cuda, impl_view_texture) {
-#if defined(KOKKOS_ENABLE_CUDA) && \
-    defined(KOKKOS_COMPILER_NVHPC)  // FIXME_NVHPC
-  GTEST_SKIP()
-      << "Getting error_count of 1000 meaning all assertions are failing";
-#endif
   TestViewCudaTexture<Kokkos::CudaSpace>::run();
   TestViewCudaTexture<Kokkos::CudaUVMSpace>::run();
 }
@@ -374,7 +369,8 @@ template <typename View>
 View create_view() {
   using execution_space = typename View::execution_space;
   View view("", 10);
-  InitFunctor iota(view);
+  // MSVC+CUDA errors on CTAD here
+  InitFunctor<View> iota(view);
   Kokkos::parallel_for("test_view_subview_const_randomaccess",
                        Kokkos::RangePolicy<execution_space>(0, view.extent(0)),
                        iota);
@@ -406,11 +402,6 @@ void test_view_subview_const_randomaccess() {
 }  // namespace issue_5594
 
 TEST(cuda, view_subview_const_randomaccess) {
-#if defined(KOKKOS_ENABLE_CUDA) && \
-    defined(KOKKOS_COMPILER_NVHPC)  // FIXME_NVHPC (similar failure to
-                                    // TestViewCudaTexture?)
-  GTEST_SKIP() << "RandomAccess view not working on NVHPC?";
-#endif
   issue_5594::test_view_subview_const_randomaccess<Kokkos::Cuda,
                                                    Kokkos::CudaSpace>();
   issue_5594::test_view_subview_const_randomaccess<Kokkos::Cuda,
diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeViewAPI.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeViewAPI.cpp
index 96fffa0dc7ee22350bed9fa7250a32134f26decb..d81c71499fa14fe79e613171d5398514287f2364 100644
--- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeViewAPI.cpp
+++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeViewAPI.cpp
@@ -113,11 +113,17 @@ TYPED_TEST(TestViewAPI, sizes) {
   static_assert(view_t::rank == TestFixture::expected_rank,
                 "TestViewAPI: Error: rank mismatch");
   size_t expected_span = 1;
-  for (int r = 0; r < view_t::rank; r++) expected_span *= this->all_sizes[r];
+  // avoid pointless comparison of unsigned integer with zero warning
+  if constexpr (view_t::rank > 0) {
+    for (size_t r = 0; r < view_t::rank; r++)
+      expected_span *= this->all_sizes[r];
+  }
 
   EXPECT_EQ(expected_span, a.span());
-  for (int r = 0; r < view_t::rank; r++) {
-    EXPECT_EQ(this->all_sizes[r], a.extent(r));
-    EXPECT_EQ(this->all_sizes[r], size_t(a.extent_int(r)));
+  if constexpr (view_t::rank > 0) {
+    for (size_t r = 0; r < view_t::rank; r++) {
+      EXPECT_EQ(this->all_sizes[r], a.extent(r));
+      EXPECT_EQ(this->all_sizes[r], size_t(a.extent_int(r)));
+    }
   }
 }
diff --git a/packages/kokkos/core/unit_test/headers_self_contained/CMakeLists.txt b/packages/kokkos/core/unit_test/headers_self_contained/CMakeLists.txt
index 5a0c589ac7556d6b6a9048fc6499a60f72634f9f..f792b03ed8807856627eddd1ba02ce3ef75aadfb 100644
--- a/packages/kokkos/core/unit_test/headers_self_contained/CMakeLists.txt
+++ b/packages/kokkos/core/unit_test/headers_self_contained/CMakeLists.txt
@@ -10,6 +10,10 @@ file(GLOB KOKKOS_CONTAINERS_HEADERS RELATIVE ${BASE_DIR}/containers/src
 file(GLOB KOKKOS_ALGORITHMS_HEADERS RELATIVE  ${BASE_DIR}/algorithms/src
      ${BASE_DIR}/algorithms/src/*.hpp)
 
+if(NOT Kokkos_ENABLE_DEPRECATED_CODE_4)
+  list(REMOVE_ITEM KOKKOS_CONTAINERS_HEADERS "Kokkos_Vector.hpp")
+endif()
+
 foreach (_header ${KOKKOS_CORE_HEADERS} ${KOKKOS_CONTAINERS_HEADERS} ${KOKKOS_ALGORITHMS_HEADERS})
   string(REGEX REPLACE "[\./]" "_" header_test_name ${_header})
   set(header_test_name Kokkos_HeaderSelfContained_${header_test_name})
diff --git a/packages/kokkos/core/src/dummy.cpp b/packages/kokkos/core/unit_test/hip/TestHIP_Graph.cpp
similarity index 73%
rename from packages/kokkos/core/src/dummy.cpp
rename to packages/kokkos/core/unit_test/hip/TestHIP_Graph.cpp
index 929380b6c33b3ea6b581f3d5c137de88293e4bb9..405cb76c643cc90e3e0228d41d8439e36aa7a500 100644
--- a/packages/kokkos/core/src/dummy.cpp
+++ b/packages/kokkos/core/unit_test/hip/TestHIP_Graph.cpp
@@ -14,11 +14,5 @@
 //
 //@HEADER
 
-namespace Kokkos {
-namespace AvoidCompilerWarnings {
-int dontComplain() {
-  // keep the compiler from complaining about emptiness
-  return 0;
-}
-}  // namespace AvoidCompilerWarnings
-}  // namespace Kokkos
+#include <TestHIP_Category.hpp>
+#include <TestGraph.hpp>
diff --git a/packages/kokkos/core/unit_test/hip/TestHIP_ScanUnit.cpp b/packages/kokkos/core/unit_test/hip/TestHIP_ScanUnit.cpp
index 0b46d9742ac720dfddd5c2490da00e1886e66b06..fe3a14d2b8dbae5286ce82fb895e4fd9e1247e60 100644
--- a/packages/kokkos/core/unit_test/hip/TestHIP_ScanUnit.cpp
+++ b/packages/kokkos/core/unit_test/hip/TestHIP_ScanUnit.cpp
@@ -33,7 +33,8 @@ __global__ void start_intra_block_scan()
   DummyFunctor f;
   typename Kokkos::Impl::FunctorAnalysis<
       Kokkos::Impl::FunctorPatternInterface::SCAN,
-      Kokkos::RangePolicy<Kokkos::HIP>, DummyFunctor>::Reducer reducer(&f);
+      Kokkos::RangePolicy<Kokkos::HIP>, DummyFunctor,
+      DummyFunctor::value_type>::Reducer reducer(f);
   Kokkos::Impl::hip_intra_block_reduce_scan<true>(reducer, values);
 
   __syncthreads();
diff --git a/packages/kokkos/core/unit_test/hpx/TestHPX_InParallel.cpp b/packages/kokkos/core/unit_test/hpx/TestHPX_InParallel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..438c81c880085d2a7faaea0f113ceae7a215b532
--- /dev/null
+++ b/packages/kokkos/core/unit_test/hpx/TestHPX_InParallel.cpp
@@ -0,0 +1,183 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+
+// These tests specifically check that work dispatched to independent instances
+// is synchronized correctly on fences. A previous bug that this protects
+// against is work being mistakenly dispatched to the default instance, but the
+// fence fencing the independent instance. In that case these tests will fail.
+
+namespace {
+inline constexpr int n = 1 << 10;
+
+TEST(hpx, in_parallel_for_range_policy) {
+  Kokkos::View<bool *, Kokkos::Experimental::HPX> a("a", n);
+
+  ASSERT_FALSE(Kokkos::Experimental::HPX::in_parallel());
+
+  Kokkos::RangePolicy<Kokkos::Experimental::HPX> policy(0, n);
+  Kokkos::parallel_for(
+      "parallel_for_range_policy", policy, KOKKOS_LAMBDA(const int i) {
+        a(i) = Kokkos::Experimental::HPX::in_parallel();
+      });
+
+  ASSERT_FALSE(Kokkos::Experimental::HPX::in_parallel());
+  Kokkos::fence();
+  ASSERT_FALSE(Kokkos::Experimental::HPX::in_parallel());
+
+  for (int i = 0; i < n; ++i) {
+    ASSERT_TRUE(a(i));
+  }
+}
+
+TEST(hpx, in_parallel_for_mdrange_policy) {
+  Kokkos::View<bool *, Kokkos::Experimental::HPX> a("a", n);
+
+  ASSERT_FALSE(Kokkos::Experimental::HPX::in_parallel());
+
+  Kokkos::MDRangePolicy<Kokkos::Experimental::HPX, Kokkos::Rank<2>> policy(
+      {0, 0}, {n, 1});
+  Kokkos::parallel_for(
+      "parallel_for_mdrange_policy", policy,
+      KOKKOS_LAMBDA(const int i, const int) {
+        a(i) = Kokkos::Experimental::HPX::in_parallel();
+      });
+
+  ASSERT_FALSE(Kokkos::Experimental::HPX::in_parallel());
+  Kokkos::fence();
+  ASSERT_FALSE(Kokkos::Experimental::HPX::in_parallel());
+
+  for (int i = 0; i < n; ++i) {
+    ASSERT_TRUE(a(i));
+  }
+}
+
+TEST(hpx, in_parallel_for_team_policy) {
+  Kokkos::View<bool *, Kokkos::Experimental::HPX> a("a", n);
+
+  ASSERT_FALSE(Kokkos::Experimental::HPX::in_parallel());
+
+  Kokkos::TeamPolicy<Kokkos::Experimental::HPX> policy(n, 1);
+  using member_type = decltype(policy)::member_type;
+  Kokkos::parallel_for(
+      "parallel_for_team_policy", policy,
+      KOKKOS_LAMBDA(const member_type &handle) {
+        a(handle.league_rank()) = Kokkos::Experimental::HPX::in_parallel();
+      });
+
+  ASSERT_FALSE(Kokkos::Experimental::HPX::in_parallel());
+  Kokkos::fence();
+  ASSERT_FALSE(Kokkos::Experimental::HPX::in_parallel());
+
+  for (int i = 0; i < n; ++i) {
+    ASSERT_TRUE(a(i));
+  }
+}
+
+TEST(hpx, in_parallel_reduce_range_policy) {
+  Kokkos::View<int, Kokkos::Experimental::HPX> a("a");
+
+  ASSERT_FALSE(Kokkos::Experimental::HPX::in_parallel());
+
+  Kokkos::RangePolicy<Kokkos::Experimental::HPX> policy(0, n);
+  Kokkos::parallel_reduce(
+      "parallel_reduce_range_policy", policy,
+      KOKKOS_LAMBDA(const int, int &x) {
+        if (!Kokkos::Experimental::HPX::in_parallel()) {
+          ++x;
+        }
+      },
+      a);
+
+  ASSERT_FALSE(Kokkos::Experimental::HPX::in_parallel());
+  Kokkos::fence();
+  ASSERT_FALSE(Kokkos::Experimental::HPX::in_parallel());
+
+  ASSERT_EQ(a(), 0);
+}
+
+TEST(hpx, in_parallel_reduce_mdrange_policy) {
+  Kokkos::View<int, Kokkos::Experimental::HPX> a("a");
+
+  ASSERT_FALSE(Kokkos::Experimental::HPX::in_parallel());
+
+  Kokkos::MDRangePolicy<Kokkos::Experimental::HPX, Kokkos::Rank<2>> policy(
+      {0, 0}, {n, 1});
+  Kokkos::parallel_reduce(
+      "parallel_reduce_mdrange_policy", policy,
+      KOKKOS_LAMBDA(const int, const int, int &x) {
+        if (!Kokkos::Experimental::HPX::in_parallel()) {
+          ++x;
+        }
+      },
+      a);
+
+  ASSERT_FALSE(Kokkos::Experimental::HPX::in_parallel());
+  Kokkos::fence();
+  ASSERT_FALSE(Kokkos::Experimental::HPX::in_parallel());
+
+  ASSERT_EQ(a(), 0);
+}
+
+TEST(hpx, in_parallel_reduce_team_policy) {
+  Kokkos::View<int, Kokkos::Experimental::HPX> a("a");
+
+  ASSERT_FALSE(Kokkos::Experimental::HPX::in_parallel());
+
+  Kokkos::TeamPolicy<Kokkos::Experimental::HPX> policy(n, 1);
+  using member_type = decltype(policy)::member_type;
+  Kokkos::parallel_reduce(
+      "parallel_reduce_team_policy", policy,
+      KOKKOS_LAMBDA(const member_type &, int &x) {
+        if (!Kokkos::Experimental::HPX::in_parallel()) {
+          ++x;
+        }
+      },
+      a);
+
+  ASSERT_FALSE(Kokkos::Experimental::HPX::in_parallel());
+  Kokkos::fence();
+  ASSERT_FALSE(Kokkos::Experimental::HPX::in_parallel());
+
+  ASSERT_EQ(a(), 0);
+}
+
+TEST(hpx, in_parallel_scan_range_policy) {
+  Kokkos::View<int *, Kokkos::Experimental::HPX> a("a", n);
+
+  ASSERT_FALSE(Kokkos::Experimental::HPX::in_parallel());
+
+  Kokkos::RangePolicy<Kokkos::Experimental::HPX> policy(0, n);
+  Kokkos::parallel_scan(
+      "parallel_scan_range_policy", policy,
+      KOKKOS_LAMBDA(const int, int &x, bool) {
+        if (!Kokkos::Experimental::HPX::in_parallel()) {
+          ++x;
+        }
+      },
+      a);
+
+  ASSERT_FALSE(Kokkos::Experimental::HPX::in_parallel());
+  Kokkos::fence();
+  ASSERT_FALSE(Kokkos::Experimental::HPX::in_parallel());
+
+  for (int i = 0; i < n; ++i) {
+    ASSERT_EQ(a(i), 0);
+  }
+}
+}  // namespace
diff --git a/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstances.cpp b/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstances.cpp
index 8f349cf8045bb38b8fce7562e693b63ecbca077b..b36f56501bbf126c0f34f58dd052d42fa3b64d85 100644
--- a/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstances.cpp
+++ b/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstances.cpp
@@ -18,9 +18,8 @@
 #include <TestHPX_Category.hpp>
 
 #include <hpx/config.hpp>
-#include <hpx/local/future.hpp>
+#include <hpx/future.hpp>
 
-#ifdef KOKKOS_ENABLE_HPX_ASYNC_DISPATCH
 #ifndef HPX_COMPUTE_DEVICE_CODE
 
 namespace {
@@ -99,7 +98,7 @@ TEST(hpx, independent_instances) {
           Kokkos::Experimental::WorkItemProperty::HintLightWeight),
       FunctorInitConstant(v1, c));
 
-  Kokkos::Experimental::HPX hpx2(hpx1.impl_get_future());
+  Kokkos::Experimental::HPX hpx2(hpx1.get_sender());
   Kokkos::parallel_for(
       "Test::hpx::independent_instances::add",
       Kokkos::Experimental::require(
@@ -107,7 +106,7 @@ TEST(hpx, independent_instances) {
           Kokkos::Experimental::WorkItemProperty::HintLightWeight),
       FunctorAdd(v1, v2, d));
 
-  Kokkos::Experimental::HPX hpx3(hpx1.impl_get_future());
+  Kokkos::Experimental::HPX hpx3(hpx1.get_sender());
   Kokkos::parallel_for(
       "Test::hpx::independent_instances::add_index",
       Kokkos::Experimental::require(
@@ -115,12 +114,8 @@ TEST(hpx, independent_instances) {
           Kokkos::Experimental::WorkItemProperty::HintLightWeight),
       FunctorAddIndex(v1, v3));
 
-  // NOTE: This monstrosity is used to collapse a future<tuple<future<void>,
-  // future<void>>> (return type of when_all) into a future<void> which is
-  // ready whenever the un-collapsed future would've been ready. HPX does not
-  // currently have the functionality to collapse this automatically.
-  Kokkos::Experimental::HPX hpx4(hpx::get<0>(hpx::split_future(
-      hpx::when_all(hpx2.impl_get_future(), hpx3.impl_get_future()))));
+  Kokkos::Experimental::HPX hpx4(hpx::execution::experimental::when_all(
+      hpx2.get_sender(), hpx3.get_sender()));
   Kokkos::parallel_for(
       "Test::hpx::independent_instances::pointwise_sum",
       Kokkos::Experimental::require(
@@ -137,11 +132,6 @@ TEST(hpx, independent_instances) {
 
   hpx4.fence();
 
-  ASSERT_EQ(true, hpx1.impl_get_future().is_ready());
-  ASSERT_EQ(true, hpx2.impl_get_future().is_ready());
-  ASSERT_EQ(true, hpx3.impl_get_future().is_ready());
-  ASSERT_EQ(true, hpx4.impl_get_future().is_ready());
-
   const int expected_sum = n * (2 * c + d) + (n * (n - 1) / 2);
   ASSERT_EQ(expected_sum, sum_v());
 }
@@ -149,4 +139,3 @@ TEST(hpx, independent_instances) {
 }  // namespace
 
 #endif
-#endif
diff --git a/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstancesDelayedExecution.cpp b/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstancesDelayedExecution.cpp
index 177d87c1c86e468360589b23da85d32c648576fd..dc6b5d7afb0bfef634899ad4933b41e855743612 100644
--- a/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstancesDelayedExecution.cpp
+++ b/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstancesDelayedExecution.cpp
@@ -17,18 +17,20 @@
 #include <Kokkos_Core.hpp>
 #include <TestHPX_Category.hpp>
 
-#include <hpx/local/future.hpp>
-
-#ifdef KOKKOS_ENABLE_HPX_ASYNC_DISPATCH
+#include <hpx/execution.hpp>
 
 namespace {
 
 TEST(hpx, independent_instances_delayed_execution) {
   Kokkos::View<bool, Kokkos::Experimental::HPX> ran("ran");
-  hpx::lcos::local::promise<void> p;
-  hpx::shared_future<void> f = p.get_future();
 
-  Kokkos::Experimental::HPX hpx(f);
+  // Create a sender that will call set_value on a receiver after a delay.
+  hpx::execution::experimental::unique_any_sender<> s{
+      hpx::execution::experimental::schedule(
+          hpx::execution::experimental::thread_pool_scheduler{}) |
+      hpx::execution::experimental::then(
+          [] { hpx::this_thread::sleep_for(std::chrono::milliseconds(500)); })};
+  Kokkos::Experimental::HPX hpx(std::move(s));
   Kokkos::parallel_for(
       "Test::hpx::independent_instances::delay_execution",
       Kokkos::Experimental::require(
@@ -36,15 +38,13 @@ TEST(hpx, independent_instances_delayed_execution) {
           Kokkos::Experimental::WorkItemProperty::HintLightWeight),
       KOKKOS_LAMBDA(int) { ran() = true; });
 
+#if defined(KOKKOS_ENABLE_IMPL_HPX_ASYNC_DISPATCH)
   ASSERT_FALSE(ran());
-  ASSERT_FALSE(hpx.impl_get_future().is_ready());
-
-  p.set_value();
-
+#else
+  ASSERT_TRUE(ran());
+#endif
   hpx.fence();
-  ASSERT_TRUE(hpx.impl_get_future().is_ready());
+  ASSERT_TRUE(ran());
 }
 
 }  // namespace
-
-#endif
diff --git a/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstancesInstanceIds.cpp b/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstancesInstanceIds.cpp
index 6441666e64f35df133a9dccfa4aec1638c285d42..9059a1d9eab6e289de843fcecf81a3111170bce8 100644
--- a/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstancesInstanceIds.cpp
+++ b/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstancesInstanceIds.cpp
@@ -17,9 +17,7 @@
 #include <Kokkos_Core.hpp>
 #include <TestHPX_Category.hpp>
 
-#include <hpx/local/future.hpp>
-
-#ifdef KOKKOS_ENABLE_HPX_ASYNC_DISPATCH
+#include <hpx/execution.hpp>
 
 namespace {
 
@@ -59,25 +57,24 @@ TEST(hpx, independent_instances_instance_ids) {
   ASSERT_EQ(hpx_independent1.impl_instance_id(),
             hpx_independent4.impl_instance_id());
 
-  hpx::shared_future<void> f = hpx::make_ready_future<void>();
-  Kokkos::Experimental::HPX hpx_independent_future1(f);
-  Kokkos::Experimental::HPX hpx_independent_future2 = hpx_independent_future1;
-  Kokkos::Experimental::HPX hpx_independent_future3{hpx_independent_future1};
-  Kokkos::Experimental::HPX hpx_independent_future4;
-  hpx_independent_future4 = hpx_independent_future1;
+  Kokkos::Experimental::HPX hpx_independent_sender1(
+      hpx::execution::experimental::unique_any_sender<>{
+          hpx::execution::experimental::just()});
+  Kokkos::Experimental::HPX hpx_independent_sender2 = hpx_independent_sender1;
+  Kokkos::Experimental::HPX hpx_independent_sender3{hpx_independent_sender1};
+  Kokkos::Experimental::HPX hpx_independent_sender4;
+  hpx_independent_sender4 = hpx_independent_sender1;
 
   ASSERT_NE(hpx_default1.impl_instance_id(),
-            hpx_independent1.impl_instance_id());
+            hpx_independent_sender1.impl_instance_id());
   ASSERT_NE(hpx_independent1.impl_instance_id(),
-            hpx_independent_future1.impl_instance_id());
-  ASSERT_EQ(hpx_independent_future1.impl_instance_id(),
-            hpx_independent_future2.impl_instance_id());
-  ASSERT_EQ(hpx_independent_future1.impl_instance_id(),
-            hpx_independent_future3.impl_instance_id());
-  ASSERT_EQ(hpx_independent_future1.impl_instance_id(),
-            hpx_independent_future4.impl_instance_id());
+            hpx_independent_sender1.impl_instance_id());
+  ASSERT_EQ(hpx_independent_sender1.impl_instance_id(),
+            hpx_independent_sender2.impl_instance_id());
+  ASSERT_EQ(hpx_independent_sender1.impl_instance_id(),
+            hpx_independent_sender3.impl_instance_id());
+  ASSERT_EQ(hpx_independent_sender1.impl_instance_id(),
+            hpx_independent_sender4.impl_instance_id());
 }
 
 }  // namespace
-
-#endif
diff --git a/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstancesRefCounting.cpp b/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstancesRefCounting.cpp
index 2e9ca081bc03b8c581ea71a4ea3e439e0ce3293a..3fc7d839090fa4fd8c8630294959177f4ca95468 100644
--- a/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstancesRefCounting.cpp
+++ b/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstancesRefCounting.cpp
@@ -17,13 +17,12 @@
 #include <Kokkos_Core.hpp>
 #include <TestHPX_Category.hpp>
 
-#ifdef KOKKOS_ENABLE_HPX_ASYNC_DISPATCH
-
 namespace {
 std::atomic<int> dummy_count;
 
 struct dummy {
   dummy() { ++dummy_count; }
+  dummy(dummy &&) { ++dummy_count; }
   dummy(dummy const &) { ++dummy_count; }
   ~dummy() { --dummy_count; }
   void f() const {}
@@ -32,28 +31,26 @@ struct dummy {
 // This test makes sure the independent HPX instances don't hold on to captured
 // data after destruction.
 TEST(hpx, independent_instances_reference_counting) {
-  dummy d;
-  Kokkos::Experimental::HPX hpx(
-      Kokkos::Experimental::HPX::instance_mode::independent);
-  Kokkos::parallel_for(
-      "Test::hpx::reference_counting::dummy",
-      Kokkos::RangePolicy<Kokkos::Experimental::HPX>(hpx, 0, 1),
-      KOKKOS_LAMBDA(int) {
-        // Make sure dummy struct is captured.
-        d.f();
-      });
-
-  hpx.fence();
-
-  // The fence above makes sure that copies of dummy get released. However,
-  // all copies are not guaranteed to be released as soon as fence returns.
-  // Therefore we wait for a short time to make it almost guaranteed that all
-  // copies have been released.
-  std::this_thread::sleep_for(std::chrono::milliseconds(100));
-
-  ASSERT_EQ(1, dummy_count);
+  ASSERT_EQ(0, dummy_count);
+
+  {
+    dummy d;
+    ASSERT_EQ(1, dummy_count);
+    Kokkos::Experimental::HPX hpx(
+        Kokkos::Experimental::HPX::instance_mode::independent);
+    Kokkos::parallel_for(
+        "Test::hpx::reference_counting::dummy",
+        Kokkos::RangePolicy<Kokkos::Experimental::HPX>(hpx, 0, 1),
+        KOKKOS_LAMBDA(int) {
+          // Make sure dummy struct is captured.
+          d.f();
+        });
+
+    hpx.fence();
+    ASSERT_EQ(1, dummy_count);
+  }
+
+  ASSERT_EQ(0, dummy_count);
 }
 
 }  // namespace
-
-#endif
diff --git a/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstancesSynchronization.cpp b/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstancesSynchronization.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..24eb642f6b466761595af06d8de5164b4e3d72d5
--- /dev/null
+++ b/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstancesSynchronization.cpp
@@ -0,0 +1,162 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <Kokkos_Core.hpp>
+#include <TestHPX_Category.hpp>
+
+// These tests specifically check that work dispatched to independent instances
+// is synchronized correctly on fences. A previous bug that this protects
+// against is work being mistakenly dispatched to the default instance, but the
+// fence fencing the independent instance. In that case these tests will fail.
+
+namespace {
+inline constexpr int n = 1 << 10;
+
+TEST(hpx, independent_instances_synchronization_parallel_for_range_policy) {
+  Kokkos::View<int *, Kokkos::Experimental::HPX> a("a", n);
+
+  Kokkos::Experimental::HPX instance{
+      Kokkos::Experimental::HPX::instance_mode::independent};
+  Kokkos::RangePolicy<Kokkos::Experimental::HPX> policy(instance, 0, n);
+  Kokkos::parallel_for(
+      "parallel_for_range_policy", policy,
+      KOKKOS_LAMBDA(const auto i) { a[i] = i; });
+
+  instance.fence();
+
+  for (int i = 0; i < n; ++i) {
+    ASSERT_EQ(a[i], i);
+  }
+}
+
+TEST(hpx, independent_instances_synchronization_parallel_for_mdrange_policy) {
+  Kokkos::View<int *, Kokkos::Experimental::HPX> a("a", n);
+
+  Kokkos::Experimental::HPX instance{
+      Kokkos::Experimental::HPX::instance_mode::independent};
+  Kokkos::MDRangePolicy<Kokkos::Experimental::HPX, Kokkos::Rank<2>> policy(
+      instance, {{0, 0}}, {{n, 1}});
+  Kokkos::parallel_for(
+      "parallel_for_mdrange_policy", policy,
+      KOKKOS_LAMBDA(const auto i, const auto) { a[i] = i; });
+
+  instance.fence();
+
+  for (int i = 0; i < n; ++i) {
+    ASSERT_EQ(a[i], i);
+  }
+}
+
+TEST(hpx, independent_instances_synchronization_parallel_for_team_policy) {
+  Kokkos::View<int *, Kokkos::Experimental::HPX> a("a", n);
+
+  Kokkos::Experimental::HPX instance{
+      Kokkos::Experimental::HPX::instance_mode::independent};
+  Kokkos::TeamPolicy<Kokkos::Experimental::HPX> policy(instance, n, 1);
+  Kokkos::parallel_for(
+      "parallel_for_team_policy", policy, KOKKOS_LAMBDA(const auto &handle) {
+        a[handle.league_rank()] = handle.league_rank();
+      });
+
+  instance.fence();
+
+  for (int i = 0; i < n; ++i) {
+    ASSERT_EQ(a[i], i);
+  }
+}
+
+TEST(hpx, independent_instances_synchronization_parallel_reduce_range_policy) {
+  Kokkos::View<int *, Kokkos::Experimental::HPX> a("a", n);
+  Kokkos::View<int, Kokkos::Experimental::HPX> b("b");
+
+  Kokkos::Experimental::HPX instance{
+      Kokkos::Experimental::HPX::instance_mode::independent};
+  Kokkos::RangePolicy<Kokkos::Experimental::HPX> policy(instance, 0, n);
+  Kokkos::parallel_reduce(
+      "parallel_reduce_range_policy", policy,
+      KOKKOS_LAMBDA(const int i, int &) { a[i] = i; }, b);
+
+  instance.fence();
+
+  for (int i = 0; i < n; ++i) {
+    ASSERT_EQ(a[i], i);
+  }
+}
+
+TEST(hpx,
+     independent_instances_synchronization_parallel_reduce_mdrange_policy) {
+  Kokkos::View<int *, Kokkos::Experimental::HPX> a("a", n);
+  Kokkos::View<int, Kokkos::Experimental::HPX> b("b");
+
+  Kokkos::Experimental::HPX instance{
+      Kokkos::Experimental::HPX::instance_mode::independent};
+  Kokkos::MDRangePolicy<Kokkos::Experimental::HPX, Kokkos::Rank<2>> policy(
+      instance, {{0, 0}}, {{n, 1}});
+  Kokkos::parallel_reduce(
+      "parallel_reduce_mdrange_policy", policy,
+      KOKKOS_LAMBDA(const int i, const int, int &) { a[i] = i; }, b);
+
+  instance.fence();
+
+  for (int i = 0; i < n; ++i) {
+    ASSERT_EQ(a[i], i);
+  }
+}
+
+TEST(hpx, independent_instances_synchronization_parallel_reduce_team_policy) {
+  Kokkos::View<int *, Kokkos::Experimental::HPX> a("a", n);
+  Kokkos::View<int, Kokkos::Experimental::HPX> b("b");
+
+  Kokkos::Experimental::HPX instance{
+      Kokkos::Experimental::HPX::instance_mode::independent};
+  Kokkos::TeamPolicy<Kokkos::Experimental::HPX> policy(instance, n, 1);
+  Kokkos::parallel_reduce(
+      "parallel_reduce_team_policy", policy,
+      KOKKOS_LAMBDA(const decltype(policy)::member_type &handle, int &) {
+        a[handle.league_rank()] = handle.league_rank();
+      },
+      b);
+
+  instance.fence();
+
+  for (int i = 0; i < n; ++i) {
+    ASSERT_EQ(a[i], i);
+  }
+}
+
+TEST(hpx, independent_instances_synchronization_parallel_scan_range_policy) {
+  Kokkos::View<int *, Kokkos::Experimental::HPX> a("a", n);
+  Kokkos::View<int *, Kokkos::Experimental::HPX> b("b", n);
+
+  Kokkos::Experimental::HPX instance{
+      Kokkos::Experimental::HPX::instance_mode::independent};
+  Kokkos::RangePolicy<Kokkos::Experimental::HPX> policy(instance, 0, n);
+  Kokkos::parallel_scan(
+      "parallel_scan_range_policy", policy,
+      KOKKOS_LAMBDA(const int i, int &, bool final) {
+        if (!final) {
+          a[i] = i;
+        }
+      },
+      b);
+
+  instance.fence();
+
+  for (int i = 0; i < n; ++i) {
+    ASSERT_EQ(a[i], i);
+  }
+}
+}  // namespace
diff --git a/packages/kokkos/core/unit_test/incremental/Test05_ParallelReduce_RangePolicy.hpp b/packages/kokkos/core/unit_test/incremental/Test05_ParallelReduce_RangePolicy.hpp
index 4235c73c8e69bd1f7b454a6fd75925b920a2c54f..ed22c22d709f18a38480f551352de1f538cef5f9 100644
--- a/packages/kokkos/core/unit_test/incremental/Test05_ParallelReduce_RangePolicy.hpp
+++ b/packages/kokkos/core/unit_test/incremental/Test05_ParallelReduce_RangePolicy.hpp
@@ -28,8 +28,11 @@ using value_type       = double;
 constexpr double value = 0.5;
 
 struct ReduceFunctor {
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const int i, double &UpdateSum) const {
+  // The functor is templated on purpose to check that the value_type deduction
+  // in parallel_reduce even works in this case.
+  template <typename IndexType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION void operator()(const IndexType i,
+                                         ValueType &UpdateSum) const {
     UpdateSum += (i + 1) * value;
   }
 };
@@ -45,6 +48,7 @@ struct NonTrivialReduceFunctor {
   NonTrivialReduceFunctor(NonTrivialReduceFunctor &&)      = default;
   NonTrivialReduceFunctor &operator=(NonTrivialReduceFunctor &&) = default;
   NonTrivialReduceFunctor &operator=(NonTrivialReduceFunctor const &) = default;
+  // Also make sure that it's OK if the destructor is not device-callable.
   ~NonTrivialReduceFunctor() {}
 };
 
diff --git a/packages/kokkos/core/unit_test/incremental/Test12a_ThreadScratch.hpp b/packages/kokkos/core/unit_test/incremental/Test12a_ThreadScratch.hpp
index a4cd4fc56f509c3a08e7042f2142613dd0ea251e..8c97043f30087e78f18783a05e0ac6cb591e3bf3 100644
--- a/packages/kokkos/core/unit_test/incremental/Test12a_ThreadScratch.hpp
+++ b/packages/kokkos/core/unit_test/incremental/Test12a_ThreadScratch.hpp
@@ -98,6 +98,10 @@ struct ThreadScratch {
 
 TEST(TEST_CATEGORY, IncrTest_12a_ThreadScratch) {
   ThreadScratch<TEST_EXECSPACE> test;
+#ifdef KOKKOS_ENABLE_OPENACC  // FIXME_OPENACC
+  GTEST_SKIP() << "skipping since scratch memory is not yet implemented in the "
+                  "OpenACC backend";
+#endif
   // FIXME_OPENMPTARGET - team_size has to be a multiple of 32 for the tests to
   // pass in the Release and RelWithDebInfo builds. Does not need the team_size
   // to be a multiple of 32 for the Debug builds.
diff --git a/packages/kokkos/core/unit_test/incremental/Test12b_TeamScratch.hpp b/packages/kokkos/core/unit_test/incremental/Test12b_TeamScratch.hpp
index 9b27e35dfe9b51dfe5e025d50c88299788de20d6..0ebb5c50fbce40c0a92b96050d3952e4640d9f98 100644
--- a/packages/kokkos/core/unit_test/incremental/Test12b_TeamScratch.hpp
+++ b/packages/kokkos/core/unit_test/incremental/Test12b_TeamScratch.hpp
@@ -88,6 +88,10 @@ struct TeamScratch {
 
 TEST(TEST_CATEGORY, IncrTest_12b_TeamScratch) {
   TeamScratch<TEST_EXECSPACE> test;
+#ifdef KOKKOS_ENABLE_OPENACC  // FIXME_OPENACC
+  GTEST_SKIP() << "skipping since scratch memory is not yet implemented in the "
+                  "OpenACC backend";
+#endif
   // FIXME_OPENMPTARGET - team_size has to be a multiple of 32 for the tests to
   // pass in the Release and RelWithDebInfo builds. Does not need the team_size
   // to be a multiple of 32 for the Debug builds.
diff --git a/packages/kokkos/core/unit_test/incremental/Test14_MDRangeReduce.hpp b/packages/kokkos/core/unit_test/incremental/Test14_MDRangeReduce.hpp
index a1f307c0c82de68b5debf7bd0c2ae4a51c342fd2..f79d4f8ea37a0247e1f7478781132ce2d0d61f34 100644
--- a/packages/kokkos/core/unit_test/incremental/Test14_MDRangeReduce.hpp
+++ b/packages/kokkos/core/unit_test/incremental/Test14_MDRangeReduce.hpp
@@ -38,6 +38,13 @@ struct MyComplex {
   KOKKOS_INLINE_FUNCTION
   MyComplex(const MyComplex& src) : _re(src._re), _im(src._im) {}
 
+  KOKKOS_INLINE_FUNCTION
+  MyComplex& operator=(const MyComplex& src) {
+    _re = src._re;
+    _im = src._im;
+    return *this;
+  }
+
   KOKKOS_INLINE_FUNCTION
   void operator+=(const MyComplex& src) {
     _re += src._re;
@@ -93,6 +100,8 @@ struct TestMDRangeReduce {
         },
         d_result);
 
+// FIXME_OPENACC: scalar reduction variable on the device is not yet supported.
+#if !defined(KOKKOS_ENABLE_OPENACC)
     // Parallel reduce on a view.
     Kokkos::parallel_reduce(
         mdPolicy_2D,
@@ -100,16 +109,23 @@ struct TestMDRangeReduce {
           update_value += d_data(i, j);
         },
         d_resultView);
+#endif
 
     // Check correctness.
     ASSERT_EQ(h_result, d_result);
 
+// FIXME_OPENACC: scalar reduction variable on the device is not yet supported.
+#if !defined(KOKKOS_ENABLE_OPENACC)
     // Copy view back to host.
     value_type view_result = 0.0;
     Kokkos::deep_copy(view_result, d_resultView);
     ASSERT_EQ(h_result, view_result);
+#endif
   }
 
+// FIXME_OPENACC: custom reductions are not yet supported in the
+// OpenACC backend.
+#if !defined(KOKKOS_ENABLE_OPENACC)
   // Custom Reduction
   void reduce_custom() {
     Complex_View_1D d_data("complex array", N);
@@ -136,6 +152,7 @@ struct TestMDRangeReduce {
     ASSERT_EQ(result._re, sum * 0.5);
     ASSERT_EQ(result._im, -sum * 0.5);
   }
+#endif
 };
 
 // Reductions tests for MDRange policy and customized reduction.
@@ -144,9 +161,13 @@ TEST(TEST_CATEGORY, incr_14_MDrangeReduce) {
   test.reduce_MDRange();
 // FIXME_OPENMPTARGET: custom reductions are not yet supported in the
 // OpenMPTarget backend.
+// FIXME_OPENACC: custom reductions are not yet supported in the
+// OpenACC backend.
 #if !defined(KOKKOS_ENABLE_OPENMPTARGET)
+#if !defined(KOKKOS_ENABLE_OPENACC)
   test.reduce_custom();
 #endif
+#endif
 }
 
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/incremental/Test16_ParallelScan.hpp b/packages/kokkos/core/unit_test/incremental/Test16_ParallelScan.hpp
index 78f791914a93c10bcb62e3c911385224b969fba8..efcb19a5c6b654c4bdca9a245674456d3a165768 100644
--- a/packages/kokkos/core/unit_test/incremental/Test16_ParallelScan.hpp
+++ b/packages/kokkos/core/unit_test/incremental/Test16_ParallelScan.hpp
@@ -26,11 +26,60 @@ namespace Test {
 using value_type = double;
 const int N      = 10;
 
+template <typename ExecSpace>
+struct TrivialScanFunctor {
+  Kokkos::View<value_type *, ExecSpace> d_data;
+
+  KOKKOS_FUNCTION
+  void operator()(const int i, value_type &update_value,
+                  const bool final) const {
+    const value_type val_i = d_data(i);
+    if (final) d_data(i) = update_value;
+    update_value += val_i;
+  }
+};
+
+template <typename ExecSpace>
+struct NonTrivialScanFunctor {
+  Kokkos::View<value_type *, ExecSpace> d_data;
+
+  KOKKOS_FUNCTION
+  void operator()(const int i, value_type &update_value,
+                  const bool final) const {
+    const value_type val_i = d_data(i);
+    if (final) d_data(i) = update_value;
+    update_value += val_i;
+  }
+
+  NonTrivialScanFunctor(const Kokkos::View<value_type *, ExecSpace> &data)
+      : d_data(data) {}
+  NonTrivialScanFunctor(NonTrivialScanFunctor const &) = default;
+  NonTrivialScanFunctor(NonTrivialScanFunctor &&)      = default;
+  NonTrivialScanFunctor &operator=(NonTrivialScanFunctor &&) = default;
+  NonTrivialScanFunctor &operator=(NonTrivialScanFunctor const &) = default;
+  // Also make sure that it's OK if the destructor is not device-callable.
+  ~NonTrivialScanFunctor() {}
+};
+
+template <typename ExecSpace>
+struct GenericExclusiveScanFunctor {
+  Kokkos::View<value_type *, ExecSpace> d_data;
+
+  template <typename IndexType, typename ValueType>
+  KOKKOS_FUNCTION void operator()(const IndexType i, ValueType &update_value,
+                                  const bool final) const {
+    const ValueType val_i = d_data(i);
+    if (final) d_data(i) = update_value;
+    update_value += val_i;
+  }
+};
+
 template <class ExecSpace>
 struct TestScan {
   // 1D  View of double
   using View_1D = typename Kokkos::View<value_type *, ExecSpace>;
 
+  template <typename FunctorType>
   void parallel_scan() {
     View_1D d_data("data", N);
 
@@ -39,15 +88,44 @@ struct TestScan {
         Kokkos::RangePolicy<ExecSpace>(0, N),
         KOKKOS_LAMBDA(const int i) { d_data(i) = i * 0.5; });
 
-    // Exclusive parallel_scan call.
-    Kokkos::parallel_scan(
-        Kokkos::RangePolicy<ExecSpace>(0, N),
-        KOKKOS_LAMBDA(const int i, value_type &update_value, const bool final) {
-          const value_type val_i = d_data(i);
-          if (final) d_data(i) = update_value;
+    // Exclusive parallel_scan call
+    Kokkos::parallel_scan(Kokkos::RangePolicy<ExecSpace>(0, N),
+                          FunctorType{d_data});
+
+    // Copy back the data.
+    auto h_data =
+        Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), d_data);
+
+    // Check Correctness
+    ASSERT_EQ(h_data(0), 0.0);
+    value_type upd = h_data(0);
+    for (int i = 1; i < N; ++i) {
+      upd += (i - 1) * 0.5;
+      ASSERT_EQ(h_data(i), upd);
+    }
+  }
+};
+
+template <class ExecSpace>
+struct TestScanWithTotal {
+  // 1D  View of double
+  using View_1D  = typename Kokkos::View<value_type *, ExecSpace>;
+  View_1D d_data = View_1D("data", N);
+
+  template <typename IndexType>
+  KOKKOS_FUNCTION void operator()(IndexType i) const {
+    d_data(i) = i * 0.5;
+  }
+
+  template <typename FunctorType>
+  void parallel_scan() {
+    // Initialize data.
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, N), *this);
 
-          update_value += val_i;
-        });
+    value_type total;
+    // Exclusive parallel_scan call
+    Kokkos::parallel_scan(Kokkos::RangePolicy<ExecSpace>(0, N),
+                          FunctorType{d_data}, total);
 
     // Copy back the data.
     auto h_data =
@@ -60,12 +138,18 @@ struct TestScan {
       upd += (i - 1) * 0.5;
       ASSERT_EQ(h_data(i), upd);
     }
+    ASSERT_EQ(total, N * (N - 1) * 0.25);
   }
 };
 
 TEST(TEST_CATEGORY, IncrTest_16_parallelscan) {
   TestScan<TEST_EXECSPACE> test;
-  test.parallel_scan();
+  test.parallel_scan<TrivialScanFunctor<TEST_EXECSPACE>>();
+  test.parallel_scan<NonTrivialScanFunctor<TEST_EXECSPACE>>();
+  TestScanWithTotal<TEST_EXECSPACE> test_total;
+  test_total.parallel_scan<TrivialScanFunctor<TEST_EXECSPACE>>();
+  test_total.parallel_scan<NonTrivialScanFunctor<TEST_EXECSPACE>>();
+  test_total.parallel_scan<GenericExclusiveScanFunctor<TEST_EXECSPACE>>();
 }
 
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP.hpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP.hpp
deleted file mode 100644
index 3a974d517cea0fed3e2e28d6fe80fe9842c6a999..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/openmp/TestOpenMP.hpp
+++ /dev/null
@@ -1,80 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#ifndef KOKKOS_TEST_OPENMP_HPP
-#define KOKKOS_TEST_OPENMP_HPP
-
-#include <gtest/gtest.h>
-
-#include <Kokkos_Macros.hpp>
-
-#ifdef KOKKOS_LAMBDA
-#undef KOKKOS_LAMBDA
-#endif
-#define KOKKOS_LAMBDA [=]
-
-#include <Kokkos_Core.hpp>
-
-#include <TestViewMapping.hpp>
-#include <TestViewAPI.hpp>
-#include <TestViewOfClass.hpp>
-#include <TestViewSubview.hpp>
-#include <TestAtomic.hpp>
-#include <TestAtomicOperations.hpp>
-#include <TestAtomicViews.hpp>
-#include <TestRange.hpp>
-#include <TestTeam.hpp>
-#include <TestReduce.hpp>
-#include <TestScan.hpp>
-#include <TestAggregate.hpp>
-#include <TestCompilerMacros.hpp>
-#include <TestTaskScheduler.hpp>
-#include <TestMemoryPool.hpp>
-#include <TestCXX11.hpp>
-#include <TestCXX11Deduction.hpp>
-#include <TestTeamVector.hpp>
-#include <TestPolicyConstruction.hpp>
-#include <TestMDRange.hpp>
-#include <TestConcurrentBitset.hpp>
-
-namespace Test {
-
-class openmp : public ::testing::Test {
- protected:
-  static void SetUpTestCase() {
-    int threads_count = 0;
-#pragma omp parallel
-    {
-#pragma omp atomic
-      ++threads_count;
-    }
-
-    if (threads_count > 3) {
-      threads_count /= 2;
-    }
-
-    Kokkos::OpenMP::initialize(threads_count);
-    Kokkos::print_configuration(std::cout, true);
-
-    srand(10231);
-  }
-
-  static void TearDownTestCase() { Kokkos::OpenMP::finalize(); }
-};
-
-}  // namespace Test
-
-#endif
diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_PartitionMaster.cpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_PartitionMaster.cpp
index 6983fabb2acb226b50e1baad6cdd71e70f19555a..92b8032bf0c44eed543e974e0d8e706d4a50d9f5 100644
--- a/packages/kokkos/core/unit_test/openmp/TestOpenMP_PartitionMaster.cpp
+++ b/packages/kokkos/core/unit_test/openmp/TestOpenMP_PartitionMaster.cpp
@@ -29,7 +29,7 @@ TEST(openmp, partition_master) {
   int errors = 0;
 
   auto master = [&errors, &mtx](int /*partition_id*/, int /*num_partitions*/) {
-    const int pool_size = Kokkos::OpenMP::impl_thread_pool_size();
+    const int pool_size = Kokkos::OpenMP().impl_thread_pool_size();
 
     {
       std::unique_lock<Mutex> lock(mtx);
@@ -46,7 +46,7 @@ TEST(openmp, partition_master) {
       Kokkos::parallel_reduce(
           Kokkos::RangePolicy<Kokkos::OpenMP>(0, 1000),
           [pool_size](const int, int& errs) {
-            if (Kokkos::OpenMP::impl_thread_pool_size() != pool_size) {
+            if (Kokkos::OpenMP().impl_thread_pool_size() != pool_size) {
               ++errs;
             }
           },
diff --git a/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget.hpp b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget.hpp
deleted file mode 100644
index 6ae45620f256043787c148fd647664fc684f991e..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget.hpp
+++ /dev/null
@@ -1,79 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//@HEADER
-
-#ifndef KOKKOS_TEST_OPENMPTARGET_HPP
-#define KOKKOS_TEST_OPENMPTARGET_HPP
-
-#include <gtest/gtest.h>
-
-#include <Kokkos_Macros.hpp>
-
-#ifdef KOKKOS_LAMBDA
-#undef KOKKOS_LAMBDA
-#endif
-#define KOKKOS_LAMBDA [=]
-
-#include <Kokkos_Core.hpp>
-
-//#include <TestViewAPI.hpp>
-//#include <TestViewOfClass.hpp>
-//#include <TestViewSubview.hpp>
-//#include <TestAtomic.hpp>
-//#include <TestAtomicOperations.hpp>
-//#include <TestAtomicViews.hpp>
-#include <TestRange.hpp>
-#include <TestTeam.hpp>
-//#include <TestReduce.hpp>
-//#include <TestScan.hpp>
-//#include <TestAggregate.hpp>
-//#include <TestCompilerMacros.hpp>
-
-// TODO enable task scheduler tests for openmptarget
-//#include <TestTaskScheduler.hpp>
-
-//#include <TestMemoryPool.hpp>
-//#include <TestCXX11.hpp>
-//#include <TestCXX11Deduction.hpp>
-#include <TestTeamVector.hpp>
-//#include <TestPolicyConstruction.hpp>
-//#include <TestMDRange.hpp>
-
-namespace Test {
-
-class openmptarget : public ::testing::Test {
- protected:
-  static void SetUpTestCase() {
-    const unsigned numa_count = Kokkos::hwloc::get_available_numa_count();
-    const unsigned cores_per_numa =
-        Kokkos::hwloc::get_available_cores_per_numa();
-    const unsigned openmptarget_per_core =
-        Kokkos::hwloc::get_available_openmptarget_per_core();
-
-    unsigned openmptarget_count = 0;
-
-    openmptarget_count = std::max(1u, numa_count) *
-                         std::max(2u, cores_per_numa * openmptarget_per_core);
-
-    Kokkos::OpenMPTarget::initialize(openmptarget_count);
-    Kokkos::print_configuration(std::cout, true /* detailed */);
-  }
-
-  static void TearDownTestCase() { Kokkos::OpenMPTarget::finalize(); }
-};
-
-}  // namespace Test
-
-#endif
diff --git a/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init.cpp b/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init.cpp
index 25c5c9a50ceed9031b600a992737d5e47b22aed8..8b6f08c14ad634f2a9dc269ba6e7d85e577dc915 100644
--- a/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init.cpp
+++ b/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init.cpp
@@ -29,8 +29,8 @@ TEST(sycl, raw_sycl_interop) {
   Kokkos::Experimental::SYCL default_space;
   sycl::context default_context = default_space.sycl_queue().get_context();
 
-  sycl::default_selector device_selector;
-  sycl::queue queue(default_context, device_selector);
+  sycl::queue queue(default_context, sycl::default_selector_v,
+                    sycl::property::queue::in_order());
   constexpr int n = 100;
   int* p          = sycl::malloc_device<int>(n, queue);
   {
diff --git a/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init_Context.cpp b/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init_Context.cpp
index 336a5d59c32fc0d3043102a50e2a51569633d1d2..4811fb6d976544610ad10c13aa8cfc78da7d4b7c 100644
--- a/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init_Context.cpp
+++ b/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init_Context.cpp
@@ -27,8 +27,8 @@ TEST(sycl, raw_sycl_interop_context_1) {
   Kokkos::Experimental::SYCL default_space;
   sycl::context default_context = default_space.sycl_queue().get_context();
 
-  sycl::default_selector device_selector;
-  sycl::queue queue(default_context, device_selector);
+  sycl::queue queue(default_context, sycl::default_selector_v,
+                    sycl::property::queue::in_order());
   constexpr int n = 100;
   int* p          = sycl::malloc_device<int>(n, queue);
 
@@ -61,8 +61,8 @@ TEST(sycl, raw_sycl_interop_context_2) {
   Kokkos::Experimental::SYCL default_space;
   sycl::context default_context = default_space.sycl_queue().get_context();
 
-  sycl::default_selector device_selector;
-  sycl::queue queue(default_context, device_selector);
+  sycl::queue queue(default_context, sycl::default_selector_v,
+                    sycl::property::queue::in_order());
   constexpr int n = 100;
 
   Kokkos::Experimental::SYCL space(queue);
diff --git a/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Streams.cpp b/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Streams.cpp
index 13810d861ca0b4cb45844e9e602a53de243148bc..c0070adb0cb2a400441dc2bd571b5f2142d2403e 100644
--- a/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Streams.cpp
+++ b/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Streams.cpp
@@ -25,8 +25,8 @@ TEST(sycl, raw_sycl_queues) {
   Kokkos::Experimental::SYCL default_space;
   sycl::context default_context = default_space.sycl_queue().get_context();
 
-  sycl::default_selector device_selector;
-  sycl::queue queue(default_context, device_selector);
+  sycl::queue queue(default_context, sycl::default_selector_v,
+                    sycl::property::queue::in_order());
   int* p            = sycl::malloc_device<int>(100, queue);
   using MemorySpace = typename TEST_EXECSPACE::memory_space;
 
diff --git a/packages/kokkos/core/unit_test/sycl/TestSYCL_TeamScratchStreams.cpp b/packages/kokkos/core/unit_test/sycl/TestSYCL_TeamScratchStreams.cpp
index 11207a5480f0345a24911cf4d8f967e715889508..9ab89df977a05e073dabb887c41dc72a70b2d503 100644
--- a/packages/kokkos/core/unit_test/sycl/TestSYCL_TeamScratchStreams.cpp
+++ b/packages/kokkos/core/unit_test/sycl/TestSYCL_TeamScratchStreams.cpp
@@ -76,13 +76,14 @@ void sycl_queue_scratch_test(
   Kokkos::Experimental::SYCL default_space;
   sycl::context default_context = default_space.sycl_queue().get_context();
 
-  sycl::default_selector device_selector;
-  sycl::queue queue(default_context, device_selector);
+  sycl::queue queue(default_context, sycl::default_selector_v,
+                    sycl::property::queue::in_order());
 
   std::array<Kokkos::Experimental::SYCL, K> sycl;
   for (int i = 0; i < K; i++) {
     sycl[i] = Kokkos::Experimental::SYCL(
-        sycl::queue(default_context, device_selector));
+        sycl::queue(default_context, sycl::default_selector_v,
+                    sycl::property::queue::in_order()));
   }
 
   // Test that growing scratch size in subsequent calls doesn't crash things
diff --git a/packages/kokkos/core/unit_test/tools/TestEventCorrectness.hpp b/packages/kokkos/core/unit_test/tools/TestEventCorrectness.hpp
index 408160411327cc67ef9c1b535cff1f47fe38956b..3c85f661aaeafcff4016ba831cede90b19cb4b44 100644
--- a/packages/kokkos/core/unit_test/tools/TestEventCorrectness.hpp
+++ b/packages/kokkos/core/unit_test/tools/TestEventCorrectness.hpp
@@ -197,7 +197,7 @@ TEST(kokkosp, test_multiple_default_instances) {
       ex1.fence("named_instance_fence_one");
       ex2.fence("named_instance_fence_two");
     });
-    ASSERT_TRUE(found_payloads[0].dev_id == found_payloads[1].dev_id);
+    ASSERT_EQ(found_payloads[0].dev_id, found_payloads[1].dev_id);
   });
 }
 
@@ -393,6 +393,13 @@ TEST(kokkosp, parallel_scan_no_fence) {
 #ifdef KOKKOS_ENABLE_THREADS
   if (std::is_same<Kokkos::DefaultExecutionSpace, Kokkos::Threads>::value)
     GTEST_SKIP() << "skipping since the Thread backend always fences";
+#endif
+#if defined(KOKKOS_ENABLE_HPX) && \
+    !defined(KOKKOS_ENABLE_IMPL_HPX_ASYNC_DISPATCH)
+  if (std::is_same<Kokkos::DefaultExecutionSpace,
+                   Kokkos::Experimental::HPX>::value)
+    GTEST_SKIP() << "skipping since the HPX backend always fences with async "
+                    "dispatch disabled";
 #endif
     // FIXME_OPENMPTARGET
 #ifdef KOKKOS_ENABLE_OPENMPTARGET
@@ -427,6 +434,13 @@ TEST(kokkosp, parallel_scan_no_fence_view) {
 #ifdef KOKKOS_ENABLE_THREADS
   if (std::is_same<Kokkos::DefaultExecutionSpace, Kokkos::Threads>::value)
     GTEST_SKIP() << "skipping since the Thread backend always fences";
+#endif
+#if defined(KOKKOS_ENABLE_HPX) && \
+    !defined(KOKKOS_ENABLE_IMPL_HPX_ASYNC_DISPATCH)
+  if (std::is_same<Kokkos::DefaultExecutionSpace,
+                   Kokkos::Experimental::HPX>::value)
+    GTEST_SKIP() << "skipping since the HPX backend always fences with async "
+                    "dispatch disabled";
 #endif
     // FIXME_OPENMPTARGET
 #ifdef KOKKOS_ENABLE_OPENMPTARGET
@@ -716,7 +730,7 @@ TEST(kokkosp, get_events) {
   });
   for (const auto& ptr : event_vector) {
     auto ptr_as_begin = std::dynamic_pointer_cast<BeginParallelForEvent>(ptr);
-    ASSERT_TRUE(ptr_as_begin == nullptr);
+    ASSERT_EQ(ptr_as_begin, nullptr);
   }
 }
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/tools/TestLogicalSpaces.hpp b/packages/kokkos/core/unit_test/tools/TestLogicalSpaces.hpp
index 1b20bd5933885c936efe2e09c75875c0b53d2e11..4e56f8996a03e2da0821b5b083b20285bbe8d7f8 100644
--- a/packages/kokkos/core/unit_test/tools/TestLogicalSpaces.hpp
+++ b/packages/kokkos/core/unit_test/tools/TestLogicalSpaces.hpp
@@ -165,7 +165,7 @@ TEST(defaultdevicetype, access_allowed) {
   test_allowed_access<fake_memory_space>();
 }
 // FIXME_SYCL
-#if !(defined(KOKKOS_COMPILER_INTEL) && defined(KOKKOS_ENABLE_SYCL))
+#if !(defined(KOKKOS_COMPILER_INTEL_LLVM) && defined(KOKKOS_ENABLE_SYCL))
 TEST(defaultdevicetype_DeathTest, access_forbidden) {
   ::testing::FLAGS_gtest_death_test_style = "threadsafe";
   ASSERT_DEATH(
diff --git a/packages/kokkos/core/unit_test/tools/TestScopedRegion.cpp b/packages/kokkos/core/unit_test/tools/TestScopedRegion.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5306496d764d7b3b7038a329d81c46bc3eea86f7
--- /dev/null
+++ b/packages/kokkos/core/unit_test/tools/TestScopedRegion.cpp
@@ -0,0 +1,72 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <Kokkos_Profiling_ScopedRegion.hpp>
+
+#include <gtest/gtest.h>
+
+#include <string>
+#include <stack>
+
+namespace {
+
+std::stack<std::string> test_region_stack;
+
+// NOTE: cannot use lambdas because they can only be converted to function
+// pointers if they don't capture anything
+void test_push_region(char const *label) { test_region_stack.push(label); }
+
+void test_pop_region() { test_region_stack.pop(); }
+
+TEST(defaultdevicetype, scoped_profile_region) {
+  Kokkos::Tools::Experimental::set_push_region_callback(test_push_region);
+  Kokkos::Tools::Experimental::set_pop_region_callback(test_pop_region);
+
+  ASSERT_TRUE(test_region_stack.empty());
+
+  {
+    std::string outer_identifier = "outer";
+    Kokkos::Profiling::ScopedRegion guard_outer(outer_identifier);
+
+    ASSERT_EQ(test_region_stack.size(), 1u);
+    ASSERT_EQ(test_region_stack.top(), outer_identifier);
+
+    {
+      std::string inner_identifier = "inner";
+      Kokkos::Profiling::ScopedRegion guard_inner(inner_identifier);
+      ASSERT_EQ(test_region_stack.size(), 2u);
+      ASSERT_EQ(test_region_stack.top(), inner_identifier);
+    }
+
+    ASSERT_EQ(test_region_stack.size(), 1u);
+    ASSERT_EQ(test_region_stack.top(), outer_identifier);
+  }
+
+  ASSERT_TRUE(test_region_stack.empty());
+
+  // Unset callbacks
+  Kokkos::Tools::Experimental::set_push_region_callback(nullptr);
+  Kokkos::Tools::Experimental::set_pop_region_callback(nullptr);
+}
+
+using Kokkos::Profiling::ScopedRegion;
+static_assert(!std::is_default_constructible<ScopedRegion>::value);
+static_assert(!std::is_copy_constructible<ScopedRegion>::value);
+static_assert(!std::is_move_constructible<ScopedRegion>::value);
+static_assert(!std::is_copy_assignable<ScopedRegion>::value);
+static_assert(!std::is_move_assignable<ScopedRegion>::value);
+
+}  // namespace
diff --git a/packages/kokkos/core/unit_test/tools/TestToolsInitialization.cpp b/packages/kokkos/core/unit_test/tools/TestToolsInitialization.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1ae974ca7147e2ae76ca732e08b83a66cffb35ae
--- /dev/null
+++ b/packages/kokkos/core/unit_test/tools/TestToolsInitialization.cpp
@@ -0,0 +1,216 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+// This file calls most of the basic Kokkos primitives. When combined with a
+// testing library this tests that our shared-library loading based profiling
+// mechanisms work
+
+#include <Kokkos_Core.hpp>
+#include <gtest/gtest.h>
+
+bool init_callback                               = false;
+bool finalize_callback                           = false;
+bool begin_parallel_for_callback                 = false;
+bool end_parallel_for_callback                   = false;
+bool begin_parallel_reduce_callback              = false;
+bool end_parallel_reduce_callback                = false;
+bool begin_parallel_scan_callback                = false;
+bool end_parallel_scan_callback                  = false;
+bool push_region_callback                        = false;
+bool pop_region_callback                         = false;
+bool allocate_data_callback                      = false;
+bool deallocate_data_callback                    = false;
+bool create_profile_section_callback             = false;
+bool start_profile_section_callback              = false;
+bool stop_profile_section_callback               = false;
+bool destroy_profile_section_callback            = false;
+bool profile_event_callback                      = false;
+bool begin_deep_copy_callback                    = false;
+bool end_deep_copy_callback                      = false;
+bool begin_fence_callback                        = false;
+bool end_fence_callback                          = false;
+bool declare_metadata_callback                   = false;
+bool request_tool_settings_callback              = false;
+bool provide_tool_programming_interface_callback = false;
+
+void test_tools_initialization_with_callbacks() {
+  Kokkos::Tools::Experimental::set_init_callback(
+      [](const int /*loadseq*/, const uint64_t /*version*/,
+         const uint32_t /*num_infos*/,
+         Kokkos::Profiling::KokkosPDeviceInfo* /*infos*/) {
+        init_callback = true;
+      });
+  Kokkos::Tools::Experimental::set_finalize_callback(
+      []() { finalize_callback = true; });
+  Kokkos::Tools::Experimental::set_begin_parallel_for_callback(
+      [](const char* /*n*/, const uint32_t /*d*/, uint64_t* /*k*/) {
+        begin_parallel_for_callback = true;
+      });
+  Kokkos::Tools::Experimental::set_end_parallel_for_callback(
+      [](const uint64_t /*k*/) { end_parallel_for_callback = true; });
+  Kokkos::Tools::Experimental::set_begin_parallel_reduce_callback(
+      [](const char* /*n*/, const uint32_t /*d*/, uint64_t* /*k*/) {
+        begin_parallel_reduce_callback = true;
+      });
+  Kokkos::Tools::Experimental::set_end_parallel_reduce_callback(
+      [](const uint64_t /*k*/) { end_parallel_reduce_callback = true; });
+  Kokkos::Tools::Experimental::set_begin_parallel_scan_callback(
+      [](const char* /*n*/, const uint32_t /*d*/, uint64_t* /*k*/) {
+        begin_parallel_scan_callback = true;
+      });
+  Kokkos::Tools::Experimental::set_end_parallel_scan_callback(
+      [](const uint64_t /*k*/) { end_parallel_scan_callback = true; });
+  Kokkos::Tools::Experimental::set_push_region_callback(
+      [](const char* /*name*/) { push_region_callback = true; });
+  Kokkos::Tools::Experimental::set_pop_region_callback(
+      []() { pop_region_callback = true; });
+  Kokkos::Tools::Experimental::set_allocate_data_callback(
+      [](Kokkos::Tools::SpaceHandle /*handle*/, const char* /*name*/,
+         const void* /*ptr*/,
+         const uint64_t /*size*/) { allocate_data_callback = true; });
+  Kokkos::Tools::Experimental::set_deallocate_data_callback(
+      [](Kokkos::Tools::SpaceHandle /*handle*/, const char* /*name*/,
+         const void* /*ptr*/,
+         const uint64_t /*size*/) { deallocate_data_callback = true; });
+  Kokkos::Tools::Experimental::set_create_profile_section_callback(
+      [](const char* /*name*/, uint32_t* /*id*/) {
+        create_profile_section_callback = true;
+      });
+  Kokkos::Tools::Experimental::set_destroy_profile_section_callback(
+      [](const uint32_t /*id*/) { destroy_profile_section_callback = true; });
+  Kokkos::Tools::Experimental::set_start_profile_section_callback(
+      [](uint32_t /*id*/) { start_profile_section_callback = true; });
+  Kokkos::Tools::Experimental::set_stop_profile_section_callback(
+      [](uint32_t /*id*/) { stop_profile_section_callback = true; });
+  Kokkos::Tools::Experimental::set_profile_event_callback(
+      [](const char* /*name*/) { profile_event_callback = true; });
+  Kokkos::Tools::Experimental::set_begin_deep_copy_callback(
+      [](Kokkos::Tools::SpaceHandle /*dst_handle*/, const char* /*dst_name*/,
+         const void* /*dst_ptr*/, Kokkos::Tools::SpaceHandle /*src_handle*/,
+         const char* /*src_name*/, const void* /*src_ptr*/,
+         uint64_t /*size*/) { begin_deep_copy_callback = true; });
+  Kokkos::Tools::Experimental::set_end_deep_copy_callback(
+      []() { end_deep_copy_callback = true; });
+  Kokkos::Tools::Experimental::set_begin_fence_callback(
+      [](const char* /*n*/, const uint32_t /*d*/, uint64_t* /*k*/) {
+        begin_fence_callback = true;
+      });
+  Kokkos::Tools::Experimental::set_end_fence_callback(
+      [](const uint64_t /*k*/) { end_fence_callback = true; });
+  Kokkos::Tools::Experimental::set_declare_metadata_callback(
+      [](const char* /*key*/, const char* /*value*/) {
+        declare_metadata_callback = true;
+      });
+  Kokkos::Tools::Experimental::set_request_tool_settings_callback(
+      [](const uint32_t /*num_settings*/,
+         Kokkos::Tools::Experimental::ToolSettings* /*settings*/) {
+        request_tool_settings_callback = true;
+      });
+  Kokkos::Tools::Experimental::set_provide_tool_programming_interface_callback(
+      [](const uint32_t /*num_functions*/,
+         Kokkos::Tools::Experimental::ToolProgrammingInterface /*interface*/) {
+        provide_tool_programming_interface_callback = true;
+      });
+
+  Kokkos::initialize();
+  {
+    ASSERT_TRUE(init_callback);
+    ASSERT_FALSE(finalize_callback);
+    ASSERT_FALSE(begin_parallel_for_callback);
+    ASSERT_FALSE(end_parallel_for_callback);
+    ASSERT_FALSE(begin_parallel_reduce_callback);
+    ASSERT_FALSE(end_parallel_reduce_callback);
+    ASSERT_FALSE(begin_parallel_scan_callback);
+    ASSERT_FALSE(end_parallel_scan_callback);
+    ASSERT_FALSE(push_region_callback);
+    ASSERT_FALSE(pop_region_callback);
+    ASSERT_FALSE(allocate_data_callback);
+    ASSERT_FALSE(deallocate_data_callback);
+    ASSERT_FALSE(create_profile_section_callback);
+    ASSERT_FALSE(start_profile_section_callback);
+    ASSERT_FALSE(stop_profile_section_callback);
+    ASSERT_FALSE(destroy_profile_section_callback);
+    ASSERT_FALSE(profile_event_callback);
+    ASSERT_FALSE(begin_deep_copy_callback);
+    ASSERT_FALSE(end_deep_copy_callback);
+    ASSERT_FALSE(begin_fence_callback);
+    ASSERT_FALSE(end_fence_callback);
+    ASSERT_TRUE(declare_metadata_callback);
+    ASSERT_TRUE(request_tool_settings_callback);
+    ASSERT_TRUE(provide_tool_programming_interface_callback);
+  }
+  {
+    using execution_space = Kokkos::DefaultExecutionSpace;
+    using memory_space    = typename execution_space::memory_space;
+    Kokkos::View<int*, memory_space> src_view("source", 10);
+    Kokkos::View<int*, memory_space> dst_view("destination", 10);
+    Kokkos::deep_copy(dst_view, src_view);
+    Kokkos::parallel_for(
+        "parallel_for", Kokkos::RangePolicy<execution_space>(0, 1),
+        KOKKOS_LAMBDA(int i) { (void)i; });
+    int result;
+    Kokkos::parallel_reduce(
+        "parallel_reduce", Kokkos::RangePolicy<execution_space>(0, 1),
+        KOKKOS_LAMBDA(int i, int& hold_result) { hold_result += i; }, result);
+    Kokkos::parallel_scan(
+        "parallel_scan", Kokkos::RangePolicy<execution_space>(0, 1),
+        KOKKOS_LAMBDA(const int i, int& hold_result, const bool final) {
+          if (final) {
+            hold_result += i;
+          }
+        });
+    Kokkos::Profiling::pushRegion("push_region");
+    Kokkos::Profiling::popRegion();
+    uint32_t sectionId;
+    Kokkos::Profiling::createProfileSection("created_section", &sectionId);
+    Kokkos::Profiling::startSection(sectionId);
+    Kokkos::Profiling::stopSection(sectionId);
+    Kokkos::Profiling::destroyProfileSection(sectionId);
+    Kokkos::Profiling::markEvent("profiling_event");
+    Kokkos::Tools::declareMetadata("dogs", "good");
+  }
+  Kokkos::finalize();
+  {
+    ASSERT_TRUE(init_callback);
+    ASSERT_TRUE(finalize_callback);
+    ASSERT_TRUE(begin_parallel_for_callback);
+    ASSERT_TRUE(end_parallel_for_callback);
+    ASSERT_TRUE(begin_parallel_reduce_callback);
+    ASSERT_TRUE(end_parallel_reduce_callback);
+    ASSERT_TRUE(begin_parallel_scan_callback);
+    ASSERT_TRUE(end_parallel_scan_callback);
+    ASSERT_TRUE(push_region_callback);
+    ASSERT_TRUE(pop_region_callback);
+    ASSERT_TRUE(allocate_data_callback);
+    ASSERT_TRUE(deallocate_data_callback);
+    ASSERT_TRUE(create_profile_section_callback);
+    ASSERT_TRUE(start_profile_section_callback);
+    ASSERT_TRUE(stop_profile_section_callback);
+    ASSERT_TRUE(destroy_profile_section_callback);
+    ASSERT_TRUE(profile_event_callback);
+    ASSERT_TRUE(begin_deep_copy_callback);
+    ASSERT_TRUE(end_deep_copy_callback);
+    ASSERT_TRUE(begin_fence_callback);
+    ASSERT_TRUE(end_fence_callback);
+    ASSERT_TRUE(declare_metadata_callback);
+    ASSERT_TRUE(request_tool_settings_callback);
+    ASSERT_TRUE(provide_tool_programming_interface_callback);
+  }
+}
+
+TEST(tools, initialization_with_callbacks) {
+  test_tools_initialization_with_callbacks();
+}
diff --git a/packages/kokkos/core/unit_test/view/TestExtentsDatatypeConversion.cpp b/packages/kokkos/core/unit_test/view/TestExtentsDatatypeConversion.cpp
index a48052ab8ae5ab7e837c9ef845221690dd089e1d..b95890614e0a60ff31b1e23b392236d96e63b101 100644
--- a/packages/kokkos/core/unit_test/view/TestExtentsDatatypeConversion.cpp
+++ b/packages/kokkos/core/unit_test/view/TestExtentsDatatypeConversion.cpp
@@ -35,55 +35,48 @@ constexpr bool extent_matches_datatype =
 
 // Conversion from DataType to extents
 // 0-rank view
-static_assert(
-    datatype_matches_extent<double, std::experimental::extents<std::size_t>>);
+static_assert(datatype_matches_extent<double, Kokkos::extents<std::size_t>>);
 
 // Only dynamic
 static_assert(datatype_matches_extent<
-              double***, std::experimental::extents<
-                             std::size_t, std::experimental::dynamic_extent,
-                             std::experimental::dynamic_extent,
-                             std::experimental::dynamic_extent>>);
+              double***,
+              Kokkos::extents<std::size_t, Kokkos::dynamic_extent,
+                              Kokkos::dynamic_extent, Kokkos::dynamic_extent>>);
 // Only static
-static_assert(datatype_matches_extent<
-              double[2][3][17],
-              std::experimental::extents<std::size_t, std::size_t{2},
-                                         std::size_t{3}, std::size_t{17}>>);
+static_assert(
+    datatype_matches_extent<double[2][3][17],
+                            Kokkos::extents<std::size_t, std::size_t{2},
+                                            std::size_t{3}, std::size_t{17}>>);
 
 // Both dynamic and static
 static_assert(datatype_matches_extent<
               double* * [3][2][8],
-              std::experimental::extents<
-                  std::size_t, std::experimental::dynamic_extent,
-                  std::experimental::dynamic_extent, std::size_t{3},
-                  std::size_t{2}, std::size_t{8}>>);
+              Kokkos::extents<std::size_t, Kokkos::dynamic_extent,
+                              Kokkos::dynamic_extent, std::size_t{3},
+                              std::size_t{2}, std::size_t{8}>>);
 
 // Conversion from extents to DataType
 // 0-rank extents
-static_assert(extent_matches_datatype<double, double,
-                                      std::experimental::extents<std::size_t>>);
+static_assert(
+    extent_matches_datatype<double, double, Kokkos::extents<std::size_t>>);
 
 // only dynamic
-static_assert(
-    extent_matches_datatype<double****, double,
-                            std::experimental::extents<
-                                std::size_t, std::experimental::dynamic_extent,
-                                std::experimental::dynamic_extent,
-                                std::experimental::dynamic_extent,
-                                std::experimental::dynamic_extent>>);
+static_assert(extent_matches_datatype<
+              double****, double,
+              Kokkos::extents<std::size_t, Kokkos::dynamic_extent,
+                              Kokkos::dynamic_extent, Kokkos::dynamic_extent,
+                              Kokkos::dynamic_extent>>);
 
 // only static
-static_assert(
-    extent_matches_datatype<double[7][5][3], double,
-                            std::experimental::extents<std::size_t, 7, 5, 3>>);
+static_assert(extent_matches_datatype<double[7][5][3], double,
+                                      Kokkos::extents<std::size_t, 7, 5, 3>>);
 
 // both dynamic and static
 static_assert(
     extent_matches_datatype<double** * [20][45], double,
-                            std::experimental::extents<
-                                std::size_t, std::experimental::dynamic_extent,
-                                std::experimental::dynamic_extent,
-                                std::experimental::dynamic_extent, 20, 45>>);
+                            Kokkos::extents<std::size_t, Kokkos::dynamic_extent,
+                                            Kokkos::dynamic_extent,
+                                            Kokkos::dynamic_extent, 20, 45>>);
 }  // namespace
 
 #endif  // KOKKOS_ENABLE_IMPL_MDSPAN
diff --git a/packages/kokkos/example/CMakeLists.txt b/packages/kokkos/example/CMakeLists.txt
index 7ecaec0f241da106de7571697b94dd3987783140..3920dc9a2776444ceacd1a69a956f5a9453c1266 100644
--- a/packages/kokkos/example/CMakeLists.txt
+++ b/packages/kokkos/example/CMakeLists.txt
@@ -1,10 +1,2 @@
-
-
-# Subpackage name must match what appears in kokkos/cmake/Dependencies.cmake
-#
-KOKKOS_SUBPACKAGE(Example)
-
 KOKKOS_ADD_EXAMPLE_DIRECTORIES(query_device)
 KOKKOS_ADD_EXAMPLE_DIRECTORIES(tutorial)
-
-KOKKOS_SUBPACKAGE_POSTPROCESS()
diff --git a/packages/kokkos/example/build_cmake_in_tree/cmake_example.cpp b/packages/kokkos/example/build_cmake_in_tree/cmake_example.cpp
index b345c48f53556c3354fd893b3e125f03617a20a5..2b9a263f87d9c8fbeec677715eed6b72b9392dd5 100644
--- a/packages/kokkos/example/build_cmake_in_tree/cmake_example.cpp
+++ b/packages/kokkos/example/build_cmake_in_tree/cmake_example.cpp
@@ -15,7 +15,9 @@
 //@HEADER
 
 #include <Kokkos_Core.hpp>
+
 #include <cstdio>
+#include <iostream>
 
 int main(int argc, char* argv[]) {
   Kokkos::initialize(argc, argv);
diff --git a/packages/kokkos/example/build_cmake_installed/cmake_example.cpp b/packages/kokkos/example/build_cmake_installed/cmake_example.cpp
index ca11250edd8b28c40d7fb58bdab1156362f275fb..ba501659791cb9fbf07efbdd6164b45d25d55165 100644
--- a/packages/kokkos/example/build_cmake_installed/cmake_example.cpp
+++ b/packages/kokkos/example/build_cmake_installed/cmake_example.cpp
@@ -15,7 +15,9 @@
 //@HEADER
 
 #include <Kokkos_Core.hpp>
+
 #include <cstdio>
+#include <iostream>
 
 extern "C" void print_fortran_();
 
diff --git a/packages/kokkos/example/build_cmake_installed_different_compiler/foo.cpp b/packages/kokkos/example/build_cmake_installed_different_compiler/foo.cpp
index e17d34aab15b8c038b57bb4868673d15ae663d3a..7630802ae95e329b7703a7262c8641b794afc57e 100644
--- a/packages/kokkos/example/build_cmake_installed_different_compiler/foo.cpp
+++ b/packages/kokkos/example/build_cmake_installed_different_compiler/foo.cpp
@@ -15,7 +15,7 @@
 //@HEADER
 
 #include <Kokkos_Core.hpp>
-#include <cstdio>
+#include <iostream>
 
 struct CountFunctor {
   KOKKOS_FUNCTION void operator()(const long i, long& lcount) const {
diff --git a/packages/kokkos/example/build_cmake_installed_kk_as_language/cmake_example.cpp b/packages/kokkos/example/build_cmake_installed_kk_as_language/cmake_example.cpp
index c7f24bd5ab6eaa8fe7360028a0fe893d3d595549..c71d75eeb8cde1df388cdfbea5f26efb5e4b5f17 100644
--- a/packages/kokkos/example/build_cmake_installed_kk_as_language/cmake_example.cpp
+++ b/packages/kokkos/example/build_cmake_installed_kk_as_language/cmake_example.cpp
@@ -15,7 +15,9 @@
 //@HEADER
 
 #include <Kokkos_Core.hpp>
+
 #include <cstdio>
+#include <iostream>
 
 extern "C" void print_fortran_();
 void print_cxx();
diff --git a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/vectorization.cpp b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/vectorization.cpp
index ae9430a93b0ec79b37a4ddc999284a976db2c196..4a7a2cebe337ab082b68a62407f6ff3e464a1393 100644
--- a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/vectorization.cpp
+++ b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/vectorization.cpp
@@ -56,7 +56,8 @@ struct SomeCorrelation {
 
     // With each team run a parallel_for with its threads
     Kokkos::parallel_for(
-        Kokkos::TeamThreadRange(thread, data.extent(1)), [=](const int& j) {
+        Kokkos::TeamThreadRange(thread, data.extent(1)),
+        [=, *this](const int& j) {
           int tsum;
           // Run a vector loop reduction over the inner dimension of data
           // Count how many values are multiples of 4
@@ -64,7 +65,7 @@ struct SomeCorrelation {
           // broadcast to all vector lanes
           Kokkos::parallel_reduce(
               Kokkos::ThreadVectorRange(thread, data.extent(2)),
-              [=](const int& k, int& vsum) {
+              [=, *this](const int& k, int& vsum) {
                 vsum += (data(i, j, k) % 4 == 0) ? 1 : 0;
               },
               tsum);
@@ -103,7 +104,7 @@ struct SomeCorrelation {
     // Add with one thread and vectorlane of the team the team_sum to the global
     // value
     Kokkos::single(Kokkos::PerTeam(thread),
-                   [=]() { Kokkos::atomic_add(&gsum(), team_sum); });
+                   [=, *this]() { Kokkos::atomic_add(&gsum(), team_sum); });
   }
 
   // The functor needs to define how much shared memory it requests given a
diff --git a/packages/kokkos/generate_makefile.bash b/packages/kokkos/generate_makefile.bash
index 018426c9b8e9dca8d612a68ca0e6a29b657fccc4..1b216d9fe35843878bdb6c85c121ddb70efd7e2f 100755
--- a/packages/kokkos/generate_makefile.bash
+++ b/packages/kokkos/generate_makefile.bash
@@ -157,11 +157,12 @@ display_help_text() {
       echo "                 ZEN2            = AMD Zen2-Core CPU"
       echo "                 ZEN3            = AMD Zen3-Core CPU"
       echo "               [AMD: GPU]"
-      echo "                 VEGA906         = AMD GPU MI50/MI60 GFX906"
-      echo "                 VEGA908         = AMD GPU MI100 GFX908"
-      echo "                 VEGA90A         = AMD GPU MI200 GFX90A"
-      echo "                 NAVI1030        = AMD GPU V620/W6800 GFX1030"
-      echo "                 NAVI1100        = AMD GPU RX 7900 XT(X) GFX1100"
+      echo "                 AMD_GFX906      = AMD GPU MI50/MI60 GFX906"
+      echo "                 AMD_GFX908      = AMD GPU MI100 GFX908"
+      echo "                 AMD_GFX90A      = AMD GPU MI200 GFX90A"
+      echo "                 AMD_GFX942      = AMD GPU MI300 GFX942"
+      echo "                 AMD_GFX1030     = AMD GPU V620/W6800 GFX1030"
+      echo "                 AMD_GFX1100     = AMD GPU RX 7900 XT(X) GFX1100"
       echo "               [ARM]"
       echo "                 ARMV80          = ARMv8.0 Compatible CPU"
       echo "                 ARMV81          = ARMv8.1 Compatible CPU"
diff --git a/packages/kokkos/master_history.txt b/packages/kokkos/master_history.txt
index 73e48268b5a6dbeda0bbaa99cd1adf7badc94640..fd0020b8d53b7bd584a6d63bece6e428d1220cd1 100644
--- a/packages/kokkos/master_history.txt
+++ b/packages/kokkos/master_history.txt
@@ -32,3 +32,5 @@ tag:  3.7.00     date: 08:25:2022    master: d19aab99    release: 0018e5fb
 tag:  3.7.01     date: 12:01:2022    master: 61d7db55    release: d3bb8cfe
 tag:  4.0.00     date: 02:23:2023    master: 5ad60966    release: 52ea2953
 tag:  4.0.01     date: 04:26:2023    master: aa1f48f3    release: 5893754f
+tag:  4.1.00     date: 06:20:2023    master: 62d2b6c8    release: adde1e6a
+tag:  4.2.00     date: 11:09:2023    master: 1a3ea28f    release: abe01c88
diff --git a/packages/kokkos/scripts/check-copyright b/packages/kokkos/scripts/check-copyright
index be696d069aa964df2a187bacb2d509fac8578718..9c4ba6a84292981734fa72198309e0df234ba486 100755
--- a/packages/kokkos/scripts/check-copyright
+++ b/packages/kokkos/scripts/check-copyright
@@ -1,16 +1,15 @@
-files=`git ls-files | grep -e '.*\.\(cc\|cpp\|hpp\)' | grep -v 'tpls/'`
+files=$(git ls-files | grep -e '.*\.\(cc\|cpp\|hpp\)' | grep -v 'tpls/')
 echo "" &> scripts/diff_files
-tmp=`cat LICENSE_FILE_HEADER | wc -l`
+tmp=$(wc -l < LICENSE_FILE_HEADER)
 NNEW=$(($tmp))
-for file in $files; do
-  head -n +$NNEW $file &> header
-  diff header LICENSE_FILE_HEADER &> header_diff
-  count=`cat header_diff | wc -l`
-  #echo $file " " COUNT " " $count >> diff_headers
-  if [ "$count" -ne "0" ]; then
-    echo $file >> scripts/diff_files
+for file in $files
+do
+  head -n +$NNEW "${file}" | diff -q - LICENSE_FILE_HEADER > /dev/null
+  if [[ "${?}" == 1 ]]
+  then
+    echo "${file}" >> scripts/diff_files
   fi
 done
 tmpfile=$(mktemp -t kokkos_diff_files.XXXX)
-cat scripts/diff_files | sort &> $tmpfile
-mv $tmpfile scripts/diff_files
+sort < scripts/diff_files &> "${tmpfile}"
+mv "${tmpfile}" scripts/diff_files
diff --git a/packages/kokkos/scripts/diff_files b/packages/kokkos/scripts/diff_files
index 125568d34557b7d8b087a5ef8bf9934b60f0813a..8b137891791fe96927ad78e64b0aad7bded08bdc 100644
--- a/packages/kokkos/scripts/diff_files
+++ b/packages/kokkos/scripts/diff_files
@@ -1,2 +1 @@
 
-core/src/Cuda/Kokkos_Cuda_Atomic_Intrinsics.hpp
diff --git a/packages/kokkos/scripts/docker/Dockerfile.clang b/packages/kokkos/scripts/docker/Dockerfile.clang
index 9df93b57545ae5fe1f0e5cb51ac46aef7d4e8de3..5c6abc1c6de53df90d27186b05450fd9eb370dd3 100644
--- a/packages/kokkos/scripts/docker/Dockerfile.clang
+++ b/packages/kokkos/scripts/docker/Dockerfile.clang
@@ -1,8 +1,9 @@
-FROM nvidia/cuda:9.2-devel
+FROM ubuntu:18.04
 
 RUN apt-get update && apt-get install -y \
         bc \
         git \
+        build-essential \
         wget \
         ccache \
         && \
@@ -34,7 +35,7 @@ ENV PATH=${CMAKE_DIR}/bin:$PATH
 
 ENV LLVM_DIR=/opt/llvm
 RUN LLVM_VERSION=8.0.0 && \
-    LLVM_URL=http://releases.llvm.org/${LLVM_VERSION}/clang+llvm-${LLVM_VERSION}-x86_64-linux-gnu-ubuntu-16.04.tar.xz && \
+    LLVM_URL=https://releases.llvm.org/${LLVM_VERSION}/clang+llvm-${LLVM_VERSION}-x86_64-linux-gnu-ubuntu-18.04.tar.xz && \
     LLVM_ARCHIVE=llvm-${LLVM_VERSION}.tar.xz && \
     SCRATCH_DIR=/scratch && mkdir -p ${SCRATCH_DIR} && cd ${SCRATCH_DIR} && \
     wget --quiet ${LLVM_URL} --output-document=${LLVM_ARCHIVE} && \
diff --git a/packages/kokkos/scripts/docker/Dockerfile.kokkosllvmproject b/packages/kokkos/scripts/docker/Dockerfile.kokkosllvmproject
index 9086cba5e8edd734f4d81cd6a18702ed4549ba00..a46d00f019da02718df10f3cad268cf2560b52d0 100644
--- a/packages/kokkos/scripts/docker/Dockerfile.kokkosllvmproject
+++ b/packages/kokkos/scripts/docker/Dockerfile.kokkosllvmproject
@@ -1,8 +1,8 @@
-FROM nvidia/cuda:10.1-devel
+FROM nvidia/cuda:11.0.3-devel
 
 RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
 
-RUN apt-get update && apt-get install -y \
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
         bc \
         git \
         wget \
@@ -15,10 +15,10 @@ RUN apt-get update && apt-get install -y \
 
 # unbuntu18.04-based images have libstdc++ that is lacking filesystem support
 RUN apt-get update && \
-    apt-get install -y software-properties-common && \
+    DEBIAN_FRONTEND=noninteractive apt-get install -y software-properties-common && \
     add-apt-repository ppa:ubuntu-toolchain-r/test -y && \
     apt-get update && \
-    apt-get install -y g++-9 && \
+    DEBIAN_FRONTEND=noninteractive apt-get install -y g++-9 && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
@@ -49,7 +49,7 @@ ARG NPROC=8
 
 # Clone Kokkos fork of the LLVM Project and build Clang
 ENV LLVM_DIR=/opt/llvm
-RUN LLVM_VERSION=55b3bcf643685c63fcc529d434bed112fdf03939 && \
+RUN LLVM_VERSION=32413084ecbb5e739c6b35d8bf13ad972985acb3 && \
     LLVM_URL=https://github.com/kokkos/llvm-project/archive/${LLVM_VERSION}.tar.gz &&\
     LLVM_ARCHIVE=llvm.tar.xz && \
     SCRATCH_DIR=/scratch && mkdir -p ${SCRATCH_DIR} && cd ${SCRATCH_DIR} && \
diff --git a/packages/kokkos/scripts/docker/Dockerfile.nvhpc b/packages/kokkos/scripts/docker/Dockerfile.nvhpc
index c0b8cc19d155eb8d9eab1cf5cfaa8cfaf61f664c..88e59de2827af09a36f368c1d8522e7eae958f74 100644
--- a/packages/kokkos/scripts/docker/Dockerfile.nvhpc
+++ b/packages/kokkos/scripts/docker/Dockerfile.nvhpc
@@ -1,4 +1,4 @@
-ARG BASE=nvcr.io/nvidia/nvhpc:22.3-devel-cuda11.6-ubuntu20.04
+ARG BASE=nvcr.io/nvidia/nvhpc:23.7-devel-cuda12.2-ubuntu20.04
 FROM $BASE
 
 RUN KEYDUMP_URL=https://cloud.cees.ornl.gov/download && \
diff --git a/packages/kokkos/scripts/docker/Dockerfile.openmptarget b/packages/kokkos/scripts/docker/Dockerfile.openmptarget
index 44c53fef1dd78f6aafe37321df0004f51b2971ff..708cf533b8a64862a4f4dc1818f28bc2bba495c4 100644
--- a/packages/kokkos/scripts/docker/Dockerfile.openmptarget
+++ b/packages/kokkos/scripts/docker/Dockerfile.openmptarget
@@ -23,7 +23,7 @@ RUN KEYDUMP_URL=https://cloud.cees.ornl.gov/download && \
     gpg --verify ${KEYDUMP_FILE}.sig ${KEYDUMP_FILE} && \
     rm ${KEYDUMP_FILE}*
 
-ARG CMAKE_VERSION=3.18.5
+ARG CMAKE_VERSION=3.27.7
 ENV CMAKE_DIR=/opt/cmake
 RUN CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && \
     CMAKE_SCRIPT=cmake-${CMAKE_VERSION}-Linux-x86_64.sh && \
@@ -38,7 +38,7 @@ RUN CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSIO
     rm ${CMAKE_SCRIPT}
 ENV PATH=${CMAKE_DIR}/bin:$PATH
 
-ARG LLVM_VERSION=llvmorg-15.0.0
+ARG LLVM_VERSION=llvmorg-17.0.1
 ENV LLVM_DIR=/opt/llvm
 RUN LLVM_URL=https://github.com/llvm/llvm-project/archive &&\
     LLVM_ARCHIVE=${LLVM_VERSION}.tar.gz &&\
diff --git a/packages/kokkos/scripts/docker/Dockerfile.sycl b/packages/kokkos/scripts/docker/Dockerfile.sycl
index d7d764e8aa53f89ecb3f93a685a6946ad08f52a2..714461bfe6a53ceb0c83656447bb89267d31213d 100644
--- a/packages/kokkos/scripts/docker/Dockerfile.sycl
+++ b/packages/kokkos/scripts/docker/Dockerfile.sycl
@@ -1,4 +1,4 @@
-ARG BASE=nvidia/cuda:10.2-devel
+ARG BASE=nvidia/cuda:11.7.1-devel-ubuntu22.04
 FROM $BASE
 
 RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
@@ -10,19 +10,11 @@ RUN apt-get update && apt-get install -y \
         ninja-build \
         python3 \
         git \
+        libomp-dev \
         && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
-# unbuntu18.04-based images have libstdc++ that is lacking filesystem support
-RUN apt-get update && \
-    apt-get install -y software-properties-common && \
-    add-apt-repository ppa:ubuntu-toolchain-r/test -y && \
-    apt-get update && \
-    apt-get install -y g++-9 && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
 RUN KEYDUMP_URL=https://cloud.cees.ornl.gov/download && \
     KEYDUMP_FILE=keydump && \
     wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE} && \
@@ -46,19 +38,20 @@ RUN CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSIO
     rm cmake*
 ENV PATH=${CMAKE_DIR}/bin:$PATH
 
-ENV SYCL_DIR=/opt/sycl
-RUN SYCL_VERSION=20220112 && \
-    SYCL_URL=https://github.com/intel/llvm/archive/sycl-nightly && \
-    SYCL_ARCHIVE=${SYCL_VERSION}.tar.gz && \
-    SCRATCH_DIR=/scratch && mkdir -p ${SCRATCH_DIR} && cd ${SCRATCH_DIR} && \
-    wget --quiet ${SYCL_URL}/${SYCL_ARCHIVE} && \
-    mkdir llvm && \
-    tar -xf ${SYCL_ARCHIVE} -C llvm --strip-components=1 && \
-    cd llvm && \
-    python3 buildbot/configure.py --cuda && \
-    python3 buildbot/compile.py && \
-    mkdir -p ${SYCL_DIR} && \
-    mv ${SCRATCH_DIR}/llvm/build/install/* ${SYCL_DIR} && \
-    echo "${SYCL_DIR}/lib" > /etc/ld.so.conf.d/sycl.conf && ldconfig && \
-    rm -rf ${SCRATCH_DIR}
-ENV PATH=${SYCL_DIR}/bin:$PATH
+RUN wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2023.PUB && \
+    apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS-2023.PUB && \
+    echo "deb https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \
+    apt-get update -o Dir::Etc::sourcelist="sources.list.d/oneAPI.list" -o APT::Get::List-Cleanup="0" && \
+    apt-get install -y intel-oneapi-compiler-dpcpp-cpp-and-cpp-classic-2023.0.0 && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN wget https://cloud.cees.ornl.gov/download/oneapi-for-nvidia-gpus-2023.0.0-linux.sh && \
+    chmod +x oneapi-for-nvidia-gpus-2023.0.0-linux.sh && \
+    ./oneapi-for-nvidia-gpus-2023.0.0-linux.sh -y && \
+    rm oneapi-for-nvidia-gpus-2023.0.0-linux.sh
+
+RUN wget https://registrationcenter-download.intel.com/akdlm/irc_nas/19133/l_oneDPL_p_2022.0.0.25335.sh &&\
+    chmod +x ./l_oneDPL_p_2022.0.0.25335.sh && \
+    ./l_oneDPL_p_2022.0.0.25335.sh -a -s --eula accept && \
+    rm l_oneDPL_p_2022.0.0.25335.sh
diff --git a/packages/kokkos/scripts/testing_scripts/test_all_sandia b/packages/kokkos/scripts/testing_scripts/test_all_sandia
index 40c30ba7f4588c19365a0f8df2a683ab4c22e998..6e463addb95e17b233ce1fca3deac135f1592531 100755
--- a/packages/kokkos/scripts/testing_scripts/test_all_sandia
+++ b/packages/kokkos/scripts/testing_scripts/test_all_sandia
@@ -75,6 +75,7 @@ CUDA_ENABLE_CMD=
 
 if [[ "$HOSTNAME" =~ weaver.* ]]; then
   MACHINE=weaver
+  source /etc/profile.d/modules.sh
   module load git
 fi
 
@@ -105,8 +106,12 @@ if [[ "$HOSTNAME" == caraway* ]]; then
   MACHINE=caraway
 fi
 
-if [[ "$HOSTNAME" == kokkos-dev\.sandia\.gov* ]]; then
-  MACHINE=kokkos-dev
+if [[ "$HOSTNAME" == fat* ]]; then # Caraway MI250 queues
+  MACHINE=caraway
+fi
+
+if [[ "$HOSTNAME" == lean* ]]; then # Caraway MI210 queues
+  MACHINE=caraway
 fi
 
 if [[ "$HOSTNAME" == sogpu01* ]]; then
@@ -272,18 +277,12 @@ fi
 #
 
 if [ "$MACHINE" = "sems" ]; then
-  source /projects/sems/modulefiles/utils/sems-archive-modules-init.sh
-
-  # On unnamed sems machines, assume more restricted rhel7 environment
-  # On rhel7 sems machines gcc/7.3.0, clang/4.0.1, and intel/16.0.3 are missing
-  # Remove kokkkos-env module use
+  module purge
+  MODULE_ENVIRONMENT="sh /projects/sems/modulefiles/utils/sems-v2-modules-init.sh"
+  eval "$MODULE_ENVIRONMENT"
 
-  module load sems-archive-cmake/3.17.1
-  BASE_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-<COMPILER_NAME>/<COMPILER_VERSION>"
-  OLDINTEL_BASE_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/6.4.0,sems-archive-<COMPILER_NAME>/<COMPILER_VERSION>"
-  INTEL_BASE_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/7.2.0,sems-archive-<COMPILER_NAME>/<COMPILER_VERSION>"
-  CLANG_BASE_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/9.2.0,sems-archive-<COMPILER_NAME>/<COMPILER_VERSION>"
-  CUDA9_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/7.2.0,sems-archive-<COMPILER_NAME>/<COMPILER_VERSION>"
+  module load sems-cmake sems-git
+  BASE_MODULE_LIST="sems-cmake,sems-<COMPILER_NAME>/<COMPILER_VERSION>"
   SKIP_HWLOC=True
   # No sems hwloc module
 
@@ -291,136 +290,63 @@ if [ "$MACHINE" = "sems" ]; then
     ARCH_FLAG=""
   fi
 
-  if [ "$SPOT_CHECK" = "True" ]; then
-    # Format: (compiler module-list build-list exe-name warning-flag)
-    COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST "OpenMP" g++ $GCC_WARNING_FLAGS"
-               "gcc/7.2.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS"
-               "intel/17.0.1 $INTEL_BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS"
-               "cuda/9.2 $CUDA9_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
-    )
-  else
-    # Format: (compiler module-list build-list exe-name warning-flag)
-    COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "gcc/6.4.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "gcc/7.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "gcc/8.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "gcc/9.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "clang/5.0.1 $CLANG_BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
-               "clang/7.0.1 $CLANG_BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
-               "clang/9.0.0 $CLANG_BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
-               "clang/10.0.0 $CLANG_BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
-               "intel/17.0.1 $OLDINTEL_BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-               "intel/18.0.5 $OLDINTEL_BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-               "intel/19.0.5 $INTEL_BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-               "cuda/9.2 $CUDA9_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
-    )
-  fi
+  # Format: (compiler module-list build-list exe-name warning-flag)
+  COMPILERS=("gcc/8.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+             "gcc/10.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+             "clang/11.0.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
+             "clang/14.0.2 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
+             "intel/19.0.5 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+             "intel/19.1.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+             "intel/2021.3 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+  )
+
 elif [ "$MACHINE" = "sogpu" ]; then
-  source /projects/sems/modulefiles/utils/sems-archive-modules-init.sh
+  MODULE_ENVIRONMENT="sh /projects/sems/modulefiles/utils/sems-v2-modules-init.sh"
+  eval "$MODULE_ENVIRONMENT"
 
-  module load sems-archive-cmake/3.17.1 sems-archive-git
-  BASE_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-<COMPILER_NAME>/<COMPILER_VERSION>"
-  OLDINTEL_BASE_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/6.4.0,sems-archive-<COMPILER_NAME>/<COMPILER_VERSION>"
-  INTEL_BASE_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/7.2.0,sems-archive-<COMPILER_NAME>/<COMPILER_VERSION>"
-  CLANG_BASE_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/9.2.0,sems-archive-<COMPILER_NAME>/<COMPILER_VERSION>"
-  CUDA_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/7.2.0,sems-archive-<COMPILER_NAME>/<COMPILER_VERSION>"
-  CUDA11_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/8.3.0,sems-archive-<COMPILER_NAME>/<COMPILER_VERSION>"
+  module load sems-cmake sems-git
+  BASE_MODULE_LIST="sems-cmake,sems-<COMPILER_NAME>/<COMPILER_VERSION>"
+  CUDA11_MODULE_LIST="sems-cmake,sems-gcc/8.3.0,sems-<COMPILER_NAME>/<COMPILER_VERSION>"
   SKIP_HWLOC=True
   # No sems hwloc module
 
+  echo "."
+  module list
   if [ -z "$ARCH_FLAG" ]; then
     ARCH_FLAG="--arch=Volta70"
   fi
 
-    # Format: (compiler module-list build-list exe-name warning-flag)
-    COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "gcc/6.4.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "gcc/7.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "gcc/8.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "gcc/9.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "clang/5.0.1 $CLANG_BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
-               "clang/7.0.1 $CLANG_BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
-               "clang/9.0.0 $CLANG_BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
-               "clang/10.0.0 $CLANG_BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
-               "intel/17.0.1 $OLDINTEL_BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-               "intel/18.0.5 $OLDINTEL_BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-               "intel/19.0.5 $INTEL_BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-               "cuda/10.1 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
-               "cuda/11.1 $CUDA11_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
-              )
-elif [ "$MACHINE" = "kokkos-dev" ]; then
-  source /projects/sems/modulefiles/utils/sems-archive-modules-init.sh
-
-  module load sems-archive-cmake/3.17.1
-  BASE_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-<COMPILER_NAME>/<COMPILER_VERSION>"
-  OLDINTEL_BASE_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/6.4.0,sems-archive-<COMPILER_NAME>/<COMPILER_VERSION>"
-  INTEL_BASE_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/7.2.0,sems-archive-<COMPILER_NAME>/<COMPILER_VERSION>"
-  CLANG_BASE_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/9.2.0,sems-archive-<COMPILER_NAME>/<COMPILER_VERSION>"
-  CUDA9_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/6.1.0,sems-archive-<COMPILER_NAME>/<COMPILER_VERSION>"
-  CUDA10_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/7.2.0,sems-archive-<COMPILER_NAME>/<COMPILER_VERSION>"
-  CUDA11_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/9.2.0,sems-archive-<COMPILER_NAME>/<COMPILER_VERSION>"
-  CLANG7_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/7.2.0,sems-archive-cuda/9.2,sems-archive-<COMPILER_NAME>/<COMPILER_VERSION>"
-  SKIP_HWLOC=True
-
-  if [ -z "$ARCH_FLAG" ]; then
-    ARCH_FLAG="--arch=Kepler35"
-  fi
+  echo "."
+  # Format: (compiler module-list build-list exe-name warning-flag)
+  COMPILERS=("gcc/8.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+             "gcc/10.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+             "clang/11.0.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
+             "clang/14.0.2 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
+             "intel/19.0.5 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+             "intel/19.1.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+             "intel/2021.3 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+             "cuda/11.1.0 $CUDA11_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
+             "cuda/11.4.2 $CUDA11_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
+  )
 
-  if [ "$SPOT_CHECK" = "True" ]; then
-    # Format: (compiler module-list build-list exe-name warning-flag)
-    COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST "OpenMP" g++ $GCC_WARNING_FLAGS"
-               "gcc/7.3.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS"
-               "intel/17.0.1 $OLDINTEL_BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS"
-               "intel/18.0.5 $OLDINTEL_BASE_MODULE_LIST "Serial" icpc $INTEL_WARNING_FLAGS"
-               "intel/19.0.5 $INTEL_BASE_MODULE_LIST "Pthread_Serial" icpc $INTEL_WARNING_FLAGS"
-               "clang/5.0.1 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS"
-               "clang/7.0.1 $CLANG7_MODULE_LIST "Cuda_OpenMP" clang++ $CLANG_WARNING_FLAGS"
-               "cuda/9.2 $CUDA9_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
-    )
-  else
-    # Format: (compiler module-list build-list exe-name warning-flag)
-    COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "gcc/7.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "gcc/8.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "gcc/9.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "intel/17.0.1 $OLDINTEL_BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-               "intel/18.0.5 $OLDINTEL_BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-               "intel/19.0.5 $INTEL_BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-               "clang/5.0.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
-               "clang/7.0.1 $CLANG7_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
-               "clang/9.0.0 $CLANG_BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
-               "clang/10.0.0 $CLANG_BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
-               "cuda/10.1 $CUDA10_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
-               "cuda/11.1 $CUDA11_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
-               "cuda/9.2 $CUDA9_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
-    )
-  fi
+  echo "."
+  echo "COMPILER VAR : $COMPILERS"
 
 elif [ "$MACHINE" = "weaver" ]; then
-  source /etc/profile.d/modules.sh
+  # Use the legacy env for now until all modules are part of the new system
+  source /projects/ppc64le-pwr9-rhel8/legacy-env.sh
   SKIP_HWLOC=True
 
-  # For rhel7W queue
-  BASE_MODULE_LIST="cmake/3.19.3,<COMPILER_NAME>/<COMPILER_VERSION>"
-
-  # For rhel8 queue
-  # Cuda/11 modules available only on the rhel8 queue (rhel8 OS)
-  RHEL8_BASE_MODULE_LIST="cmake/3.21.2,<COMPILER_NAME>/<COMPILER_VERSION>"
-  RHEL8_CUDA11_MODULE_LIST="cmake/3.21.2,<COMPILER_NAME>/<COMPILER_VERSION>"
+  # Cuda/11 modules available only on the dev queue (rhel8 OS); gcc/8.3.1 loaded by default
+  CUDA11_MODULE_LIST="cmake/3.23.1,<COMPILER_NAME>/<COMPILER_VERSION>"
 
   # Don't do pthread with Power
-  GCC_IBM_BUILD_LIST="OpenMP,Serial,OpenMP_Serial"
+  GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial"
 
   # Format: (compiler module-list build-list exe-name warning-flag)
-  COMPILERS=("gcc/9.3.0 $BASE_MODULE_LIST $GCC_IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-             "gcc/8.3.1 $RHEL8_BASE_MODULE_LIST $GCC_IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-             "cuda/11.2.2 $RHEL8_CUDA11_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
+  COMPILERS=("cuda/11.2.2 $CUDA11_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
   )
+
   if [ -z "$ARCH_FLAG" ]; then
     ARCH_FLAG="--arch=Power9,Volta70"
   fi
@@ -432,9 +358,7 @@ elif [ "$MACHINE" = "voltrino" ]; then
   BASE_MODULE_LIST="PrgEnv-intel,craype-mic-knl,cmake/3.16.2,slurm/20.11.4a,<COMPILER_NAME>/<COMPILER_VERSION>,gcc/9.3.0"
 
   # Format: (compiler module-list build-list exe-name warning-flag)
-  COMPILERS=("intel/17.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-             "intel/18.0.5 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-             "intel/19.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+  COMPILERS=("intel/19.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
   )
 
   if [ -z "$ARCH_FLAG" ]; then
@@ -448,8 +372,7 @@ elif [ "$MACHINE" = "mayer" ]; then
   BASE_MODULE_LIST="cmake/3.17.1,<COMPILER_NAME>/<COMPILER_VERSION>"
 
   # Format: (compiler module-list build-list exe-name warning-flag)
-  COMPILERS=("gnu7/7.2.0 $BASE_MODULE_LIST $ARM_GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-             "gnu9/9.3.0 $BASE_MODULE_LIST $ARM_GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+  COMPILERS=("gnu9/9.3.0 $BASE_MODULE_LIST $ARM_GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
              "arm/20.1 $BASE_MODULE_LIST $ARM_GCC_BUILD_LIST armclang++ $CLANG_WARNING_FLAGS")
 
   if [ -z "$ARCH_FLAG" ]; then
@@ -460,15 +383,12 @@ elif [ "$MACHINE" = "caraway" ]; then
   SKIP_HWLOC=True
 
   BASE_MODULE_LIST="cmake/3.19.3,<COMPILER_NAME>/<COMPILER_VERSION>"
-  # Cuda11 usage available on the V100 queue
-  CUDA11_MODULE_LIST="cmake/3.22.2,<COMPILER_NAME>/<COMPILER_VERSION>,gcc/8.2.0"
 
   HIPCLANG_BUILD_LIST="Hip_Serial,Hip_OpenMP"
   HIPCLANG_WARNING_FLAGS="-Werror -Wno-unused-command-line-argument -DNDEBUG"
 
   # Format: (compiler module-list build-list exe-name warning-flag)
   COMPILERS=("rocm/5.2.0 $BASE_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS"
-             "cuda/11.4 $CUDA11_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
              "gcc/8.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
              "gcc/9.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
              "gcc/10.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
@@ -476,7 +396,7 @@ elif [ "$MACHINE" = "caraway" ]; then
   )
 
   if [ -z "$ARCH_FLAG" ]; then
-    ARCH_FLAG="--arch=VEGA908"
+    ARCH_FLAG="--arch=AMD_GFX908"
   fi
 
 elif [ "$MACHINE" = "blake" ]; then
@@ -488,95 +408,62 @@ elif [ "$MACHINE" = "blake" ]; then
 
   BASE_MODULE_LIST="cmake/3.19.3,<COMPILER_NAME>/<COMPILER_VERSION>"
   BASE_MODULE_LIST_INTEL="cmake/3.19.3,<COMPILER_NAME>/compilers/<COMPILER_VERSION>"
-  BASE_MODULE_LIST_ONEAPI="cmake/3.19.3,<COMPILER_NAME>/oneAPI/base-toolkit/<COMPILER_VERSION>"
+  BASE_MODULE_LIST_ONEAPI="cmake/3.19.3,<COMPILER_NAME>/oneAPI/base-toolkit/<COMPILER_VERSION>,<COMPILER_NAME>/oneAPI/hpc-toolkit/<COMPILER_VERSION>"
   ONEAPI_WARNING_FLAGS=""
 
-  if [ "$SPOT_CHECK" = "True" ]; then
-
-  # Format: (compiler module-list build-list exe-name warning-flag)
-  COMPILERS=("intel/18.1.163 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-             "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-  )
-  else
   COMPILERS=("intel/19.5.281 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
              "intel/2021.1.1 $BASE_MODULE_LIST_ONEAPI $INTEL_BUILD_LIST icpx $ONEAPI_WARNING_FLAGS"
-             "gcc/8.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+             "intel/2021.2.0 $BASE_MODULE_LIST_ONEAPI $INTEL_BUILD_LIST icpx $ONEAPI_WARNING_FLAGS"
+             "intel/2021.4.0 $BASE_MODULE_LIST_ONEAPI $INTEL_BUILD_LIST icpx $ONEAPI_WARNING_FLAGS"
+             "intel/2022.1.2 $BASE_MODULE_LIST_ONEAPI $INTEL_BUILD_LIST icpx $ONEAPI_WARNING_FLAGS"
              "gcc/8.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+             "gcc/8.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
              "gcc/9.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
              "gcc/10.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+             "gcc/11.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+             "clang/10.0.0 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
   )
 
-  fi
   if [ -z "$ARCH_FLAG" ]; then
     ARCH_FLAG="--arch=SKX"
   fi
 
 elif [ "$MACHINE" = "kokkos-dev-2" ]; then
+  module purge
   source /projects/sems/modulefiles/utils/sems-archive-modules-init.sh
   module use /home/projects/x86-64/modulefiles/local
-  module purge
   module load sems-archive-env
 
   module load sems-archive-git
-  module load sems-archive-tex
   module load sems-archive-cmake/3.17.1
   module load sems-archive-gdb
 
   SKIP_HWLOC=True
 
   BASE_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-<COMPILER_NAME>/<COMPILER_VERSION>"
-  OLDINTEL_BASE_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/6.4.0,sems-archive-<COMPILER_NAME>/<COMPILER_VERSION>"
-  INTEL_BASE_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/7.2.0,sems-archive-<COMPILER_NAME>/<COMPILER_VERSION>"
+  INTEL_BASE_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/8.3.0,sems-archive-<COMPILER_NAME>/<COMPILER_VERSION>"
   CLANG_BASE_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/9.2.0,sems-archive-<COMPILER_NAME>/<COMPILER_VERSION>"
-  CLANG8_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/7.2.0,cuda/10.0,<COMPILER_NAME>/<COMPILER_VERSION>"
   GCC91_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,<COMPILER_NAME>/<COMPILER_VERSION>"
-  NVCC9_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/5.3.0,sems-archive-<COMPILER_NAME>/<COMPILER_VERSION>"
-  NVCC_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/7.3.0,<COMPILER_NAME>/<COMPILER_VERSION>"
   NVCC_SEMSMODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/7.3.0,sems-archive-<COMPILER_NAME>/<COMPILER_VERSION>"
-  NVCC11_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/9.2.0,<COMPILER_NAME>/<COMPILER_VERSION>"
+  NVCC_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/9.2.0,<COMPILER_NAME>/<COMPILER_VERSION>"
 
-  BUILD_LIST_CUDA_NVCC="Cuda_Serial,Cuda_Pthread"
-  BUILD_LIST_CUDA_CLANG="Cuda_Serial,Cuda_OpenMP"
   BUILD_LIST_CLANG="Serial,Pthread,OpenMP"
 
-  if [ "$SPOT_CHECK" = "True" ]; then
-    # Format: (compiler module-list build-list exe-name warning-flag)
-    COMPILERS=("gcc/7.3.0 $BASE_MODULE_LIST "OpenMP,Pthread" g++ $GCC_WARNING_FLAGS"
-               "gcc/8.3.0 $BASE_MODULE_LIST "OpenMP" g++ $GCC_WARNING_FLAGS"
-               "gcc/9.1 $GCC91_MODULE_LIST "OpenMP,Serial" g++ $GCC_WARNING_FLAGS"
-               "intel/18.0.5 $OLDINTEL_BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS"
-               "clang/8.0 $CLANG8_MODULE_LIST "Cuda_OpenMP,Pthread_Serial" clang++ $CLANG_WARNING_FLAGS"
-               "cuda/10.1 $NVCC_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
-    )
-  else
-    # Format: (compiler module-list build-list exe-name warning-flag)
-    COMPILERS=("cuda/10.0 $NVCC_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
-               "cuda/10.1 $NVCC_SEMSMODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
-               "cuda/11.0 $NVCC11_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
-               "cuda/11.1 $NVCC_SEMSMODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
-               "cuda/11.2 $NVCC11_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
-               "cuda/9.2 $NVCC9_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
-               "clang/8.0 $CLANG8_MODULE_LIST $BUILD_LIST_CUDA_CLANG clang++ $CUDA_WARNING_FLAGS"
-               "clang/8.0 $CLANG8_MODULE_LIST $BUILD_LIST_CLANG clang++ $CLANG_WARNING_FLAGS"
-               "gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "gcc/7.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "gcc/8.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "gcc/9.1 $GCC91_MODULE_LIST "$GCC_BUILD_LIST" g++ $GCC_WARNING_FLAGS"
-               "gcc/9.2.0 $BASE_MODULE_LIST "$GCC_BUILD_LIST" g++ $GCC_WARNING_FLAGS"
-               "intel/17.0.1 $OLDINTEL_BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-               "intel/18.0.5 $OLDINTEL_BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-               "intel/19.0.5 $INTEL_BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-               "clang/5.0.1 $CLANG_BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
-               "clang/7.0.1 $CLANG_BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
-               "clang/9.0.0 $CLANG_BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
-               "clang/10.0.0 $CLANG_BASE_MODULE_LIST $BUILD_LIST_CLANG clang++ $CLANG_WARNING_FLAGS"
-    )
-  fi
+  # Format: (compiler module-list build-list exe-name warning-flag)
+  COMPILERS=("cuda/11.1 $NVCC_SEMSMODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
+             "cuda/11.2 $NVCC_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
+             "cuda/11.7 $NVCC_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
+             "cuda/12.0 $NVCC_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
+             "gcc/8.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+             "gcc/9.1 $GCC91_MODULE_LIST "$GCC_BUILD_LIST" g++ $GCC_WARNING_FLAGS"
+             "gcc/9.2.0 $BASE_MODULE_LIST "$GCC_BUILD_LIST" g++ $GCC_WARNING_FLAGS"
+             "intel/19.0.5 $INTEL_BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+             "clang/9.0.0 $CLANG_BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
+             "clang/10.0.0 $CLANG_BASE_MODULE_LIST $BUILD_LIST_CLANG clang++ $CLANG_WARNING_FLAGS"
+  )
 
   if [ -z "$ARCH_FLAG" ]; then
-    ARCH_FLAG="--arch=SNB,Volta70"
+    ARCH_FLAG="--arch=Volta70"
   fi
 
 else
diff --git a/packages/kokkos/simd/CMakeLists.txt b/packages/kokkos/simd/CMakeLists.txt
index 83557e61e627f1d8d8a4a7f99c4e8709d924cb27..59e09b85ac3df05da80fd102cee7e51fc5fac9fe 100644
--- a/packages/kokkos/simd/CMakeLists.txt
+++ b/packages/kokkos/simd/CMakeLists.txt
@@ -1,10 +1,5 @@
-
-KOKKOS_SUBPACKAGE(Simd)
-
 IF (NOT Kokkos_INSTALL_TESTING)
   ADD_SUBDIRECTORY(src)
 ENDIF()
 
 KOKKOS_ADD_TEST_DIRECTORIES(unit_tests)
-
-KOKKOS_SUBPACKAGE_POSTPROCESS()
diff --git a/packages/kokkos/simd/cmake/Dependencies.cmake b/packages/kokkos/simd/cmake/Dependencies.cmake
deleted file mode 100644
index 1d71d8af341181f689a6a8bf63036b67584cb138..0000000000000000000000000000000000000000
--- a/packages/kokkos/simd/cmake/Dependencies.cmake
+++ /dev/null
@@ -1,5 +0,0 @@
-TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
-  LIB_REQUIRED_PACKAGES KokkosCore
-  LIB_OPTIONAL_TPLS Pthread CUDA HWLOC
-  TEST_OPTIONAL_TPLS CUSPARSE
-  )
diff --git a/packages/kokkos/simd/src/Kokkos_SIMD.hpp b/packages/kokkos/simd/src/Kokkos_SIMD.hpp
index 92807634076ee1719f6c9b2926811cd9fada2f5f..57d4afd88beee86e9bc3c20a412fcd206ee03045 100644
--- a/packages/kokkos/simd/src/Kokkos_SIMD.hpp
+++ b/packages/kokkos/simd/src/Kokkos_SIMD.hpp
@@ -19,19 +19,62 @@
 
 #include <Kokkos_SIMD_Common.hpp>
 
+// suppress NVCC warnings with the [[nodiscard]] attribute on overloaded
+// operators implemented as hidden friends
+#if defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC < 1130
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wattributes"
+#endif
+
 #include <Kokkos_SIMD_Scalar.hpp>
 
-#ifdef KOKKOS_ARCH_AVX2
+#include <Kokkos_Macros.hpp>
+
+// FIXME_OPENMPTARGET The device pass disables all compiler macros checked
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+#if defined(KOKKOS_ARCH_AVX2)
+#include <Kokkos_SIMD_AVX2.hpp>
+#endif
+
+#if defined(KOKKOS_ARCH_AVX512XEON)
+#include <Kokkos_SIMD_AVX512.hpp>
+#endif
+
+#if defined(KOKKOS_ARCH_ARM_NEON)
+#include <Kokkos_SIMD_NEON.hpp>
+#endif
+#else  // KOKKOS_ENABLE_OPENMPTARGET
+#if defined(KOKKOS_ARCH_AVX) && !defined(__AVX__)
+#error "__AVX__ must be defined for KOKKOS_ARCH_AVX"
+#endif
+
+#if defined(KOKKOS_ARCH_AVX2)
+#if !defined(__AVX2__)
+#error "__AVX2__ must be defined for KOKKOS_ARCH_AVX2"
+#endif
 #include <Kokkos_SIMD_AVX2.hpp>
 #endif
 
-#ifdef KOKKOS_ARCH_AVX512XEON
+#if defined(KOKKOS_ARCH_AVX512XEON)
+#if !defined(__AVX512F__)
+#error "__AVX512F__ must be defined for KOKKOS_ARCH_AVX512XEON"
+#endif
 #include <Kokkos_SIMD_AVX512.hpp>
 #endif
 
-#ifdef __ARM_NEON
+#if defined(KOKKOS_ARCH_ARM_NEON)
+#if !defined(__ARM_NEON)
+#error "__ARM_NEON must be definded for KOKKOS_ARCH_ARM_NEON"
+#endif
 #include <Kokkos_SIMD_NEON.hpp>
 #endif
+#endif
+
+#if defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC < 1130
+#pragma GCC diagnostic pop
+#endif
+
+#include <Kokkos_SIMD_Common_Math.hpp>
 
 namespace Kokkos {
 namespace Experimental {
@@ -44,10 +87,10 @@ namespace Impl {
 using host_native = avx512_fixed_size<8>;
 #elif defined(KOKKOS_ARCH_AVX2)
 using host_native  = avx2_fixed_size<4>;
-#elif defined(__ARM_NEON)
+#elif defined(KOKKOS_ARCH_ARM_NEON)
 using host_native  = neon_fixed_size<2>;
 #else
-using host_native  = scalar;
+using host_native   = scalar;
 #endif
 
 template <class T>
@@ -136,14 +179,25 @@ namespace Impl {
 template <class... Abis>
 class abi_set {};
 
+template <typename... Ts>
+class data_types {};
+
 #if defined(KOKKOS_ARCH_AVX512XEON)
-using host_abi_set = abi_set<simd_abi::scalar, simd_abi::avx512_fixed_size<8>>;
+using host_abi_set  = abi_set<simd_abi::scalar, simd_abi::avx512_fixed_size<8>>;
+using data_type_set = data_types<std::int32_t, std::uint32_t, std::int64_t,
+                                 std::uint64_t, double, float>;
 #elif defined(KOKKOS_ARCH_AVX2)
 using host_abi_set = abi_set<simd_abi::scalar, simd_abi::avx2_fixed_size<4>>;
-#elif defined(__ARM_NEON)
+using data_type_set =
+    data_types<std::int32_t, std::int64_t, std::uint64_t, double, float>;
+#elif defined(KOKKOS_ARCH_ARM_NEON)
 using host_abi_set = abi_set<simd_abi::scalar, simd_abi::neon_fixed_size<2>>;
+using data_type_set =
+    data_types<std::int32_t, std::int64_t, std::uint64_t, double, float>;
 #else
-using host_abi_set = abi_set<simd_abi::scalar>;
+using host_abi_set  = abi_set<simd_abi::scalar>;
+using data_type_set = data_types<std::int32_t, std::uint32_t, std::int64_t,
+                                 std::uint64_t, double, float>;
 #endif
 
 using device_abi_set = abi_set<simd_abi::scalar>;
diff --git a/packages/kokkos/simd/src/Kokkos_SIMD_AVX2.hpp b/packages/kokkos/simd/src/Kokkos_SIMD_AVX2.hpp
index 86b944efa5408fdf9a886081cc31b81e5441eb1c..521160b76fc421e4c957c650d3e1527dc878ecbc 100644
--- a/packages/kokkos/simd/src/Kokkos_SIMD_AVX2.hpp
+++ b/packages/kokkos/simd/src/Kokkos_SIMD_AVX2.hpp
@@ -21,9 +21,21 @@
 #include <type_traits>
 
 #include <Kokkos_SIMD_Common.hpp>
+#include <Kokkos_BitManipulation.hpp>  // bit_cast
 
 #include <immintrin.h>
 
+#ifdef KOKKOS_SIMD_COMMON_MATH_HPP
+#error \
+    "Kokkos_SIMD_AVX2.hpp must be included before Kokkos_SIMD_Common_Math.hpp!"
+#endif
+
+// FIXME_HIP ROCm 5.6 and 5.7 can't compile with the intrinsic used here.
+#if defined(__HIPCC__) && (HIP_VERSION_MAJOR == 5) && \
+    ((HIP_VERSION_MINOR == 6) || (HIP_VERSION_MINOR == 7))
+#define KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE
+#endif
+
 namespace Kokkos {
 
 namespace Experimental {
@@ -72,6 +84,18 @@ class simd_mask<double, simd_abi::avx2_fixed_size<4>> {
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd_mask(value_type value)
       : m_value(_mm256_castsi256_pd(_mm256_set1_epi64x(-std::int64_t(value)))) {
   }
+  template <class G,
+            std::enable_if_t<
+                std::is_invocable_r_v<value_type, G,
+                                      std::integral_constant<std::size_t, 0>>,
+                bool> = false>
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd_mask(
+      G&& gen) noexcept
+      : m_value(_mm256_castsi256_pd(_mm256_setr_epi64x(
+            -std::int64_t(gen(std::integral_constant<std::size_t, 0>())),
+            -std::int64_t(gen(std::integral_constant<std::size_t, 1>())),
+            -std::int64_t(gen(std::integral_constant<std::size_t, 2>())),
+            -std::int64_t(gen(std::integral_constant<std::size_t, 3>()))))) {}
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask(
       simd_mask<std::int32_t, simd_abi::avx2_fixed_size<4>> const& i32_mask);
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() {
@@ -114,6 +138,94 @@ class simd_mask<double, simd_abi::avx2_fixed_size<4>> {
   }
 };
 
+template <>
+class simd_mask<float, simd_abi::avx2_fixed_size<4>> {
+  __m128 m_value;
+
+ public:
+  class reference {
+    __m128& m_mask;
+    int m_lane;
+    KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION __m128 bit_mask() const {
+      return _mm_castsi128_ps(_mm_setr_epi32(
+          -std::int32_t(m_lane == 0), -std::int32_t(m_lane == 1),
+          -std::int32_t(m_lane == 2), -std::int32_t(m_lane == 3)));
+    }
+
+   public:
+    KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference(__m128& mask_arg,
+                                                    int lane_arg)
+        : m_mask(mask_arg), m_lane(lane_arg) {}
+    KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference
+    operator=(bool value) const {
+      if (value) {
+        m_mask = _mm_or_ps(bit_mask(), m_mask);
+      } else {
+        m_mask = _mm_andnot_ps(bit_mask(), m_mask);
+      }
+      return *this;
+    }
+    KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION operator bool() const {
+      return (_mm_movemask_ps(m_mask) & (1 << m_lane)) != 0;
+    }
+  };
+  using value_type = bool;
+  using abi_type   = simd_abi::avx2_fixed_size<4>;
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask() = default;
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd_mask(value_type value)
+      : m_value(_mm_castsi128_ps(_mm_set1_epi32(-std::int32_t(value)))) {}
+  template <class G,
+            std::enable_if_t<
+                std::is_invocable_r_v<value_type, G,
+                                      std::integral_constant<std::size_t, 0>>,
+                bool> = false>
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd_mask(
+      G&& gen) noexcept
+      : m_value(_mm_castsi128_ps(_mm_setr_epi32(
+            -std::int32_t(gen(std::integral_constant<std::size_t, 0>())),
+            -std::int32_t(gen(std::integral_constant<std::size_t, 1>())),
+            -std::int32_t(gen(std::integral_constant<std::size_t, 2>())),
+            -std::int32_t(gen(std::integral_constant<std::size_t, 3>()))))) {}
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() {
+    return 4;
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd_mask(
+      __m128 const& value_in)
+      : m_value(value_in) {}
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m128()
+      const {
+    return m_value;
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) {
+    return reference(m_value, int(i));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type
+  operator[](std::size_t i) const {
+    return static_cast<value_type>(
+        reference(const_cast<__m128&>(m_value), int(i)));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask
+  operator||(simd_mask const& other) const {
+    return simd_mask(_mm_or_ps(m_value, other.m_value));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask
+  operator&&(simd_mask const& other) const {
+    return simd_mask(_mm_and_ps(m_value, other.m_value));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask operator!() const {
+    auto const true_value = static_cast<__m128>(simd_mask(true));
+    return simd_mask(_mm_andnot_ps(m_value, true_value));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION bool operator==(
+      simd_mask const& other) const {
+    return _mm_movemask_ps(m_value) == _mm_movemask_ps(other.m_value);
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION bool operator!=(
+      simd_mask const& other) const {
+    return !operator==(other);
+  }
+};
+
 template <>
 class simd_mask<std::int32_t, simd_abi::avx2_fixed_size<4>> {
   __m128i m_value;
@@ -158,6 +270,18 @@ class simd_mask<std::int32_t, simd_abi::avx2_fixed_size<4>> {
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd_mask(
       __m128i const& value_in)
       : m_value(value_in) {}
+  template <class G,
+            std::enable_if_t<
+                std::is_invocable_r_v<value_type, G,
+                                      std::integral_constant<std::size_t, 0>>,
+                bool> = false>
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd_mask(
+      G&& gen) noexcept
+      : m_value(_mm_setr_epi32(
+            -std::int32_t(gen(std::integral_constant<std::size_t, 0>())),
+            -std::int32_t(gen(std::integral_constant<std::size_t, 1>())),
+            -std::int32_t(gen(std::integral_constant<std::size_t, 2>())),
+            -std::int32_t(gen(std::integral_constant<std::size_t, 3>())))) {}
   template <class U>
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask(
       simd_mask<U, abi_type> const& other) {
@@ -243,6 +367,18 @@ class simd_mask<std::int64_t, simd_abi::avx2_fixed_size<4>> {
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd_mask(
       __m256i const& value_in)
       : m_value(value_in) {}
+  template <class G,
+            std::enable_if_t<
+                std::is_invocable_r_v<value_type, G,
+                                      std::integral_constant<std::size_t, 0>>,
+                bool> = false>
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd_mask(
+      G&& gen) noexcept
+      : m_value(_mm256_setr_epi64x(
+            -std::int64_t(gen(std::integral_constant<std::size_t, 0>())),
+            -std::int64_t(gen(std::integral_constant<std::size_t, 1>())),
+            -std::int64_t(gen(std::integral_constant<std::size_t, 2>())),
+            -std::int64_t(gen(std::integral_constant<std::size_t, 3>())))) {}
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask(
       simd_mask<std::int32_t, abi_type> const& other)
       : m_value(_mm256_cvtepi32_epi64(static_cast<__m128i>(other))) {}
@@ -327,6 +463,18 @@ class simd_mask<std::uint64_t, simd_abi::avx2_fixed_size<4>> {
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd_mask(
       __m256i const& value_in)
       : m_value(value_in) {}
+  template <class G,
+            std::enable_if_t<
+                std::is_invocable_r_v<value_type, G,
+                                      std::integral_constant<std::size_t, 0>>,
+                bool> = false>
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd_mask(
+      G&& gen) noexcept
+      : m_value(_mm256_setr_epi64x(
+            -std::int64_t(gen(std::integral_constant<std::size_t, 0>())),
+            -std::int64_t(gen(std::integral_constant<std::size_t, 1>())),
+            -std::int64_t(gen(std::integral_constant<std::size_t, 2>())),
+            -std::int64_t(gen(std::integral_constant<std::size_t, 3>())))) {}
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256i()
       const {
     return m_value;
@@ -389,12 +537,21 @@ class simd<double, simd_abi::avx2_fixed_size<4>> {
                                       bool> = false>
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(U&& value)
       : m_value(_mm256_set1_pd(value_type(value))) {}
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(double a, double b, double c,
-                                             double d)
-      : m_value(_mm256_setr_pd(a, b, c, d)) {}
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd(
       __m256d const& value_in)
       : m_value(value_in) {}
+  template <class G,
+            std::enable_if_t<
+                std::is_invocable_r_v<value_type, G,
+                                      std::integral_constant<std::size_t, 0>>,
+                bool> = false>
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd(
+      G&& gen) noexcept
+      : m_value(_mm256_setr_pd(gen(std::integral_constant<std::size_t, 0>()),
+                               gen(std::integral_constant<std::size_t, 1>()),
+                               gen(std::integral_constant<std::size_t, 2>()),
+                               gen(std::integral_constant<std::size_t, 3>()))) {
+  }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) {
     return reinterpret_cast<value_type*>(&m_value)[i];
   }
@@ -414,163 +571,214 @@ class simd<double, simd_abi::avx2_fixed_size<4>> {
       const {
     return m_value;
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator<(simd const& other) const {
-    return mask_type(_mm256_cmp_pd(m_value, other.m_value, _CMP_LT_OS));
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator-() const
+      noexcept {
+    return simd(
+        _mm256_sub_pd(_mm256_set1_pd(0.0), static_cast<__m256d>(m_value)));
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(
+        _mm256_mul_pd(static_cast<__m256d>(lhs), static_cast<__m256d>(rhs)));
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator/(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(
+        _mm256_div_pd(static_cast<__m256d>(lhs), static_cast<__m256d>(rhs)));
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator+(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(
+        _mm256_add_pd(static_cast<__m256d>(lhs), static_cast<__m256d>(rhs)));
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator-(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(
+        _mm256_sub_pd(static_cast<__m256d>(lhs), static_cast<__m256d>(rhs)));
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator<(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(_mm256_cmp_pd(static_cast<__m256d>(lhs),
+                                   static_cast<__m256d>(rhs), _CMP_LT_OS));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator>(simd const& other) const {
-    return mask_type(_mm256_cmp_pd(m_value, other.m_value, _CMP_GT_OS));
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator>(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(_mm256_cmp_pd(static_cast<__m256d>(lhs),
+                                   static_cast<__m256d>(rhs), _CMP_GT_OS));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator<=(simd const& other) const {
-    return mask_type(_mm256_cmp_pd(m_value, other.m_value, _CMP_LE_OS));
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator<=(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(_mm256_cmp_pd(static_cast<__m256d>(lhs),
+                                   static_cast<__m256d>(rhs), _CMP_LE_OS));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator>=(simd const& other) const {
-    return mask_type(_mm256_cmp_pd(m_value, other.m_value, _CMP_GE_OS));
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator>=(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(_mm256_cmp_pd(static_cast<__m256d>(lhs),
+                                   static_cast<__m256d>(rhs), _CMP_GE_OS));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator==(simd const& other) const {
-    return mask_type(_mm256_cmp_pd(m_value, other.m_value, _CMP_EQ_OS));
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator==(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(_mm256_cmp_pd(static_cast<__m256d>(lhs),
+                                   static_cast<__m256d>(rhs), _CMP_EQ_OS));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator!=(simd const& other) const {
-    return mask_type(_mm256_cmp_pd(m_value, other.m_value, _CMP_NEQ_OS));
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator!=(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(_mm256_cmp_pd(static_cast<__m256d>(lhs),
+                                   static_cast<__m256d>(rhs), _CMP_NEQ_OS));
   }
 };
 
-[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-    simd<double, simd_abi::avx2_fixed_size<4>>
-    operator*(simd<double, simd_abi::avx2_fixed_size<4>> const& lhs,
-              simd<double, simd_abi::avx2_fixed_size<4>> const& rhs) {
-  return simd<double, simd_abi::avx2_fixed_size<4>>(
-      _mm256_mul_pd(static_cast<__m256d>(lhs), static_cast<__m256d>(rhs)));
-}
+}  // namespace Experimental
 
 [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-    simd<double, simd_abi::avx2_fixed_size<4>>
-    operator/(simd<double, simd_abi::avx2_fixed_size<4>> const& lhs,
-              simd<double, simd_abi::avx2_fixed_size<4>> const& rhs) {
-  return simd<double, simd_abi::avx2_fixed_size<4>>(
-      _mm256_div_pd(static_cast<__m256d>(lhs), static_cast<__m256d>(rhs)));
+    Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>
+    copysign(Experimental::simd<
+                 double, Experimental::simd_abi::avx2_fixed_size<4>> const& a,
+             Experimental::simd<
+                 double, Experimental::simd_abi::avx2_fixed_size<4>> const& b) {
+  __m256d const sign_mask = _mm256_set1_pd(-0.0);
+  return Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>(
+      _mm256_xor_pd(_mm256_andnot_pd(sign_mask, static_cast<__m256d>(a)),
+                    _mm256_and_pd(sign_mask, static_cast<__m256d>(b))));
 }
 
 [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-    simd<double, simd_abi::avx2_fixed_size<4>>
-    operator+(simd<double, simd_abi::avx2_fixed_size<4>> const& lhs,
-              simd<double, simd_abi::avx2_fixed_size<4>> const& rhs) {
-  return simd<double, simd_abi::avx2_fixed_size<4>>(
-      _mm256_add_pd(static_cast<__m256d>(lhs), static_cast<__m256d>(rhs)));
+    Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>
+    abs(Experimental::simd<
+        double, Experimental::simd_abi::avx2_fixed_size<4>> const& a) {
+  __m256d const sign_mask = _mm256_set1_pd(-0.0);
+  return Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>(
+      _mm256_andnot_pd(sign_mask, static_cast<__m256d>(a)));
 }
 
 [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-    simd<double, simd_abi::avx2_fixed_size<4>>
-    operator-(simd<double, simd_abi::avx2_fixed_size<4>> const& lhs,
-              simd<double, simd_abi::avx2_fixed_size<4>> const& rhs) {
-  return simd<double, simd_abi::avx2_fixed_size<4>>(
-      _mm256_sub_pd(static_cast<__m256d>(lhs), static_cast<__m256d>(rhs)));
+    Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>
+    floor(Experimental::simd<
+          double, Experimental::simd_abi::avx2_fixed_size<4>> const& a) {
+  return Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>(
+      _mm256_round_pd(static_cast<__m256d>(a),
+                      (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)));
 }
 
 [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-    simd<double, simd_abi::avx2_fixed_size<4>>
-    operator-(simd<double, simd_abi::avx2_fixed_size<4>> const& a) {
-  return simd<double, simd_abi::avx2_fixed_size<4>>(
-      _mm256_sub_pd(_mm256_set1_pd(0.0), static_cast<__m256d>(a)));
+    Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>
+    ceil(Experimental::simd<
+         double, Experimental::simd_abi::avx2_fixed_size<4>> const& a) {
+  return Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>(
+      _mm256_round_pd(static_cast<__m256d>(a),
+                      (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)));
 }
 
-KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-simd<double, simd_abi::avx2_fixed_size<4>> copysign(
-    simd<double, simd_abi::avx2_fixed_size<4>> const& a,
-    simd<double, simd_abi::avx2_fixed_size<4>> const& b) {
-  __m256d const sign_mask = _mm256_set1_pd(-0.0);
-  return simd<double, simd_abi::avx2_fixed_size<4>>(
-      _mm256_xor_pd(_mm256_andnot_pd(sign_mask, static_cast<__m256d>(a)),
-                    _mm256_and_pd(sign_mask, static_cast<__m256d>(b))));
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>
+    round(Experimental::simd<
+          double, Experimental::simd_abi::avx2_fixed_size<4>> const& a) {
+  return Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>(
+      _mm256_round_pd(static_cast<__m256d>(a),
+                      (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)));
 }
 
-KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-simd<double, simd_abi::avx2_fixed_size<4>> abs(
-    simd<double, simd_abi::avx2_fixed_size<4>> const& a) {
-  __m256d const sign_mask = _mm256_set1_pd(-0.0);
-  return simd<double, simd_abi::avx2_fixed_size<4>>(
-      _mm256_andnot_pd(sign_mask, static_cast<__m256d>(a)));
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>
+    trunc(Experimental::simd<
+          double, Experimental::simd_abi::avx2_fixed_size<4>> const& a) {
+  return Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>(
+      _mm256_round_pd(static_cast<__m256d>(a),
+                      (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)));
 }
 
-KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-simd<double, simd_abi::avx2_fixed_size<4>> sqrt(
-    simd<double, simd_abi::avx2_fixed_size<4>> const& a) {
-  return simd<double, simd_abi::avx2_fixed_size<4>>(
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>
+    sqrt(Experimental::simd<
+         double, Experimental::simd_abi::avx2_fixed_size<4>> const& a) {
+  return Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>(
       _mm256_sqrt_pd(static_cast<__m256d>(a)));
 }
 
 #ifdef __INTEL_COMPILER
 
-KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-simd<double, simd_abi::avx2_fixed_size<4>> cbrt(
-    simd<double, simd_abi::avx2_fixed_size<4>> const& a) {
-  return simd<double, simd_abi::avx2_fixed_size<4>>(
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>
+    cbrt(Experimental::simd<
+         double, Experimental::simd_abi::avx2_fixed_size<4>> const& a) {
+  return Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>(
       _mm256_cbrt_pd(static_cast<__m256d>(a)));
 }
 
-KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-simd<double, simd_abi::avx2_fixed_size<4>> exp(
-    simd<double, simd_abi::avx2_fixed_size<4>> const& a) {
-  return simd<double, simd_abi::avx2_fixed_size<4>>(
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>
+    exp(Experimental::simd<
+        double, Experimental::simd_abi::avx2_fixed_size<4>> const& a) {
+  return Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>(
       _mm256_exp_pd(static_cast<__m256d>(a)));
 }
 
-KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-simd<double, simd_abi::avx2_fixed_size<4>> log(
-    simd<double, simd_abi::avx2_fixed_size<4>> const& a) {
-  return simd<double, simd_abi::avx2_fixed_size<4>>(
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>
+    log(Experimental::simd<
+        double, Experimental::simd_abi::avx2_fixed_size<4>> const& a) {
+  return Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>(
       _mm256_log_pd(static_cast<__m256d>(a)));
 }
 
 #endif
 
-KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-simd<double, simd_abi::avx2_fixed_size<4>> fma(
-    simd<double, simd_abi::avx2_fixed_size<4>> const& a,
-    simd<double, simd_abi::avx2_fixed_size<4>> const& b,
-    simd<double, simd_abi::avx2_fixed_size<4>> const& c) {
-  return simd<double, simd_abi::avx2_fixed_size<4>>(
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>
+    fma(Experimental::simd<double,
+                           Experimental::simd_abi::avx2_fixed_size<4>> const& a,
+        Experimental::simd<double,
+                           Experimental::simd_abi::avx2_fixed_size<4>> const& b,
+        Experimental::simd<
+            double, Experimental::simd_abi::avx2_fixed_size<4>> const& c) {
+  return Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>(
       _mm256_fmadd_pd(static_cast<__m256d>(a), static_cast<__m256d>(b),
                       static_cast<__m256d>(c)));
 }
 
-KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-simd<double, simd_abi::avx2_fixed_size<4>> max(
-    simd<double, simd_abi::avx2_fixed_size<4>> const& a,
-    simd<double, simd_abi::avx2_fixed_size<4>> const& b) {
-  return simd<double, simd_abi::avx2_fixed_size<4>>(
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>
+    max(Experimental::simd<double,
+                           Experimental::simd_abi::avx2_fixed_size<4>> const& a,
+        Experimental::simd<
+            double, Experimental::simd_abi::avx2_fixed_size<4>> const& b) {
+  return Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>(
       _mm256_max_pd(static_cast<__m256d>(a), static_cast<__m256d>(b)));
 }
 
-KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-simd<double, simd_abi::avx2_fixed_size<4>> min(
-    simd<double, simd_abi::avx2_fixed_size<4>> const& a,
-    simd<double, simd_abi::avx2_fixed_size<4>> const& b) {
-  return simd<double, simd_abi::avx2_fixed_size<4>>(
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>
+    min(Experimental::simd<double,
+                           Experimental::simd_abi::avx2_fixed_size<4>> const& a,
+        Experimental::simd<
+            double, Experimental::simd_abi::avx2_fixed_size<4>> const& b) {
+  return Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>(
       _mm256_min_pd(static_cast<__m256d>(a), static_cast<__m256d>(b)));
 }
 
-KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-simd<double, simd_abi::avx2_fixed_size<4>> condition(
-    simd_mask<double, simd_abi::avx2_fixed_size<4>> const& a,
-    simd<double, simd_abi::avx2_fixed_size<4>> const& b,
-    simd<double, simd_abi::avx2_fixed_size<4>> const& c) {
+namespace Experimental {
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    simd<double, simd_abi::avx2_fixed_size<4>>
+    condition(simd_mask<double, simd_abi::avx2_fixed_size<4>> const& a,
+              simd<double, simd_abi::avx2_fixed_size<4>> const& b,
+              simd<double, simd_abi::avx2_fixed_size<4>> const& c) {
   return simd<double, simd_abi::avx2_fixed_size<4>>(
       _mm256_blendv_pd(static_cast<__m256d>(c), static_cast<__m256d>(b),
                        static_cast<__m256d>(a)));
 }
 
 template <>
-class simd<std::int32_t, simd_abi::avx2_fixed_size<4>> {
-  __m128i m_value;
+class simd<float, simd_abi::avx2_fixed_size<4>> {
+  __m128 m_value;
 
  public:
-  using value_type = std::int32_t;
+  using value_type = float;
   using abi_type   = simd_abi::avx2_fixed_size<4>;
   using mask_type  = simd_mask<value_type, abi_type>;
   using reference  = value_type&;
@@ -582,28 +790,23 @@ class simd<std::int32_t, simd_abi::avx2_fixed_size<4>> {
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() {
     return 4;
   }
-  template <class U, std::enable_if_t<std::is_convertible_v<U, value_type>,
-                                      bool> = false>
+  template <typename U, std::enable_if_t<std::is_convertible_v<U, value_type>,
+                                         bool> = false>
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(U&& value)
-      : m_value(_mm_set1_epi32(value_type(value))) {}
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(std::int32_t a, std::int32_t b,
-                                             std::int32_t c, std::int32_t d)
-      : m_value(_mm_setr_epi32(a, b, c, d)) {}
-  template <class G,
+      : m_value(_mm_set1_ps(value_type(value))) {}
+  template <typename G,
             std::enable_if_t<
                 std::is_invocable_r_v<value_type, G,
                                       std::integral_constant<std::size_t, 0>>,
                 bool> = false>
-  KOKKOS_FORCEINLINE_FUNCTION simd(G&& gen)
-      : simd(gen(std::integral_constant<std::size_t, 0>()),
-             gen(std::integral_constant<std::size_t, 1>()),
-             gen(std::integral_constant<std::size_t, 2>()),
-             gen(std::integral_constant<std::size_t, 3>())) {}
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(G&& gen)
+      : m_value(_mm_setr_ps(gen(std::integral_constant<std::size_t, 0>()),
+                            gen(std::integral_constant<std::size_t, 1>()),
+                            gen(std::integral_constant<std::size_t, 2>()),
+                            gen(std::integral_constant<std::size_t, 3>()))) {}
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd(
-      __m128i const& value_in)
+      __m128 const& value_in)
       : m_value(value_in) {}
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd(
-      simd<std::uint64_t, abi_type> const& other);
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) {
     return reinterpret_cast<value_type*>(&m_value)[i];
   }
@@ -613,75 +816,208 @@ class simd<std::int32_t, simd_abi::avx2_fixed_size<4>> {
   }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr,
                                                        element_aligned_tag) {
-    m_value = _mm_maskload_epi32(ptr, static_cast<__m128i>(mask_type(true)));
+    m_value = _mm_loadu_ps(ptr);
   }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(
       value_type* ptr, element_aligned_tag) const {
-    _mm_maskstore_epi32(ptr, static_cast<__m128i>(mask_type(true)), m_value);
+    _mm_storeu_ps(ptr, m_value);
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m128i()
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m128()
       const {
     return m_value;
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator==(simd const& other) const {
-    return mask_type(_mm_cmpeq_epi32(m_value, other.m_value));
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator-() const
+      noexcept {
+    return simd(_mm_sub_ps(_mm_set1_ps(0.0), m_value));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator>(simd const& other) const {
-    return mask_type(_mm_cmpgt_epi32(m_value, other.m_value));
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(_mm_mul_ps(lhs.m_value, rhs.m_value));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator<(simd const& other) const {
-    return mask_type(_mm_cmplt_epi32(m_value, other.m_value));
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator/(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(_mm_div_ps(lhs.m_value, rhs.m_value));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator<=(simd const& other) const {
-    return ((*this) < other) || ((*this) == other);
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator+(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(_mm_add_ps(lhs.m_value, rhs.m_value));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator>=(simd const& other) const {
-    return ((*this) > other) || ((*this) == other);
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator-(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(_mm_sub_ps(lhs.m_value, rhs.m_value));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator!=(simd const& other) const {
-    return !((*this) == other);
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator<(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(_mm_cmplt_ps(lhs.m_value, rhs.m_value));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator>(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(_mm_cmpgt_ps(lhs.m_value, rhs.m_value));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator<=(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(_mm_cmple_ps(lhs.m_value, rhs.m_value));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator>=(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(_mm_cmpge_ps(lhs.m_value, rhs.m_value));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator==(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(_mm_cmpeq_ps(lhs.m_value, rhs.m_value));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator!=(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(_mm_cmpneq_ps(lhs.m_value, rhs.m_value));
   }
 };
 
+}  // namespace Experimental
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd<
+    float, Experimental::simd_abi::avx2_fixed_size<4>>
+copysign(
+    Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>> const&
+        a,
+    Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>> const&
+        b) {
+  __m128 const sign_mask = _mm_set1_ps(-0.0);
+  return Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>>(
+      _mm_xor_ps(_mm_andnot_ps(sign_mask, static_cast<__m128>(a)),
+                 _mm_and_ps(sign_mask, static_cast<__m128>(b))));
+}
+
 [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-    simd<std::int32_t, simd_abi::avx2_fixed_size<4>>
-    operator-(simd<std::int32_t, simd_abi::avx2_fixed_size<4>> const& lhs,
-              simd<std::int32_t, simd_abi::avx2_fixed_size<4>> const& rhs) {
-  return simd<std::int32_t, simd_abi::avx2_fixed_size<4>>(
-      _mm_sub_epi32(static_cast<__m128i>(lhs), static_cast<__m128i>(rhs)));
+    Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>>
+    abs(Experimental::simd<
+        float, Experimental::simd_abi::avx2_fixed_size<4>> const& a) {
+  __m128 const sign_mask = _mm_set1_ps(-0.0);
+  return Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>>(
+      _mm_andnot_ps(sign_mask, static_cast<__m128>(a)));
 }
 
 [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-    simd<std::int32_t, simd_abi::avx2_fixed_size<4>>
-    operator+(simd<std::int32_t, simd_abi::avx2_fixed_size<4>> const& lhs,
-              simd<std::int32_t, simd_abi::avx2_fixed_size<4>> const& rhs) {
-  return simd<std::int32_t, simd_abi::avx2_fixed_size<4>>(
-      _mm_add_epi32(static_cast<__m128i>(lhs), static_cast<__m128i>(rhs)));
+    Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>>
+    floor(Experimental::simd<
+          float, Experimental::simd_abi::avx2_fixed_size<4>> const& a) {
+  return Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>>(
+      _mm_round_ps(static_cast<__m128>(a),
+                   (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)));
 }
 
 [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-    simd<std::int32_t, simd_abi::avx2_fixed_size<4>>
-    condition(simd_mask<std::int32_t, simd_abi::avx2_fixed_size<4>> const& a,
-              simd<std::int32_t, simd_abi::avx2_fixed_size<4>> const& b,
-              simd<std::int32_t, simd_abi::avx2_fixed_size<4>> const& c) {
-  return simd<std::int32_t, simd_abi::avx2_fixed_size<4>>(_mm_castps_si128(
-      _mm_blendv_ps(_mm_castsi128_ps(static_cast<__m128i>(c)),
-                    _mm_castsi128_ps(static_cast<__m128i>(b)),
-                    _mm_castsi128_ps(static_cast<__m128i>(a)))));
+    Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>>
+    ceil(Experimental::simd<
+         float, Experimental::simd_abi::avx2_fixed_size<4>> const& a) {
+  return Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>>(
+      _mm_round_ps(static_cast<__m128>(a),
+                   (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)));
+}
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>>
+    round(Experimental::simd<
+          float, Experimental::simd_abi::avx2_fixed_size<4>> const& a) {
+  return Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>>(
+      _mm_round_ps(static_cast<__m128>(a),
+                   (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)));
+}
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>>
+    trunc(Experimental::simd<
+          float, Experimental::simd_abi::avx2_fixed_size<4>> const& a) {
+  return Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>>(
+      _mm_round_ps(static_cast<__m128>(a),
+                   (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)));
+}
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>>
+    sqrt(Experimental::simd<
+         float, Experimental::simd_abi::avx2_fixed_size<4>> const& a) {
+  return Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>>(
+      _mm_sqrt_ps(static_cast<__m128>(a)));
+}
+
+#ifdef __INTEL_COMPILER
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>>
+    cbrt(Experimental::simd<
+         float, Experimental::simd_abi::avx2_fixed_size<4>> const& a) {
+  return Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>>(
+      _mm_cbrt_ps(static_cast<__m128>(a)));
+}
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>>
+    exp(Experimental::simd<
+        float, Experimental::simd_abi::avx2_fixed_size<4>> const& a) {
+  return Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>>(
+      _mm_exp_ps(static_cast<__m128>(a)));
+}
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>>
+    log(Experimental::simd<
+        float, Experimental::simd_abi::avx2_fixed_size<4>> const& a) {
+  return Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>>(
+      _mm_log_ps(static_cast<__m128>(a)));
+}
+
+#endif
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd<
+    float, Experimental::simd_abi::avx2_fixed_size<4>>
+fma(Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>> const&
+        a,
+    Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>> const&
+        b,
+    Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>> const&
+        c) {
+  return Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>>(
+      _mm_fmadd_ps(static_cast<__m128>(a), static_cast<__m128>(b),
+                   static_cast<__m128>(c)));
+}
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd<
+    float, Experimental::simd_abi::avx2_fixed_size<4>>
+max(Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>> const&
+        a,
+    Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>> const&
+        b) {
+  return Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>>(
+      _mm_max_ps(static_cast<__m128>(a), static_cast<__m128>(b)));
+}
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd<
+    float, Experimental::simd_abi::avx2_fixed_size<4>>
+min(Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>> const&
+        a,
+    Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>> const&
+        b) {
+  return Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<4>>(
+      _mm_min_ps(static_cast<__m128>(a), static_cast<__m128>(b)));
+}
+
+namespace Experimental {
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    simd<float, simd_abi::avx2_fixed_size<4>>
+    condition(simd_mask<float, simd_abi::avx2_fixed_size<4>> const& a,
+              simd<float, simd_abi::avx2_fixed_size<4>> const& b,
+              simd<float, simd_abi::avx2_fixed_size<4>> const& c) {
+  return simd<float, simd_abi::avx2_fixed_size<4>>(_mm_blendv_ps(
+      static_cast<__m128>(c), static_cast<__m128>(b), static_cast<__m128>(a)));
 }
 
 template <>
-class simd<std::int64_t, simd_abi::avx2_fixed_size<4>> {
-  __m256i m_value;
+class simd<std::int32_t, simd_abi::avx2_fixed_size<4>> {
+  __m128i m_value;
 
  public:
-  using value_type = std::int64_t;
+  using value_type = std::int32_t;
   using abi_type   = simd_abi::avx2_fixed_size<4>;
   using mask_type  = simd_mask<value_type, abi_type>;
   using reference  = value_type&;
@@ -696,28 +1032,24 @@ class simd<std::int64_t, simd_abi::avx2_fixed_size<4>> {
   template <class U, std::enable_if_t<std::is_convertible_v<U, value_type>,
                                       bool> = false>
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(U&& value)
-      : m_value(_mm256_set1_epi64x(value_type(value))) {}
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(std::int64_t a, std::int64_t b,
-                                             std::int64_t c, std::int64_t d)
-      : m_value(_mm256_setr_epi64x(a, b, c, d)) {}
+      : m_value(_mm_set1_epi32(value_type(value))) {}
   template <class G,
             std::enable_if_t<
                 std::is_invocable_r_v<value_type, G,
                                       std::integral_constant<std::size_t, 0>>,
                 bool> = false>
-  KOKKOS_FORCEINLINE_FUNCTION simd(G&& gen)
-      : simd(gen(std::integral_constant<std::size_t, 0>()),
-             gen(std::integral_constant<std::size_t, 1>()),
-             gen(std::integral_constant<std::size_t, 2>()),
-             gen(std::integral_constant<std::size_t, 3>())) {}
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd(
-      __m256i const& value_in)
+      G&& gen) noexcept
+      : m_value(_mm_setr_epi32(gen(std::integral_constant<std::size_t, 0>()),
+                               gen(std::integral_constant<std::size_t, 1>()),
+                               gen(std::integral_constant<std::size_t, 2>()),
+                               gen(std::integral_constant<std::size_t, 3>()))) {
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd(
+      __m128i const& value_in)
       : m_value(value_in) {}
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd(
       simd<std::uint64_t, abi_type> const& other);
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(
-      simd<std::int32_t, abi_type> const& other)
-      : m_value(_mm256_cvtepi32_epi64(static_cast<__m128i>(other))) {}
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) {
     return reinterpret_cast<value_type*>(&m_value)[i];
   }
@@ -727,62 +1059,329 @@ class simd<std::int64_t, simd_abi::avx2_fixed_size<4>> {
   }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr,
                                                        element_aligned_tag) {
-    m_value = _mm256_maskload_epi64(ptr, static_cast<__m256i>(mask_type(true)));
+    // FIXME_HIP ROCm 5.6 can't compile with the intrinsic used here.
+#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE
+    m_value = _mm_loadu_si128(reinterpret_cast<__m128i const*>(ptr));
+#else
+    m_value = _mm_maskload_epi32(ptr, static_cast<__m128i>(mask_type(true)));
+#endif
   }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(
       value_type* ptr, element_aligned_tag) const {
-    _mm256_maskstore_epi64(ptr, static_cast<__m256i>(mask_type(true)), m_value);
+    _mm_maskstore_epi32(ptr, static_cast<__m128i>(mask_type(true)), m_value);
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256i()
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m128i()
       const {
     return m_value;
   }
-  // AVX2 only has eq and gt comparisons for int64
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator==(simd const& other) const {
-    return mask_type(_mm256_cmpeq_epi64(m_value, other.m_value));
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator==(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(
+        _mm_cmpeq_epi32(static_cast<__m128i>(lhs), static_cast<__m128i>(rhs)));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator>(simd const& other) const {
-    return mask_type(_mm256_cmpgt_epi64(m_value, other.m_value));
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator>(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(
+        _mm_cmpgt_epi32(static_cast<__m128i>(lhs), static_cast<__m128i>(rhs)));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator<(simd const& other) const {
-    return other > (*this);
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator<(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(
+        _mm_cmplt_epi32(static_cast<__m128i>(lhs), static_cast<__m128i>(rhs)));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator<=(simd const& other) const {
-    return ((*this) < other) || ((*this) == other);
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator<=(simd const& lhs, simd const& rhs) noexcept {
+    return (lhs < rhs) || (lhs == rhs);
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator>=(simd const& other) const {
-    return ((*this) > other) || ((*this) == other);
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator>=(simd const& lhs, simd const& rhs) noexcept {
+    return (lhs > rhs) || (lhs == rhs);
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator!=(simd const& other) const {
-    return !((*this) == other);
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator!=(simd const& lhs, simd const& rhs) noexcept {
+    return !(lhs == rhs);
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator-(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(
+        _mm_sub_epi32(static_cast<__m128i>(lhs), static_cast<__m128i>(rhs)));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator+(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(
+        _mm_add_epi32(static_cast<__m128i>(lhs), static_cast<__m128i>(rhs)));
   }
-};
 
-[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-    simd<std::int64_t, simd_abi::avx2_fixed_size<4>>
-    operator-(simd<std::int64_t, simd_abi::avx2_fixed_size<4>> const& lhs,
-              simd<std::int64_t, simd_abi::avx2_fixed_size<4>> const& rhs) {
-  return simd<std::int64_t, simd_abi::avx2_fixed_size<4>>(
-      _mm256_sub_epi64(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs)));
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>(
+      simd const& lhs, int rhs) noexcept {
+    return simd(_mm_srai_epi32(static_cast<__m128i>(lhs), rhs));
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(
+        _mm_srav_epi32(static_cast<__m128i>(lhs), static_cast<__m128i>(rhs)));
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<(
+      simd const& lhs, int rhs) noexcept {
+    return simd(_mm_slli_epi32(static_cast<__m128i>(lhs), rhs));
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(
+        _mm_sllv_epi32(static_cast<__m128i>(lhs), static_cast<__m128i>(rhs)));
+  }
+};
+
+}  // namespace Experimental
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    Experimental::simd<std::int32_t, Experimental::simd_abi::avx2_fixed_size<4>>
+    abs(Experimental::simd<
+        std::int32_t, Experimental::simd_abi::avx2_fixed_size<4>> const& a) {
+  __m128i const rhs = static_cast<__m128i>(a);
+  return Experimental::simd<std::int32_t,
+                            Experimental::simd_abi::avx2_fixed_size<4>>(
+      _mm_abs_epi32(rhs));
 }
 
 [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-    simd<std::int64_t, simd_abi::avx2_fixed_size<4>>
-    operator-(simd<std::int64_t, simd_abi::avx2_fixed_size<4>> const& a) {
-  return simd<std::int64_t, simd_abi::avx2_fixed_size<4>>(0) - a;
+    Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>
+    floor(Experimental::simd<
+          std::int32_t, Experimental::simd_abi::avx2_fixed_size<4>> const& a) {
+  return Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>(
+      _mm256_cvtepi32_pd(static_cast<__m128i>(a)));
 }
 
-KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-simd<std::int64_t, simd_abi::avx2_fixed_size<4>> condition(
-    simd_mask<std::int64_t, simd_abi::avx2_fixed_size<4>> const& a,
-    simd<std::int64_t, simd_abi::avx2_fixed_size<4>> const& b,
-    simd<std::int64_t, simd_abi::avx2_fixed_size<4>> const& c) {
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>
+    ceil(Experimental::simd<
+         std::int32_t, Experimental::simd_abi::avx2_fixed_size<4>> const& a) {
+  return Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>(
+      _mm256_cvtepi32_pd(static_cast<__m128i>(a)));
+}
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>
+    round(Experimental::simd<
+          std::int32_t, Experimental::simd_abi::avx2_fixed_size<4>> const& a) {
+  return Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>(
+      _mm256_cvtepi32_pd(static_cast<__m128i>(a)));
+}
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>
+    trunc(Experimental::simd<
+          std::int32_t, Experimental::simd_abi::avx2_fixed_size<4>> const& a) {
+  return Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>(
+      _mm256_cvtepi32_pd(static_cast<__m128i>(a)));
+}
+
+namespace Experimental {
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    simd<std::int32_t, simd_abi::avx2_fixed_size<4>>
+    condition(simd_mask<std::int32_t, simd_abi::avx2_fixed_size<4>> const& a,
+              simd<std::int32_t, simd_abi::avx2_fixed_size<4>> const& b,
+              simd<std::int32_t, simd_abi::avx2_fixed_size<4>> const& c) {
+  return simd<std::int32_t, simd_abi::avx2_fixed_size<4>>(_mm_castps_si128(
+      _mm_blendv_ps(_mm_castsi128_ps(static_cast<__m128i>(c)),
+                    _mm_castsi128_ps(static_cast<__m128i>(b)),
+                    _mm_castsi128_ps(static_cast<__m128i>(a)))));
+}
+
+template <>
+class simd<std::int64_t, simd_abi::avx2_fixed_size<4>> {
+  __m256i m_value;
+
+  static_assert(sizeof(long long) == 8);
+
+ public:
+  using value_type = std::int64_t;
+  using abi_type   = simd_abi::avx2_fixed_size<4>;
+  using mask_type  = simd_mask<value_type, abi_type>;
+  using reference  = value_type&;
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd()            = default;
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default;
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&)      = default;
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd const&) = default;
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default;
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() {
+    return 4;
+  }
+  template <class U, std::enable_if_t<std::is_convertible_v<U, value_type>,
+                                      bool> = false>
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(U&& value)
+      : m_value(_mm256_set1_epi64x(value_type(value))) {}
+  template <class G,
+            std::enable_if_t<
+                std::is_invocable_r_v<value_type, G,
+                                      std::integral_constant<std::size_t, 0>>,
+                bool> = false>
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd(
+      G&& gen) noexcept
+      : m_value(_mm256_setr_epi64x(
+            gen(std::integral_constant<std::size_t, 0>()),
+            gen(std::integral_constant<std::size_t, 1>()),
+            gen(std::integral_constant<std::size_t, 2>()),
+            gen(std::integral_constant<std::size_t, 3>()))) {}
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd(
+      __m256i const& value_in)
+      : m_value(value_in) {}
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(
+      simd<std::uint64_t, abi_type> const& other);
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(
+      simd<std::int32_t, abi_type> const& other)
+      : m_value(_mm256_cvtepi32_epi64(static_cast<__m128i>(other))) {}
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) {
+    return reinterpret_cast<value_type*>(&m_value)[i];
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type
+  operator[](std::size_t i) const {
+    return reinterpret_cast<value_type const*>(&m_value)[i];
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr,
+                                                       element_aligned_tag) {
+#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE
+    m_value = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(ptr));
+#else
+    m_value = _mm256_maskload_epi64(reinterpret_cast<long long const*>(ptr),
+                                    static_cast<__m256i>(mask_type(true)));
+#endif
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(
+      value_type* ptr, element_aligned_tag) const {
+    _mm256_maskstore_epi64(reinterpret_cast<long long*>(ptr),
+                           static_cast<__m256i>(mask_type(true)), m_value);
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256i()
+      const {
+    return m_value;
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator-() const
+      noexcept {
+    return simd(
+        _mm256_sub_epi64(_mm256_set1_epi64x(0), static_cast<__m256i>(m_value)));
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator-(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(
+        _mm256_sub_epi64(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs)));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator+(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(
+        _mm256_add_epi64(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs)));
+  }
+
+  // AVX2 only has eq and gt comparisons for int64
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator==(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(_mm256_cmpeq_epi64(static_cast<__m256i>(lhs),
+                                        static_cast<__m256i>(rhs)));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator>(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(_mm256_cmpgt_epi64(static_cast<__m256i>(lhs),
+                                        static_cast<__m256i>(rhs)));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator<(simd const& lhs, simd const& rhs) noexcept {
+    return rhs > lhs;
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator<=(simd const& lhs, simd const& rhs) noexcept {
+    return (lhs < rhs) || (lhs == rhs);
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator>=(simd const& lhs, simd const& rhs) noexcept {
+    return (lhs > rhs) || (lhs == rhs);
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator!=(simd const& lhs, simd const& rhs) noexcept {
+    return !(lhs == rhs);
+  }
+
+  // Shift right arithmetic for 64bit packed ints is not availalbe in AVX2
+  // [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd(
+  //     simd const& lhs, int rhs) noexcept {
+  //   return simd(_mm256_srai_epi64(static_cast<__m256i>(lhs), rhs));
+  // }
+
+  // [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd(
+  //     simd const& lhs, simd const& rhs) noexcept {
+  //   return simd(_mm256_srav_epi64(static_cast<__m256i>(lhs),
+  //                                 static_cast<__m256i>(rhs))));
+  // }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<(
+      simd const& lhs, int rhs) noexcept {
+    return simd(_mm256_slli_epi64(static_cast<__m256i>(lhs), rhs));
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(_mm256_sllv_epi64(static_cast<__m256i>(lhs),
+                                  static_cast<__m256i>(rhs)));
+  }
+};
+
+}  // namespace Experimental
+
+// Manually computing absolute values, because _mm256_abs_epi64
+// is not in AVX2; it's available in AVX512.
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    Experimental::simd<std::int64_t, Experimental::simd_abi::avx2_fixed_size<4>>
+    abs(Experimental::simd<
+        std::int64_t, Experimental::simd_abi::avx2_fixed_size<4>> const& a) {
+  return Experimental::simd<std::int64_t,
+                            Experimental::simd_abi::avx2_fixed_size<4>>(
+      [&](std::size_t i) { return (a[i] < 0) ? -a[i] : a[i]; });
+}
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>
+    floor(Experimental::simd<
+          std::int64_t, Experimental::simd_abi::avx2_fixed_size<4>> const& a) {
+  return Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>(
+      _mm256_setr_pd(a[0], a[1], a[2], a[3]));
+}
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>
+    ceil(Experimental::simd<
+         std::int64_t, Experimental::simd_abi::avx2_fixed_size<4>> const& a) {
+  return Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>(
+      _mm256_setr_pd(a[0], a[1], a[2], a[3]));
+}
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>
+    round(Experimental::simd<
+          std::int64_t, Experimental::simd_abi::avx2_fixed_size<4>> const& a) {
+  return Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>(
+      _mm256_setr_pd(a[0], a[1], a[2], a[3]));
+}
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>
+    trunc(Experimental::simd<
+          std::int64_t, Experimental::simd_abi::avx2_fixed_size<4>> const& a) {
+  return Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>(
+      _mm256_setr_pd(a[0], a[1], a[2], a[3]));
+}
+
+namespace Experimental {
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    simd<std::int64_t, simd_abi::avx2_fixed_size<4>>
+    condition(simd_mask<std::int64_t, simd_abi::avx2_fixed_size<4>> const& a,
+              simd<std::int64_t, simd_abi::avx2_fixed_size<4>> const& b,
+              simd<std::int64_t, simd_abi::avx2_fixed_size<4>> const& c) {
   return simd<std::int64_t, simd_abi::avx2_fixed_size<4>>(_mm256_castpd_si256(
       _mm256_blendv_pd(_mm256_castsi256_pd(static_cast<__m256i>(c)),
                        _mm256_castsi256_pd(static_cast<__m256i>(b)),
@@ -809,8 +1408,20 @@ class simd<std::uint64_t, simd_abi::avx2_fixed_size<4>> {
   template <class U, std::enable_if_t<std::is_convertible_v<U, value_type>,
                                       bool> = false>
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(U&& value)
-      : m_value(_mm256_set1_epi64x(bit_cast<std::int64_t>(value_type(value)))) {
-  }
+      : m_value(_mm256_set1_epi64x(
+            Kokkos::bit_cast<std::int64_t>(value_type(value)))) {}
+  template <class G,
+            std::enable_if_t<
+                std::is_invocable_r_v<value_type, G,
+                                      std::integral_constant<std::size_t, 0>>,
+                bool> = false>
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd(
+      G&& gen) noexcept
+      : m_value(_mm256_setr_epi64x(
+            gen(std::integral_constant<std::size_t, 0>()),
+            gen(std::integral_constant<std::size_t, 1>()),
+            gen(std::integral_constant<std::size_t, 2>()),
+            gen(std::integral_constant<std::size_t, 3>()))) {}
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr simd(__m256i const& value_in)
       : m_value(value_in) {}
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd(
@@ -826,43 +1437,65 @@ class simd<std::uint64_t, simd_abi::avx2_fixed_size<4>> {
   operator[](std::size_t i) const {
     return reinterpret_cast<value_type const*>(&m_value)[i];
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd
-  operator>>(unsigned int rhs) const {
-    return _mm256_srli_epi64(m_value, rhs);
-  }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator>>(
-      simd<std::int32_t, simd_abi::avx2_fixed_size<4>> const& rhs) const {
-    return _mm256_srlv_epi64(m_value,
-                             _mm256_cvtepi32_epi64(static_cast<__m128i>(rhs)));
-  }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd
-  operator<<(unsigned int rhs) const {
-    return _mm256_slli_epi64(m_value, rhs);
-  }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator<<(
-      simd<std::int32_t, simd_abi::avx2_fixed_size<4>> const& rhs) const {
-    return _mm256_sllv_epi64(m_value,
-                             _mm256_cvtepi32_epi64(static_cast<__m128i>(rhs)));
-  }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd
-  operator&(simd const& other) const {
-    return _mm256_and_si256(m_value, other.m_value);
-  }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd
-  operator|(simd const& other) const {
-    return _mm256_or_si256(m_value, other.m_value);
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr,
+                                                       element_aligned_tag) {
+#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE
+    m_value = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(ptr));
+#else
+    m_value = _mm256_maskload_epi64(reinterpret_cast<long long const*>(ptr),
+                                    static_cast<__m256i>(mask_type(true)));
+#endif
   }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256i()
       const {
     return m_value;
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator==(simd const& other) const {
-    return mask_type(_mm256_cmpeq_epi64(m_value, other.m_value));
-  }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator!=(simd const& other) const {
-    return !((*this) == other);
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator+(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(
+        _mm256_add_epi64(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs)));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator-(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(
+        _mm256_sub_epi64(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs)));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>(
+      simd const& lhs, int rhs) noexcept {
+    return _mm256_srli_epi64(static_cast<__m256i>(lhs), rhs);
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>(
+      simd const& lhs, simd const& rhs) noexcept {
+    return _mm256_srlv_epi64(static_cast<__m256i>(lhs),
+                             static_cast<__m256i>(rhs));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<(
+      simd const& lhs, int rhs) noexcept {
+    return _mm256_slli_epi64(static_cast<__m256i>(lhs), rhs);
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<(
+      simd const& lhs, simd const& rhs) noexcept {
+    return _mm256_sllv_epi64(static_cast<__m256i>(lhs),
+                             static_cast<__m256i>(rhs));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator&(
+      simd const& lhs, simd const& rhs) noexcept {
+    return _mm256_and_si256(static_cast<__m256i>(lhs),
+                            static_cast<__m256i>(rhs));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator|(
+      simd const& lhs, simd const& rhs) noexcept {
+    return _mm256_or_si256(static_cast<__m256i>(lhs),
+                           static_cast<__m256i>(rhs));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator==(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(_mm256_cmpeq_epi64(static_cast<__m256i>(lhs),
+                                        static_cast<__m256i>(rhs)));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator!=(simd const& lhs, simd const& rhs) noexcept {
+    return !(lhs == rhs);
   }
 };
 
@@ -871,11 +1504,54 @@ simd<std::int64_t, simd_abi::avx2_fixed_size<4>>::simd(
     simd<std::uint64_t, simd_abi::avx2_fixed_size<4>> const& other)
     : m_value(static_cast<__m256i>(other)) {}
 
-KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-simd<std::uint64_t, simd_abi::avx2_fixed_size<4>> condition(
-    simd_mask<std::uint64_t, simd_abi::avx2_fixed_size<4>> const& a,
-    simd<std::uint64_t, simd_abi::avx2_fixed_size<4>> const& b,
-    simd<std::uint64_t, simd_abi::avx2_fixed_size<4>> const& c) {
+}  // namespace Experimental
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd<
+    std::uint64_t, Experimental::simd_abi::avx2_fixed_size<4>>
+abs(Experimental::simd<std::uint64_t,
+                       Experimental::simd_abi::avx2_fixed_size<4>> const& a) {
+  return a;
+}
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>
+    floor(Experimental::simd<
+          std::uint64_t, Experimental::simd_abi::avx2_fixed_size<4>> const& a) {
+  return Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>(
+      _mm256_setr_pd(a[0], a[1], a[2], a[3]));
+}
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>
+    ceil(Experimental::simd<
+         std::uint64_t, Experimental::simd_abi::avx2_fixed_size<4>> const& a) {
+  return Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>(
+      _mm256_setr_pd(a[0], a[1], a[2], a[3]));
+}
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>
+    round(Experimental::simd<
+          std::uint64_t, Experimental::simd_abi::avx2_fixed_size<4>> const& a) {
+  return Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>(
+      _mm256_setr_pd(a[0], a[1], a[2], a[3]));
+}
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>
+    trunc(Experimental::simd<
+          std::uint64_t, Experimental::simd_abi::avx2_fixed_size<4>> const& a) {
+  return Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>(
+      _mm256_setr_pd(a[0], a[1], a[2], a[3]));
+}
+
+namespace Experimental {
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    simd<std::uint64_t, simd_abi::avx2_fixed_size<4>>
+    condition(simd_mask<std::uint64_t, simd_abi::avx2_fixed_size<4>> const& a,
+              simd<std::uint64_t, simd_abi::avx2_fixed_size<4>> const& b,
+              simd<std::uint64_t, simd_abi::avx2_fixed_size<4>> const& c) {
   return simd<std::uint64_t, simd_abi::avx2_fixed_size<4>>(_mm256_castpd_si256(
       _mm256_blendv_pd(_mm256_castsi256_pd(static_cast<__m256i>(c)),
                        _mm256_castsi256_pd(static_cast<__m256i>(b)),
@@ -905,14 +1581,7 @@ class const_where_expression<simd_mask<double, simd_abi::avx2_fixed_size<4>>,
  public:
   const_where_expression(mask_type const& mask_arg, value_type const& value_arg)
       : m_value(const_cast<value_type&>(value_arg)), m_mask(mask_arg) {}
-  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr mask_type const&
-  mask() const {
-    return m_mask;
-  }
-  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr value_type const&
-  value() const {
-    return m_value;
-  }
+
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
   void copy_to(double* mem, element_aligned_tag) const {
     _mm256_maskstore_pd(mem, _mm256_castpd_si256(static_cast<__m256d>(m_mask)),
@@ -926,6 +1595,16 @@ class const_where_expression<simd_mask<double, simd_abi::avx2_fixed_size<4>>,
       if (m_mask[lane]) mem[index[lane]] = m_value[lane];
     }
   }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type const&
+  impl_get_value() const {
+    return m_value;
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type const&
+  impl_get_mask() const {
+    return m_mask;
+  }
 };
 
 template <>
@@ -949,7 +1628,7 @@ class where_expression<simd_mask<double, simd_abi::avx2_fixed_size<4>>,
       double const* mem,
       simd<std::int32_t, simd_abi::avx2_fixed_size<4>> const& index) {
     m_value = value_type(_mm256_mask_i32gather_pd(
-        _mm256_set1_pd(0.0), mem, static_cast<__m128i>(index),
+        static_cast<__m256d>(m_value), mem, static_cast<__m128i>(index),
         static_cast<__m256d>(m_mask), 8));
   }
   template <class U,
@@ -966,6 +1645,85 @@ class where_expression<simd_mask<double, simd_abi::avx2_fixed_size<4>>,
   }
 };
 
+template <>
+class const_where_expression<simd_mask<float, simd_abi::avx2_fixed_size<4>>,
+                             simd<float, simd_abi::avx2_fixed_size<4>>> {
+ public:
+  using abi_type   = simd_abi::avx2_fixed_size<4>;
+  using value_type = simd<float, abi_type>;
+  using mask_type  = simd_mask<float, abi_type>;
+
+ protected:
+  value_type& m_value;
+  mask_type const& m_mask;
+
+ public:
+  const_where_expression(mask_type const& mask_arg, value_type const& value_arg)
+      : m_value(const_cast<value_type&>(value_arg)), m_mask(mask_arg) {}
+
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void copy_to(float* mem, element_aligned_tag) const {
+    _mm_maskstore_ps(mem, _mm_castps_si128(static_cast<__m128>(m_mask)),
+                     static_cast<__m128>(m_value));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void scatter_to(
+      float* mem,
+      simd<std::int32_t, simd_abi::avx2_fixed_size<4>> const& index) const {
+    for (std::size_t lane = 0; lane < 4; ++lane) {
+      if (m_mask[lane]) mem[index[lane]] = m_value[lane];
+    }
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type const&
+  impl_get_value() const {
+    return m_value;
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type const&
+  impl_get_mask() const {
+    return m_mask;
+  }
+};
+
+template <>
+class where_expression<simd_mask<float, simd_abi::avx2_fixed_size<4>>,
+                       simd<float, simd_abi::avx2_fixed_size<4>>>
+    : public const_where_expression<
+          simd_mask<float, simd_abi::avx2_fixed_size<4>>,
+          simd<float, simd_abi::avx2_fixed_size<4>>> {
+ public:
+  where_expression(
+      simd_mask<float, simd_abi::avx2_fixed_size<4>> const& mask_arg,
+      simd<float, simd_abi::avx2_fixed_size<4>>& value_arg)
+      : const_where_expression(mask_arg, value_arg) {}
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void copy_from(float const* mem, element_aligned_tag) {
+    m_value = value_type(
+        _mm_maskload_ps(mem, _mm_castps_si128(static_cast<__m128>(m_mask))));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void gather_from(
+      float const* mem,
+      simd<std::int32_t, simd_abi::avx2_fixed_size<4>> const& index) {
+    m_value = value_type(_mm_mask_i32gather_ps(static_cast<__m128>(m_value),
+                                               mem, static_cast<__m128i>(index),
+                                               static_cast<__m128>(m_mask), 4));
+  }
+  template <class U,
+            std::enable_if_t<std::is_convertible_v<
+                                 U, simd<float, simd_abi::avx2_fixed_size<4>>>,
+                             bool> = false>
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void operator=(U&& x) {
+    auto const x_as_value_type =
+        static_cast<simd<float, simd_abi::avx2_fixed_size<4>>>(
+            std::forward<U>(x));
+    m_value = simd<float, simd_abi::avx2_fixed_size<4>>(_mm_blendv_ps(
+        static_cast<__m128>(m_value), static_cast<__m128>(x_as_value_type),
+        static_cast<__m128>(m_mask)));
+  }
+};
+
 template <>
 class const_where_expression<
     simd_mask<std::int32_t, simd_abi::avx2_fixed_size<4>>,
@@ -982,19 +1740,30 @@ class const_where_expression<
  public:
   const_where_expression(mask_type const& mask_arg, value_type const& value_arg)
       : m_value(const_cast<value_type&>(value_arg)), m_mask(mask_arg) {}
-  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr mask_type const&
-  mask() const {
-    return m_mask;
-  }
-  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr value_type const&
-  value() const {
-    return m_value;
-  }
+
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
   void copy_to(std::int32_t* mem, element_aligned_tag) const {
     _mm_maskstore_epi32(mem, static_cast<__m128i>(m_mask),
                         static_cast<__m128i>(m_value));
   }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void scatter_to(
+      std::int32_t* mem,
+      simd<std::int32_t, simd_abi::avx2_fixed_size<4>> const& index) const {
+    for (std::size_t lane = 0; lane < 4; ++lane) {
+      if (m_mask[lane]) mem[index[lane]] = m_value[lane];
+    }
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type const&
+  impl_get_value() const {
+    return m_value;
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type const&
+  impl_get_mask() const {
+    return m_mask;
+  }
 };
 
 template <>
@@ -1010,11 +1779,218 @@ class where_expression<simd_mask<std::int32_t, simd_abi::avx2_fixed_size<4>>,
       : const_where_expression(mask_arg, value_arg) {}
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
   void copy_from(std::int32_t const* mem, element_aligned_tag) {
+#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE
+    __m128i tmp = _mm_loadu_si128(reinterpret_cast<__m128i const*>(mem));
+    m_value     = value_type(_mm_and_si128(tmp, static_cast<__m128i>(m_mask)));
+#else
     m_value = value_type(_mm_maskload_epi32(mem, static_cast<__m128i>(m_mask)));
+#endif
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void gather_from(
+      std::int32_t const* mem,
+      simd<std::int32_t, simd_abi::avx2_fixed_size<4>> const& index) {
+    m_value = value_type(_mm_mask_i32gather_epi32(
+        static_cast<__m128i>(m_value), mem, static_cast<__m128i>(index),
+        static_cast<__m128i>(m_mask), 4));
+  }
+  template <
+      class U,
+      std::enable_if_t<std::is_convertible_v<
+                           U, simd<std::int32_t, simd_abi::avx2_fixed_size<4>>>,
+                       bool> = false>
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void operator=(U&& x) {
+    auto const x_as_value_type =
+        static_cast<simd<std::int32_t, simd_abi::avx2_fixed_size<4>>>(
+            std::forward<U>(x));
+    m_value = simd<std::int32_t, simd_abi::avx2_fixed_size<4>>(_mm_castps_si128(
+        _mm_blendv_ps(_mm_castsi128_ps(static_cast<__m128i>(m_value)),
+                      _mm_castsi128_ps(static_cast<__m128i>(x_as_value_type)),
+                      _mm_castsi128_ps(static_cast<__m128i>(m_mask)))));
+  }
+};
+
+template <>
+class const_where_expression<
+    simd_mask<std::int64_t, simd_abi::avx2_fixed_size<4>>,
+    simd<std::int64_t, simd_abi::avx2_fixed_size<4>>> {
+ public:
+  using abi_type   = simd_abi::avx2_fixed_size<4>;
+  using value_type = simd<std::int64_t, abi_type>;
+  using mask_type  = simd_mask<std::int64_t, abi_type>;
+
+ protected:
+  value_type& m_value;
+  mask_type const& m_mask;
+
+ public:
+  const_where_expression(mask_type const& mask_arg, value_type const& value_arg)
+      : m_value(const_cast<value_type&>(value_arg)), m_mask(mask_arg) {}
+
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(
+      std::int64_t* mem, element_aligned_tag) const {
+    _mm256_maskstore_epi64(reinterpret_cast<long long*>(mem),
+                           static_cast<__m256i>(m_mask),
+                           static_cast<__m256i>(m_value));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void scatter_to(
+      std::int64_t* mem,
+      simd<std::int32_t, simd_abi::avx2_fixed_size<4>> const& index) const {
+    for (std::size_t lane = 0; lane < 4; ++lane) {
+      if (m_mask[lane]) mem[index[lane]] = m_value[lane];
+    }
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type const&
+  impl_get_value() const {
+    return m_value;
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type const&
+  impl_get_mask() const {
+    return m_mask;
+  }
+};
+
+template <>
+class where_expression<simd_mask<std::int64_t, simd_abi::avx2_fixed_size<4>>,
+                       simd<std::int64_t, simd_abi::avx2_fixed_size<4>>>
+    : public const_where_expression<
+          simd_mask<std::int64_t, simd_abi::avx2_fixed_size<4>>,
+          simd<std::int64_t, simd_abi::avx2_fixed_size<4>>> {
+ public:
+  where_expression(
+      simd_mask<std::int64_t, simd_abi::avx2_fixed_size<4>> const& mask_arg,
+      simd<std::int64_t, simd_abi::avx2_fixed_size<4>>& value_arg)
+      : const_where_expression(mask_arg, value_arg) {}
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(std::int64_t const* mem,
+                                                       element_aligned_tag) {
+#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE
+    __m256i tmp = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(mem));
+    m_value = value_type(_mm256_and_si256(tmp, static_cast<__m256i>(m_mask)));
+#else
+    m_value = value_type(_mm256_maskload_epi64(
+        reinterpret_cast<long long const*>(mem), static_cast<__m256i>(m_mask)));
+#endif
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void gather_from(
+      std::int64_t const* mem,
+      simd<std::int32_t, simd_abi::avx2_fixed_size<4>> const& index) {
+    m_value = value_type(_mm256_mask_i32gather_epi64(
+        static_cast<__m256i>(m_value), reinterpret_cast<long long const*>(mem),
+        static_cast<__m128i>(index), static_cast<__m256i>(m_mask), 8));
+  }
+  template <
+      class u,
+      std::enable_if_t<std::is_convertible_v<
+                           u, simd<std::int64_t, simd_abi::avx2_fixed_size<4>>>,
+                       bool> = false>
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void operator=(u&& x) {
+    auto const x_as_value_type =
+        static_cast<simd<std::int64_t, simd_abi::avx2_fixed_size<4>>>(
+            std::forward<u>(x));
+    m_value = simd<std::int64_t, simd_abi::avx2_fixed_size<4>>(
+        _mm256_castpd_si256(_mm256_blendv_pd(
+            _mm256_castsi256_pd(static_cast<__m256i>(m_value)),
+            _mm256_castsi256_pd(static_cast<__m256i>(x_as_value_type)),
+            _mm256_castsi256_pd(static_cast<__m256i>(m_mask)))));
+  }
+};
+
+template <>
+class const_where_expression<
+    simd_mask<std::uint64_t, simd_abi::avx2_fixed_size<4>>,
+    simd<std::uint64_t, simd_abi::avx2_fixed_size<4>>> {
+ public:
+  using abi_type   = simd_abi::avx2_fixed_size<4>;
+  using value_type = simd<std::uint64_t, abi_type>;
+  using mask_type  = simd_mask<std::uint64_t, abi_type>;
+
+ protected:
+  value_type& m_value;
+  mask_type const& m_mask;
+
+ public:
+  const_where_expression(mask_type const& mask_arg, value_type const& value_arg)
+      : m_value(const_cast<value_type&>(value_arg)), m_mask(mask_arg) {}
+
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(
+      std::uint64_t* mem, element_aligned_tag) const {
+    _mm256_maskstore_epi64(reinterpret_cast<long long*>(mem),
+                           static_cast<__m256i>(m_mask),
+                           static_cast<__m256i>(m_value));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void scatter_to(
+      std::uint64_t* mem,
+      simd<std::int32_t, simd_abi::avx2_fixed_size<4>> const& index) const {
+    for (std::size_t lane = 0; lane < 4; ++lane) {
+      if (m_mask[lane]) mem[index[lane]] = m_value[lane];
+    }
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type const&
+  impl_get_value() const {
+    return m_value;
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type const&
+  impl_get_mask() const {
+    return m_mask;
+  }
+};
+
+template <>
+class where_expression<simd_mask<std::uint64_t, simd_abi::avx2_fixed_size<4>>,
+                       simd<std::uint64_t, simd_abi::avx2_fixed_size<4>>>
+    : public const_where_expression<
+          simd_mask<std::uint64_t, simd_abi::avx2_fixed_size<4>>,
+          simd<std::uint64_t, simd_abi::avx2_fixed_size<4>>> {
+ public:
+  where_expression(
+      simd_mask<std::uint64_t, simd_abi::avx2_fixed_size<4>> const& mask_arg,
+      simd<std::uint64_t, simd_abi::avx2_fixed_size<4>>& value_arg)
+      : const_where_expression(mask_arg, value_arg) {}
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(std::uint64_t const* mem,
+                                                       element_aligned_tag) {
+#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE
+    __m256i tmp = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(mem));
+    m_value = value_type(_mm256_and_si256(tmp, static_cast<__m256i>(m_mask)));
+#else
+    m_value = value_type(_mm256_maskload_epi64(
+        reinterpret_cast<long long const*>(mem), static_cast<__m256i>(m_mask)));
+#endif
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void gather_from(
+      std::uint64_t const* mem,
+      simd<std::int32_t, simd_abi::avx2_fixed_size<4>> const& index) {
+    m_value = value_type(_mm256_mask_i32gather_epi64(
+        static_cast<__m256i>(m_value), reinterpret_cast<long long const*>(mem),
+        static_cast<__m128i>(index), static_cast<__m256i>(m_mask), 8));
+  }
+  template <class u,
+            std::enable_if_t<
+                std::is_convertible_v<
+                    u, simd<std::uint64_t, simd_abi::avx2_fixed_size<4>>>,
+                bool> = false>
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void operator=(u&& x) {
+    auto const x_as_value_type =
+        static_cast<simd<std::uint64_t, simd_abi::avx2_fixed_size<4>>>(
+            std::forward<u>(x));
+    m_value = simd<std::uint64_t, simd_abi::avx2_fixed_size<4>>(
+        _mm256_castpd_si256(_mm256_blendv_pd(
+            _mm256_castsi256_pd(static_cast<__m256i>(m_value)),
+            _mm256_castsi256_pd(static_cast<__m256i>(x_as_value_type)),
+            _mm256_castsi256_pd(static_cast<__m256i>(m_mask)))));
   }
 };
 
 }  // namespace Experimental
 }  // namespace Kokkos
 
+#undef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE
+
 #endif
diff --git a/packages/kokkos/simd/src/Kokkos_SIMD_AVX512.hpp b/packages/kokkos/simd/src/Kokkos_SIMD_AVX512.hpp
index b63bb569bb7573282db1d298988d0041f2929474..c5d1717ad4ea765bf7648c68b1430cac7d3fae95 100644
--- a/packages/kokkos/simd/src/Kokkos_SIMD_AVX512.hpp
+++ b/packages/kokkos/simd/src/Kokkos_SIMD_AVX512.hpp
@@ -21,9 +21,15 @@
 #include <type_traits>
 
 #include <Kokkos_SIMD_Common.hpp>
+#include <Kokkos_BitManipulation.hpp>  // bit_cast
 
 #include <immintrin.h>
 
+#ifdef KOKKOS_SIMD_COMMON_MATH_HPP
+#error \
+    "Kokkos_SIMD_AVX512.hpp must be included before Kokkos_SIMD_Common_Math.hpp!"
+#endif
+
 namespace Kokkos {
 namespace Experimental {
 
@@ -71,6 +77,29 @@ class simd_mask<T, simd_abi::avx512_fixed_size<8>> {
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask(
       simd_mask<U, simd_abi::avx512_fixed_size<8>> const& other)
       : m_value(static_cast<__mmask8>(other)) {}
+  template <class G,
+            std::enable_if_t<
+                std::is_invocable_r_v<value_type, G,
+                                      std::integral_constant<std::size_t, 0>>,
+                bool> = false>
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask(G&& gen) : m_value(false) {
+    reference(m_value, int(0)) =
+        static_cast<bool>(gen(std::integral_constant<std::size_t, 0>()));
+    reference(m_value, int(1)) =
+        static_cast<bool>(gen(std::integral_constant<std::size_t, 1>()));
+    reference(m_value, int(2)) =
+        static_cast<bool>(gen(std::integral_constant<std::size_t, 2>()));
+    reference(m_value, int(3)) =
+        static_cast<bool>(gen(std::integral_constant<std::size_t, 3>()));
+    reference(m_value, int(4)) =
+        static_cast<bool>(gen(std::integral_constant<std::size_t, 4>()));
+    reference(m_value, int(5)) =
+        static_cast<bool>(gen(std::integral_constant<std::size_t, 5>()));
+    reference(m_value, int(6)) =
+        static_cast<bool>(gen(std::integral_constant<std::size_t, 6>()));
+    reference(m_value, int(7)) =
+        static_cast<bool>(gen(std::integral_constant<std::size_t, 7>()));
+  }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() {
     return 8;
   }
@@ -86,7 +115,8 @@ class simd_mask<T, simd_abi::avx512_fixed_size<8>> {
   }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type
   operator[](std::size_t i) const {
-    return static_cast<value_type>(reference(m_value, int(i)));
+    auto const bit_mask = __mmask8(std::int16_t(1 << i));
+    return (m_value & bit_mask) != 0;
   }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask
   operator||(simd_mask const& other) const {
@@ -111,11 +141,11 @@ class simd_mask<T, simd_abi::avx512_fixed_size<8>> {
 };
 
 template <>
-class simd<std::int32_t, simd_abi::avx512_fixed_size<8>> {
-  __m256i m_value;
+class simd<double, simd_abi::avx512_fixed_size<8>> {
+  __m512d m_value;
 
  public:
-  using value_type = std::int32_t;
+  using value_type = double;
   using abi_type   = simd_abi::avx512_fixed_size<8>;
   using mask_type  = simd_mask<value_type, abi_type>;
   using reference  = value_type&;
@@ -130,12 +160,10 @@ class simd<std::int32_t, simd_abi::avx512_fixed_size<8>> {
   template <class U, std::enable_if_t<std::is_convertible_v<U, value_type>,
                                       bool> = false>
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(U&& value)
-      : m_value(_mm256_set1_epi32(value_type(value))) {}
+      : m_value(_mm512_set1_pd(value_type(value))) {}
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd(
-      __m256i const& value_in)
+      __m512d const& value_in)
       : m_value(value_in) {}
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd(
-      simd<std::uint64_t, abi_type> const& other);
   template <class G,
             std::enable_if_t<
                 // basically, can you do { value_type r =
@@ -143,16 +171,17 @@ class simd<std::int32_t, simd_abi::avx512_fixed_size<8>> {
                 std::is_invocable_r_v<value_type, G,
                                       std::integral_constant<std::size_t, 0>>,
                 bool> = false>
-  KOKKOS_FORCEINLINE_FUNCTION simd(G&& gen)
-      : m_value(
-            _mm256_setr_epi32(gen(std::integral_constant<std::size_t, 0>()),
-                              gen(std::integral_constant<std::size_t, 1>()),
-                              gen(std::integral_constant<std::size_t, 2>()),
-                              gen(std::integral_constant<std::size_t, 3>()),
-                              gen(std::integral_constant<std::size_t, 4>()),
-                              gen(std::integral_constant<std::size_t, 5>()),
-                              gen(std::integral_constant<std::size_t, 6>()),
-                              gen(std::integral_constant<std::size_t, 7>()))) {}
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd(
+      G&& gen) noexcept
+      : m_value(_mm512_setr_pd(gen(std::integral_constant<std::size_t, 0>()),
+                               gen(std::integral_constant<std::size_t, 1>()),
+                               gen(std::integral_constant<std::size_t, 2>()),
+                               gen(std::integral_constant<std::size_t, 3>()),
+                               gen(std::integral_constant<std::size_t, 4>()),
+                               gen(std::integral_constant<std::size_t, 5>()),
+                               gen(std::integral_constant<std::size_t, 6>()),
+                               gen(std::integral_constant<std::size_t, 7>()))) {
+  }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) {
     return reinterpret_cast<value_type*>(&m_value)[i];
   }
@@ -160,92 +189,248 @@ class simd<std::int32_t, simd_abi::avx512_fixed_size<8>> {
   operator[](std::size_t i) const {
     return reinterpret_cast<value_type const*>(&m_value)[i];
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(
-      value_type* ptr, element_aligned_tag) const {
-    _mm256_mask_storeu_epi32(ptr, static_cast<__mmask8>(mask_type(true)),
-                             m_value);
-  }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr,
                                                        element_aligned_tag) {
-    m_value = _mm256_mask_loadu_epi32(
-        _mm256_set1_epi32(0), static_cast<__mmask8>(mask_type(true)), ptr);
+    m_value = _mm512_loadu_pd(ptr);
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256i()
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(
+      value_type* ptr, element_aligned_tag) const {
+    _mm512_storeu_pd(ptr, m_value);
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m512d()
       const {
     return m_value;
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator<(simd const& other) const {
-    return mask_type(_mm256_cmplt_epi32_mask(m_value, other.m_value));
-  }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator>(simd const& other) const {
-    return mask_type(_mm256_cmplt_epi32_mask(other.m_value, m_value));
-  }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator<=(simd const& other) const {
-    return mask_type(_mm256_cmple_epi32_mask(m_value, other.m_value));
-  }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator>=(simd const& other) const {
-    return mask_type(_mm256_cmple_epi32_mask(other.m_value, m_value));
-  }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator==(simd const& other) const {
-    return mask_type(_mm256_cmpeq_epi32_mask(m_value, other.m_value));
-  }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator!=(simd const& other) const {
-    return mask_type(_mm256_cmpneq_epi32_mask(m_value, other.m_value));
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator-() const
+      noexcept {
+    return simd(_mm512_sub_pd(_mm512_set1_pd(0.0), m_value));
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(
+        _mm512_mul_pd(static_cast<__m512d>(lhs), static_cast<__m512d>(rhs)));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator/(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(
+        _mm512_div_pd(static_cast<__m512d>(lhs), static_cast<__m512d>(rhs)));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator+(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(
+        _mm512_add_pd(static_cast<__m512d>(lhs), static_cast<__m512d>(rhs)));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator-(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(
+        _mm512_sub_pd(static_cast<__m512d>(lhs), static_cast<__m512d>(rhs)));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator<(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(_mm512_cmp_pd_mask(static_cast<__m512d>(lhs),
+                                        static_cast<__m512d>(rhs), _CMP_LT_OS));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator>(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(_mm512_cmp_pd_mask(static_cast<__m512d>(rhs),
+                                        static_cast<__m512d>(lhs), _CMP_GT_OS));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator<=(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(_mm512_cmp_pd_mask(static_cast<__m512d>(lhs),
+                                        static_cast<__m512d>(rhs), _CMP_LE_OS));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator>=(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(_mm512_cmp_pd_mask(static_cast<__m512d>(rhs),
+                                        static_cast<__m512d>(lhs), _CMP_GE_OS));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator==(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(_mm512_cmp_pd_mask(static_cast<__m512d>(lhs),
+                                        static_cast<__m512d>(rhs), _CMP_EQ_OS));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator!=(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(_mm512_cmp_pd_mask(
+        static_cast<__m512d>(lhs), static_cast<__m512d>(rhs), _CMP_NEQ_OS));
   }
 };
 
+}  // namespace Experimental
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd<
+    double, Experimental::simd_abi::avx512_fixed_size<8>>
+copysign(
+    Experimental::simd<double,
+                       Experimental::simd_abi::avx512_fixed_size<8>> const& a,
+    Experimental::simd<double,
+                       Experimental::simd_abi::avx512_fixed_size<8>> const& b) {
+  static const __m512i sign_mask =
+      reinterpret_cast<__m512i>(static_cast<__m512d>(
+          Experimental::simd<
+              double, Experimental::simd_abi::avx512_fixed_size<8>>(-0.0)));
+  return Experimental::simd<double,
+                            Experimental::simd_abi::avx512_fixed_size<8>>(
+      reinterpret_cast<__m512d>(_mm512_xor_epi64(
+          _mm512_andnot_epi64(
+              sign_mask, reinterpret_cast<__m512i>(static_cast<__m512d>(a))),
+          _mm512_and_epi64(
+              sign_mask, reinterpret_cast<__m512i>(static_cast<__m512d>(b))))));
+}
+
 [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-    simd<std::int32_t, simd_abi::avx512_fixed_size<8>>
-    operator*(simd<std::int32_t, simd_abi::avx512_fixed_size<8>> const& lhs,
-              simd<std::int32_t, simd_abi::avx512_fixed_size<8>> const& rhs) {
-  return simd<std::int32_t, simd_abi::avx512_fixed_size<8>>(
-      _mm256_mullo_epi32(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs)));
+    Experimental::simd<double, Experimental::simd_abi::avx512_fixed_size<8>>
+    abs(Experimental::simd<
+        double, Experimental::simd_abi::avx512_fixed_size<8>> const& a) {
+  __m512d const rhs = static_cast<__m512d>(a);
+#if defined(KOKKOS_COMPILER_GNU) && (KOKKOS_COMPILER_GNU < 830)
+  return Experimental::simd<double,
+                            Experimental::simd_abi::avx512_fixed_size<8>>(
+      (__m512d)_mm512_and_epi64((__m512i)rhs,
+                                _mm512_set1_epi64(0x7fffffffffffffffLL)));
+#else
+  return Experimental::simd<double,
+                            Experimental::simd_abi::avx512_fixed_size<8>>(
+      _mm512_abs_pd(rhs));
+#endif
 }
 
 [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-    simd<std::int32_t, simd_abi::avx512_fixed_size<8>>
-    operator+(simd<std::int32_t, simd_abi::avx512_fixed_size<8>> const& lhs,
-              simd<std::int32_t, simd_abi::avx512_fixed_size<8>> const& rhs) {
-  return simd<std::int32_t, simd_abi::avx512_fixed_size<8>>(
-      _mm256_add_epi32(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs)));
+    Experimental::simd<double, Experimental::simd_abi::avx512_fixed_size<8>>
+    floor(Experimental::simd<
+          double, Experimental::simd_abi::avx512_fixed_size<8>> const& a) {
+  __m512d const val = static_cast<__m512d>(a);
+  return Experimental::simd<double,
+                            Experimental::simd_abi::avx512_fixed_size<8>>(
+      _mm512_roundscale_pd(val, _MM_FROUND_TO_NEG_INF));
 }
 
 [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-    simd<std::int32_t, simd_abi::avx512_fixed_size<8>>
-    operator-(simd<std::int32_t, simd_abi::avx512_fixed_size<8>> const& lhs,
-              simd<std::int32_t, simd_abi::avx512_fixed_size<8>> const& rhs) {
-  return simd<std::int32_t, simd_abi::avx512_fixed_size<8>>(
-      _mm256_sub_epi32(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs)));
+    Experimental::simd<double, Experimental::simd_abi::avx512_fixed_size<8>>
+    ceil(Experimental::simd<
+         double, Experimental::simd_abi::avx512_fixed_size<8>> const& a) {
+  __m512d const val = static_cast<__m512d>(a);
+  return Experimental::simd<double,
+                            Experimental::simd_abi::avx512_fixed_size<8>>(
+      _mm512_roundscale_pd(val, _MM_FROUND_TO_POS_INF));
 }
 
 [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-    simd<std::int32_t, simd_abi::avx512_fixed_size<8>>
-    operator-(simd<std::int32_t, simd_abi::avx512_fixed_size<8>> const& a) {
-  return simd<std::int32_t, simd_abi::avx512_fixed_size<8>>(0) - a;
+    Experimental::simd<double, Experimental::simd_abi::avx512_fixed_size<8>>
+    round(Experimental::simd<
+          double, Experimental::simd_abi::avx512_fixed_size<8>> const& a) {
+  __m512d const val = static_cast<__m512d>(a);
+  return Experimental::simd<double,
+                            Experimental::simd_abi::avx512_fixed_size<8>>(
+      _mm512_roundscale_pd(val, _MM_FROUND_TO_NEAREST_INT));
 }
 
-KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-simd<std::int32_t, simd_abi::avx512_fixed_size<8>> condition(
-    simd_mask<std::int32_t, simd_abi::avx512_fixed_size<8>> const& a,
-    simd<std::int32_t, simd_abi::avx512_fixed_size<8>> const& b,
-    simd<std::int32_t, simd_abi::avx512_fixed_size<8>> const& c) {
-  return simd<std::int32_t, simd_abi::avx512_fixed_size<8>>(
-      _mm256_mask_blend_epi32(static_cast<__mmask8>(a), static_cast<__m256i>(c),
-                              static_cast<__m256i>(b)));
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    Experimental::simd<double, Experimental::simd_abi::avx512_fixed_size<8>>
+    trunc(Experimental::simd<
+          double, Experimental::simd_abi::avx512_fixed_size<8>> const& a) {
+  __m512d const val = static_cast<__m512d>(a);
+  return Experimental::simd<double,
+                            Experimental::simd_abi::avx512_fixed_size<8>>(
+      _mm512_roundscale_pd(val, _MM_FROUND_TO_ZERO));
+}
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    Experimental::simd<double, Experimental::simd_abi::avx512_fixed_size<8>>
+    sqrt(Experimental::simd<
+         double, Experimental::simd_abi::avx512_fixed_size<8>> const& a) {
+  return Experimental::simd<double,
+                            Experimental::simd_abi::avx512_fixed_size<8>>(
+      _mm512_sqrt_pd(static_cast<__m512d>(a)));
+}
+
+#ifdef __INTEL_COMPILER
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    Experimental::simd<double, Experimental::simd_abi::avx512_fixed_size<8>>
+    cbrt(Experimental::simd<
+         double, Experimental::simd_abi::avx512_fixed_size<8>> const& a) {
+  return Experimental::simd<double,
+                            Experimental::simd_abi::avx512_fixed_size<8>>(
+      _mm512_cbrt_pd(static_cast<__m512d>(a)));
+}
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    Experimental::simd<double, Experimental::simd_abi::avx512_fixed_size<8>>
+    exp(Experimental::simd<
+        double, Experimental::simd_abi::avx512_fixed_size<8>> const& a) {
+  return Experimental::simd<double,
+                            Experimental::simd_abi::avx512_fixed_size<8>>(
+      _mm512_exp_pd(static_cast<__m512d>(a)));
+}
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    Experimental::simd<double, Experimental::simd_abi::avx512_fixed_size<8>>
+    log(Experimental::simd<
+        double, Experimental::simd_abi::avx512_fixed_size<8>> const& a) {
+  return Experimental::simd<double,
+                            Experimental::simd_abi::avx512_fixed_size<8>>(
+      _mm512_log_pd(static_cast<__m512d>(a)));
+}
+
+#endif
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd<
+    double, Experimental::simd_abi::avx512_fixed_size<8>>
+fma(Experimental::simd<double,
+                       Experimental::simd_abi::avx512_fixed_size<8>> const& a,
+    Experimental::simd<double,
+                       Experimental::simd_abi::avx512_fixed_size<8>> const& b,
+    Experimental::simd<double,
+                       Experimental::simd_abi::avx512_fixed_size<8>> const& c) {
+  return Experimental::simd<double,
+                            Experimental::simd_abi::avx512_fixed_size<8>>(
+      _mm512_fmadd_pd(static_cast<__m512d>(a), static_cast<__m512d>(b),
+                      static_cast<__m512d>(c)));
+}
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd<
+    double, Experimental::simd_abi::avx512_fixed_size<8>>
+max(Experimental::simd<double,
+                       Experimental::simd_abi::avx512_fixed_size<8>> const& a,
+    Experimental::simd<double,
+                       Experimental::simd_abi::avx512_fixed_size<8>> const& b) {
+  return Experimental::simd<double,
+                            Experimental::simd_abi::avx512_fixed_size<8>>(
+      _mm512_max_pd(static_cast<__m512d>(a), static_cast<__m512d>(b)));
+}
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd<
+    double, Experimental::simd_abi::avx512_fixed_size<8>>
+min(Experimental::simd<double,
+                       Experimental::simd_abi::avx512_fixed_size<8>> const& a,
+    Experimental::simd<double,
+                       Experimental::simd_abi::avx512_fixed_size<8>> const& b) {
+  return Experimental::simd<double,
+                            Experimental::simd_abi::avx512_fixed_size<8>>(
+      _mm512_min_pd(static_cast<__m512d>(a), static_cast<__m512d>(b)));
+}
+
+namespace Experimental {
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    simd<double, simd_abi::avx512_fixed_size<8>>
+    condition(simd_mask<double, simd_abi::avx512_fixed_size<8>> const& a,
+              simd<double, simd_abi::avx512_fixed_size<8>> const& b,
+              simd<double, simd_abi::avx512_fixed_size<8>> const& c) {
+  return simd<double, simd_abi::avx512_fixed_size<8>>(
+      _mm512_mask_blend_pd(static_cast<__mmask8>(a), static_cast<__m512d>(c),
+                           static_cast<__m512d>(b)));
 }
 
 template <>
-class simd<std::uint32_t, simd_abi::avx512_fixed_size<8>> {
-  __m256i m_value;
+class simd<float, simd_abi::avx512_fixed_size<8>> {
+  __m256 m_value;
 
  public:
-  using value_type = std::uint32_t;
+  using value_type = float;
   using abi_type   = simd_abi::avx512_fixed_size<8>;
   using mask_type  = simd_mask<value_type, abi_type>;
   using reference  = value_type&;
@@ -260,13 +445,25 @@ class simd<std::uint32_t, simd_abi::avx512_fixed_size<8>> {
   template <class U, std::enable_if_t<std::is_convertible_v<U, value_type>,
                                       bool> = false>
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(U&& value)
-      : m_value(_mm256_set1_epi32(bit_cast<std::int32_t>(value_type(value)))) {}
+      : m_value(_mm256_set1_ps(value_type(value))) {}
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd(
-      __m256i const& value_in)
+      __m256 const& value_in)
       : m_value(value_in) {}
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd(
-      simd<std::int32_t, simd_abi::avx512_fixed_size<8>> const& other)
-      : m_value(static_cast<__m256i>(other)) {}
+  template <class G,
+            std::enable_if_t<
+                std::is_invocable_r_v<value_type, G,
+                                      std::integral_constant<std::size_t, 0>>,
+                bool> = false>
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(G&& gen)
+      : m_value(_mm256_setr_ps(gen(std::integral_constant<std::size_t, 0>()),
+                               gen(std::integral_constant<std::size_t, 1>()),
+                               gen(std::integral_constant<std::size_t, 2>()),
+                               gen(std::integral_constant<std::size_t, 3>()),
+                               gen(std::integral_constant<std::size_t, 4>()),
+                               gen(std::integral_constant<std::size_t, 5>()),
+                               gen(std::integral_constant<std::size_t, 6>()),
+                               gen(std::integral_constant<std::size_t, 7>()))) {
+  }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) {
     return reinterpret_cast<value_type*>(&m_value)[i];
   }
@@ -274,76 +471,225 @@ class simd<std::uint32_t, simd_abi::avx512_fixed_size<8>> {
   operator[](std::size_t i) const {
     return reinterpret_cast<value_type const*>(&m_value)[i];
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256i()
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr,
+                                                       element_aligned_tag) {
+    m_value = _mm256_loadu_ps(ptr);
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(
+      value_type* ptr, element_aligned_tag) const {
+    _mm256_storeu_ps(ptr, m_value);
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256()
       const {
     return m_value;
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator<(simd const& other) const {
-    return mask_type(_mm256_cmplt_epu32_mask(m_value, other.m_value));
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator-() const
+      noexcept {
+    return simd(_mm256_sub_ps(_mm256_set1_ps(0.0), m_value));
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(_mm256_mul_ps(lhs.m_value, rhs.m_value));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator/(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(_mm256_div_ps(lhs.m_value, rhs.m_value));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator+(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(_mm256_add_ps(lhs.m_value, rhs.m_value));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator>(simd const& other) const {
-    return mask_type(_mm256_cmplt_epu32_mask(other.m_value, m_value));
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator-(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(_mm256_sub_ps(lhs.m_value, rhs.m_value));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator<=(simd const& other) const {
-    return mask_type(_mm256_cmple_epu32_mask(m_value, other.m_value));
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator<(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(_mm256_cmp_ps_mask(lhs.m_value, rhs.m_value, _CMP_LT_OS));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator>=(simd const& other) const {
-    return mask_type(_mm256_cmple_epu32_mask(other.m_value, m_value));
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator>(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(_mm256_cmp_ps_mask(lhs.m_value, rhs.m_value, _CMP_GT_OS));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator==(simd const& other) const {
-    return mask_type(_mm256_cmpeq_epu32_mask(m_value, other.m_value));
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator<=(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(_mm256_cmp_ps_mask(lhs.m_value, rhs.m_value, _CMP_LE_OS));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator!=(simd const& other) const {
-    return mask_type(_mm256_cmpneq_epu32_mask(m_value, other.m_value));
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator>=(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(_mm256_cmp_ps_mask(lhs.m_value, rhs.m_value, _CMP_GE_OS));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator==(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(_mm256_cmp_ps_mask(lhs.m_value, rhs.m_value, _CMP_EQ_OS));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator!=(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(_mm256_cmp_ps_mask(lhs.m_value, rhs.m_value, _CMP_NEQ_OS));
   }
 };
 
+}  // namespace Experimental
+
+KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+Experimental::simd<float, Experimental::simd_abi::avx512_fixed_size<8>>
+copysign(
+    Experimental::simd<float,
+                       Experimental::simd_abi::avx512_fixed_size<8>> const& a,
+    Experimental::simd<float,
+                       Experimental::simd_abi::avx512_fixed_size<8>> const& b) {
+  __m256 const sign_mask = _mm256_set1_ps(-0.0);
+  return Experimental::simd<float,
+                            Experimental::simd_abi::avx512_fixed_size<8>>(
+      _mm256_xor_ps(_mm256_andnot_ps(sign_mask, static_cast<__m256>(a)),
+                    _mm256_and_ps(sign_mask, static_cast<__m256>(b))));
+}
+
+KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+Experimental::simd<float, Experimental::simd_abi::avx512_fixed_size<8>> abs(
+    Experimental::simd<float,
+                       Experimental::simd_abi::avx512_fixed_size<8>> const& a) {
+  __m256 const sign_mask = _mm256_set1_ps(-0.0);
+  return Experimental::simd<float,
+                            Experimental::simd_abi::avx512_fixed_size<8>>(
+      _mm256_andnot_ps(sign_mask, static_cast<__m256>(a)));
+}
+
 [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-    simd<std::uint32_t, simd_abi::avx512_fixed_size<8>>
-    operator*(simd<std::uint32_t, simd_abi::avx512_fixed_size<8>> const& lhs,
-              simd<std::uint32_t, simd_abi::avx512_fixed_size<8>> const& rhs) {
-  return simd<std::uint32_t, simd_abi::avx512_fixed_size<8>>(
-      _mm256_mullo_epi32(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs)));
+    Experimental::simd<float, Experimental::simd_abi::avx512_fixed_size<8>>
+    floor(Experimental::simd<
+          float, Experimental::simd_abi::avx512_fixed_size<8>> const& a) {
+  __m256 const val = static_cast<__m256>(a);
+  return Experimental::simd<float,
+                            Experimental::simd_abi::avx512_fixed_size<8>>(
+      _mm256_roundscale_ps(val, _MM_FROUND_TO_NEG_INF));
 }
 
 [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-    simd<std::uint32_t, simd_abi::avx512_fixed_size<8>>
-    operator+(simd<std::uint32_t, simd_abi::avx512_fixed_size<8>> const& lhs,
-              simd<std::uint32_t, simd_abi::avx512_fixed_size<8>> const& rhs) {
-  return simd<std::uint32_t, simd_abi::avx512_fixed_size<8>>(
-      _mm256_add_epi32(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs)));
+    Experimental::simd<float, Experimental::simd_abi::avx512_fixed_size<8>>
+    ceil(Experimental::simd<
+         float, Experimental::simd_abi::avx512_fixed_size<8>> const& a) {
+  __m256 const val = static_cast<__m256>(a);
+  return Experimental::simd<float,
+                            Experimental::simd_abi::avx512_fixed_size<8>>(
+      _mm256_roundscale_ps(val, _MM_FROUND_TO_POS_INF));
 }
 
 [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-    simd<std::uint32_t, simd_abi::avx512_fixed_size<8>>
-    operator-(simd<std::uint32_t, simd_abi::avx512_fixed_size<8>> const& lhs,
-              simd<std::uint32_t, simd_abi::avx512_fixed_size<8>> const& rhs) {
-  return simd<std::uint32_t, simd_abi::avx512_fixed_size<8>>(
-      _mm256_sub_epi32(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs)));
+    Experimental::simd<float, Experimental::simd_abi::avx512_fixed_size<8>>
+    round(Experimental::simd<
+          float, Experimental::simd_abi::avx512_fixed_size<8>> const& a) {
+  __m256 const val = static_cast<__m256>(a);
+  return Experimental::simd<float,
+                            Experimental::simd_abi::avx512_fixed_size<8>>(
+      _mm256_roundscale_ps(val, _MM_FROUND_TO_NEAREST_INT));
+}
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    Experimental::simd<float, Experimental::simd_abi::avx512_fixed_size<8>>
+    trunc(Experimental::simd<
+          float, Experimental::simd_abi::avx512_fixed_size<8>> const& a) {
+  __m256 const val = static_cast<__m256>(a);
+  return Experimental::simd<float,
+                            Experimental::simd_abi::avx512_fixed_size<8>>(
+      _mm256_roundscale_ps(val, _MM_FROUND_TO_ZERO));
 }
 
 KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-simd<std::uint32_t, simd_abi::avx512_fixed_size<8>> condition(
-    simd_mask<std::uint32_t, simd_abi::avx512_fixed_size<8>> const& a,
-    simd<std::uint32_t, simd_abi::avx512_fixed_size<8>> const& b,
-    simd<std::uint32_t, simd_abi::avx512_fixed_size<8>> const& c) {
-  return simd<std::uint32_t, simd_abi::avx512_fixed_size<8>>(
-      _mm256_mask_blend_epi32(static_cast<__mmask8>(a), static_cast<__m256i>(c),
-                              static_cast<__m256i>(b)));
+Experimental::simd<float, Experimental::simd_abi::avx512_fixed_size<8>> sqrt(
+    Experimental::simd<float,
+                       Experimental::simd_abi::avx512_fixed_size<8>> const& a) {
+  return Experimental::simd<float,
+                            Experimental::simd_abi::avx512_fixed_size<8>>(
+      _mm256_sqrt_ps(static_cast<__m256>(a)));
+}
+
+#ifdef __INTEL_COMPILER
+
+KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+Experimental::simd<float, Experimental::simd_abi::avx512_fixed_size<8>> cbrt(
+    Experimental::simd<float,
+                       Experimental::simd_abi::avx512_fixed_size<8>> const& a) {
+  return Experimental::simd<float,
+                            Experimental::simd_abi::avx512_fixed_size<8>>(
+      _mm256_cbrt_ps(static_cast<__m256>(a)));
+}
+
+KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+Experimental::simd<float, Experimental::simd_abi::avx512_fixed_size<8>> exp(
+    Experimental::simd<float,
+                       Experimental::simd_abi::avx512_fixed_size<8>> const& a) {
+  return Experimental::simd<float,
+                            Experimental::simd_abi::avx512_fixed_size<8>>(
+      _mm256_exp_ps(static_cast<__m256>(a)));
+}
+
+KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+Experimental::simd<float, Experimental::simd_abi::avx512_fixed_size<8>> log(
+    Experimental::simd<float,
+                       Experimental::simd_abi::avx512_fixed_size<8>> const& a) {
+  return Experimental::simd<float,
+                            Experimental::simd_abi::avx512_fixed_size<8>>(
+      _mm256_log_ps(static_cast<__m256>(a)));
+}
+
+#endif
+
+KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+Experimental::simd<float, Experimental::simd_abi::avx512_fixed_size<8>> fma(
+    Experimental::simd<float,
+                       Experimental::simd_abi::avx512_fixed_size<8>> const& a,
+    Experimental::simd<float,
+                       Experimental::simd_abi::avx512_fixed_size<8>> const& b,
+    Experimental::simd<float,
+                       Experimental::simd_abi::avx512_fixed_size<8>> const& c) {
+  return Experimental::simd<float,
+                            Experimental::simd_abi::avx512_fixed_size<8>>(
+      _mm256_fmadd_ps(static_cast<__m256>(a), static_cast<__m256>(b),
+                      static_cast<__m256>(c)));
+}
+
+KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+Experimental::simd<float, Experimental::simd_abi::avx512_fixed_size<8>> max(
+    Experimental::simd<float,
+                       Experimental::simd_abi::avx512_fixed_size<8>> const& a,
+    Experimental::simd<float,
+                       Experimental::simd_abi::avx512_fixed_size<8>> const& b) {
+  return Experimental::simd<float,
+                            Experimental::simd_abi::avx512_fixed_size<8>>(
+      _mm256_max_ps(static_cast<__m256>(a), static_cast<__m256>(b)));
+}
+
+KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+Experimental::simd<float, Experimental::simd_abi::avx512_fixed_size<8>> min(
+    Experimental::simd<float,
+                       Experimental::simd_abi::avx512_fixed_size<8>> const& a,
+    Experimental::simd<float,
+                       Experimental::simd_abi::avx512_fixed_size<8>> const& b) {
+  return Experimental::simd<float,
+                            Experimental::simd_abi::avx512_fixed_size<8>>(
+      _mm256_min_ps(static_cast<__m256>(a), static_cast<__m256>(b)));
+}
+
+namespace Experimental {
+
+KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+simd<float, simd_abi::avx512_fixed_size<8>> condition(
+    simd_mask<float, simd_abi::avx512_fixed_size<8>> const& a,
+    simd<float, simd_abi::avx512_fixed_size<8>> const& b,
+    simd<float, simd_abi::avx512_fixed_size<8>> const& c) {
+  return simd<float, simd_abi::avx512_fixed_size<8>>(
+      _mm256_mask_blend_ps(static_cast<__mmask8>(a), static_cast<__m256>(c),
+                           static_cast<__m256>(b)));
 }
 
 template <>
-class simd<std::int64_t, simd_abi::avx512_fixed_size<8>> {
-  __m512i m_value;
+class simd<std::int32_t, simd_abi::avx512_fixed_size<8>> {
+  __m256i m_value;
 
  public:
-  using value_type = std::int64_t;
+  using value_type = std::int32_t;
   using abi_type   = simd_abi::avx512_fixed_size<8>;
   using mask_type  = simd_mask<value_type, abi_type>;
   using reference  = value_type&;
@@ -358,14 +704,30 @@ class simd<std::int64_t, simd_abi::avx512_fixed_size<8>> {
   template <class U, std::enable_if_t<std::is_convertible_v<U, value_type>,
                                       bool> = false>
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(U&& value)
-      : m_value(_mm512_set1_epi64(value_type(value))) {}
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd(
-      simd<std::int32_t, simd_abi::avx512_fixed_size<8>> const& other)
-      : m_value(_mm512_cvtepi32_epi64(static_cast<__m256i>(other))) {}
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd(
-      simd<std::uint64_t, simd_abi::avx512_fixed_size<8>> const& other);
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr simd(__m512i const& value_in)
+      : m_value(_mm256_set1_epi32(value_type(value))) {}
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd(
+      __m256i const& value_in)
       : m_value(value_in) {}
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd(
+      simd<std::uint64_t, abi_type> const& other);
+  template <class G,
+            std::enable_if_t<
+                // basically, can you do { value_type r =
+                // gen(std::integral_constant<std::size_t, i>()); }
+                std::is_invocable_r_v<value_type, G,
+                                      std::integral_constant<std::size_t, 0>>,
+                bool> = false>
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd(
+      G&& gen) noexcept
+      : m_value(
+            _mm256_setr_epi32(gen(std::integral_constant<std::size_t, 0>()),
+                              gen(std::integral_constant<std::size_t, 1>()),
+                              gen(std::integral_constant<std::size_t, 2>()),
+                              gen(std::integral_constant<std::size_t, 3>()),
+                              gen(std::integral_constant<std::size_t, 4>()),
+                              gen(std::integral_constant<std::size_t, 5>()),
+                              gen(std::integral_constant<std::size_t, 6>()),
+                              gen(std::integral_constant<std::size_t, 7>()))) {}
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) {
     return reinterpret_cast<value_type*>(&m_value)[i];
   }
@@ -375,101 +737,156 @@ class simd<std::int64_t, simd_abi::avx512_fixed_size<8>> {
   }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(
       value_type* ptr, element_aligned_tag) const {
-    _mm512_mask_storeu_epi64(ptr, static_cast<__mmask8>(mask_type(true)),
+    _mm256_mask_storeu_epi32(ptr, static_cast<__mmask8>(mask_type(true)),
                              m_value);
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator>>(int rhs) const {
-    return _mm512_srai_epi64(m_value, rhs);
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr,
+                                                       element_aligned_tag) {
+    m_value = _mm256_mask_loadu_epi32(
+        _mm256_set1_epi32(0), static_cast<__mmask8>(mask_type(true)), ptr);
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256i()
+      const {
+    return m_value;
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd
-  operator>>(simd<int, simd_abi::avx512_fixed_size<8>> const& rhs) const {
-    return _mm512_srav_epi64(m_value,
-                             _mm512_cvtepi32_epi64(static_cast<__m256i>(rhs)));
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator-() const
+      noexcept {
+    return simd(_mm256_sub_epi32(_mm256_set1_epi32(0), m_value));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator<<(int rhs) const {
-    return _mm512_slli_epi64(m_value, rhs);
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(_mm256_mullo_epi32(static_cast<__m256i>(lhs),
+                                   static_cast<__m256i>(rhs)));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd
-  operator<<(simd<int, simd_abi::avx512_fixed_size<8>> const& rhs) const {
-    return _mm512_sllv_epi64(m_value,
-                             _mm512_cvtepi32_epi64(static_cast<__m256i>(rhs)));
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator+(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd<std::int32_t, simd_abi::avx512_fixed_size<8>>(
+        _mm256_add_epi32(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs)));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m512i()
-      const {
-    return m_value;
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator-(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd<std::int32_t, simd_abi::avx512_fixed_size<8>>(
+        _mm256_sub_epi32(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs)));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator<(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(_mm256_cmplt_epi32_mask(static_cast<__m256i>(lhs),
+                                             static_cast<__m256i>(rhs)));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator>(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(_mm256_cmplt_epi32_mask(static_cast<__m256i>(rhs),
+                                             static_cast<__m256i>(lhs)));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator<(simd const& other) const {
-    return mask_type(_mm512_cmplt_epi64_mask(m_value, other.m_value));
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator<=(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(_mm256_cmple_epi32_mask(static_cast<__m256i>(lhs),
+                                             static_cast<__m256i>(rhs)));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator>(simd const& other) const {
-    return mask_type(_mm512_cmplt_epi64_mask(other.m_value, m_value));
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator>=(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(_mm256_cmple_epi32_mask(static_cast<__m256i>(rhs),
+                                             static_cast<__m256i>(lhs)));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator<=(simd const& other) const {
-    return mask_type(_mm512_cmple_epi64_mask(m_value, other.m_value));
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator==(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(_mm256_cmpeq_epi32_mask(static_cast<__m256i>(lhs),
+                                             static_cast<__m256i>(rhs)));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator>=(simd const& other) const {
-    return mask_type(_mm512_cmple_epi64_mask(other.m_value, m_value));
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator!=(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(_mm256_cmpneq_epi32_mask(static_cast<__m256i>(lhs),
+                                              static_cast<__m256i>(rhs)));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator==(simd const& other) const {
-    return mask_type(_mm512_cmpeq_epi64_mask(m_value, other.m_value));
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>(
+      simd const& lhs, int rhs) noexcept {
+    return simd(_mm256_srai_epi32(static_cast<__m256i>(lhs), rhs));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(_mm256_srav_epi32(static_cast<__m256i>(lhs),
+                                  static_cast<__m256i>(rhs)));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator!=(simd const& other) const {
-    return mask_type(_mm512_cmpneq_epi64_mask(m_value, other.m_value));
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<(
+      simd const& lhs, int rhs) noexcept {
+    return simd(_mm256_slli_epi32(static_cast<__m256i>(lhs), rhs));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(_mm256_sllv_epi32(static_cast<__m256i>(lhs),
+                                  static_cast<__m256i>(rhs)));
   }
 };
 
-[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-    simd<std::int64_t, simd_abi::avx512_fixed_size<8>>
-    operator*(simd<std::int64_t, simd_abi::avx512_fixed_size<8>> const& lhs,
-              simd<std::int64_t, simd_abi::avx512_fixed_size<8>> const& rhs) {
-  return simd<std::int64_t, simd_abi::avx512_fixed_size<8>>(
-      _mm512_mullo_epi64(static_cast<__m512i>(lhs), static_cast<__m512i>(rhs)));
+}  // namespace Experimental
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd<
+    std::int32_t, Experimental::simd_abi::avx512_fixed_size<8>>
+abs(Experimental::simd<std::int32_t,
+                       Experimental::simd_abi::avx512_fixed_size<8>> const& a) {
+  __m256i const rhs = static_cast<__m256i>(a);
+  return Experimental::simd<std::int32_t,
+                            Experimental::simd_abi::avx512_fixed_size<8>>(
+      _mm256_abs_epi32(rhs));
 }
 
-[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-    simd<std::int64_t, simd_abi::avx512_fixed_size<8>>
-    operator+(simd<std::int64_t, simd_abi::avx512_fixed_size<8>> const& lhs,
-              simd<std::int64_t, simd_abi::avx512_fixed_size<8>> const& rhs) {
-  return simd<std::int64_t, simd_abi::avx512_fixed_size<8>>(
-      _mm512_add_epi64(static_cast<__m512i>(lhs), static_cast<__m512i>(rhs)));
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd<
+    double, Experimental::simd_abi::avx512_fixed_size<8>>
+floor(Experimental::simd<
+      std::int32_t, Experimental::simd_abi::avx512_fixed_size<8>> const& a) {
+  return Experimental::simd<double,
+                            Experimental::simd_abi::avx512_fixed_size<8>>(
+      _mm512_cvtepi32_pd(static_cast<__m256i>(a)));
 }
 
 [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-    simd<std::int64_t, simd_abi::avx512_fixed_size<8>>
-    operator-(simd<std::int64_t, simd_abi::avx512_fixed_size<8>> const& lhs,
-              simd<std::int64_t, simd_abi::avx512_fixed_size<8>> const& rhs) {
-  return simd<std::int64_t, simd_abi::avx512_fixed_size<8>>(
-      _mm512_sub_epi64(static_cast<__m512i>(lhs), static_cast<__m512i>(rhs)));
+    Experimental::simd<double, Experimental::simd_abi::avx512_fixed_size<8>>
+    ceil(Experimental::simd<
+         std::int32_t, Experimental::simd_abi::avx512_fixed_size<8>> const& a) {
+  return Experimental::simd<double,
+                            Experimental::simd_abi::avx512_fixed_size<8>>(
+      _mm512_cvtepi32_pd(static_cast<__m256i>(a)));
 }
 
-[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-    simd<std::int64_t, simd_abi::avx512_fixed_size<8>>
-    operator-(simd<std::int64_t, simd_abi::avx512_fixed_size<8>> const& a) {
-  return simd<std::int64_t, simd_abi::avx512_fixed_size<8>>(0) - a;
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd<
+    double, Experimental::simd_abi::avx512_fixed_size<8>>
+round(Experimental::simd<
+      std::int32_t, Experimental::simd_abi::avx512_fixed_size<8>> const& a) {
+  return Experimental::simd<double,
+                            Experimental::simd_abi::avx512_fixed_size<8>>(
+      _mm512_cvtepi32_pd(static_cast<__m256i>(a)));
 }
 
-KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-simd<std::int64_t, simd_abi::avx512_fixed_size<8>> condition(
-    simd_mask<std::int64_t, simd_abi::avx512_fixed_size<8>> const& a,
-    simd<std::int64_t, simd_abi::avx512_fixed_size<8>> const& b,
-    simd<std::int64_t, simd_abi::avx512_fixed_size<8>> const& c) {
-  return simd<std::int64_t, simd_abi::avx512_fixed_size<8>>(
-      _mm512_mask_blend_epi64(static_cast<__mmask8>(a), static_cast<__m512i>(c),
-                              static_cast<__m512i>(b)));
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd<
+    double, Experimental::simd_abi::avx512_fixed_size<8>>
+trunc(Experimental::simd<
+      std::int32_t, Experimental::simd_abi::avx512_fixed_size<8>> const& a) {
+  return Experimental::simd<double,
+                            Experimental::simd_abi::avx512_fixed_size<8>>(
+      _mm512_cvtepi32_pd(static_cast<__m256i>(a)));
 }
 
-template <>
-class simd<std::uint64_t, simd_abi::avx512_fixed_size<8>> {
-  __m512i m_value;
+namespace Experimental {
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    simd<std::int32_t, simd_abi::avx512_fixed_size<8>>
+    condition(simd_mask<std::int32_t, simd_abi::avx512_fixed_size<8>> const& a,
+              simd<std::int32_t, simd_abi::avx512_fixed_size<8>> const& b,
+              simd<std::int32_t, simd_abi::avx512_fixed_size<8>> const& c) {
+  return simd<std::int32_t, simd_abi::avx512_fixed_size<8>>(
+      _mm256_mask_blend_epi32(static_cast<__mmask8>(a), static_cast<__m256i>(c),
+                              static_cast<__m256i>(b)));
+}
+
+template <>
+class simd<std::uint32_t, simd_abi::avx512_fixed_size<8>> {
+  __m256i m_value;
 
  public:
-  using value_type = std::uint64_t;
+  using value_type = std::uint32_t;
   using abi_type   = simd_abi::avx512_fixed_size<8>;
   using mask_type  = simd_mask<value_type, abi_type>;
   using reference  = value_type&;
@@ -484,15 +901,32 @@ class simd<std::uint64_t, simd_abi::avx512_fixed_size<8>> {
   template <class U, std::enable_if_t<std::is_convertible_v<U, value_type>,
                                       bool> = false>
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(U&& value)
-      : m_value(_mm512_set1_epi64(bit_cast<std::int64_t>(value_type(value)))) {}
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr simd(__m512i const& value_in)
+      : m_value(_mm256_set1_epi32(
+            Kokkos::bit_cast<std::int32_t>(value_type(value)))) {}
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd(
+      __m256i const& value_in)
       : m_value(value_in) {}
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd(
-      simd<std::int32_t, abi_type> const& other)
-      : m_value(_mm512_cvtepi32_epi64(static_cast<__m256i>(other))) {}
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd(
-      simd<std::int64_t, abi_type> const& other)
-      : m_value(static_cast<__m512i>(other)) {}
+      simd<std::int32_t, simd_abi::avx512_fixed_size<8>> const& other)
+      : m_value(static_cast<__m256i>(other)) {}
+  template <class G,
+            std::enable_if_t<
+                // basically, can you do { value_type r =
+                // gen(std::integral_constant<std::size_t, i>()); }
+                std::is_invocable_r_v<value_type, G,
+                                      std::integral_constant<std::size_t, 0>>,
+                bool> = false>
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd(
+      G&& gen) noexcept
+      : m_value(
+            _mm256_setr_epi32(gen(std::integral_constant<std::size_t, 0>()),
+                              gen(std::integral_constant<std::size_t, 1>()),
+                              gen(std::integral_constant<std::size_t, 2>()),
+                              gen(std::integral_constant<std::size_t, 3>()),
+                              gen(std::integral_constant<std::size_t, 4>()),
+                              gen(std::integral_constant<std::size_t, 5>()),
+                              gen(std::integral_constant<std::size_t, 6>()),
+                              gen(std::integral_constant<std::size_t, 7>()))) {}
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) {
     return reinterpret_cast<value_type*>(&m_value)[i];
   }
@@ -500,112 +934,151 @@ class simd<std::uint64_t, simd_abi::avx512_fixed_size<8>> {
   operator[](std::size_t i) const {
     return reinterpret_cast<value_type const*>(&m_value)[i];
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd
-  operator>>(unsigned int rhs) const {
-    return _mm512_srli_epi64(m_value, rhs);
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(
+      value_type* ptr, element_aligned_tag) const {
+    _mm256_mask_storeu_epi32(ptr, static_cast<__mmask8>(mask_type(true)),
+                             m_value);
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr,
+                                                       element_aligned_tag) {
+    m_value = _mm256_mask_loadu_epi32(
+        _mm256_set1_epi32(0), static_cast<__mmask8>(mask_type(true)), ptr);
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator>>(
-      simd<std::int32_t, simd_abi::avx512_fixed_size<8>> const& rhs) const {
-    return _mm512_srlv_epi64(m_value,
-                             _mm512_cvtepi32_epi64(static_cast<__m256i>(rhs)));
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256i()
+      const {
+    return m_value;
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd
-  operator<<(unsigned int rhs) const {
-    return _mm512_slli_epi64(m_value, rhs);
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(_mm256_mullo_epi32(static_cast<__m256i>(lhs),
+                                   static_cast<__m256i>(rhs)));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator<<(
-      simd<std::int32_t, simd_abi::avx512_fixed_size<8>> const& rhs) const {
-    return _mm512_sllv_epi64(m_value,
-                             _mm512_cvtepi32_epi64(static_cast<__m256i>(rhs)));
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator+(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(
+        _mm256_add_epi32(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs)));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd
-  operator&(simd const& other) const {
-    return _mm512_and_epi64(m_value, other.m_value);
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator-(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(
+        _mm256_sub_epi32(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs)));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd
-  operator|(simd const& other) const {
-    return _mm512_or_epi64(m_value, other.m_value);
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator<(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(_mm256_cmplt_epu32_mask(static_cast<__m256i>(lhs),
+                                             static_cast<__m256i>(rhs)));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m512i()
-      const {
-    return m_value;
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator>(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(_mm256_cmplt_epu32_mask(static_cast<__m256i>(rhs),
+                                             static_cast<__m256i>(lhs)));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator<=(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(_mm256_cmple_epu32_mask(static_cast<__m256i>(lhs),
+                                             static_cast<__m256i>(rhs)));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator<(simd const& other) const {
-    return mask_type(_mm512_cmplt_epu64_mask(m_value, other.m_value));
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator>=(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(_mm256_cmple_epu32_mask(static_cast<__m256i>(rhs),
+                                             static_cast<__m256i>(lhs)));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator>(simd const& other) const {
-    return mask_type(_mm512_cmplt_epu64_mask(other.m_value, m_value));
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator==(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(_mm256_cmpeq_epu32_mask(static_cast<__m256i>(lhs),
+                                             static_cast<__m256i>(rhs)));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator<=(simd const& other) const {
-    return mask_type(_mm512_cmple_epu64_mask(m_value, other.m_value));
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator!=(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(_mm256_cmpneq_epu32_mask(static_cast<__m256i>(lhs),
+                                              static_cast<__m256i>(rhs)));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator>=(simd const& other) const {
-    return mask_type(_mm512_cmple_epu64_mask(other.m_value, m_value));
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>(
+      simd const& lhs, int rhs) noexcept {
+    return simd(_mm256_srli_epi32(static_cast<__m256i>(lhs), rhs));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(_mm256_srlv_epi32(static_cast<__m256i>(lhs),
+                                  static_cast<__m256i>(rhs)));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator==(simd const& other) const {
-    return mask_type(_mm512_cmpeq_epu64_mask(m_value, other.m_value));
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<(
+      simd const& lhs, int rhs) noexcept {
+    return simd(_mm256_slli_epi32(static_cast<__m256i>(lhs), rhs));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator!=(simd const& other) const {
-    return mask_type(_mm512_cmpneq_epu64_mask(m_value, other.m_value));
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(_mm256_sllv_epi32(static_cast<__m256i>(lhs),
+                                  static_cast<__m256i>(rhs)));
   }
 };
 
-[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-    simd<std::uint64_t, simd_abi::avx512_fixed_size<8>>
-    operator*(simd<std::uint64_t, simd_abi::avx512_fixed_size<8>> const& lhs,
-              simd<std::uint64_t, simd_abi::avx512_fixed_size<8>> const& rhs) {
-  return simd<std::uint64_t, simd_abi::avx512_fixed_size<8>>(
-      _mm512_mullo_epi64(static_cast<__m512i>(lhs), static_cast<__m512i>(rhs)));
+}  // namespace Experimental
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd<
+    std::uint32_t, Experimental::simd_abi::avx512_fixed_size<8>>
+abs(Experimental::simd<std::uint32_t,
+                       Experimental::simd_abi::avx512_fixed_size<8>> const& a) {
+  return a;
 }
 
-[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-    simd<std::uint64_t, simd_abi::avx512_fixed_size<8>>
-    operator+(simd<std::uint64_t, simd_abi::avx512_fixed_size<8>> const& lhs,
-              simd<std::uint64_t, simd_abi::avx512_fixed_size<8>> const& rhs) {
-  return simd<std::uint64_t, simd_abi::avx512_fixed_size<8>>(
-      _mm512_add_epi64(static_cast<__m512i>(lhs), static_cast<__m512i>(rhs)));
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd<
+    double, Experimental::simd_abi::avx512_fixed_size<8>>
+floor(Experimental::simd<
+      std::uint32_t, Experimental::simd_abi::avx512_fixed_size<8>> const& a) {
+  return Experimental::simd<double,
+                            Experimental::simd_abi::avx512_fixed_size<8>>(
+      _mm512_cvtepu32_pd(static_cast<__m256i>(a)));
 }
 
-[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-    simd<std::uint64_t, simd_abi::avx512_fixed_size<8>>
-    operator-(simd<std::uint64_t, simd_abi::avx512_fixed_size<8>> const& lhs,
-              simd<std::uint64_t, simd_abi::avx512_fixed_size<8>> const& rhs) {
-  return simd<std::uint64_t, simd_abi::avx512_fixed_size<8>>(
-      _mm512_sub_epi64(static_cast<__m512i>(lhs), static_cast<__m512i>(rhs)));
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd<
+    double, Experimental::simd_abi::avx512_fixed_size<8>>
+ceil(Experimental::simd<
+     std::uint32_t, Experimental::simd_abi::avx512_fixed_size<8>> const& a) {
+  return Experimental::simd<double,
+                            Experimental::simd_abi::avx512_fixed_size<8>>(
+      _mm512_cvtepu32_pd(static_cast<__m256i>(a)));
 }
 
-KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-simd<std::uint64_t, simd_abi::avx512_fixed_size<8>> condition(
-    simd_mask<std::uint64_t, simd_abi::avx512_fixed_size<8>> const& a,
-    simd<std::uint64_t, simd_abi::avx512_fixed_size<8>> const& b,
-    simd<std::uint64_t, simd_abi::avx512_fixed_size<8>> const& c) {
-  return simd<std::uint64_t, simd_abi::avx512_fixed_size<8>>(
-      _mm512_mask_blend_epi64(static_cast<__mmask8>(a), static_cast<__m512i>(c),
-                              static_cast<__m512i>(b)));
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd<
+    double, Experimental::simd_abi::avx512_fixed_size<8>>
+round(Experimental::simd<
+      std::uint32_t, Experimental::simd_abi::avx512_fixed_size<8>> const& a) {
+  return Experimental::simd<double,
+                            Experimental::simd_abi::avx512_fixed_size<8>>(
+      _mm512_cvtepu32_pd(static_cast<__m256i>(a)));
 }
 
-KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-simd<std::int32_t, simd_abi::avx512_fixed_size<8>>::simd(
-    simd<std::uint64_t, simd_abi::avx512_fixed_size<8>> const& other)
-    : m_value(_mm512_cvtepi64_epi32(static_cast<__m512i>(other))) {}
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd<
+    double, Experimental::simd_abi::avx512_fixed_size<8>>
+trunc(Experimental::simd<
+      std::uint32_t, Experimental::simd_abi::avx512_fixed_size<8>> const& a) {
+  return Experimental::simd<double,
+                            Experimental::simd_abi::avx512_fixed_size<8>>(
+      _mm512_cvtepu32_pd(static_cast<__m256i>(a)));
+}
 
-KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-simd<std::int64_t, simd_abi::avx512_fixed_size<8>>::simd(
-    simd<std::uint64_t, simd_abi::avx512_fixed_size<8>> const& other)
-    : m_value(static_cast<__m512i>(other)) {}
+namespace Experimental {
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    simd<std::uint32_t, simd_abi::avx512_fixed_size<8>>
+    condition(simd_mask<std::uint32_t, simd_abi::avx512_fixed_size<8>> const& a,
+              simd<std::uint32_t, simd_abi::avx512_fixed_size<8>> const& b,
+              simd<std::uint32_t, simd_abi::avx512_fixed_size<8>> const& c) {
+  return simd<std::uint32_t, simd_abi::avx512_fixed_size<8>>(
+      _mm256_mask_blend_epi32(static_cast<__mmask8>(a), static_cast<__m256i>(c),
+                              static_cast<__m256i>(b)));
+}
 
 template <>
-class simd<double, simd_abi::avx512_fixed_size<8>> {
-  __m512d m_value;
+class simd<std::int64_t, simd_abi::avx512_fixed_size<8>> {
+  __m512i m_value;
 
  public:
-  using value_type = double;
+  using value_type = std::int64_t;
   using abi_type   = simd_abi::avx512_fixed_size<8>;
   using mask_type  = simd_mask<value_type, abi_type>;
   using reference  = value_type&;
@@ -620,13 +1093,31 @@ class simd<double, simd_abi::avx512_fixed_size<8>> {
   template <class U, std::enable_if_t<std::is_convertible_v<U, value_type>,
                                       bool> = false>
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(U&& value)
-      : m_value(_mm512_set1_pd(value_type(value))) {}
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(double a, double b, double c,
-                                             double d, double e, double f,
-                                             double g, double h)
-      : m_value(_mm512_setr_pd(a, b, c, d, e, f, g, h)) {}
+      : m_value(_mm512_set1_epi64(value_type(value))) {}
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd(
+      simd<std::int32_t, simd_abi::avx512_fixed_size<8>> const& other)
+      : m_value(_mm512_cvtepi32_epi64(static_cast<__m256i>(other))) {}
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd(
+      simd<std::uint64_t, simd_abi::avx512_fixed_size<8>> const& other);
+  template <class G,
+            std::enable_if_t<
+                // basically, can you do { value_type r =
+                // gen(std::integral_constant<std::size_t, i>()); }
+                std::is_invocable_r_v<value_type, G,
+                                      std::integral_constant<std::size_t, 0>>,
+                bool> = false>
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd(
-      __m512d const& value_in)
+      G&& gen) noexcept
+      : m_value(
+            _mm512_setr_epi64(gen(std::integral_constant<std::size_t, 0>()),
+                              gen(std::integral_constant<std::size_t, 1>()),
+                              gen(std::integral_constant<std::size_t, 2>()),
+                              gen(std::integral_constant<std::size_t, 3>()),
+                              gen(std::integral_constant<std::size_t, 4>()),
+                              gen(std::integral_constant<std::size_t, 5>()),
+                              gen(std::integral_constant<std::size_t, 6>()),
+                              gen(std::integral_constant<std::size_t, 7>()))) {}
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr simd(__m512i const& value_in)
       : m_value(value_in) {}
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) {
     return reinterpret_cast<value_type*>(&m_value)[i];
@@ -637,171 +1128,360 @@ class simd<double, simd_abi::avx512_fixed_size<8>> {
   }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr,
                                                        element_aligned_tag) {
-    m_value = _mm512_loadu_pd(ptr);
+    m_value = _mm512_loadu_si512(ptr);
   }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(
       value_type* ptr, element_aligned_tag) const {
-    _mm512_storeu_pd(ptr, m_value);
+    _mm512_storeu_si512(ptr, m_value);
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m512d()
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m512i()
       const {
     return m_value;
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator<(simd const& other) const {
-    return mask_type(_mm512_cmp_pd_mask(m_value, other.m_value, _CMP_LT_OS));
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator-() const
+      noexcept {
+    return simd(_mm512_sub_epi64(_mm512_set1_epi64(0), m_value));
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(_mm512_mullo_epi64(static_cast<__m512i>(lhs),
+                                   static_cast<__m512i>(rhs)));
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator+(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(
+        _mm512_add_epi64(static_cast<__m512i>(lhs), static_cast<__m512i>(rhs)));
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator-(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(
+        _mm512_sub_epi64(static_cast<__m512i>(lhs), static_cast<__m512i>(rhs)));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator<(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(_mm512_cmplt_epi64_mask(static_cast<__m512i>(lhs),
+                                             static_cast<__m512i>(rhs)));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator>(simd const& other) const {
-    return mask_type(_mm512_cmp_pd_mask(m_value, other.m_value, _CMP_GT_OS));
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator>(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(_mm512_cmplt_epi64_mask(static_cast<__m512i>(rhs),
+                                             static_cast<__m512i>(lhs)));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator<=(simd const& other) const {
-    return mask_type(_mm512_cmp_pd_mask(m_value, other.m_value, _CMP_LE_OS));
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator<=(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(_mm512_cmple_epi64_mask(static_cast<__m512i>(lhs),
+                                             static_cast<__m512i>(rhs)));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator>=(simd const& other) const {
-    return mask_type(_mm512_cmp_pd_mask(m_value, other.m_value, _CMP_GE_OS));
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator>=(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(_mm512_cmple_epi64_mask(static_cast<__m512i>(rhs),
+                                             static_cast<__m512i>(lhs)));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator==(simd const& other) const {
-    return mask_type(_mm512_cmp_pd_mask(m_value, other.m_value, _CMP_EQ_OS));
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator==(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(_mm512_cmpeq_epi64_mask(static_cast<__m512i>(lhs),
+                                             static_cast<__m512i>(rhs)));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator!=(simd const& other) const {
-    return mask_type(_mm512_cmp_pd_mask(m_value, other.m_value, _CMP_NEQ_OS));
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator!=(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(_mm512_cmpneq_epi64_mask(static_cast<__m512i>(lhs),
+                                              static_cast<__m512i>(rhs)));
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>(
+      simd const& lhs, int rhs) {
+    return simd(_mm512_srai_epi64(static_cast<__m512i>(lhs), rhs));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>(
+      simd const& lhs, simd const& rhs) {
+    return simd(_mm512_srav_epi64(static_cast<__m512i>(lhs),
+                                  static_cast<__m512i>(rhs)));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<(
+      simd const& lhs, int rhs) {
+    return simd(_mm512_slli_epi64(static_cast<__m512i>(lhs), rhs));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<(
+      simd const& lhs, simd const& rhs) {
+    return simd(_mm512_sllv_epi64(static_cast<__m512i>(lhs),
+                                  static_cast<__m512i>(rhs)));
   }
 };
 
-[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-    simd<double, simd_abi::avx512_fixed_size<8>>
-    operator*(simd<double, simd_abi::avx512_fixed_size<8>> const& lhs,
-              simd<double, simd_abi::avx512_fixed_size<8>> const& rhs) {
-  return simd<double, simd_abi::avx512_fixed_size<8>>(
-      _mm512_mul_pd(static_cast<__m512d>(lhs), static_cast<__m512d>(rhs)));
-}
+}  // namespace Experimental
 
-[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-    simd<double, simd_abi::avx512_fixed_size<8>>
-    operator/(simd<double, simd_abi::avx512_fixed_size<8>> const& lhs,
-              simd<double, simd_abi::avx512_fixed_size<8>> const& rhs) {
-  return simd<double, simd_abi::avx512_fixed_size<8>>(
-      _mm512_div_pd(static_cast<__m512d>(lhs), static_cast<__m512d>(rhs)));
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd<
+    std::int64_t, Experimental::simd_abi::avx512_fixed_size<8>>
+abs(Experimental::simd<std::int64_t,
+                       Experimental::simd_abi::avx512_fixed_size<8>> const& a) {
+  __m512i const rhs = static_cast<__m512i>(a);
+  return Experimental::simd<std::int64_t,
+                            Experimental::simd_abi::avx512_fixed_size<8>>(
+      _mm512_abs_epi64(rhs));
 }
 
-[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-    simd<double, simd_abi::avx512_fixed_size<8>>
-    operator+(simd<double, simd_abi::avx512_fixed_size<8>> const& lhs,
-              simd<double, simd_abi::avx512_fixed_size<8>> const& rhs) {
-  return simd<double, simd_abi::avx512_fixed_size<8>>(
-      _mm512_add_pd(static_cast<__m512d>(lhs), static_cast<__m512d>(rhs)));
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd<
+    double, Experimental::simd_abi::avx512_fixed_size<8>>
+floor(Experimental::simd<
+      std::int64_t, Experimental::simd_abi::avx512_fixed_size<8>> const& a) {
+  return Experimental::simd<double,
+                            Experimental::simd_abi::avx512_fixed_size<8>>(
+      _mm512_cvtepi64_pd(static_cast<__m512i>(a)));
 }
 
 [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-    simd<double, simd_abi::avx512_fixed_size<8>>
-    operator-(simd<double, simd_abi::avx512_fixed_size<8>> const& lhs,
-              simd<double, simd_abi::avx512_fixed_size<8>> const& rhs) {
-  return simd<double, simd_abi::avx512_fixed_size<8>>(
-      _mm512_sub_pd(static_cast<__m512d>(lhs), static_cast<__m512d>(rhs)));
+    Experimental::simd<double, Experimental::simd_abi::avx512_fixed_size<8>>
+    ceil(Experimental::simd<
+         std::int64_t, Experimental::simd_abi::avx512_fixed_size<8>> const& a) {
+  return Experimental::simd<double,
+                            Experimental::simd_abi::avx512_fixed_size<8>>(
+      _mm512_cvtepi64_pd(static_cast<__m512i>(a)));
 }
 
-[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-    simd<double, simd_abi::avx512_fixed_size<8>>
-    operator-(simd<double, simd_abi::avx512_fixed_size<8>> const& a) {
-  return simd<double, simd_abi::avx512_fixed_size<8>>(
-      _mm512_sub_pd(_mm512_set1_pd(0.0), static_cast<__m512d>(a)));
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd<
+    double, Experimental::simd_abi::avx512_fixed_size<8>>
+round(Experimental::simd<
+      std::int64_t, Experimental::simd_abi::avx512_fixed_size<8>> const& a) {
+  return Experimental::simd<double,
+                            Experimental::simd_abi::avx512_fixed_size<8>>(
+      _mm512_cvtepi64_pd(static_cast<__m512i>(a)));
 }
 
-KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-simd<double, simd_abi::avx512_fixed_size<8>> copysign(
-    simd<double, simd_abi::avx512_fixed_size<8>> const& a,
-    simd<double, simd_abi::avx512_fixed_size<8>> const& b) {
-  static const __m512i sign_mask = reinterpret_cast<__m512i>(
-      static_cast<__m512d>(simd<double, simd_abi::avx512_fixed_size<8>>(-0.0)));
-  return simd<double, simd_abi::avx512_fixed_size<8>>(
-      reinterpret_cast<__m512d>(_mm512_xor_epi64(
-          _mm512_andnot_epi64(
-              sign_mask, reinterpret_cast<__m512i>(static_cast<__m512d>(a))),
-          _mm512_and_epi64(
-              sign_mask, reinterpret_cast<__m512i>(static_cast<__m512d>(b))))));
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd<
+    double, Experimental::simd_abi::avx512_fixed_size<8>>
+trunc(Experimental::simd<
+      std::int64_t, Experimental::simd_abi::avx512_fixed_size<8>> const& a) {
+  return Experimental::simd<double,
+                            Experimental::simd_abi::avx512_fixed_size<8>>(
+      _mm512_cvtepi64_pd(static_cast<__m512i>(a)));
 }
 
-KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-simd<double, simd_abi::avx512_fixed_size<8>> abs(
-    simd<double, simd_abi::avx512_fixed_size<8>> const& a) {
-  __m512d const rhs = static_cast<__m512d>(a);
-  return simd<double, simd_abi::avx512_fixed_size<8>>(reinterpret_cast<__m512d>(
-      _mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),
-                       reinterpret_cast<__m512i>(rhs))));
-}
+namespace Experimental {
 
-KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-simd<double, simd_abi::avx512_fixed_size<8>> sqrt(
-    simd<double, simd_abi::avx512_fixed_size<8>> const& a) {
-  return simd<double, simd_abi::avx512_fixed_size<8>>(
-      _mm512_sqrt_pd(static_cast<__m512d>(a)));
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    simd<std::int64_t, simd_abi::avx512_fixed_size<8>>
+    condition(simd_mask<std::int64_t, simd_abi::avx512_fixed_size<8>> const& a,
+              simd<std::int64_t, simd_abi::avx512_fixed_size<8>> const& b,
+              simd<std::int64_t, simd_abi::avx512_fixed_size<8>> const& c) {
+  return simd<std::int64_t, simd_abi::avx512_fixed_size<8>>(
+      _mm512_mask_blend_epi64(static_cast<__mmask8>(a), static_cast<__m512i>(c),
+                              static_cast<__m512i>(b)));
 }
 
-#ifdef __INTEL_COMPILER
+template <>
+class simd<std::uint64_t, simd_abi::avx512_fixed_size<8>> {
+  __m512i m_value;
 
-KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-simd<double, simd_abi::avx512_fixed_size<8>> cbrt(
-    simd<double, simd_abi::avx512_fixed_size<8>> const& a) {
-  return simd<double, simd_abi::avx512_fixed_size<8>>(
-      _mm512_cbrt_pd(static_cast<__m512d>(a)));
+ public:
+  using value_type = std::uint64_t;
+  using abi_type   = simd_abi::avx512_fixed_size<8>;
+  using mask_type  = simd_mask<value_type, abi_type>;
+  using reference  = value_type&;
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd()            = default;
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default;
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&)      = default;
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd const&) = default;
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default;
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() {
+    return 8;
+  }
+  template <class U, std::enable_if_t<std::is_convertible_v<U, value_type>,
+                                      bool> = false>
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(U&& value)
+      : m_value(_mm512_set1_epi64(
+            Kokkos::bit_cast<std::int64_t>(value_type(value)))) {}
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr simd(__m512i const& value_in)
+      : m_value(value_in) {}
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd(
+      simd<std::int32_t, abi_type> const& other)
+      : m_value(_mm512_cvtepi32_epi64(static_cast<__m256i>(other))) {}
+  template <class G,
+            std::enable_if_t<
+                // basically, can you do { value_type r =
+                // gen(std::integral_constant<std::size_t, i>()); }
+                std::is_invocable_r_v<value_type, G,
+                                      std::integral_constant<std::size_t, 0>>,
+                bool> = false>
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd(
+      G&& gen) noexcept
+      : m_value(
+            _mm512_setr_epi64(gen(std::integral_constant<std::size_t, 0>()),
+                              gen(std::integral_constant<std::size_t, 1>()),
+                              gen(std::integral_constant<std::size_t, 2>()),
+                              gen(std::integral_constant<std::size_t, 3>()),
+                              gen(std::integral_constant<std::size_t, 4>()),
+                              gen(std::integral_constant<std::size_t, 5>()),
+                              gen(std::integral_constant<std::size_t, 6>()),
+                              gen(std::integral_constant<std::size_t, 7>()))) {}
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd(
+      simd<std::int64_t, abi_type> const& other)
+      : m_value(static_cast<__m512i>(other)) {}
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) {
+    return reinterpret_cast<value_type*>(&m_value)[i];
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type
+  operator[](std::size_t i) const {
+    return reinterpret_cast<value_type const*>(&m_value)[i];
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr,
+                                                       element_aligned_tag) {
+    m_value = _mm512_loadu_si512(ptr);
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(
+      value_type* ptr, element_aligned_tag) const {
+    _mm512_storeu_si512(ptr, m_value);
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m512i()
+      const {
+    return m_value;
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(_mm512_mullo_epi64(static_cast<__m512i>(lhs),
+                                   static_cast<__m512i>(rhs)));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator+(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(
+        _mm512_add_epi64(static_cast<__m512i>(lhs), static_cast<__m512i>(rhs)));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator-(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(
+        _mm512_sub_epi64(static_cast<__m512i>(lhs), static_cast<__m512i>(rhs)));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>(
+      simd const& lhs, int rhs) noexcept {
+    return _mm512_srli_epi64(static_cast<__m512i>(lhs), rhs);
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>(
+      simd const& lhs, simd const& rhs) noexcept {
+    return _mm512_srlv_epi64(static_cast<__m512i>(lhs),
+                             static_cast<__m512i>(rhs));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<(
+      simd const& lhs, int rhs) noexcept {
+    return _mm512_slli_epi64(static_cast<__m512i>(lhs), rhs);
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<(
+      simd const& lhs, simd const& rhs) noexcept {
+    return _mm512_sllv_epi64(static_cast<__m512i>(lhs),
+                             static_cast<__m512i>(rhs));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator&(
+      simd const& lhs, simd const& rhs) noexcept {
+    return _mm512_and_epi64(static_cast<__m512i>(lhs),
+                            static_cast<__m512i>(rhs));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator|(
+      simd const& lhs, simd const& rhs) noexcept {
+    return _mm512_or_epi64(static_cast<__m512i>(lhs),
+                           static_cast<__m512i>(rhs));
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator<(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(_mm512_cmplt_epu64_mask(static_cast<__m512i>(lhs),
+                                             static_cast<__m512i>(rhs)));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator>(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(_mm512_cmplt_epu64_mask(static_cast<__m512i>(rhs),
+                                             static_cast<__m512i>(lhs)));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator<=(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(_mm512_cmple_epu64_mask(static_cast<__m512i>(lhs),
+                                             static_cast<__m512i>(rhs)));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator>=(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(_mm512_cmple_epu64_mask(static_cast<__m512i>(rhs),
+                                             static_cast<__m512i>(lhs)));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator==(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(_mm512_cmpeq_epu64_mask(static_cast<__m512i>(lhs),
+                                             static_cast<__m512i>(rhs)));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator!=(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(_mm512_cmpneq_epu64_mask(static_cast<__m512i>(lhs),
+                                              static_cast<__m512i>(rhs)));
+  }
+};
+
+}  // namespace Experimental
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd<
+    std::uint64_t, Experimental::simd_abi::avx512_fixed_size<8>>
+abs(Experimental::simd<std::uint64_t,
+                       Experimental::simd_abi::avx512_fixed_size<8>> const& a) {
+  return a;
 }
 
-KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-simd<double, simd_abi::avx512_fixed_size<8>> exp(
-    simd<double, simd_abi::avx512_fixed_size<8>> const& a) {
-  return simd<double, simd_abi::avx512_fixed_size<8>>(
-      _mm512_exp_pd(static_cast<__m512d>(a)));
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd<
+    double, Experimental::simd_abi::avx512_fixed_size<8>>
+floor(Experimental::simd<
+      std::uint64_t, Experimental::simd_abi::avx512_fixed_size<8>> const& a) {
+  return Experimental::simd<double,
+                            Experimental::simd_abi::avx512_fixed_size<8>>(
+      _mm512_cvtepu64_pd(static_cast<__m512i>(a)));
 }
 
-KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-simd<double, simd_abi::avx512_fixed_size<8>> log(
-    simd<double, simd_abi::avx512_fixed_size<8>> const& a) {
-  return simd<double, simd_abi::avx512_fixed_size<8>>(
-      _mm512_log_pd(static_cast<__m512d>(a)));
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd<
+    double, Experimental::simd_abi::avx512_fixed_size<8>>
+ceil(Experimental::simd<
+     std::uint64_t, Experimental::simd_abi::avx512_fixed_size<8>> const& a) {
+  return Experimental::simd<double,
+                            Experimental::simd_abi::avx512_fixed_size<8>>(
+      _mm512_cvtepu64_pd(static_cast<__m512i>(a)));
 }
 
-#endif
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd<
+    double, Experimental::simd_abi::avx512_fixed_size<8>>
+round(Experimental::simd<
+      std::uint64_t, Experimental::simd_abi::avx512_fixed_size<8>> const& a) {
+  return Experimental::simd<double,
+                            Experimental::simd_abi::avx512_fixed_size<8>>(
+      _mm512_cvtepu64_pd(static_cast<__m512i>(a)));
+}
 
-KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-simd<double, simd_abi::avx512_fixed_size<8>> fma(
-    simd<double, simd_abi::avx512_fixed_size<8>> const& a,
-    simd<double, simd_abi::avx512_fixed_size<8>> const& b,
-    simd<double, simd_abi::avx512_fixed_size<8>> const& c) {
-  return simd<double, simd_abi::avx512_fixed_size<8>>(
-      _mm512_fmadd_pd(static_cast<__m512d>(a), static_cast<__m512d>(b),
-                      static_cast<__m512d>(c)));
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd<
+    double, Experimental::simd_abi::avx512_fixed_size<8>>
+trunc(Experimental::simd<
+      std::uint64_t, Experimental::simd_abi::avx512_fixed_size<8>> const& a) {
+  return Experimental::simd<double,
+                            Experimental::simd_abi::avx512_fixed_size<8>>(
+      _mm512_cvtepu64_pd(static_cast<__m512i>(a)));
 }
 
-KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-simd<double, simd_abi::avx512_fixed_size<8>> max(
-    simd<double, simd_abi::avx512_fixed_size<8>> const& a,
-    simd<double, simd_abi::avx512_fixed_size<8>> const& b) {
-  return simd<double, simd_abi::avx512_fixed_size<8>>(
-      _mm512_max_pd(static_cast<__m512d>(a), static_cast<__m512d>(b)));
+namespace Experimental {
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    simd<std::uint64_t, simd_abi::avx512_fixed_size<8>>
+    condition(simd_mask<std::uint64_t, simd_abi::avx512_fixed_size<8>> const& a,
+              simd<std::uint64_t, simd_abi::avx512_fixed_size<8>> const& b,
+              simd<std::uint64_t, simd_abi::avx512_fixed_size<8>> const& c) {
+  return simd<std::uint64_t, simd_abi::avx512_fixed_size<8>>(
+      _mm512_mask_blend_epi64(static_cast<__mmask8>(a), static_cast<__m512i>(c),
+                              static_cast<__m512i>(b)));
 }
 
 KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-simd<double, simd_abi::avx512_fixed_size<8>> min(
-    simd<double, simd_abi::avx512_fixed_size<8>> const& a,
-    simd<double, simd_abi::avx512_fixed_size<8>> const& b) {
-  return simd<double, simd_abi::avx512_fixed_size<8>>(
-      _mm512_min_pd(static_cast<__m512d>(a), static_cast<__m512d>(b)));
-}
+simd<std::int32_t, simd_abi::avx512_fixed_size<8>>::simd(
+    simd<std::uint64_t, simd_abi::avx512_fixed_size<8>> const& other)
+    : m_value(_mm512_cvtepi64_epi32(static_cast<__m512i>(other))) {}
 
 KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-simd<double, simd_abi::avx512_fixed_size<8>> condition(
-    simd_mask<double, simd_abi::avx512_fixed_size<8>> const& a,
-    simd<double, simd_abi::avx512_fixed_size<8>> const& b,
-    simd<double, simd_abi::avx512_fixed_size<8>> const& c) {
-  return simd<double, simd_abi::avx512_fixed_size<8>>(
-      _mm512_mask_blend_pd(static_cast<__mmask8>(a), static_cast<__m512d>(c),
-                           static_cast<__m512d>(b)));
-}
+simd<std::int64_t, simd_abi::avx512_fixed_size<8>>::simd(
+    simd<std::uint64_t, simd_abi::avx512_fixed_size<8>> const& other)
+    : m_value(static_cast<__m512i>(other)) {}
 
 template <>
 class const_where_expression<simd_mask<double, simd_abi::avx512_fixed_size<8>>,
@@ -818,14 +1498,7 @@ class const_where_expression<simd_mask<double, simd_abi::avx512_fixed_size<8>>,
  public:
   const_where_expression(mask_type const& mask_arg, value_type const& value_arg)
       : m_value(const_cast<value_type&>(value_arg)), m_mask(mask_arg) {}
-  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr mask_type const&
-  mask() const {
-    return m_mask;
-  }
-  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr value_type const&
-  value() const {
-    return m_value;
-  }
+
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
   void copy_to(double* mem, element_aligned_tag) const {
     _mm512_mask_storeu_pd(mem, static_cast<__mmask8>(m_mask),
@@ -839,6 +1512,16 @@ class const_where_expression<simd_mask<double, simd_abi::avx512_fixed_size<8>>,
                               static_cast<__m256i>(index),
                               static_cast<__m512d>(m_value), 8);
   }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type const&
+  impl_get_value() const {
+    return m_value;
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type const&
+  impl_get_mask() const {
+    return m_mask;
+  }
 };
 
 template <>
@@ -862,7 +1545,7 @@ class where_expression<simd_mask<double, simd_abi::avx512_fixed_size<8>>,
       double const* mem,
       simd<std::int32_t, simd_abi::avx512_fixed_size<8>> const& index) {
     m_value = value_type(_mm512_mask_i32gather_pd(
-        _mm512_set1_pd(0.0), static_cast<__mmask8>(m_mask),
+        static_cast<__m512d>(m_value), static_cast<__mmask8>(m_mask),
         static_cast<__m256i>(index), mem, 8));
   }
   template <class U, std::enable_if_t<
@@ -879,6 +1562,88 @@ class where_expression<simd_mask<double, simd_abi::avx512_fixed_size<8>>,
   }
 };
 
+template <>
+class const_where_expression<simd_mask<float, simd_abi::avx512_fixed_size<8>>,
+                             simd<float, simd_abi::avx512_fixed_size<8>>> {
+ public:
+  using abi_type   = simd_abi::avx512_fixed_size<8>;
+  using value_type = simd<float, abi_type>;
+  using mask_type  = simd_mask<float, abi_type>;
+
+ protected:
+  value_type& m_value;
+  mask_type const& m_mask;
+
+ public:
+  const_where_expression(mask_type const& mask_arg, value_type const& value_arg)
+      : m_value(const_cast<value_type&>(value_arg)), m_mask(mask_arg) {}
+
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void copy_to(float* mem, element_aligned_tag) const {
+    _mm256_mask_storeu_ps(mem, static_cast<__mmask8>(m_mask),
+                          static_cast<__m256>(m_value));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void scatter_to(
+      float* mem,
+      simd<std::int32_t, simd_abi::avx512_fixed_size<8>> const& index) const {
+    _mm256_mask_i32scatter_ps(mem, static_cast<__mmask8>(m_mask),
+                              static_cast<__m256i>(index),
+                              static_cast<__m256>(m_value), 4);
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type const&
+  impl_get_value() const {
+    return m_value;
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type const&
+  impl_get_mask() const {
+    return m_mask;
+  }
+};
+
+template <>
+class where_expression<simd_mask<float, simd_abi::avx512_fixed_size<8>>,
+                       simd<float, simd_abi::avx512_fixed_size<8>>>
+    : public const_where_expression<
+          simd_mask<float, simd_abi::avx512_fixed_size<8>>,
+          simd<float, simd_abi::avx512_fixed_size<8>>> {
+ public:
+  where_expression(
+      simd_mask<float, simd_abi::avx512_fixed_size<8>> const& mask_arg,
+      simd<float, simd_abi::avx512_fixed_size<8>>& value_arg)
+      : const_where_expression(mask_arg, value_arg) {}
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void copy_from(float const* mem, element_aligned_tag) {
+    m_value = value_type(_mm256_mask_loadu_ps(
+        _mm256_set1_ps(0.0), static_cast<__mmask8>(m_mask), mem));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void gather_from(
+      float const* mem,
+      simd<std::int32_t, simd_abi::avx512_fixed_size<8>> const& index) {
+    __m256 on   = _mm256_castsi256_ps(_mm256_set1_epi32(-1));
+    __m256 mask = _mm256_maskz_mov_ps(static_cast<__mmask8>(m_mask), on);
+    m_value     = value_type(
+        _mm256_mask_i32gather_ps(static_cast<__m256>(m_value), mem,
+                                 static_cast<__m256i>(index), mask, 4));
+  }
+  template <
+      class U,
+      std::enable_if_t<
+          std::is_convertible_v<U, simd<float, simd_abi::avx512_fixed_size<8>>>,
+          bool> = false>
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void operator=(U&& x) {
+    auto const x_as_value_type =
+        static_cast<simd<float, simd_abi::avx512_fixed_size<8>>>(
+            std::forward<U>(x));
+    m_value = simd<float, simd_abi::avx512_fixed_size<8>>(_mm256_mask_blend_ps(
+        static_cast<__mmask8>(m_mask), static_cast<__m256>(m_value),
+        static_cast<__m256>(x_as_value_type)));
+  }
+};
+
 template <>
 class const_where_expression<
     simd_mask<std::int32_t, simd_abi::avx512_fixed_size<8>>,
@@ -895,19 +1660,30 @@ class const_where_expression<
  public:
   const_where_expression(mask_type const& mask_arg, value_type const& value_arg)
       : m_value(const_cast<value_type&>(value_arg)), m_mask(mask_arg) {}
-  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr mask_type const&
-  mask() const {
-    return m_mask;
-  }
-  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr value_type const&
-  value() const {
-    return m_value;
-  }
+
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
   void copy_to(std::int32_t* mem, element_aligned_tag) const {
     _mm256_mask_storeu_epi32(mem, static_cast<__mmask8>(m_mask),
                              static_cast<__m256i>(m_value));
   }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void scatter_to(
+      std::int32_t* mem,
+      simd<std::int32_t, simd_abi::avx512_fixed_size<8>> const& index) const {
+    _mm256_mask_i32scatter_epi32(mem, static_cast<__mmask8>(m_mask),
+                                 static_cast<__m256i>(index),
+                                 static_cast<__m256i>(m_value), 4);
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type const&
+  impl_get_value() const {
+    return m_value;
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type const&
+  impl_get_mask() const {
+    return m_mask;
+  }
 };
 
 template <>
@@ -926,6 +1702,110 @@ class where_expression<simd_mask<std::int32_t, simd_abi::avx512_fixed_size<8>>,
     m_value = value_type(_mm256_mask_loadu_epi32(
         _mm256_set1_epi32(0), static_cast<__mmask8>(m_mask), mem));
   }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void gather_from(
+      std::int32_t const* mem,
+      simd<std::int32_t, simd_abi::avx512_fixed_size<8>> const& index) {
+    m_value = value_type(_mm256_mmask_i32gather_epi32(
+        static_cast<__m256i>(m_value), static_cast<__mmask8>(m_mask),
+        static_cast<__m256i>(index), mem, 4));
+  }
+  template <class U,
+            std::enable_if_t<
+                std::is_convertible_v<
+                    U, simd<std::int32_t, simd_abi::avx512_fixed_size<8>>>,
+                bool> = false>
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void operator=(U&& x) {
+    auto const x_as_value_type =
+        static_cast<simd<std::int32_t, simd_abi::avx512_fixed_size<8>>>(
+            std::forward<U>(x));
+    m_value = simd<std::int32_t, simd_abi::avx512_fixed_size<8>>(
+        _mm256_mask_blend_epi32(static_cast<__mmask8>(m_mask),
+                                static_cast<__m256i>(m_value),
+                                static_cast<__m256i>(x_as_value_type)));
+  }
+};
+
+template <>
+class const_where_expression<
+    simd_mask<std::uint32_t, simd_abi::avx512_fixed_size<8>>,
+    simd<std::uint32_t, simd_abi::avx512_fixed_size<8>>> {
+ public:
+  using abi_type   = simd_abi::avx512_fixed_size<8>;
+  using value_type = simd<std::uint32_t, abi_type>;
+  using mask_type  = simd_mask<std::uint32_t, abi_type>;
+
+ protected:
+  value_type& m_value;
+  mask_type const& m_mask;
+
+ public:
+  const_where_expression(mask_type const& mask_arg, value_type const& value_arg)
+      : m_value(const_cast<value_type&>(value_arg)), m_mask(mask_arg) {}
+
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void copy_to(std::uint32_t* mem, element_aligned_tag) const {
+    _mm256_mask_storeu_epi32(mem, static_cast<__mmask8>(m_mask),
+                             static_cast<__m256i>(m_value));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void scatter_to(
+      std::uint32_t* mem,
+      simd<std::int32_t, simd_abi::avx512_fixed_size<8>> const& index) const {
+    _mm256_mask_i32scatter_epi32(mem, static_cast<__mmask8>(m_mask),
+                                 static_cast<__m256i>(index),
+                                 static_cast<__m256i>(m_value), 4);
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type const&
+  impl_get_value() const {
+    return m_value;
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type const&
+  impl_get_mask() const {
+    return m_mask;
+  }
+};
+
+template <>
+class where_expression<simd_mask<std::uint32_t, simd_abi::avx512_fixed_size<8>>,
+                       simd<std::uint32_t, simd_abi::avx512_fixed_size<8>>>
+    : public const_where_expression<
+          simd_mask<std::uint32_t, simd_abi::avx512_fixed_size<8>>,
+          simd<std::uint32_t, simd_abi::avx512_fixed_size<8>>> {
+ public:
+  where_expression(
+      simd_mask<std::uint32_t, simd_abi::avx512_fixed_size<8>> const& mask_arg,
+      simd<std::uint32_t, simd_abi::avx512_fixed_size<8>>& value_arg)
+      : const_where_expression(mask_arg, value_arg) {}
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void copy_from(std::uint32_t const* mem, element_aligned_tag) {
+    m_value = value_type(_mm256_mask_loadu_epi32(
+        _mm256_set1_epi32(0), static_cast<__mmask8>(m_mask), mem));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void gather_from(
+      std::uint32_t const* mem,
+      simd<std::int32_t, simd_abi::avx512_fixed_size<8>> const& index) {
+    m_value = value_type(_mm256_mmask_i32gather_epi32(
+        static_cast<__m256i>(m_value), static_cast<__mmask8>(m_mask),
+        static_cast<__m256i>(index), mem, 4));
+  }
+  template <class U,
+            std::enable_if_t<
+                std::is_convertible_v<
+                    U, simd<std::uint32_t, simd_abi::avx512_fixed_size<8>>>,
+                bool> = false>
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void operator=(U&& x) {
+    auto const x_as_value_type =
+        static_cast<simd<std::uint32_t, simd_abi::avx512_fixed_size<8>>>(
+            std::forward<U>(x));
+    m_value = simd<std::uint32_t, simd_abi::avx512_fixed_size<8>>(
+        _mm256_mask_blend_epi32(static_cast<__mmask8>(m_mask),
+                                static_cast<__m256i>(m_value),
+                                static_cast<__m256i>(x_as_value_type)));
+  }
 };
 
 template <>
@@ -944,14 +1824,152 @@ class const_where_expression<
  public:
   const_where_expression(mask_type const& mask_arg, value_type const& value_arg)
       : m_value(const_cast<value_type&>(value_arg)), m_mask(mask_arg) {}
-  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr mask_type const&
-  mask() const {
+
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void copy_to(std::int64_t* mem, element_aligned_tag) const {
+    _mm512_mask_storeu_epi64(mem, static_cast<__mmask8>(m_mask),
+                             static_cast<__m512i>(m_value));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void scatter_to(
+      std::int64_t* mem,
+      simd<std::int32_t, simd_abi::avx512_fixed_size<8>> const& index) const {
+    _mm512_mask_i32scatter_epi64(mem, static_cast<__mmask8>(m_mask),
+                                 static_cast<__m256i>(index),
+                                 static_cast<__m512i>(m_value), 8);
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type const&
+  impl_get_value() const {
+    return m_value;
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type const&
+  impl_get_mask() const {
     return m_mask;
   }
-  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr value_type const&
-  value() const {
+};
+
+template <>
+class where_expression<simd_mask<std::int64_t, simd_abi::avx512_fixed_size<8>>,
+                       simd<std::int64_t, simd_abi::avx512_fixed_size<8>>>
+    : public const_where_expression<
+          simd_mask<std::int64_t, simd_abi::avx512_fixed_size<8>>,
+          simd<std::int64_t, simd_abi::avx512_fixed_size<8>>> {
+ public:
+  where_expression(
+      simd_mask<std::int64_t, simd_abi::avx512_fixed_size<8>> const& mask_arg,
+      simd<std::int64_t, simd_abi::avx512_fixed_size<8>>& value_arg)
+      : const_where_expression(mask_arg, value_arg) {}
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void copy_from(std::int64_t const* mem, element_aligned_tag) {
+    m_value = value_type(_mm512_mask_loadu_epi64(
+        _mm512_set1_epi64(0.0), static_cast<__mmask8>(m_mask), mem));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void gather_from(
+      std::int64_t const* mem,
+      simd<std::int32_t, simd_abi::avx512_fixed_size<8>> const& index) {
+    m_value = value_type(_mm512_mask_i32gather_epi64(
+        static_cast<__m512i>(m_value), static_cast<__mmask8>(m_mask),
+        static_cast<__m256i>(index), mem, 8));
+  }
+  template <class U,
+            std::enable_if_t<
+                std::is_convertible_v<
+                    U, simd<std::int64_t, simd_abi::avx512_fixed_size<8>>>,
+                bool> = false>
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void operator=(U&& x) {
+    auto const x_as_value_type =
+        static_cast<simd<std::int64_t, simd_abi::avx512_fixed_size<8>>>(
+            std::forward<U>(x));
+    m_value = simd<std::int64_t, simd_abi::avx512_fixed_size<8>>(
+        _mm512_mask_blend_epi64(static_cast<__mmask8>(m_mask),
+                                static_cast<__m512i>(m_value),
+                                static_cast<__m512i>(x_as_value_type)));
+  }
+};
+
+template <>
+class const_where_expression<
+    simd_mask<std::uint64_t, simd_abi::avx512_fixed_size<8>>,
+    simd<std::uint64_t, simd_abi::avx512_fixed_size<8>>> {
+ public:
+  using abi_type   = simd_abi::avx512_fixed_size<8>;
+  using value_type = simd<std::uint64_t, abi_type>;
+  using mask_type  = simd_mask<std::uint64_t, abi_type>;
+
+ protected:
+  value_type& m_value;
+  mask_type const& m_mask;
+
+ public:
+  const_where_expression(mask_type const& mask_arg, value_type const& value_arg)
+      : m_value(const_cast<value_type&>(value_arg)), m_mask(mask_arg) {}
+
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void copy_to(std::uint64_t* mem, element_aligned_tag) const {
+    _mm512_mask_storeu_epi64(mem, static_cast<__mmask8>(m_mask),
+                             static_cast<__m512i>(m_value));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void scatter_to(
+      std::uint64_t* mem,
+      simd<std::int32_t, simd_abi::avx512_fixed_size<8>> const& index) const {
+    _mm512_mask_i32scatter_epi64(mem, static_cast<__mmask8>(m_mask),
+                                 static_cast<__m256i>(index),
+                                 static_cast<__m512i>(m_value), 8);
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type const&
+  impl_get_value() const {
     return m_value;
   }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type const&
+  impl_get_mask() const {
+    return m_mask;
+  }
+};
+
+template <>
+class where_expression<simd_mask<std::uint64_t, simd_abi::avx512_fixed_size<8>>,
+                       simd<std::uint64_t, simd_abi::avx512_fixed_size<8>>>
+    : public const_where_expression<
+          simd_mask<std::uint64_t, simd_abi::avx512_fixed_size<8>>,
+          simd<std::uint64_t, simd_abi::avx512_fixed_size<8>>> {
+ public:
+  where_expression(
+      simd_mask<std::uint64_t, simd_abi::avx512_fixed_size<8>> const& mask_arg,
+      simd<std::uint64_t, simd_abi::avx512_fixed_size<8>>& value_arg)
+      : const_where_expression(mask_arg, value_arg) {}
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void copy_from(std::uint64_t const* mem, element_aligned_tag) {
+    m_value = value_type(_mm512_mask_loadu_epi64(
+        _mm512_set1_epi64(0.0), static_cast<__mmask8>(m_mask), mem));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void gather_from(
+      std::uint64_t const* mem,
+      simd<std::int32_t, simd_abi::avx512_fixed_size<8>> const& index) {
+    m_value = value_type(_mm512_mask_i32gather_epi64(
+        static_cast<__m512i>(m_value), static_cast<__mmask8>(m_mask),
+        static_cast<__m256i>(index), mem, 8));
+  }
+  template <class U,
+            std::enable_if_t<
+                std::is_convertible_v<
+                    U, simd<std::uint64_t, simd_abi::avx512_fixed_size<8>>>,
+                bool> = false>
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void operator=(U&& x) {
+    auto const x_as_value_type =
+        static_cast<simd<std::uint64_t, simd_abi::avx512_fixed_size<8>>>(
+            std::forward<U>(x));
+    m_value = simd<std::uint64_t, simd_abi::avx512_fixed_size<8>>(
+        _mm512_mask_blend_epi64(static_cast<__mmask8>(m_mask),
+                                static_cast<__m512i>(m_value),
+                                static_cast<__m512i>(x_as_value_type)));
+  }
 };
 
 [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION std::int32_t hmax(
@@ -959,16 +1977,16 @@ class const_where_expression<
         simd_mask<std::int32_t, simd_abi::avx512_fixed_size<8>>,
         simd<std::int32_t, simd_abi::avx512_fixed_size<8>>> const& x) {
   return _mm512_mask_reduce_max_epi32(
-      static_cast<__mmask8>(x.mask()),
-      _mm512_castsi256_si512(static_cast<__m256i>(x.value())));
+      static_cast<__mmask8>(x.impl_get_mask()),
+      _mm512_castsi256_si512(static_cast<__m256i>(x.impl_get_value())));
 }
 
 [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION double hmin(
     const_where_expression<simd_mask<double, simd_abi::avx512_fixed_size<8>>,
                            simd<double, simd_abi::avx512_fixed_size<8>>> const&
         x) {
-  return _mm512_mask_reduce_min_pd(static_cast<__mmask8>(x.mask()),
-                                   static_cast<__m512d>(x.value()));
+  return _mm512_mask_reduce_min_pd(static_cast<__mmask8>(x.impl_get_mask()),
+                                   static_cast<__m512d>(x.impl_get_value()));
 }
 
 [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION std::int64_t reduce(
@@ -976,8 +1994,8 @@ class const_where_expression<
         simd_mask<std::int64_t, simd_abi::avx512_fixed_size<8>>,
         simd<std::int64_t, simd_abi::avx512_fixed_size<8>>> const& x,
     std::int64_t, std::plus<>) {
-  return _mm512_mask_reduce_add_epi64(static_cast<__mmask8>(x.mask()),
-                                      static_cast<__m512i>(x.value()));
+  return _mm512_mask_reduce_add_epi64(static_cast<__mmask8>(x.impl_get_mask()),
+                                      static_cast<__m512i>(x.impl_get_value()));
 }
 
 [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION double reduce(
@@ -985,8 +2003,8 @@ class const_where_expression<
                            simd<double, simd_abi::avx512_fixed_size<8>>> const&
         x,
     double, std::plus<>) {
-  return _mm512_mask_reduce_add_pd(static_cast<__mmask8>(x.mask()),
-                                   static_cast<__m512d>(x.value()));
+  return _mm512_mask_reduce_add_pd(static_cast<__mmask8>(x.impl_get_mask()),
+                                   static_cast<__m512d>(x.impl_get_value()));
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/simd/src/Kokkos_SIMD_Common.hpp b/packages/kokkos/simd/src/Kokkos_SIMD_Common.hpp
index c29d49fb3ab4a781605f0be164b39413ffe9c989..87edf994533df8c79cae87bf3e38e560d0fec84e 100644
--- a/packages/kokkos/simd/src/Kokkos_SIMD_Common.hpp
+++ b/packages/kokkos/simd/src/Kokkos_SIMD_Common.hpp
@@ -17,7 +17,6 @@
 #ifndef KOKKOS_SIMD_COMMON_HPP
 #define KOKKOS_SIMD_COMMON_HPP
 
-#include <cmath>
 #include <cstring>
 
 #include <Kokkos_Core.hpp>
@@ -26,14 +25,6 @@ namespace Kokkos {
 
 namespace Experimental {
 
-template <class To, class From>
-[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION constexpr To bit_cast(
-    From const& src) {
-  To dst;
-  std::memcpy(&dst, &src, sizeof(To));
-  return dst;
-}
-
 template <class T, class Abi>
 class simd;
 
@@ -100,14 +91,14 @@ class where_expression<bool, T> : public const_where_expression<bool, T> {
 };
 
 template <class T, class Abi>
-[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
     where_expression<simd_mask<T, Abi>, simd<T, Abi>>
     where(typename simd<T, Abi>::mask_type const& mask, simd<T, Abi>& value) {
   return where_expression(mask, value);
 }
 
 template <class T, class Abi>
-[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
     const_where_expression<simd_mask<T, Abi>, simd<T, Abi>>
     where(typename simd<T, Abi>::mask_type const& mask,
           simd<T, Abi> const& value) {
@@ -137,30 +128,34 @@ template <class T, class Abi>
 }
 
 // fallback simd shift using generator constructor
-// At the time of this writing, these fallbacks are only used
-// to shift vectors of 64-bit unsigned integers for the NEON backend
+// At the time of this edit, only the fallback for shift vectors of
+// 64-bit signed integers for the AVX2 backend is used
 
-template <class T, class U, class Abi>
+template <typename T, typename Abi,
+          typename = std::enable_if_t<std::is_integral_v<T>>>
 [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd<T, Abi> operator>>(
-    simd<T, Abi> const& lhs, unsigned int rhs) {
+    simd<T, Abi> const& lhs, int rhs) {
   return simd<T, Abi>([&](std::size_t i) { return lhs[i] >> rhs; });
 }
 
-template <class T, class U, class Abi>
+template <typename T, typename Abi,
+          typename = std::enable_if_t<std::is_integral_v<T>>>
 [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd<T, Abi> operator<<(
-    simd<T, Abi> const& lhs, unsigned int rhs) {
+    simd<T, Abi> const& lhs, int rhs) {
   return simd<T, Abi>([&](std::size_t i) { return lhs[i] << rhs; });
 }
 
-template <class T, class U, class Abi>
+template <typename T, typename Abi,
+          typename = std::enable_if_t<std::is_integral_v<T>>>
 [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd<T, Abi> operator>>(
-    simd<T, Abi> const& lhs, simd<U, Abi> const& rhs) {
+    simd<T, Abi> const& lhs, simd<T, Abi> const& rhs) {
   return simd<T, Abi>([&](std::size_t i) { return lhs[i] >> rhs[i]; });
 }
 
-template <class T, class U, class Abi>
+template <typename T, typename Abi,
+          typename = std::enable_if_t<std::is_integral_v<T>>>
 [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd<T, Abi> operator<<(
-    simd<T, Abi> const& lhs, simd<U, Abi> const& rhs) {
+    simd<T, Abi> const& lhs, simd<T, Abi> const& rhs) {
   return simd<T, Abi>([&](std::size_t i) { return lhs[i] << rhs[i]; });
 }
 
@@ -316,160 +311,38 @@ KOKKOS_FORCEINLINE_FUNCTION where_expression<M, T>& operator/=(
 // fallback implementations of reductions across simd_mask:
 
 template <class T, class Abi>
-[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION bool all_of(
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION bool all_of(
     simd_mask<T, Abi> const& a) {
   return a == simd_mask<T, Abi>(true);
 }
 
 template <class T, class Abi>
-[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION bool any_of(
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION bool any_of(
     simd_mask<T, Abi> const& a) {
   return a != simd_mask<T, Abi>(false);
 }
 
 template <class T, class Abi>
-[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION bool none_of(
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION bool none_of(
     simd_mask<T, Abi> const& a) {
   return a == simd_mask<T, Abi>(false);
 }
 
-template <class T, class Abi>
-[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION T
-hmin(const_where_expression<simd_mask<T, Abi>, simd<T, Abi>> const& x) {
-  auto const& v = x.value();
-  auto const& m = x.mask();
-  auto result   = Kokkos::reduction_identity<T>::min();
-  for (std::size_t i = 0; i < v.size(); ++i) {
-    if (m[i]) result = Kokkos::min(result, v[i]);
-  }
-  return result;
-}
-
-template <class T, class Abi>
-[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION T
-hmax(const_where_expression<simd_mask<T, Abi>, simd<T, Abi>> const& x) {
-  auto const& v = x.value();
-  auto const& m = x.mask();
-  auto result   = Kokkos::reduction_identity<T>::max();
-  for (std::size_t i = 0; i < v.size(); ++i) {
-    if (m[i]) result = Kokkos::max(result, v[i]);
-  }
-  return result;
-}
+// A temporary device-callable implemenation of round half to nearest even
+template <typename T>
+[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION auto round_half_to_nearest_even(
+    T const& x) {
+  auto ceil  = Kokkos::ceil(x);
+  auto floor = Kokkos::floor(x);
 
-template <class T, class Abi>
-[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION T
-reduce(const_where_expression<simd_mask<T, Abi>, simd<T, Abi>> const& x, T,
-       std::plus<>) {
-  auto const& v = x.value();
-  auto const& m = x.mask();
-  auto result   = Kokkos::reduction_identity<T>::sum();
-  for (std::size_t i = 0; i < v.size(); ++i) {
-    if (m[i]) result += v[i];
+  if (Kokkos::abs(ceil - x) == Kokkos::abs(floor - x)) {
+    auto rem = Kokkos::remainder(ceil, 2.0);
+    return (rem == 0) ? ceil : floor;
   }
-  return result;
+  return Kokkos::round(x);
 }
 
 }  // namespace Experimental
-
-template <class T, class Abi>
-[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION Experimental::simd<T, Abi> min(
-    Experimental::simd<T, Abi> const& a, Experimental::simd<T, Abi> const& b) {
-  Experimental::simd<T, Abi> result;
-  for (std::size_t i = 0; i < Experimental::simd<T, Abi>::size(); ++i) {
-    result[i] = Kokkos::min(a[i], b[i]);
-  }
-  return result;
-}
-
-template <class T, class Abi>
-[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION Experimental::simd<T, Abi> max(
-    Experimental::simd<T, Abi> const& a, Experimental::simd<T, Abi> const& b) {
-  Experimental::simd<T, Abi> result;
-  for (std::size_t i = 0; i < Experimental::simd<T, Abi>::size(); ++i) {
-    result[i] = Kokkos::max(a[i], b[i]);
-  }
-  return result;
-}
-
-// fallback implementations of <cmath> functions.
-// individual Abi types may provide overloads with more efficient
-// implementations.
-// These are not in the Experimental namespace because their double
-// overloads are not either
-
-#define KOKKOS_IMPL_SIMD_UNARY_FUNCTION(FUNC)                               \
-  template <class Abi>                                                      \
-  [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION Experimental::simd<double, Abi> \
-  FUNC(Experimental::simd<double, Abi> const& a) {                          \
-    Experimental::simd<double, Abi> result;                                 \
-    for (std::size_t i = 0; i < Experimental::simd<double, Abi>::size();    \
-         ++i) {                                                             \
-      result[i] = Kokkos::FUNC(a[i]);                                       \
-    }                                                                       \
-    return result;                                                          \
-  }
-
-KOKKOS_IMPL_SIMD_UNARY_FUNCTION(abs)
-KOKKOS_IMPL_SIMD_UNARY_FUNCTION(exp)
-KOKKOS_IMPL_SIMD_UNARY_FUNCTION(exp2)
-KOKKOS_IMPL_SIMD_UNARY_FUNCTION(log)
-KOKKOS_IMPL_SIMD_UNARY_FUNCTION(log10)
-KOKKOS_IMPL_SIMD_UNARY_FUNCTION(log2)
-KOKKOS_IMPL_SIMD_UNARY_FUNCTION(sqrt)
-KOKKOS_IMPL_SIMD_UNARY_FUNCTION(cbrt)
-KOKKOS_IMPL_SIMD_UNARY_FUNCTION(sin)
-KOKKOS_IMPL_SIMD_UNARY_FUNCTION(cos)
-KOKKOS_IMPL_SIMD_UNARY_FUNCTION(tan)
-KOKKOS_IMPL_SIMD_UNARY_FUNCTION(asin)
-KOKKOS_IMPL_SIMD_UNARY_FUNCTION(acos)
-KOKKOS_IMPL_SIMD_UNARY_FUNCTION(atan)
-KOKKOS_IMPL_SIMD_UNARY_FUNCTION(sinh)
-KOKKOS_IMPL_SIMD_UNARY_FUNCTION(cosh)
-KOKKOS_IMPL_SIMD_UNARY_FUNCTION(tanh)
-KOKKOS_IMPL_SIMD_UNARY_FUNCTION(asinh)
-KOKKOS_IMPL_SIMD_UNARY_FUNCTION(acosh)
-KOKKOS_IMPL_SIMD_UNARY_FUNCTION(atanh)
-KOKKOS_IMPL_SIMD_UNARY_FUNCTION(erf)
-KOKKOS_IMPL_SIMD_UNARY_FUNCTION(erfc)
-KOKKOS_IMPL_SIMD_UNARY_FUNCTION(tgamma)
-KOKKOS_IMPL_SIMD_UNARY_FUNCTION(lgamma)
-
-#define KOKKOS_IMPL_SIMD_BINARY_FUNCTION(FUNC)                              \
-  template <class Abi>                                                      \
-  [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION Experimental::simd<double, Abi> \
-  FUNC(Experimental::simd<double, Abi> const& a,                            \
-       Experimental::simd<double, Abi> const& b) {                          \
-    Experimental::simd<double, Abi> result;                                 \
-    for (std::size_t i = 0; i < Experimental::simd<double, Abi>::size();    \
-         ++i) {                                                             \
-      result[i] = Kokkos::FUNC(a[i], b[i]);                                 \
-    }                                                                       \
-    return result;                                                          \
-  }
-
-KOKKOS_IMPL_SIMD_BINARY_FUNCTION(pow)
-KOKKOS_IMPL_SIMD_BINARY_FUNCTION(hypot)
-KOKKOS_IMPL_SIMD_BINARY_FUNCTION(atan2)
-KOKKOS_IMPL_SIMD_BINARY_FUNCTION(copysign)
-
-#define KOKKOS_IMPL_SIMD_TERNARY_FUNCTION(FUNC)                             \
-  template <class Abi>                                                      \
-  [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION Experimental::simd<double, Abi> \
-  FUNC(Experimental::simd<double, Abi> const& a,                            \
-       Experimental::simd<double, Abi> const& b,                            \
-       Experimental::simd<double, Abi> const& c) {                          \
-    Experimental::simd<double, Abi> result;                                 \
-    for (std::size_t i = 0; i < Experimental::simd<double, Abi>::size();    \
-         ++i) {                                                             \
-      result[i] = Kokkos::FUNC(a[i], b[i], c[i]);                           \
-    }                                                                       \
-    return result;                                                          \
-  }
-
-KOKKOS_IMPL_SIMD_TERNARY_FUNCTION(fma)
-KOKKOS_IMPL_SIMD_TERNARY_FUNCTION(hypot)
-
 }  // namespace Kokkos
 
 #endif
diff --git a/packages/kokkos/simd/src/Kokkos_SIMD_Common_Math.hpp b/packages/kokkos/simd/src/Kokkos_SIMD_Common_Math.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..8c6a95596043d156e5baa29f1a7cb04f613cd047
--- /dev/null
+++ b/packages/kokkos/simd/src/Kokkos_SIMD_Common_Math.hpp
@@ -0,0 +1,260 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_SIMD_COMMON_MATH_HPP
+#define KOKKOS_SIMD_COMMON_MATH_HPP
+
+#include <Kokkos_Core.hpp>  // Kokkos::min, etc.
+
+namespace Kokkos {
+
+namespace Experimental {
+
+template <class T, class Abi>
+class simd;
+
+template <class T, class Abi>
+class simd_mask;
+
+template <class M, class T>
+class const_where_expression;
+
+template <typename T, typename Abi>
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION T
+hmin(const_where_expression<simd_mask<T, Abi>, simd<T, Abi>> const& x) {
+  auto const& v = x.impl_get_value();
+  auto const& m = x.impl_get_mask();
+  auto result   = Kokkos::reduction_identity<T>::min();
+  for (std::size_t i = 0; i < v.size(); ++i) {
+    if (m[i]) result = Kokkos::min(result, v[i]);
+  }
+  return result;
+}
+
+template <class T, class Abi>
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION T
+hmax(const_where_expression<simd_mask<T, Abi>, simd<T, Abi>> const& x) {
+  auto const& v = x.impl_get_value();
+  auto const& m = x.impl_get_mask();
+  auto result   = Kokkos::reduction_identity<T>::max();
+  for (std::size_t i = 0; i < v.size(); ++i) {
+    if (m[i]) result = Kokkos::max(result, v[i]);
+  }
+  return result;
+}
+
+template <class T, class Abi>
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION T
+reduce(const_where_expression<simd_mask<T, Abi>, simd<T, Abi>> const& x, T,
+       std::plus<>) {
+  auto const& v = x.impl_get_value();
+  auto const& m = x.impl_get_mask();
+  auto result   = Kokkos::reduction_identity<T>::sum();
+  for (std::size_t i = 0; i < v.size(); ++i) {
+    if (m[i]) result += v[i];
+  }
+  return result;
+}
+
+}  // namespace Experimental
+
+template <class T, class Abi>
+[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION Experimental::simd<T, Abi> min(
+    Experimental::simd<T, Abi> const& a, Experimental::simd<T, Abi> const& b) {
+  Experimental::simd<T, Abi> result;
+  for (std::size_t i = 0; i < Experimental::simd<T, Abi>::size(); ++i) {
+    result[i] = Kokkos::min(a[i], b[i]);
+  }
+  return result;
+}
+
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
+namespace Experimental {
+template <class T, class Abi>
+[[nodiscard]] KOKKOS_DEPRECATED KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    Experimental::simd<T, Abi>
+    min(Experimental::simd<T, Abi> const& a,
+        Experimental::simd<T, Abi> const& b) {
+  return Kokkos::min(a, b);
+}
+}  // namespace Experimental
+#endif
+
+template <class T, class Abi>
+[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION Experimental::simd<T, Abi> max(
+    Experimental::simd<T, Abi> const& a, Experimental::simd<T, Abi> const& b) {
+  Experimental::simd<T, Abi> result;
+  for (std::size_t i = 0; i < Experimental::simd<T, Abi>::size(); ++i) {
+    result[i] = Kokkos::max(a[i], b[i]);
+  }
+  return result;
+}
+
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
+namespace Experimental {
+template <class T, class Abi>
+[[nodiscard]] KOKKOS_DEPRECATED KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    Experimental::simd<T, Abi>
+    max(Experimental::simd<T, Abi> const& a,
+        Experimental::simd<T, Abi> const& b) {
+  return Kokkos::max(a, b);
+}
+}  // namespace Experimental
+#endif
+
+// fallback implementations of <cmath> functions.
+// individual Abi types may provide overloads with more efficient
+// implementations.
+
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
+#define KOKKOS_IMPL_SIMD_UNARY_FUNCTION(FUNC)                                \
+  template <class T, class Abi>                                              \
+  [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION Experimental::simd<T, Abi> FUNC( \
+      Experimental::simd<T, Abi> const& a) {                                 \
+    Experimental::simd<T, Abi> result;                                       \
+    for (std::size_t i = 0; i < Experimental::simd<T, Abi>::size(); ++i) {   \
+      result[i] = Kokkos::FUNC(a[i]);                                        \
+    }                                                                        \
+    return result;                                                           \
+  }                                                                          \
+  namespace Experimental {                                                   \
+  template <class T, class Abi>                                              \
+  [[nodiscard]] KOKKOS_DEPRECATED KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION      \
+      simd<T, Abi>                                                           \
+      FUNC(simd<T, Abi> const& a) {                                          \
+    return Kokkos::FUNC(a);                                                  \
+  }                                                                          \
+  }
+#else
+#define KOKKOS_IMPL_SIMD_UNARY_FUNCTION(FUNC)                                \
+  template <class T, class Abi>                                              \
+  [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION Experimental::simd<T, Abi> FUNC( \
+      Experimental::simd<T, Abi> const& a) {                                 \
+    Experimental::simd<T, Abi> result;                                       \
+    for (std::size_t i = 0; i < Experimental::simd<T, Abi>::size(); ++i) {   \
+      result[i] = Kokkos::FUNC(a[i]);                                        \
+    }                                                                        \
+    return result;                                                           \
+  }
+#endif
+
+KOKKOS_IMPL_SIMD_UNARY_FUNCTION(abs)
+KOKKOS_IMPL_SIMD_UNARY_FUNCTION(exp)
+KOKKOS_IMPL_SIMD_UNARY_FUNCTION(exp2)
+KOKKOS_IMPL_SIMD_UNARY_FUNCTION(log)
+KOKKOS_IMPL_SIMD_UNARY_FUNCTION(log10)
+KOKKOS_IMPL_SIMD_UNARY_FUNCTION(log2)
+KOKKOS_IMPL_SIMD_UNARY_FUNCTION(sqrt)
+KOKKOS_IMPL_SIMD_UNARY_FUNCTION(cbrt)
+KOKKOS_IMPL_SIMD_UNARY_FUNCTION(sin)
+KOKKOS_IMPL_SIMD_UNARY_FUNCTION(cos)
+KOKKOS_IMPL_SIMD_UNARY_FUNCTION(tan)
+KOKKOS_IMPL_SIMD_UNARY_FUNCTION(asin)
+KOKKOS_IMPL_SIMD_UNARY_FUNCTION(acos)
+KOKKOS_IMPL_SIMD_UNARY_FUNCTION(atan)
+KOKKOS_IMPL_SIMD_UNARY_FUNCTION(sinh)
+KOKKOS_IMPL_SIMD_UNARY_FUNCTION(cosh)
+KOKKOS_IMPL_SIMD_UNARY_FUNCTION(tanh)
+KOKKOS_IMPL_SIMD_UNARY_FUNCTION(asinh)
+KOKKOS_IMPL_SIMD_UNARY_FUNCTION(acosh)
+KOKKOS_IMPL_SIMD_UNARY_FUNCTION(atanh)
+KOKKOS_IMPL_SIMD_UNARY_FUNCTION(erf)
+KOKKOS_IMPL_SIMD_UNARY_FUNCTION(erfc)
+KOKKOS_IMPL_SIMD_UNARY_FUNCTION(tgamma)
+KOKKOS_IMPL_SIMD_UNARY_FUNCTION(lgamma)
+
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
+#define KOKKOS_IMPL_SIMD_BINARY_FUNCTION(FUNC)                               \
+  template <class T, class Abi>                                              \
+  [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION Experimental::simd<T, Abi> FUNC( \
+      Experimental::simd<T, Abi> const& a,                                   \
+      Experimental::simd<T, Abi> const& b) {                                 \
+    Experimental::simd<T, Abi> result;                                       \
+    for (std::size_t i = 0; i < Experimental::simd<T, Abi>::size(); ++i) {   \
+      result[i] = Kokkos::FUNC(a[i], b[i]);                                  \
+    }                                                                        \
+    return result;                                                           \
+  }                                                                          \
+  namespace Experimental {                                                   \
+  template <class T, class Abi>                                              \
+  [[nodiscard]] KOKKOS_DEPRECATED KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION      \
+      simd<T, Abi>                                                           \
+      FUNC(simd<T, Abi> const& a, simd<T, Abi> const& b) {                   \
+    Kokkos::FUNC(a, b);                                                      \
+  }                                                                          \
+  }
+#else
+#define KOKKOS_IMPL_SIMD_BINARY_FUNCTION(FUNC)                               \
+  template <class T, class Abi>                                              \
+  [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION Experimental::simd<T, Abi> FUNC( \
+      Experimental::simd<T, Abi> const& a,                                   \
+      Experimental::simd<T, Abi> const& b) {                                 \
+    Experimental::simd<T, Abi> result;                                       \
+    for (std::size_t i = 0; i < Experimental::simd<T, Abi>::size(); ++i) {   \
+      result[i] = Kokkos::FUNC(a[i], b[i]);                                  \
+    }                                                                        \
+    return result;                                                           \
+  }
+#endif
+
+KOKKOS_IMPL_SIMD_BINARY_FUNCTION(pow)
+KOKKOS_IMPL_SIMD_BINARY_FUNCTION(hypot)
+KOKKOS_IMPL_SIMD_BINARY_FUNCTION(atan2)
+KOKKOS_IMPL_SIMD_BINARY_FUNCTION(copysign)
+
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
+#define KOKKOS_IMPL_SIMD_TERNARY_FUNCTION(FUNC)                              \
+  template <class T, class Abi>                                              \
+  [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION Experimental::simd<T, Abi> FUNC( \
+      Experimental::simd<T, Abi> const& a,                                   \
+      Experimental::simd<T, Abi> const& b,                                   \
+      Experimental::simd<T, Abi> const& c) {                                 \
+    Experimental::simd<T, Abi> result;                                       \
+    for (std::size_t i = 0; i < Experimental::simd<T, Abi>::size(); ++i) {   \
+      result[i] = Kokkos::FUNC(a[i], b[i], c[i]);                            \
+    }                                                                        \
+    return result;                                                           \
+  }                                                                          \
+  namespace Experimental {                                                   \
+  template <class T, class Abi>                                              \
+  [[nodiscard]] KOKKOS_DEPRECATED KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION      \
+      simd<T, Abi>                                                           \
+      FUNC(simd<T, Abi> const& a, simd<T, Abi> const& b,                     \
+           simd<T, Abi> const& c) {                                          \
+    return Kokkos::FUNC(a, b, c);                                            \
+  }                                                                          \
+  }
+#else
+#define KOKKOS_IMPL_SIMD_TERNARY_FUNCTION(FUNC)                              \
+  template <class T, class Abi>                                              \
+  [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION Experimental::simd<T, Abi> FUNC( \
+      Experimental::simd<T, Abi> const& a,                                   \
+      Experimental::simd<T, Abi> const& b,                                   \
+      Experimental::simd<T, Abi> const& c) {                                 \
+    Experimental::simd<T, Abi> result;                                       \
+    for (std::size_t i = 0; i < Experimental::simd<T, Abi>::size(); ++i) {   \
+      result[i] = Kokkos::FUNC(a[i], b[i], c[i]);                            \
+    }                                                                        \
+    return result;                                                           \
+  }
+#endif
+
+KOKKOS_IMPL_SIMD_TERNARY_FUNCTION(fma)
+KOKKOS_IMPL_SIMD_TERNARY_FUNCTION(hypot)
+
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/simd/src/Kokkos_SIMD_NEON.hpp b/packages/kokkos/simd/src/Kokkos_SIMD_NEON.hpp
index 2473004098e23e0be990ef9535a1c23d3970aeed..43ece2038903e1139485f14973f8816c3e2f24b1 100644
--- a/packages/kokkos/simd/src/Kokkos_SIMD_NEON.hpp
+++ b/packages/kokkos/simd/src/Kokkos_SIMD_NEON.hpp
@@ -24,6 +24,11 @@
 
 #include <arm_neon.h>
 
+#ifdef KOKKOS_SIMD_COMMON_MATH_HPP
+#error \
+    "Kokkos_SIMD_NEON.hpp must be included before Kokkos_SIMD_Common_Math.hpp!"
+#endif
+
 namespace Kokkos {
 
 namespace Experimental {
@@ -81,6 +86,22 @@ class neon_mask<Derived, 64> {
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION neon_mask() = default;
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit neon_mask(value_type value)
       : m_value(vmovq_n_u64(value ? 0xFFFFFFFFFFFFFFFFULL : 0)) {}
+  template <class G,
+            std::enable_if_t<
+                std::is_invocable_r_v<value_type, G,
+                                      std::integral_constant<std::size_t, 0>>,
+                bool> = false>
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit neon_mask(
+      G&& gen) noexcept {
+    m_value = vsetq_lane_u64(
+        (gen(std::integral_constant<std::size_t, 0>()) ? 0xFFFFFFFFFFFFFFFFULL
+                                                       : 0),
+        m_value, 0);
+    m_value = vsetq_lane_u64(
+        (gen(std::integral_constant<std::size_t, 1>()) ? 0xFFFFFFFFFFFFFFFFULL
+                                                       : 0),
+        m_value, 1);
+  }
   template <class U>
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION neon_mask(
       neon_mask<U, 32> const& other) {
@@ -175,6 +196,20 @@ class neon_mask<Derived, 32> {
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION neon_mask() = default;
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit neon_mask(value_type value)
       : m_value(vmov_n_u32(value ? 0xFFFFFFFFU : 0)) {}
+  template <class G,
+            std::enable_if_t<
+                std::is_invocable_r_v<value_type, G,
+                                      std::integral_constant<std::size_t, 0>>,
+                bool> = false>
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit neon_mask(
+      G&& gen) noexcept {
+    m_value = vset_lane_u32(
+        (gen(std::integral_constant<std::size_t, 0>()) ? 0xFFFFFFFFU : 0),
+        m_value, 0);
+    m_value = vset_lane_u32(
+        (gen(std::integral_constant<std::size_t, 1>()) ? 0xFFFFFFFFU : 0),
+        m_value, 1);
+  }
   template <class U>
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION neon_mask(neon_mask<U, 64> const& other)
       : m_value(vqmovn_u64(static_cast<uint64x2_t>(other))) {}
@@ -246,6 +281,14 @@ class simd_mask<T, simd_abi::neon_fixed_size<2>>
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd_mask(
       implementation_type const& value)
       : base_type(value) {}
+  template <class G,
+            std::enable_if_t<
+                std::is_invocable_r_v<typename base_type::value_type, G,
+                                      std::integral_constant<std::size_t, 0>>,
+                bool> = false>
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd_mask(
+      G&& gen) noexcept
+      : base_type(gen) {}
 };
 
 template <>
@@ -299,7 +342,8 @@ class simd<double, simd_abi::neon_fixed_size<2>> {
                 std::is_invocable_r_v<value_type, G,
                                       std::integral_constant<std::size_t, 0>>,
                 bool> = false>
-  KOKKOS_FORCEINLINE_FUNCTION simd(G&& gen) {
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd(
+      G&& gen) noexcept {
     m_value = vsetq_lane_f64(gen(std::integral_constant<std::size_t, 0>()),
                              m_value, 0);
     m_value = vsetq_lane_f64(gen(std::integral_constant<std::size_t, 1>()),
@@ -327,122 +371,163 @@ class simd<double, simd_abi::neon_fixed_size<2>> {
   operator float64x2_t() const {
     return m_value;
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator<(simd const& other) const {
-    return mask_type(vcltq_f64(m_value, other.m_value));
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator-() const
+      noexcept {
+    return simd(vnegq_f64(m_value));
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(vmulq_f64(static_cast<float64x2_t>(lhs),
+                          static_cast<float64x2_t>(rhs)));
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator/(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(vdivq_f64(static_cast<float64x2_t>(lhs),
+                          static_cast<float64x2_t>(rhs)));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator+(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(vaddq_f64(static_cast<float64x2_t>(lhs),
+                          static_cast<float64x2_t>(rhs)));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator-(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(vsubq_f64(static_cast<float64x2_t>(lhs),
+                          static_cast<float64x2_t>(rhs)));
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator<(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(vcltq_f64(static_cast<float64x2_t>(lhs),
+                               static_cast<float64x2_t>(rhs)));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator>(simd const& other) const {
-    return mask_type(vcgtq_f64(m_value, other.m_value));
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator>(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(vcgtq_f64(static_cast<float64x2_t>(lhs),
+                               static_cast<float64x2_t>(rhs)));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator<=(simd const& other) const {
-    return mask_type(vcleq_f64(m_value, other.m_value));
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator<=(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(vcleq_f64(static_cast<float64x2_t>(lhs),
+                               static_cast<float64x2_t>(rhs)));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator>=(simd const& other) const {
-    return mask_type(vcgeq_f64(m_value, other.m_value));
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator>=(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(vcgeq_f64(static_cast<float64x2_t>(lhs),
+                               static_cast<float64x2_t>(rhs)));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator==(simd const& other) const {
-    return mask_type(vceqq_f64(m_value, other.m_value));
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator==(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(vceqq_f64(static_cast<float64x2_t>(lhs),
+                               static_cast<float64x2_t>(rhs)));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator!=(simd const& other) const {
-    return !(operator==(other));
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator!=(simd const& lhs, simd const& rhs) noexcept {
+    return !(operator==(lhs, rhs));
   }
 };
 
-[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-    simd<double, simd_abi::neon_fixed_size<2>>
-    operator*(simd<double, simd_abi::neon_fixed_size<2>> const& lhs,
-              simd<double, simd_abi::neon_fixed_size<2>> const& rhs) {
-  return simd<double, simd_abi::neon_fixed_size<2>>(
-      vmulq_f64(static_cast<float64x2_t>(lhs), static_cast<float64x2_t>(rhs)));
-}
+}  // namespace Experimental
 
 [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-    simd<double, simd_abi::neon_fixed_size<2>>
-    operator/(simd<double, simd_abi::neon_fixed_size<2>> const& lhs,
-              simd<double, simd_abi::neon_fixed_size<2>> const& rhs) {
-  return simd<double, simd_abi::neon_fixed_size<2>>(
-      vdivq_f64(static_cast<float64x2_t>(lhs), static_cast<float64x2_t>(rhs)));
+    Experimental::simd<double, Experimental::simd_abi::neon_fixed_size<2>>
+    abs(Experimental::simd<
+        double, Experimental::simd_abi::neon_fixed_size<2>> const& a) {
+  return Experimental::simd<double, Experimental::simd_abi::neon_fixed_size<2>>(
+      vabsq_f64(static_cast<float64x2_t>(a)));
 }
 
 [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-    simd<double, simd_abi::neon_fixed_size<2>>
-    operator+(simd<double, simd_abi::neon_fixed_size<2>> const& lhs,
-              simd<double, simd_abi::neon_fixed_size<2>> const& rhs) {
-  return simd<double, simd_abi::neon_fixed_size<2>>(
-      vaddq_f64(static_cast<float64x2_t>(lhs), static_cast<float64x2_t>(rhs)));
+    Experimental::simd<double, Experimental::simd_abi::neon_fixed_size<2>>
+    floor(Experimental::simd<
+          double, Experimental::simd_abi::neon_fixed_size<2>> const& a) {
+  return Experimental::simd<double, Experimental::simd_abi::neon_fixed_size<2>>(
+      vrndmq_f64(static_cast<float64x2_t>(a)));
 }
 
 [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-    simd<double, simd_abi::neon_fixed_size<2>>
-    operator-(simd<double, simd_abi::neon_fixed_size<2>> const& lhs,
-              simd<double, simd_abi::neon_fixed_size<2>> const& rhs) {
-  return simd<double, simd_abi::neon_fixed_size<2>>(
-      vsubq_f64(static_cast<float64x2_t>(lhs), static_cast<float64x2_t>(rhs)));
+    Experimental::simd<double, Experimental::simd_abi::neon_fixed_size<2>>
+    ceil(Experimental::simd<
+         double, Experimental::simd_abi::neon_fixed_size<2>> const& a) {
+  return Experimental::simd<double, Experimental::simd_abi::neon_fixed_size<2>>(
+      vrndpq_f64(static_cast<float64x2_t>(a)));
 }
 
 [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-    simd<double, simd_abi::neon_fixed_size<2>>
-    operator-(simd<double, simd_abi::neon_fixed_size<2>> const& a) {
-  return simd<double, simd_abi::neon_fixed_size<2>>(
-      vnegq_f64(static_cast<float64x2_t>(a)));
+    Experimental::simd<double, Experimental::simd_abi::neon_fixed_size<2>>
+    round(Experimental::simd<
+          double, Experimental::simd_abi::neon_fixed_size<2>> const& a) {
+  return Experimental::simd<double, Experimental::simd_abi::neon_fixed_size<2>>(
+      vrndxq_f64(static_cast<float64x2_t>(a)));
 }
 
-KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-simd<double, simd_abi::neon_fixed_size<2>> abs(
-    simd<double, simd_abi::neon_fixed_size<2>> const& a) {
-  return simd<double, simd_abi::neon_fixed_size<2>>(
-      vabsq_f64(static_cast<float64x2_t>(a)));
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    Experimental::simd<double, Experimental::simd_abi::neon_fixed_size<2>>
+    trunc(Experimental::simd<
+          double, Experimental::simd_abi::neon_fixed_size<2>> const& a) {
+  return Experimental::simd<double, Experimental::simd_abi::neon_fixed_size<2>>(
+      vrndq_f64(static_cast<float64x2_t>(a)));
 }
 
-KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-simd<double, simd_abi::neon_fixed_size<2>> copysign(
-    simd<double, simd_abi::neon_fixed_size<2>> const& a,
-    simd<double, simd_abi::neon_fixed_size<2>> const& b) {
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    Experimental::simd<double, Experimental::simd_abi::neon_fixed_size<2>>
+    copysign(Experimental::simd<
+                 double, Experimental::simd_abi::neon_fixed_size<2>> const& a,
+             Experimental::simd<
+                 double, Experimental::simd_abi::neon_fixed_size<2>> const& b) {
   uint64x2_t const sign_mask = vreinterpretq_u64_f64(vmovq_n_f64(-0.0));
-  return simd<double, simd_abi::neon_fixed_size<2>>(vreinterpretq_f64_u64(
-      vorrq_u64(vreinterpretq_u64_f64(static_cast<float64x2_t>(abs(a))),
-                vandq_u64(sign_mask, vreinterpretq_u64_f64(
-                                         static_cast<float64x2_t>(b))))));
+  return Experimental::simd<double, Experimental::simd_abi::neon_fixed_size<2>>(
+      vreinterpretq_f64_u64(vorrq_u64(
+          vreinterpretq_u64_f64(static_cast<float64x2_t>(abs(a))),
+          vandq_u64(sign_mask,
+                    vreinterpretq_u64_f64(static_cast<float64x2_t>(b))))));
 }
 
-KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-simd<double, simd_abi::neon_fixed_size<2>> sqrt(
-    simd<double, simd_abi::neon_fixed_size<2>> const& a) {
-  return simd<double, simd_abi::neon_fixed_size<2>>(
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    Experimental::simd<double, Experimental::simd_abi::neon_fixed_size<2>>
+    sqrt(Experimental::simd<
+         double, Experimental::simd_abi::neon_fixed_size<2>> const& a) {
+  return Experimental::simd<double, Experimental::simd_abi::neon_fixed_size<2>>(
       vsqrtq_f64(static_cast<float64x2_t>(a)));
 }
 
-KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-simd<double, simd_abi::neon_fixed_size<2>> fma(
-    simd<double, simd_abi::neon_fixed_size<2>> const& a,
-    simd<double, simd_abi::neon_fixed_size<2>> const& b,
-    simd<double, simd_abi::neon_fixed_size<2>> const& c) {
-  return simd<double, simd_abi::neon_fixed_size<2>>(
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    Experimental::simd<double, Experimental::simd_abi::neon_fixed_size<2>>
+    fma(Experimental::simd<double,
+                           Experimental::simd_abi::neon_fixed_size<2>> const& a,
+        Experimental::simd<double,
+                           Experimental::simd_abi::neon_fixed_size<2>> const& b,
+        Experimental::simd<
+            double, Experimental::simd_abi::neon_fixed_size<2>> const& c) {
+  return Experimental::simd<double, Experimental::simd_abi::neon_fixed_size<2>>(
       vfmaq_f64(static_cast<float64x2_t>(c), static_cast<float64x2_t>(b),
                 static_cast<float64x2_t>(a)));
 }
 
-KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-simd<double, simd_abi::neon_fixed_size<2>> max(
-    simd<double, simd_abi::neon_fixed_size<2>> const& a,
-    simd<double, simd_abi::neon_fixed_size<2>> const& b) {
-  return simd<double, simd_abi::neon_fixed_size<2>>(
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    Experimental::simd<double, Experimental::simd_abi::neon_fixed_size<2>>
+    max(Experimental::simd<double,
+                           Experimental::simd_abi::neon_fixed_size<2>> const& a,
+        Experimental::simd<
+            double, Experimental::simd_abi::neon_fixed_size<2>> const& b) {
+  return Experimental::simd<double, Experimental::simd_abi::neon_fixed_size<2>>(
       vmaxq_f64(static_cast<float64x2_t>(a), static_cast<float64x2_t>(b)));
 }
 
-KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-simd<double, simd_abi::neon_fixed_size<2>> min(
-    simd<double, simd_abi::neon_fixed_size<2>> const& a,
-    simd<double, simd_abi::neon_fixed_size<2>> const& b) {
-  return simd<double, simd_abi::neon_fixed_size<2>>(
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    Experimental::simd<double, Experimental::simd_abi::neon_fixed_size<2>>
+    min(Experimental::simd<double,
+                           Experimental::simd_abi::neon_fixed_size<2>> const& a,
+        Experimental::simd<
+            double, Experimental::simd_abi::neon_fixed_size<2>> const& b) {
+  return Experimental::simd<double, Experimental::simd_abi::neon_fixed_size<2>>(
       vminq_f64(static_cast<float64x2_t>(a), static_cast<float64x2_t>(b)));
 }
 
+namespace Experimental {
+
 [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
     simd<double, simd_abi::neon_fixed_size<2>>
     condition(simd_mask<double, simd_abi::neon_fixed_size<2>> const& a,
@@ -453,6 +538,240 @@ simd<double, simd_abi::neon_fixed_size<2>> min(
                 static_cast<float64x2_t>(c)));
 }
 
+template <>
+class simd<float, simd_abi::neon_fixed_size<2>> {
+  float32x2_t m_value;
+
+ public:
+  using value_type = float;
+  using abi_type   = simd_abi::neon_fixed_size<2>;
+  using mask_type  = simd_mask<value_type, abi_type>;
+  class reference {
+    float32x2_t& m_value;
+    int m_lane;
+
+   public:
+    KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference(float32x2_t& value_arg,
+                                                    int lane_arg)
+        : m_value(value_arg), m_lane(lane_arg) {}
+    KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference
+    operator=(float value) const {
+      switch (m_lane) {
+        case 0: m_value = vset_lane_f32(value, m_value, 0); break;
+        case 1: m_value = vset_lane_f32(value, m_value, 1); break;
+      }
+      return *this;
+    }
+    KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION operator float() const {
+      switch (m_lane) {
+        case 0: return vget_lane_f32(m_value, 0);
+        case 1: return vget_lane_f32(m_value, 1);
+      }
+      return 0;
+    }
+  };
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd()            = default;
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default;
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&)      = default;
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd const&) = default;
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default;
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() {
+    return 2;
+  }
+  template <class U, std::enable_if_t<std::is_convertible_v<U, value_type>,
+                                      bool> = false>
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(U&& value)
+      : m_value(vmov_n_f32(value_type(value))) {}
+  template <class G,
+            std::enable_if_t<
+                std::is_invocable_r_v<value_type, G,
+                                      std::integral_constant<std::size_t, 0>>,
+                bool> = false>
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(G&& gen) {
+    m_value = vset_lane_f32(gen(std::integral_constant<std::size_t, 0>()),
+                            m_value, 0);
+    m_value = vset_lane_f32(gen(std::integral_constant<std::size_t, 1>()),
+                            m_value, 1);
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd(
+      float32x2_t const& value_in)
+      : m_value(value_in) {}
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) {
+    return reference(m_value, int(i));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type
+  operator[](std::size_t i) const {
+    return reference(const_cast<simd*>(this)->m_value, int(i));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr,
+                                                       element_aligned_tag) {
+    m_value = vld1_f32(ptr);
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(
+      value_type* ptr, element_aligned_tag) const {
+    vst1_f32(ptr, m_value);
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit
+  operator float32x2_t() const {
+    return m_value;
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator-() const
+      noexcept {
+    return simd(vneg_f32(m_value));
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(vmul_f32(lhs.m_value, rhs.m_value));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator/(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(vdiv_f32(lhs.m_value, rhs.m_value));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator+(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(vadd_f32(lhs.m_value, rhs.m_value));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator-(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(vsub_f32(lhs.m_value, rhs.m_value));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator<(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(vclt_f32(lhs.m_value, rhs.m_value));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator>(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(vcgt_f32(lhs.m_value, rhs.m_value));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator<=(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(vcle_f32(lhs.m_value, rhs.m_value));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator>=(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(vcge_f32(lhs.m_value, rhs.m_value));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator==(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(vceq_f32(lhs.m_value, rhs.m_value));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator!=(simd const& lhs, simd const& rhs) noexcept {
+    return !(lhs == rhs);
+  }
+};
+
+}  // namespace Experimental
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    Experimental::simd<float, Experimental::simd_abi::neon_fixed_size<2>>
+    abs(Experimental::simd<
+        float, Experimental::simd_abi::neon_fixed_size<2>> const& a) {
+  return Experimental::simd<float, Experimental::simd_abi::neon_fixed_size<2>>(
+      vabs_f32(static_cast<float32x2_t>(a)));
+}
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    Experimental::simd<float, Experimental::simd_abi::neon_fixed_size<2>>
+    floor(Experimental::simd<
+          float, Experimental::simd_abi::neon_fixed_size<2>> const& a) {
+  return Experimental::simd<float, Experimental::simd_abi::neon_fixed_size<2>>(
+      vrndm_f32(static_cast<float32x2_t>(a)));
+}
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    Experimental::simd<float, Experimental::simd_abi::neon_fixed_size<2>>
+    ceil(Experimental::simd<
+         float, Experimental::simd_abi::neon_fixed_size<2>> const& a) {
+  return Experimental::simd<float, Experimental::simd_abi::neon_fixed_size<2>>(
+      vrndp_f32(static_cast<float32x2_t>(a)));
+}
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    Experimental::simd<float, Experimental::simd_abi::neon_fixed_size<2>>
+    round(Experimental::simd<
+          float, Experimental::simd_abi::neon_fixed_size<2>> const& a) {
+  return Experimental::simd<float, Experimental::simd_abi::neon_fixed_size<2>>(
+      vrndx_f32(static_cast<float32x2_t>(a)));
+}
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    Experimental::simd<float, Experimental::simd_abi::neon_fixed_size<2>>
+    trunc(Experimental::simd<
+          float, Experimental::simd_abi::neon_fixed_size<2>> const& a) {
+  return Experimental::simd<float, Experimental::simd_abi::neon_fixed_size<2>>(
+      vrnd_f32(static_cast<float32x2_t>(a)));
+}
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd<
+    float, Experimental::simd_abi::neon_fixed_size<2>>
+copysign(
+    Experimental::simd<float, Experimental::simd_abi::neon_fixed_size<2>> const&
+        a,
+    Experimental::simd<float, Experimental::simd_abi::neon_fixed_size<2>> const&
+        b) {
+  uint32x2_t const sign_mask = vreinterpret_u32_f32(vmov_n_f32(-0.0));
+  return Experimental::simd<float, Experimental::simd_abi::neon_fixed_size<2>>(
+      vreinterpret_f32_u32(vorr_u32(
+          vreinterpret_u32_f32(static_cast<float32x2_t>(abs(a))),
+          vand_u32(sign_mask,
+                   vreinterpret_u32_f32(static_cast<float32x2_t>(b))))));
+}
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    Experimental::simd<float, Experimental::simd_abi::neon_fixed_size<2>>
+    sqrt(Experimental::simd<
+         float, Experimental::simd_abi::neon_fixed_size<2>> const& a) {
+  return Experimental::simd<float, Experimental::simd_abi::neon_fixed_size<2>>(
+      vsqrt_f32(static_cast<float32x2_t>(a)));
+}
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd<
+    float, Experimental::simd_abi::neon_fixed_size<2>>
+fma(Experimental::simd<float, Experimental::simd_abi::neon_fixed_size<2>> const&
+        a,
+    Experimental::simd<float, Experimental::simd_abi::neon_fixed_size<2>> const&
+        b,
+    Experimental::simd<float, Experimental::simd_abi::neon_fixed_size<2>> const&
+        c) {
+  return Experimental::simd<float, Experimental::simd_abi::neon_fixed_size<2>>(
+      vfma_f32(static_cast<float32x2_t>(c), static_cast<float32x2_t>(b),
+               static_cast<float32x2_t>(a)));
+}
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd<
+    float, Experimental::simd_abi::neon_fixed_size<2>>
+max(Experimental::simd<float, Experimental::simd_abi::neon_fixed_size<2>> const&
+        a,
+    Experimental::simd<float, Experimental::simd_abi::neon_fixed_size<2>> const&
+        b) {
+  return Experimental::simd<float, Experimental::simd_abi::neon_fixed_size<2>>(
+      vmax_f32(static_cast<float32x2_t>(a), static_cast<float32x2_t>(b)));
+}
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd<
+    float, Experimental::simd_abi::neon_fixed_size<2>>
+min(Experimental::simd<float, Experimental::simd_abi::neon_fixed_size<2>> const&
+        a,
+    Experimental::simd<float, Experimental::simd_abi::neon_fixed_size<2>> const&
+        b) {
+  return Experimental::simd<float, Experimental::simd_abi::neon_fixed_size<2>>(
+      vmin_f32(static_cast<float32x2_t>(a), static_cast<float32x2_t>(b)));
+}
+
+namespace Experimental {
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    simd<float, simd_abi::neon_fixed_size<2>>
+    condition(simd_mask<float, simd_abi::neon_fixed_size<2>> const& a,
+              simd<float, simd_abi::neon_fixed_size<2>> const& b,
+              simd<float, simd_abi::neon_fixed_size<2>> const& c) {
+  return simd<float, simd_abi::neon_fixed_size<2>>(
+      vbsl_f32(static_cast<uint32x2_t>(a), static_cast<float32x2_t>(b),
+               static_cast<float32x2_t>(c)));
+}
+
 template <>
 class simd<std::int32_t, simd_abi::neon_fixed_size<2>> {
   int32x2_t m_value;
@@ -502,7 +821,8 @@ class simd<std::int32_t, simd_abi::neon_fixed_size<2>> {
                 std::is_invocable_r_v<value_type, G,
                                       std::integral_constant<std::size_t, 0>>,
                 bool> = false>
-  KOKKOS_FORCEINLINE_FUNCTION simd(G&& gen) {
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd(
+      G&& gen) noexcept {
     m_value = vset_lane_s32(gen(std::integral_constant<std::size_t, 0>()),
                             m_value, 0);
     m_value = vset_lane_s32(gen(std::integral_constant<std::size_t, 1>()),
@@ -532,55 +852,116 @@ class simd<std::int32_t, simd_abi::neon_fixed_size<2>> {
       const {
     return m_value;
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator==(simd const& other) const {
-    return mask_type(vceq_s32(m_value, other.m_value));
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator-() const
+      noexcept {
+    return simd(vneg_s32(m_value));
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator-(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(
+        vsub_s32(static_cast<int32x2_t>(lhs), static_cast<int32x2_t>(rhs)));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator+(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(
+        vadd_s32(static_cast<int32x2_t>(lhs), static_cast<int32x2_t>(rhs)));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator>(simd const& other) const {
-    return mask_type(vcgt_s32(m_value, other.m_value));
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator==(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(
+        vceq_s32(static_cast<int32x2_t>(lhs), static_cast<int32x2_t>(rhs)));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator>(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(
+        vcgt_s32(static_cast<int32x2_t>(lhs), static_cast<int32x2_t>(rhs)));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator<(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(
+        vclt_s32(static_cast<int32x2_t>(lhs), static_cast<int32x2_t>(rhs)));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator<=(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(
+        vcle_s32(static_cast<int32x2_t>(lhs), static_cast<int32x2_t>(rhs)));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator<(simd const& other) const {
-    return mask_type(vclt_s32(m_value, other.m_value));
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator>=(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(
+        vcge_s32(static_cast<int32x2_t>(lhs), static_cast<int32x2_t>(rhs)));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator<=(simd const& other) const {
-    return mask_type(vcle_s32(m_value, other.m_value));
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator!=(simd const& lhs, simd const& rhs) noexcept {
+    return !(lhs == rhs);
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator>=(simd const& other) const {
-    return mask_type(vcge_s32(m_value, other.m_value));
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>(
+      simd const& lhs, int rhs) noexcept {
+    return simd(vshl_s32(static_cast<int32x2_t>(lhs),
+                         vneg_s32(vmov_n_s32(std::int32_t(rhs)))));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(vshl_s32(static_cast<int32x2_t>(lhs),
+                         vneg_s32(static_cast<int32x2_t>(rhs))));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator!=(simd const& other) const {
-    return !((*this) == other);
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<(
+      simd const& lhs, int rhs) noexcept {
+    return simd(
+        vshl_s32(static_cast<int32x2_t>(lhs), vmov_n_s32(std::int32_t(rhs))));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(
+        vshl_s32(static_cast<int32x2_t>(lhs), static_cast<int32x2_t>(rhs)));
   }
 };
 
+}  // namespace Experimental
+
 [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-    simd<std::int32_t, simd_abi::neon_fixed_size<2>>
-    operator-(simd<std::int32_t, simd_abi::neon_fixed_size<2>> const& a) {
-  return simd<std::int32_t, simd_abi::neon_fixed_size<2>>(
-      vneg_s32(static_cast<int32x2_t>(a)));
+    Experimental::simd<std::int32_t, Experimental::simd_abi::neon_fixed_size<2>>
+    abs(Experimental::simd<
+        std::int32_t, Experimental::simd_abi::neon_fixed_size<2>> const& a) {
+  return Experimental::simd<std::int32_t,
+                            Experimental::simd_abi::neon_fixed_size<2>>(
+      vabs_s32(static_cast<int32x2_t>(a)));
 }
 
 [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-    simd<std::int32_t, simd_abi::neon_fixed_size<2>>
-    operator-(simd<std::int32_t, simd_abi::neon_fixed_size<2>> const& lhs,
-              simd<std::int32_t, simd_abi::neon_fixed_size<2>> const& rhs) {
-  return simd<std::int32_t, simd_abi::neon_fixed_size<2>>(
-      vsub_s32(static_cast<int32x2_t>(lhs), static_cast<int32x2_t>(rhs)));
+    Experimental::simd<std::int32_t, Experimental::simd_abi::neon_fixed_size<2>>
+    floor(Experimental::simd<
+          std::int32_t, Experimental::simd_abi::neon_fixed_size<2>> const& a) {
+  return a;
 }
 
 [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-    simd<std::int32_t, simd_abi::neon_fixed_size<2>>
-    operator+(simd<std::int32_t, simd_abi::neon_fixed_size<2>> const& lhs,
-              simd<std::int32_t, simd_abi::neon_fixed_size<2>> const& rhs) {
-  return simd<std::int32_t, simd_abi::neon_fixed_size<2>>(
-      vadd_s32(static_cast<int32x2_t>(lhs), static_cast<int32x2_t>(rhs)));
+    Experimental::simd<std::int32_t, Experimental::simd_abi::neon_fixed_size<2>>
+    ceil(Experimental::simd<
+         std::int32_t, Experimental::simd_abi::neon_fixed_size<2>> const& a) {
+  return a;
 }
 
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    Experimental::simd<std::int32_t, Experimental::simd_abi::neon_fixed_size<2>>
+    round(Experimental::simd<
+          std::int32_t, Experimental::simd_abi::neon_fixed_size<2>> const& a) {
+  return a;
+}
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    Experimental::simd<std::int32_t, Experimental::simd_abi::neon_fixed_size<2>>
+    trunc(Experimental::simd<
+          std::int32_t, Experimental::simd_abi::neon_fixed_size<2>> const& a) {
+  return a;
+}
+
+namespace Experimental {
+
 [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
     simd<std::int32_t, simd_abi::neon_fixed_size<2>>
     condition(simd_mask<std::int32_t, simd_abi::neon_fixed_size<2>> const& a,
@@ -640,7 +1021,8 @@ class simd<std::int64_t, simd_abi::neon_fixed_size<2>> {
                 std::is_invocable_r_v<value_type, G,
                                       std::integral_constant<std::size_t, 0>>,
                 bool> = false>
-  KOKKOS_FORCEINLINE_FUNCTION simd(G&& gen) {
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd(
+      G&& gen) noexcept {
     m_value = vsetq_lane_s64(gen(std::integral_constant<std::size_t, 0>()),
                              m_value, 0);
     m_value = vsetq_lane_s64(gen(std::integral_constant<std::size_t, 1>()),
@@ -670,55 +1052,116 @@ class simd<std::int64_t, simd_abi::neon_fixed_size<2>> {
       const {
     return m_value;
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator==(simd const& other) const {
-    return mask_type(vceqq_s64(m_value, other.m_value));
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator-() const
+      noexcept {
+    return simd(vnegq_s64(m_value));
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator-(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(
+        vsubq_s64(static_cast<int64x2_t>(lhs), static_cast<int64x2_t>(rhs)));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator+(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(
+        vaddq_s64(static_cast<int64x2_t>(lhs), static_cast<int64x2_t>(rhs)));
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator==(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(
+        vceqq_s64(static_cast<int64x2_t>(lhs), static_cast<int64x2_t>(rhs)));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator>(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(
+        vcgtq_s64(static_cast<int64x2_t>(lhs), static_cast<int64x2_t>(rhs)));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator<(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(
+        vcltq_s64(static_cast<int64x2_t>(lhs), static_cast<int64x2_t>(rhs)));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator<=(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(
+        vcleq_s64(static_cast<int64x2_t>(lhs), static_cast<int64x2_t>(rhs)));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator>(simd const& other) const {
-    return mask_type(vcgtq_s64(m_value, other.m_value));
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator>=(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(
+        vcgeq_s64(static_cast<int64x2_t>(lhs), static_cast<int64x2_t>(rhs)));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator<(simd const& other) const {
-    return mask_type(vcltq_s64(m_value, other.m_value));
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator!=(simd const& lhs, simd const& rhs) noexcept {
+    return !(lhs == rhs);
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator<=(simd const& other) const {
-    return mask_type(vcleq_s64(m_value, other.m_value));
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>(
+      simd const& lhs, int rhs) noexcept {
+    return simd(vshlq_s64(static_cast<int64x2_t>(lhs),
+                          vnegq_s64(vmovq_n_s64(std::int64_t(rhs)))));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(vshlq_s64(static_cast<int64x2_t>(lhs),
+                          vnegq_s64(static_cast<int64x2_t>(rhs))));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator>=(simd const& other) const {
-    return mask_type(vcgeq_s64(m_value, other.m_value));
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<(
+      simd const& lhs, int rhs) noexcept {
+    return simd(
+        vshlq_s64(static_cast<int64x2_t>(lhs), vmovq_n_s64(std::int64_t(rhs))));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator!=(simd const& other) const {
-    return !((*this) == other);
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(
+        vshlq_s64(static_cast<int64x2_t>(lhs), static_cast<int64x2_t>(rhs)));
   }
 };
 
+}  // namespace Experimental
+
 [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-    simd<std::int64_t, simd_abi::neon_fixed_size<2>>
-    operator-(simd<std::int64_t, simd_abi::neon_fixed_size<2>> const& a) {
-  return simd<std::int64_t, simd_abi::neon_fixed_size<2>>(
-      vnegq_s64(static_cast<int64x2_t>(a)));
+    Experimental::simd<std::int64_t, Experimental::simd_abi::neon_fixed_size<2>>
+    abs(Experimental::simd<
+        std::int64_t, Experimental::simd_abi::neon_fixed_size<2>> const& a) {
+  return Experimental::simd<std::int64_t,
+                            Experimental::simd_abi::neon_fixed_size<2>>(
+      vabsq_s64(static_cast<int64x2_t>(a)));
 }
 
 [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-    simd<std::int64_t, simd_abi::neon_fixed_size<2>>
-    operator-(simd<std::int64_t, simd_abi::neon_fixed_size<2>> const& lhs,
-              simd<std::int64_t, simd_abi::neon_fixed_size<2>> const& rhs) {
-  return simd<std::int64_t, simd_abi::neon_fixed_size<2>>(
-      vsubq_s64(static_cast<int64x2_t>(lhs), static_cast<int64x2_t>(rhs)));
+    Experimental::simd<std::int64_t, Experimental::simd_abi::neon_fixed_size<2>>
+    floor(Experimental::simd<
+          std::int64_t, Experimental::simd_abi::neon_fixed_size<2>> const& a) {
+  return a;
 }
 
 [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-    simd<std::int64_t, simd_abi::neon_fixed_size<2>>
-    operator+(simd<std::int64_t, simd_abi::neon_fixed_size<2>> const& lhs,
-              simd<std::int64_t, simd_abi::neon_fixed_size<2>> const& rhs) {
-  return simd<std::int64_t, simd_abi::neon_fixed_size<2>>(
-      vaddq_s64(static_cast<int64x2_t>(lhs), static_cast<int64x2_t>(rhs)));
+    Experimental::simd<std::int64_t, Experimental::simd_abi::neon_fixed_size<2>>
+    ceil(Experimental::simd<
+         std::int64_t, Experimental::simd_abi::neon_fixed_size<2>> const& a) {
+  return a;
 }
 
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    Experimental::simd<std::int64_t, Experimental::simd_abi::neon_fixed_size<2>>
+    round(Experimental::simd<
+          std::int64_t, Experimental::simd_abi::neon_fixed_size<2>> const& a) {
+  return a;
+}
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    Experimental::simd<std::int64_t, Experimental::simd_abi::neon_fixed_size<2>>
+    trunc(Experimental::simd<
+          std::int64_t, Experimental::simd_abi::neon_fixed_size<2>> const& a) {
+  return a;
+}
+
+namespace Experimental {
+
 [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
     simd<std::int64_t, simd_abi::neon_fixed_size<2>>
     condition(simd_mask<std::int64_t, simd_abi::neon_fixed_size<2>> const& a,
@@ -778,7 +1221,8 @@ class simd<std::uint64_t, simd_abi::neon_fixed_size<2>> {
                 std::is_invocable_r_v<value_type, G,
                                       std::integral_constant<std::size_t, 0>>,
                 bool> = false>
-  KOKKOS_FORCEINLINE_FUNCTION simd(G&& gen) {
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd(
+      G&& gen) noexcept {
     m_value = vsetq_lane_u64(gen(std::integral_constant<std::size_t, 0>()),
                              m_value, 0);
     m_value = vsetq_lane_u64(gen(std::integral_constant<std::size_t, 1>()),
@@ -798,51 +1242,68 @@ class simd<std::uint64_t, simd_abi::neon_fixed_size<2>> {
   operator[](std::size_t i) const {
     return reference(const_cast<simd*>(this)->m_value, int(i));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd
-  operator&(simd const& other) const {
-    return simd(vandq_u64(m_value, other.m_value));
-  }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd
-  operator|(simd const& other) const {
-    return simd(vorrq_u64(m_value, other.m_value));
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr,
+                                                       element_aligned_tag) {
+    m_value = vld1q_u64(ptr);
   }
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator uint64x2_t()
       const {
     return m_value;
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd
-  operator<<(unsigned int rhs) const {
-    return simd(vshlq_u64(m_value, vmovq_n_s64(std::int64_t(rhs))));
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator-(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(
+        vsubq_u64(static_cast<uint64x2_t>(lhs), static_cast<uint64x2_t>(rhs)));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd
-  operator>>(unsigned int rhs) const {
-    return simd(vshlq_u64(m_value, vmovq_n_s64(-std::int64_t(rhs))));
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator+(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(
+        vaddq_u64(static_cast<uint64x2_t>(lhs), static_cast<uint64x2_t>(rhs)));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator==(simd const& other) const {
-    return mask_type(vceqq_u64(m_value, other.m_value));
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator&(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(
+        vandq_u64(static_cast<uint64x2_t>(lhs), static_cast<uint64x2_t>(rhs)));
   }
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
-  operator!=(simd const& other) const {
-    return !((*this) == other);
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator|(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(
+        vorrq_u64(static_cast<uint64x2_t>(lhs), static_cast<uint64x2_t>(rhs)));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator==(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(
+        vceqq_u64(static_cast<uint64x2_t>(lhs), static_cast<uint64x2_t>(rhs)));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type
+  operator!=(simd const& lhs, simd const& rhs) noexcept {
+    return !(lhs == rhs);
   }
-};
-
-[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-    simd<std::uint64_t, simd_abi::neon_fixed_size<2>>
-    operator-(simd<std::uint64_t, simd_abi::neon_fixed_size<2>> const& lhs,
-              simd<std::uint64_t, simd_abi::neon_fixed_size<2>> const& rhs) {
-  return simd<std::uint64_t, simd_abi::neon_fixed_size<2>>(
-      vsubq_u64(static_cast<uint64x2_t>(lhs), static_cast<uint64x2_t>(rhs)));
-}
 
-[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
-    simd<std::uint64_t, simd_abi::neon_fixed_size<2>>
-    operator+(simd<std::uint64_t, simd_abi::neon_fixed_size<2>> const& lhs,
-              simd<std::uint64_t, simd_abi::neon_fixed_size<2>> const& rhs) {
-  return simd<std::uint64_t, simd_abi::neon_fixed_size<2>>(
-      vaddq_u64(static_cast<uint64x2_t>(lhs), static_cast<uint64x2_t>(rhs)));
-}
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>(
+      simd const& lhs, int rhs) noexcept {
+    return simd(vshlq_u64(static_cast<uint64x2_t>(lhs),
+                          vnegq_s64(vmovq_n_s64(std::int64_t(rhs)))));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(vshlq_u64(
+        static_cast<uint64x2_t>(lhs),
+        vnegq_s64(vreinterpretq_s64_u64(static_cast<uint64x2_t>(rhs)))));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<(
+      simd const& lhs, int rhs) noexcept {
+    return simd(vshlq_u64(static_cast<uint64x2_t>(lhs),
+                          vmovq_n_s64(std::int64_t(rhs))));
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(vshlq_u64(static_cast<uint64x2_t>(lhs),
+                          vreinterpretq_s64_u64(static_cast<uint64x2_t>(rhs))));
+  }
+};
 
 KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
 simd<std::int32_t, simd_abi::neon_fixed_size<2>>::simd(
@@ -855,6 +1316,44 @@ simd<std::int64_t, simd_abi::neon_fixed_size<2>>::simd(
     simd<std::uint64_t, simd_abi::neon_fixed_size<2>> const& other)
     : m_value(vreinterpretq_s64_u64(static_cast<uint64x2_t>(other))) {}
 
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    simd<std::uint64_t, simd_abi::neon_fixed_size<2>>
+    abs(simd<std::uint64_t, simd_abi::neon_fixed_size<2>> const& a) {
+  return a;
+}
+
+}  // namespace Experimental
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd<
+    std::uint64_t, Experimental::simd_abi::neon_fixed_size<2>>
+floor(Experimental::simd<std::uint64_t,
+                         Experimental::simd_abi::neon_fixed_size<2>> const& a) {
+  return a;
+}
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd<
+    std::uint64_t, Experimental::simd_abi::neon_fixed_size<2>>
+ceil(Experimental::simd<std::uint64_t,
+                        Experimental::simd_abi::neon_fixed_size<2>> const& a) {
+  return a;
+}
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd<
+    std::uint64_t, Experimental::simd_abi::neon_fixed_size<2>>
+round(Experimental::simd<std::uint64_t,
+                         Experimental::simd_abi::neon_fixed_size<2>> const& a) {
+  return a;
+}
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd<
+    std::uint64_t, Experimental::simd_abi::neon_fixed_size<2>>
+trunc(Experimental::simd<std::uint64_t,
+                         Experimental::simd_abi::neon_fixed_size<2>> const& a) {
+  return a;
+}
+
+namespace Experimental {
+
 [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
     simd<std::uint64_t, simd_abi::neon_fixed_size<2>>
     condition(simd_mask<std::uint64_t, simd_abi::neon_fixed_size<2>> const& a,
@@ -880,14 +1379,7 @@ class const_where_expression<simd_mask<double, simd_abi::neon_fixed_size<2>>,
  public:
   const_where_expression(mask_type const& mask_arg, value_type const& value_arg)
       : m_value(const_cast<value_type&>(value_arg)), m_mask(mask_arg) {}
-  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr mask_type const&
-  mask() const {
-    return m_mask;
-  }
-  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr value_type const&
-  value() const {
-    return m_value;
-  }
+
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
   void copy_to(double* mem, element_aligned_tag) const {
     if (m_mask[0]) mem[0] = m_value[0];
@@ -900,6 +1392,16 @@ class const_where_expression<simd_mask<double, simd_abi::neon_fixed_size<2>>,
     if (m_mask[0]) mem[index[0]] = m_value[0];
     if (m_mask[1]) mem[index[1]] = m_value[1];
   }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type const&
+  impl_get_value() const {
+    return m_value;
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type const&
+  impl_get_mask() const {
+    return m_mask;
+  }
 };
 
 template <>
@@ -940,6 +1442,84 @@ class where_expression<simd_mask<double, simd_abi::neon_fixed_size<2>>,
   }
 };
 
+template <>
+class const_where_expression<simd_mask<float, simd_abi::neon_fixed_size<2>>,
+                             simd<float, simd_abi::neon_fixed_size<2>>> {
+ public:
+  using abi_type   = simd_abi::neon_fixed_size<2>;
+  using value_type = simd<float, abi_type>;
+  using mask_type  = simd_mask<float, abi_type>;
+
+ protected:
+  value_type& m_value;
+  mask_type const& m_mask;
+
+ public:
+  const_where_expression(mask_type const& mask_arg, value_type const& value_arg)
+      : m_value(const_cast<value_type&>(value_arg)), m_mask(mask_arg) {}
+
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void copy_to(float* mem, element_aligned_tag) const {
+    if (m_mask[0]) mem[0] = m_value[0];
+    if (m_mask[1]) mem[1] = m_value[1];
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void scatter_to(
+      float* mem,
+      simd<std::int32_t, simd_abi::neon_fixed_size<2>> const& index) const {
+    if (m_mask[0]) mem[index[0]] = m_value[0];
+    if (m_mask[1]) mem[index[1]] = m_value[1];
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type const&
+  impl_get_value() const {
+    return m_value;
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type const&
+  impl_get_mask() const {
+    return m_mask;
+  }
+};
+
+template <>
+class where_expression<simd_mask<float, simd_abi::neon_fixed_size<2>>,
+                       simd<float, simd_abi::neon_fixed_size<2>>>
+    : public const_where_expression<
+          simd_mask<float, simd_abi::neon_fixed_size<2>>,
+          simd<float, simd_abi::neon_fixed_size<2>>> {
+ public:
+  where_expression(
+      simd_mask<float, simd_abi::neon_fixed_size<2>> const& mask_arg,
+      simd<float, simd_abi::neon_fixed_size<2>>& value_arg)
+      : const_where_expression(mask_arg, value_arg) {}
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void copy_from(float const* mem, element_aligned_tag) {
+    if (m_mask[0]) m_value[0] = mem[0];
+    if (m_mask[1]) m_value[1] = mem[1];
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void gather_from(
+      float const* mem,
+      simd<std::int32_t, simd_abi::neon_fixed_size<2>> const& index) {
+    if (m_mask[0]) m_value[0] = mem[index[0]];
+    if (m_mask[1]) m_value[1] = mem[index[1]];
+  }
+  template <class U,
+            std::enable_if_t<std::is_convertible_v<
+                                 U, simd<float, simd_abi::neon_fixed_size<2>>>,
+                             bool> = false>
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void operator=(U&& x) {
+    auto const x_as_value_type =
+        static_cast<simd<float, simd_abi::neon_fixed_size<2>>>(
+            std::forward<U>(x));
+    m_value = static_cast<simd<float, simd_abi::neon_fixed_size<2>>>(
+        vbsl_f32(static_cast<uint32x2_t>(m_mask),
+                 static_cast<float32x2_t>(x_as_value_type),
+                 static_cast<float32x2_t>(m_value)));
+  }
+};
+
 template <>
 class const_where_expression<
     simd_mask<std::int32_t, simd_abi::neon_fixed_size<2>>,
@@ -956,19 +1536,29 @@ class const_where_expression<
  public:
   const_where_expression(mask_type const& mask_arg, value_type const& value_arg)
       : m_value(const_cast<value_type&>(value_arg)), m_mask(mask_arg) {}
-  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr mask_type const&
-  mask() const {
-    return m_mask;
-  }
-  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr value_type const&
-  value() const {
-    return m_value;
-  }
+
   KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
   void copy_to(std::int32_t* mem, element_aligned_tag) const {
     if (m_mask[0]) mem[0] = m_value[0];
     if (m_mask[1]) mem[1] = m_value[1];
   }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void scatter_to(
+      std::int32_t* mem,
+      simd<std::int32_t, simd_abi::neon_fixed_size<2>> const& index) const {
+    if (m_mask[0]) mem[index[0]] = m_value[0];
+    if (m_mask[1]) mem[index[1]] = m_value[1];
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type const&
+  impl_get_value() const {
+    return m_value;
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type const&
+  impl_get_mask() const {
+    return m_mask;
+  }
 };
 
 template <>
@@ -987,6 +1577,187 @@ class where_expression<simd_mask<std::int32_t, simd_abi::neon_fixed_size<2>>,
     if (m_mask[0]) m_value[0] = mem[0];
     if (m_mask[1]) m_value[1] = mem[1];
   }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void gather_from(
+      std::int32_t const* mem,
+      simd<std::int32_t, simd_abi::neon_fixed_size<2>> const& index) {
+    if (m_mask[0]) m_value[0] = mem[index[0]];
+    if (m_mask[1]) m_value[1] = mem[index[1]];
+  }
+  template <
+      class U,
+      std::enable_if_t<
+          std::is_convertible_v<U, simd<int32_t, simd_abi::neon_fixed_size<2>>>,
+          bool> = false>
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void operator=(U&& x) {
+    auto const x_as_value_type =
+        static_cast<simd<int32_t, simd_abi::neon_fixed_size<2>>>(
+            std::forward<U>(x));
+    m_value = static_cast<simd<int32_t, simd_abi::neon_fixed_size<2>>>(
+        vbsl_s32(static_cast<uint32x2_t>(m_mask),
+                 static_cast<int32x2_t>(x_as_value_type),
+                 static_cast<int32x2_t>(m_value)));
+  }
+};
+
+template <>
+class const_where_expression<
+    simd_mask<std::int64_t, simd_abi::neon_fixed_size<2>>,
+    simd<std::int64_t, simd_abi::neon_fixed_size<2>>> {
+ public:
+  using abi_type   = simd_abi::neon_fixed_size<2>;
+  using value_type = simd<std::int64_t, abi_type>;
+  using mask_type  = simd_mask<std::int64_t, abi_type>;
+
+ protected:
+  value_type& m_value;
+  mask_type const& m_mask;
+
+ public:
+  const_where_expression(mask_type const& mask_arg, value_type const& value_arg)
+      : m_value(const_cast<value_type&>(value_arg)), m_mask(mask_arg) {}
+
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void copy_to(std::int64_t* mem, element_aligned_tag) const {
+    if (m_mask[0]) mem[0] = m_value[0];
+    if (m_mask[1]) mem[1] = m_value[1];
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void scatter_to(
+      std::int64_t* mem,
+      simd<std::int32_t, simd_abi::neon_fixed_size<2>> const& index) const {
+    if (m_mask[0]) mem[index[0]] = m_value[0];
+    if (m_mask[1]) mem[index[1]] = m_value[1];
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type const&
+  impl_get_value() const {
+    return m_value;
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type const&
+  impl_get_mask() const {
+    return m_mask;
+  }
+};
+
+template <>
+class where_expression<simd_mask<std::int64_t, simd_abi::neon_fixed_size<2>>,
+                       simd<std::int64_t, simd_abi::neon_fixed_size<2>>>
+    : public const_where_expression<
+          simd_mask<std::int64_t, simd_abi::neon_fixed_size<2>>,
+          simd<std::int64_t, simd_abi::neon_fixed_size<2>>> {
+ public:
+  where_expression(
+      simd_mask<std::int64_t, simd_abi::neon_fixed_size<2>> const& mask_arg,
+      simd<std::int64_t, simd_abi::neon_fixed_size<2>>& value_arg)
+      : const_where_expression(mask_arg, value_arg) {}
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void copy_from(std::int64_t const* mem, element_aligned_tag) {
+    if (m_mask[0]) m_value[0] = mem[0];
+    if (m_mask[1]) m_value[1] = mem[1];
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void gather_from(
+      std::int64_t const* mem,
+      simd<std::int32_t, simd_abi::neon_fixed_size<2>> const& index) {
+    if (m_mask[0]) m_value[0] = mem[index[0]];
+    if (m_mask[1]) m_value[1] = mem[index[1]];
+  }
+  template <
+      class U,
+      std::enable_if_t<std::is_convertible_v<
+                           U, simd<std::int64_t, simd_abi::neon_fixed_size<2>>>,
+                       bool> = false>
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void operator=(U&& x) {
+    auto const x_as_value_type =
+        static_cast<simd<std::int64_t, simd_abi::neon_fixed_size<2>>>(
+            std::forward<U>(x));
+    m_value = static_cast<simd<std::int64_t, simd_abi::neon_fixed_size<2>>>(
+        vbslq_s64(static_cast<uint64x2_t>(m_mask),
+                  static_cast<int64x2_t>(x_as_value_type),
+                  static_cast<int64x2_t>(m_value)));
+  }
+};
+
+template <>
+class const_where_expression<
+    simd_mask<std::uint64_t, simd_abi::neon_fixed_size<2>>,
+    simd<std::uint64_t, simd_abi::neon_fixed_size<2>>> {
+ public:
+  using abi_type   = simd_abi::neon_fixed_size<2>;
+  using value_type = simd<std::uint64_t, abi_type>;
+  using mask_type  = simd_mask<std::uint64_t, abi_type>;
+
+ protected:
+  value_type& m_value;
+  mask_type const& m_mask;
+
+ public:
+  const_where_expression(mask_type const& mask_arg, value_type const& value_arg)
+      : m_value(const_cast<value_type&>(value_arg)), m_mask(mask_arg) {}
+
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void copy_to(std::uint64_t* mem, element_aligned_tag) const {
+    if (m_mask[0]) mem[0] = m_value[0];
+    if (m_mask[1]) mem[1] = m_value[1];
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void scatter_to(
+      std::uint64_t* mem,
+      simd<std::int32_t, simd_abi::neon_fixed_size<2>> const& index) const {
+    if (m_mask[0]) mem[index[0]] = m_value[0];
+    if (m_mask[1]) mem[index[1]] = m_value[1];
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type const&
+  impl_get_value() const {
+    return m_value;
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type const&
+  impl_get_mask() const {
+    return m_mask;
+  }
+};
+
+template <>
+class where_expression<simd_mask<std::uint64_t, simd_abi::neon_fixed_size<2>>,
+                       simd<std::uint64_t, simd_abi::neon_fixed_size<2>>>
+    : public const_where_expression<
+          simd_mask<std::uint64_t, simd_abi::neon_fixed_size<2>>,
+          simd<std::uint64_t, simd_abi::neon_fixed_size<2>>> {
+ public:
+  where_expression(
+      simd_mask<std::uint64_t, simd_abi::neon_fixed_size<2>> const& mask_arg,
+      simd<std::uint64_t, simd_abi::neon_fixed_size<2>>& value_arg)
+      : const_where_expression(mask_arg, value_arg) {}
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void copy_from(std::uint64_t const* mem, element_aligned_tag) {
+    if (m_mask[0]) m_value[0] = mem[0];
+    if (m_mask[1]) m_value[1] = mem[1];
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void gather_from(
+      std::uint64_t const* mem,
+      simd<std::int32_t, simd_abi::neon_fixed_size<2>> const& index) {
+    if (m_mask[0]) m_value[0] = mem[index[0]];
+    if (m_mask[1]) m_value[1] = mem[index[1]];
+  }
+  template <class U,
+            std::enable_if_t<
+                std::is_convertible_v<
+                    U, simd<std::uint64_t, simd_abi::neon_fixed_size<2>>>,
+                bool> = false>
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void operator=(U&& x) {
+    auto const x_as_value_type =
+        static_cast<simd<std::uint64_t, simd_abi::neon_fixed_size<2>>>(
+            std::forward<U>(x));
+    m_value = static_cast<simd<std::uint64_t, simd_abi::neon_fixed_size<2>>>(
+        vbslq_u64(static_cast<uint64x2_t>(m_mask),
+                  static_cast<uint64x2_t>(x_as_value_type),
+                  static_cast<uint64x2_t>(m_value)));
+  }
 };
 
 }  // namespace Experimental
diff --git a/packages/kokkos/simd/src/Kokkos_SIMD_Scalar.hpp b/packages/kokkos/simd/src/Kokkos_SIMD_Scalar.hpp
index e878eec296880d8383761e3449494b74e8b216e8..af7cb1e2c61689ba6d3b9df83f642d482e533b34 100644
--- a/packages/kokkos/simd/src/Kokkos_SIMD_Scalar.hpp
+++ b/packages/kokkos/simd/src/Kokkos_SIMD_Scalar.hpp
@@ -23,6 +23,11 @@
 
 #include <Kokkos_SIMD_Common.hpp>
 
+#ifdef KOKKOS_SIMD_COMMON_MATH_HPP
+#error \
+    "Kokkos_SIMD_Scalar.hpp must be included before Kokkos_SIMD_Common_Math.hpp!"
+#endif
+
 namespace Kokkos {
 namespace Experimental {
 
@@ -45,6 +50,13 @@ class simd_mask<T, simd_abi::scalar> {
   KOKKOS_FORCEINLINE_FUNCTION static constexpr std::size_t size() { return 1; }
   KOKKOS_FORCEINLINE_FUNCTION explicit simd_mask(value_type value)
       : m_value(value) {}
+  template <
+      class G,
+      std::enable_if_t<std::is_invocable_r_v<
+                           value_type, G, std::integral_constant<bool, false>>,
+                       bool> = false>
+  KOKKOS_FORCEINLINE_FUNCTION constexpr explicit simd_mask(G&& gen) noexcept
+      : m_value(gen(0)) {}
   template <class U>
   KOKKOS_FORCEINLINE_FUNCTION simd_mask(
       simd_mask<U, simd_abi::scalar> const& other)
@@ -106,113 +118,166 @@ class simd<T, simd_abi::scalar> {
                 std::is_invocable_r_v<value_type, G,
                                       std::integral_constant<std::size_t, 0>>,
                 bool> = false>
-  KOKKOS_FORCEINLINE_FUNCTION simd(G&& gen)
-      : m_value(gen(std::integral_constant<std::size_t, 0>())) {}
-  KOKKOS_FORCEINLINE_FUNCTION simd operator-() const { return simd(-m_value); }
-  KOKKOS_FORCEINLINE_FUNCTION simd operator>>(int rhs) const {
-    return simd(m_value >> rhs);
+  KOKKOS_FORCEINLINE_FUNCTION constexpr explicit simd(G&& gen) noexcept
+      : m_value(gen(0)) {}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr explicit operator T() const {
+    return m_value;
   }
-  KOKKOS_FORCEINLINE_FUNCTION simd
-  operator>>(simd<int, abi_type> const& rhs) const {
-    return simd(m_value >> static_cast<int>(rhs));
+  KOKKOS_FORCEINLINE_FUNCTION void copy_from(T const* ptr,
+                                             element_aligned_tag) {
+    m_value = *ptr;
   }
-  KOKKOS_FORCEINLINE_FUNCTION simd operator<<(int rhs) const {
-    return simd(m_value << rhs);
+  KOKKOS_FORCEINLINE_FUNCTION void copy_to(T* ptr, element_aligned_tag) const {
+    *ptr = m_value;
   }
-  KOKKOS_FORCEINLINE_FUNCTION simd
-  operator<<(simd<int, abi_type> const& rhs) const {
-    return simd(m_value << static_cast<int>(rhs));
+  KOKKOS_FORCEINLINE_FUNCTION reference operator[](std::size_t) {
+    return m_value;
   }
-  KOKKOS_FORCEINLINE_FUNCTION simd operator&(simd const& other) const {
-    return m_value & other.m_value;
+  KOKKOS_FORCEINLINE_FUNCTION value_type operator[](std::size_t) const {
+    return m_value;
   }
-  KOKKOS_FORCEINLINE_FUNCTION simd operator|(simd const& other) const {
-    return m_value | other.m_value;
+  KOKKOS_FORCEINLINE_FUNCTION simd operator-() const noexcept {
+    return simd(-m_value);
   }
-  KOKKOS_FORCEINLINE_FUNCTION constexpr explicit operator T() const {
-    return m_value;
+
+  [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION friend constexpr simd operator*(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(lhs.m_value * rhs.m_value);
   }
-  KOKKOS_FORCEINLINE_FUNCTION mask_type operator<(simd const& other) const {
-    return mask_type(m_value < other.m_value);
+  [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION friend constexpr simd operator/(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(lhs.m_value / rhs.m_value);
   }
-  KOKKOS_FORCEINLINE_FUNCTION mask_type operator>(simd const& other) const {
-    return mask_type(m_value > other.m_value);
+  [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION friend constexpr simd operator+(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(lhs.m_value + rhs.m_value);
   }
-  KOKKOS_FORCEINLINE_FUNCTION mask_type operator<=(simd const& other) const {
-    return mask_type(m_value <= other.m_value);
+  [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION friend constexpr simd operator-(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(lhs.m_value - rhs.m_value);
   }
-  KOKKOS_FORCEINLINE_FUNCTION mask_type operator>=(simd const& other) const {
-    return mask_type(m_value >= other.m_value);
+  [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION friend constexpr simd operator>>(
+      simd const& lhs, int rhs) noexcept {
+    return simd(lhs.m_value >> rhs);
   }
-  KOKKOS_FORCEINLINE_FUNCTION mask_type operator==(simd const& other) const {
-    return mask_type(m_value == other.m_value);
+  [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION friend constexpr simd operator>>(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(lhs.m_value >> rhs.m_value);
   }
-  KOKKOS_FORCEINLINE_FUNCTION mask_type operator!=(simd const& other) const {
-    return mask_type(m_value != other.m_value);
+  [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION friend constexpr simd operator<<(
+      simd const& lhs, int rhs) noexcept {
+    return simd(lhs.m_value << rhs);
   }
-  KOKKOS_FORCEINLINE_FUNCTION void copy_from(T const* ptr,
-                                             element_aligned_tag) {
-    m_value = *ptr;
+  [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION friend constexpr simd operator<<(
+      simd const& lhs, simd const& rhs) noexcept {
+    return simd(lhs.m_value << rhs.m_value);
   }
-  KOKKOS_FORCEINLINE_FUNCTION void copy_to(T* ptr, element_aligned_tag) const {
-    *ptr = m_value;
+  [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION friend constexpr simd operator&(
+      simd const& lhs, simd const& rhs) noexcept {
+    return lhs.m_value & rhs.m_value;
   }
-  KOKKOS_FORCEINLINE_FUNCTION reference operator[](std::size_t) {
-    return m_value;
+  [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION friend constexpr simd operator|(
+      simd const& lhs, simd const& rhs) noexcept {
+    return lhs.m_value | rhs.m_value;
   }
-  KOKKOS_FORCEINLINE_FUNCTION value_type operator[](std::size_t) const {
-    return m_value;
+  [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION friend constexpr mask_type
+  operator<(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(lhs.m_value < rhs.m_value);
+  }
+  [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION friend constexpr mask_type
+  operator>(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(lhs.m_value > rhs.m_value);
+  }
+  [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION friend constexpr mask_type
+  operator<=(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(lhs.m_value <= rhs.m_value);
+  }
+  [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION friend constexpr mask_type
+  operator>=(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(lhs.m_value >= rhs.m_value);
+  }
+  [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION friend constexpr mask_type
+  operator==(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(lhs.m_value == rhs.m_value);
+  }
+  [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION friend constexpr mask_type
+  operator!=(simd const& lhs, simd const& rhs) noexcept {
+    return mask_type(lhs.m_value != rhs.m_value);
   }
 };
 
-template <class T>
-[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION simd<T, simd_abi::scalar> operator*(
-    simd<T, simd_abi::scalar> const& lhs,
-    simd<T, simd_abi::scalar> const& rhs) {
-  return simd<T, simd_abi::scalar>(static_cast<T>(lhs) * static_cast<T>(rhs));
-}
+}  // namespace Experimental
 
 template <class T>
-[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION simd<T, simd_abi::scalar> operator/(
-    simd<T, simd_abi::scalar> const& lhs,
-    simd<T, simd_abi::scalar> const& rhs) {
-  return simd<T, simd_abi::scalar>(static_cast<T>(lhs) / static_cast<T>(rhs));
+[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION
+    Experimental::simd<T, Experimental::simd_abi::scalar>
+    abs(Experimental::simd<T, Experimental::simd_abi::scalar> const& a) {
+  if constexpr (std::is_signed_v<T>) {
+    return (a < 0 ? -a : a);
+  }
+  return a;
 }
 
-template <class T>
-[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION simd<T, simd_abi::scalar> operator+(
-    simd<T, simd_abi::scalar> const& lhs,
-    simd<T, simd_abi::scalar> const& rhs) {
-  return simd<T, simd_abi::scalar>(static_cast<T>(lhs) + static_cast<T>(rhs));
-}
+template <typename T>
+[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION auto floor(
+    Experimental::simd<T, Experimental::simd_abi::scalar> const& a) {
+  using data_type = std::conditional_t<std::is_floating_point_v<T>, T, double>;
+  return Experimental::simd<data_type, Experimental::simd_abi::scalar>(
+      Kokkos::floor(static_cast<data_type>(a[0])));
+};
 
-template <class T>
-[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION simd<T, simd_abi::scalar> operator-(
-    simd<T, simd_abi::scalar> const& lhs,
-    simd<T, simd_abi::scalar> const& rhs) {
-  return simd<T, simd_abi::scalar>(static_cast<T>(lhs) - static_cast<T>(rhs));
-}
+template <typename T>
+[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION auto ceil(
+    Experimental::simd<T, Experimental::simd_abi::scalar> const& a) {
+  using data_type = std::conditional_t<std::is_floating_point_v<T>, T, double>;
+  return Experimental::simd<data_type, Experimental::simd_abi::scalar>(
+      Kokkos::ceil(static_cast<data_type>(a[0])));
+};
+
+template <typename T>
+[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION auto round(
+    Experimental::simd<T, Experimental::simd_abi::scalar> const& a) {
+  using data_type = std::conditional_t<std::is_floating_point_v<T>, T, double>;
+  return Experimental::simd<data_type, Experimental::simd_abi::scalar>(
+      Experimental::round_half_to_nearest_even(static_cast<data_type>(a[0])));
+};
+
+template <typename T>
+[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION auto trunc(
+    Experimental::simd<T, Experimental::simd_abi::scalar> const& a) {
+  using data_type = std::conditional_t<std::is_floating_point_v<T>, T, double>;
+  return Experimental::simd<data_type, Experimental::simd_abi::scalar>(
+      Kokkos::trunc(static_cast<data_type>(a[0])));
+};
 
 template <class T>
-[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION simd<T, simd_abi::scalar> abs(
-    simd<T, simd_abi::scalar> const& a) {
-  return simd<T, simd_abi::scalar>(std::abs(static_cast<T>(a)));
+[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION
+    Experimental::simd<T, Experimental::simd_abi::scalar>
+    sqrt(Experimental::simd<T, Experimental::simd_abi::scalar> const& a) {
+  return Experimental::simd<T, Experimental::simd_abi::scalar>(
+      std::sqrt(static_cast<T>(a)));
 }
 
 template <class T>
-[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION simd<T, simd_abi::scalar> sqrt(
-    simd<T, simd_abi::scalar> const& a) {
-  return simd<T, simd_abi::scalar>(std::sqrt(static_cast<T>(a)));
+KOKKOS_FORCEINLINE_FUNCTION
+    Experimental::simd<T, Experimental::simd_abi::scalar>
+    fma(Experimental::simd<T, Experimental::simd_abi::scalar> const& x,
+        Experimental::simd<T, Experimental::simd_abi::scalar> const& y,
+        Experimental::simd<T, Experimental::simd_abi::scalar> const& z) {
+  return Experimental::simd<T, Experimental::simd_abi::scalar>(
+      (static_cast<T>(x) * static_cast<T>(y)) + static_cast<T>(z));
 }
 
 template <class T>
-KOKKOS_FORCEINLINE_FUNCTION simd<T, simd_abi::scalar> fma(
-    simd<T, simd_abi::scalar> const& x, simd<T, simd_abi::scalar> const& y,
-    simd<T, simd_abi::scalar> const& z) {
-  return simd<T, simd_abi::scalar>((static_cast<T>(x) * static_cast<T>(y)) +
-                                   static_cast<T>(z));
+[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION
+    Experimental::simd<T, Experimental::simd_abi::scalar>
+    copysign(Experimental::simd<T, Experimental::simd_abi::scalar> const& a,
+             Experimental::simd<T, Experimental::simd_abi::scalar> const& b) {
+  return std::copysign(static_cast<T>(a), static_cast<T>(b));
 }
 
+namespace Experimental {
+
 template <class T>
 KOKKOS_FORCEINLINE_FUNCTION simd<T, simd_abi::scalar> condition(
     desul::Impl::dont_deduce_this_parameter_t<
@@ -222,12 +287,6 @@ KOKKOS_FORCEINLINE_FUNCTION simd<T, simd_abi::scalar> condition(
                                                         : static_cast<T>(c));
 }
 
-template <class T, class Abi>
-[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION simd<T, Abi> copysign(
-    simd<T, Abi> const& a, simd<T, Abi> const& b) {
-  return std::copysign(static_cast<T>(a), static_cast<T>(b));
-}
-
 template <class T>
 class const_where_expression<simd_mask<T, simd_abi::scalar>,
                              simd<T, simd_abi::scalar>> {
@@ -244,10 +303,7 @@ class const_where_expression<simd_mask<T, simd_abi::scalar>,
   KOKKOS_FORCEINLINE_FUNCTION
   const_where_expression(mask_type const& mask_arg, value_type const& value_arg)
       : m_value(const_cast<value_type&>(value_arg)), m_mask(mask_arg) {}
-  KOKKOS_FORCEINLINE_FUNCTION
-  mask_type const& mask() const { return m_mask; }
-  KOKKOS_FORCEINLINE_FUNCTION
-  value_type const& value() const { return m_value; }
+
   KOKKOS_FORCEINLINE_FUNCTION
   void copy_to(T* mem, element_aligned_tag) const {
     if (static_cast<bool>(m_mask)) *mem = static_cast<T>(m_value);
@@ -258,6 +314,16 @@ class const_where_expression<simd_mask<T, simd_abi::scalar>,
     if (static_cast<bool>(m_mask))
       mem[static_cast<Integral>(index)] = static_cast<T>(m_value);
   }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type const&
+  impl_get_value() const {
+    return m_value;
+  }
+
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type const&
+  impl_get_mask() const {
+    return m_mask;
+  }
 };
 
 template <class T>
@@ -294,29 +360,70 @@ class where_expression<simd_mask<T, simd_abi::scalar>,
   }
 };
 
+template <class T>
+[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION
+    where_expression<simd_mask<T, Kokkos::Experimental::simd_abi::scalar>,
+                     simd<T, Kokkos::Experimental::simd_abi::scalar>>
+    where(typename simd<
+              T, Kokkos::Experimental::simd_abi::scalar>::mask_type const& mask,
+          simd<T, Kokkos::Experimental::simd_abi::scalar>& value) {
+  return where_expression(mask, value);
+}
+
+template <class T>
+[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION
+    const_where_expression<simd_mask<T, Kokkos::Experimental::simd_abi::scalar>,
+                           simd<T, Kokkos::Experimental::simd_abi::scalar>>
+    where(typename simd<
+              T, Kokkos::Experimental::simd_abi::scalar>::mask_type const& mask,
+          simd<T, Kokkos::Experimental::simd_abi::scalar> const& value) {
+  return const_where_expression(mask, value);
+}
+
+template <class T>
+[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION bool all_of(
+    simd_mask<T, Kokkos::Experimental::simd_abi::scalar> const& a) {
+  return a == simd_mask<T, Kokkos::Experimental::simd_abi::scalar>(true);
+}
+
+template <class T>
+[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION bool any_of(
+    simd_mask<T, Kokkos::Experimental::simd_abi::scalar> const& a) {
+  return a != simd_mask<T, Kokkos::Experimental::simd_abi::scalar>(false);
+}
+
+template <class T>
+[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION bool none_of(
+    simd_mask<T, Kokkos::Experimental::simd_abi::scalar> const& a) {
+  return a == simd_mask<T, Kokkos::Experimental::simd_abi::scalar>(false);
+}
+
 template <class T>
 [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION T
 reduce(const_where_expression<simd_mask<T, simd_abi::scalar>,
                               simd<T, simd_abi::scalar>> const& x,
        T identity_element, std::plus<>) {
-  return static_cast<bool>(x.mask()) ? static_cast<T>(x.value())
-                                     : identity_element;
+  return static_cast<bool>(x.impl_get_mask())
+             ? static_cast<T>(x.impl_get_value())
+             : identity_element;
 }
 
 template <class T>
 [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION T
 hmax(const_where_expression<simd_mask<T, simd_abi::scalar>,
                             simd<T, simd_abi::scalar>> const& x) {
-  return static_cast<bool>(x.mask()) ? static_cast<T>(x.value())
-                                     : Kokkos::reduction_identity<T>::max();
+  return static_cast<bool>(x.impl_get_mask())
+             ? static_cast<T>(x.impl_get_value())
+             : Kokkos::reduction_identity<T>::max();
 }
 
 template <class T>
 [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION T
 hmin(const_where_expression<simd_mask<T, simd_abi::scalar>,
                             simd<T, simd_abi::scalar>> const& x) {
-  return static_cast<bool>(x.mask()) ? static_cast<T>(x.value())
-                                     : Kokkos::reduction_identity<T>::min();
+  return static_cast<bool>(x.impl_get_mask())
+             ? static_cast<T>(x.impl_get_value())
+             : Kokkos::reduction_identity<T>::min();
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/simd/unit_tests/CMakeLists.txt b/packages/kokkos/simd/unit_tests/CMakeLists.txt
index f4de8624187ddf9878259d80a7b413ac8453a881..75d557e8b525835a0fb8596bac4e683e55254102 100644
--- a/packages/kokkos/simd/unit_tests/CMakeLists.txt
+++ b/packages/kokkos/simd/unit_tests/CMakeLists.txt
@@ -1,3 +1,5 @@
+KOKKOS_INCLUDE_DIRECTORIES(${KOKKOS_SOURCE_DIR}/simd/unit_tests/include)
+
 KOKKOS_ADD_EXECUTABLE_AND_TEST(
   UnitTest_SIMD
   SOURCES
diff --git a/packages/kokkos/simd/unit_tests/TestSIMD.cpp b/packages/kokkos/simd/unit_tests/TestSIMD.cpp
index 92c77033b9a84eecdb0c0782ae86619ea1272318..61c076e82466da2f31f2739383a42c7515d30594 100644
--- a/packages/kokkos/simd/unit_tests/TestSIMD.cpp
+++ b/packages/kokkos/simd/unit_tests/TestSIMD.cpp
@@ -14,504 +14,10 @@
 //
 //@HEADER
 
-#include <gtest/gtest.h>
-
-#include <Kokkos_SIMD.hpp>
-
-class gtest_checker {
- public:
-  void truth(bool x) const { EXPECT_TRUE(x); }
-  template <class T>
-  void equality(T const& a, T const& b) const {
-    EXPECT_EQ(a, b);
-  }
-};
-
-class kokkos_checker {
- public:
-  KOKKOS_INLINE_FUNCTION void truth(bool x) const {
-    if (!x) Kokkos::abort("SIMD unit test truth condition failed on device");
-  }
-  template <class T>
-  KOKKOS_INLINE_FUNCTION void equality(T const& a, T const& b) const {
-    if (a != b)
-      Kokkos::abort("SIMD unit test equality condition failed on device");
-  }
-};
-
-template <class T, class Abi>
-inline void host_check_equality(
-    Kokkos::Experimental::simd<T, Abi> const& expected_result,
-    Kokkos::Experimental::simd<T, Abi> const& computed_result,
-    std::size_t nlanes) {
-  gtest_checker checker;
-  for (std::size_t i = 0; i < nlanes; ++i) {
-    checker.equality(expected_result[i], computed_result[i]);
-  }
-  using mask_type = typename Kokkos::Experimental::simd<T, Abi>::mask_type;
-  mask_type mask(false);
-  for (std::size_t i = 0; i < nlanes; ++i) {
-    mask[i] = true;
-  }
-  checker.equality((expected_result == computed_result) && mask, mask);
-}
-
-template <class T, class Abi>
-KOKKOS_INLINE_FUNCTION void device_check_equality(
-    Kokkos::Experimental::simd<T, Abi> const& expected_result,
-    Kokkos::Experimental::simd<T, Abi> const& computed_result,
-    std::size_t nlanes) {
-  kokkos_checker checker;
-  for (std::size_t i = 0; i < nlanes; ++i) {
-    checker.equality(expected_result[i], computed_result[i]);
-  }
-  using mask_type = typename Kokkos::Experimental::simd<T, Abi>::mask_type;
-  mask_type mask(false);
-  for (std::size_t i = 0; i < nlanes; ++i) {
-    mask[i] = true;
-  }
-  checker.equality((expected_result == computed_result) && mask, mask);
-}
-
-class load_element_aligned {
- public:
-  template <class T, class Abi>
-  bool host_load(T const* mem, std::size_t n,
-                 Kokkos::Experimental::simd<T, Abi>& result) const {
-    if (n < result.size()) return false;
-    result.copy_from(mem, Kokkos::Experimental::element_aligned_tag());
-    return true;
-  }
-  template <class T, class Abi>
-  KOKKOS_INLINE_FUNCTION bool device_load(
-      T const* mem, std::size_t n,
-      Kokkos::Experimental::simd<T, Abi>& result) const {
-    if (n < result.size()) return false;
-    result.copy_from(mem, Kokkos::Experimental::element_aligned_tag());
-    return true;
-  }
-};
-
-class load_masked {
- public:
-  template <class T, class Abi>
-  bool host_load(T const* mem, std::size_t n,
-                 Kokkos::Experimental::simd<T, Abi>& result) const {
-    using mask_type = typename Kokkos::Experimental::simd<T, Abi>::mask_type;
-    mask_type mask(false);
-    for (std::size_t i = 0; i < n; ++i) {
-      mask[i] = true;
-    }
-    where(mask, result)
-        .copy_from(mem, Kokkos::Experimental::element_aligned_tag());
-    where(!mask, result) = 0;
-    return true;
-  }
-  template <class T, class Abi>
-  KOKKOS_INLINE_FUNCTION bool device_load(
-      T const* mem, std::size_t n,
-      Kokkos::Experimental::simd<T, Abi>& result) const {
-    using mask_type = typename Kokkos::Experimental::simd<T, Abi>::mask_type;
-    mask_type mask(false);
-    for (std::size_t i = 0; i < n; ++i) {
-      mask[i] = true;
-    }
-    where(mask, result)
-        .copy_from(mem, Kokkos::Experimental::element_aligned_tag());
-    where(!mask, result) = T(0);
-    return true;
-  }
-};
-
-class load_as_scalars {
- public:
-  template <class T, class Abi>
-  bool host_load(T const* mem, std::size_t n,
-                 Kokkos::Experimental::simd<T, Abi>& result) const {
-    for (std::size_t i = 0; i < n; ++i) {
-      result[i] = mem[i];
-    }
-    for (std::size_t i = n; i < result.size(); ++i) {
-      result[i] = T(0);
-    }
-    return true;
-  }
-  template <class T, class Abi>
-  KOKKOS_INLINE_FUNCTION bool device_load(
-      T const* mem, std::size_t n,
-      Kokkos::Experimental::simd<T, Abi>& result) const {
-    for (std::size_t i = 0; i < n; ++i) {
-      result[i] = mem[i];
-    }
-    for (std::size_t i = n; i < result.size(); ++i) {
-      result[i] = T(0);
-    }
-    return true;
-  }
-};
-
-template <class Abi, class Loader, class BinaryOp, class T>
-void host_check_binary_op_one_loader(BinaryOp binary_op, std::size_t n,
-                                     T const* first_args,
-                                     T const* second_args) {
-  Loader loader;
-  using simd_type             = Kokkos::Experimental::simd<T, Abi>;
-  std::size_t constexpr width = simd_type::size();
-  for (std::size_t i = 0; i < n; i += width) {
-    std::size_t const nremaining = n - i;
-    std::size_t const nlanes     = Kokkos::min(nremaining, width);
-    simd_type first_arg;
-    bool const loaded_first_arg =
-        loader.host_load(first_args + i, nlanes, first_arg);
-    simd_type second_arg;
-    bool const loaded_second_arg =
-        loader.host_load(second_args + i, nlanes, second_arg);
-    if (!(loaded_first_arg && loaded_second_arg)) continue;
-    simd_type expected_result;
-    for (std::size_t lane = 0; lane < nlanes; ++lane) {
-      expected_result[lane] =
-          binary_op.on_host(T(first_arg[lane]), T(second_arg[lane]));
-    }
-    simd_type const computed_result = binary_op.on_host(first_arg, second_arg);
-    host_check_equality(expected_result, computed_result, nlanes);
-  }
-}
-
-template <class Abi, class Loader, class BinaryOp, class T>
-KOKKOS_INLINE_FUNCTION void device_check_binary_op_one_loader(
-    BinaryOp binary_op, std::size_t n, T const* first_args,
-    T const* second_args) {
-  Loader loader;
-  using simd_type             = Kokkos::Experimental::simd<T, Abi>;
-  std::size_t constexpr width = simd_type::size();
-  for (std::size_t i = 0; i < n; i += width) {
-    std::size_t const nremaining = n - i;
-    std::size_t const nlanes     = Kokkos::min(nremaining, width);
-    simd_type first_arg;
-    bool const loaded_first_arg =
-        loader.device_load(first_args + i, nlanes, first_arg);
-    simd_type second_arg;
-    bool const loaded_second_arg =
-        loader.device_load(second_args + i, nlanes, second_arg);
-    if (!(loaded_first_arg && loaded_second_arg)) continue;
-    simd_type expected_result;
-    for (std::size_t lane = 0; lane < nlanes; ++lane) {
-      expected_result[lane] =
-          binary_op.on_device(first_arg[lane], second_arg[lane]);
-    }
-    simd_type const computed_result =
-        binary_op.on_device(first_arg, second_arg);
-    device_check_equality(expected_result, computed_result, nlanes);
-  }
-}
-
-template <class Abi, class BinaryOp, class T>
-inline void host_check_binary_op_all_loaders(BinaryOp binary_op, std::size_t n,
-                                             T const* first_args,
-                                             T const* second_args) {
-  host_check_binary_op_one_loader<Abi, load_element_aligned>(
-      binary_op, n, first_args, second_args);
-  host_check_binary_op_one_loader<Abi, load_masked>(binary_op, n, first_args,
-                                                    second_args);
-  host_check_binary_op_one_loader<Abi, load_as_scalars>(
-      binary_op, n, first_args, second_args);
-}
-
-template <class Abi, class BinaryOp, class T>
-KOKKOS_INLINE_FUNCTION void device_check_binary_op_all_loaders(
-    BinaryOp binary_op, std::size_t n, T const* first_args,
-    T const* second_args) {
-  device_check_binary_op_one_loader<Abi, load_element_aligned>(
-      binary_op, n, first_args, second_args);
-  device_check_binary_op_one_loader<Abi, load_masked>(binary_op, n, first_args,
-                                                      second_args);
-  device_check_binary_op_one_loader<Abi, load_as_scalars>(
-      binary_op, n, first_args, second_args);
-}
-
-class plus {
- public:
-  template <class T>
-  auto on_host(T const& a, T const& b) const {
-    return a + b;
-  }
-  template <class T>
-  KOKKOS_INLINE_FUNCTION auto on_device(T const& a, T const& b) const {
-    return a + b;
-  }
-};
-
-class minus {
- public:
-  template <class T>
-  auto on_host(T const& a, T const& b) const {
-    return a - b;
-  }
-  template <class T>
-  KOKKOS_INLINE_FUNCTION auto on_device(T const& a, T const& b) const {
-    return a - b;
-  }
-};
-
-class multiplies {
- public:
-  template <class T>
-  auto on_host(T const& a, T const& b) const {
-    return a * b;
-  }
-  template <class T>
-  KOKKOS_INLINE_FUNCTION auto on_device(T const& a, T const& b) const {
-    return a * b;
-  }
-};
-
-class divides {
- public:
-  template <class T>
-  auto on_host(T const& a, T const& b) const {
-    return a / b;
-  }
-  template <class T>
-  KOKKOS_INLINE_FUNCTION auto on_device(T const& a, T const& b) const {
-    return a / b;
-  }
-};
-
-template <class Abi>
-inline void host_check_math_ops() {
-  std::size_t constexpr n     = 11;
-  double const first_args[n]  = {1, 2, -1, 10, 0, 1, -2, 10, 0, 1, -2};
-  double const second_args[n] = {1, 2, 1, 1, 1, -3, -2, 1, 13, -3, -2};
-  host_check_binary_op_all_loaders<Abi>(plus(), n, first_args, second_args);
-  host_check_binary_op_all_loaders<Abi>(minus(), n, first_args, second_args);
-  host_check_binary_op_all_loaders<Abi>(multiplies(), n, first_args,
-                                        second_args);
-  host_check_binary_op_all_loaders<Abi>(divides(), n, first_args, second_args);
-}
-
-template <class Abi>
-inline void host_check_mask_ops() {
-  using mask_type = Kokkos::Experimental::simd_mask<double, Abi>;
-  EXPECT_FALSE(none_of(mask_type(true)));
-  EXPECT_TRUE(none_of(mask_type(false)));
-  EXPECT_TRUE(all_of(mask_type(true)));
-  EXPECT_FALSE(all_of(mask_type(false)));
-}
-
-template <class Abi>
-inline void host_check_conversions() {
-  {
-    auto a = Kokkos::Experimental::simd<std::uint64_t, Abi>(1);
-    auto b = Kokkos::Experimental::simd<std::int64_t, Abi>(a);
-    EXPECT_TRUE(all_of(b == decltype(b)(1)));
-  }
-  {
-    auto a = Kokkos::Experimental::simd<std::int32_t, Abi>(1);
-    auto b = Kokkos::Experimental::simd<std::uint64_t, Abi>(a);
-    EXPECT_TRUE(all_of(b == decltype(b)(1)));
-  }
-  {
-    auto a = Kokkos::Experimental::simd<std::uint64_t, Abi>(1);
-    auto b = Kokkos::Experimental::simd<std::int32_t, Abi>(a);
-    EXPECT_TRUE(all_of(b == decltype(b)(1)));
-  }
-  {
-    auto a = Kokkos::Experimental::simd_mask<double, Abi>(true);
-    auto b = Kokkos::Experimental::simd_mask<std::int32_t, Abi>(a);
-    EXPECT_TRUE(b == decltype(b)(true));
-  }
-  {
-    auto a = Kokkos::Experimental::simd_mask<std::int32_t, Abi>(true);
-    auto b = Kokkos::Experimental::simd_mask<std::uint64_t, Abi>(a);
-    EXPECT_TRUE(b == decltype(b)(true));
-  }
-  {
-    auto a = Kokkos::Experimental::simd_mask<std::int32_t, Abi>(true);
-    auto b = Kokkos::Experimental::simd_mask<std::int64_t, Abi>(a);
-    EXPECT_TRUE(b == decltype(b)(true));
-  }
-  {
-    auto a = Kokkos::Experimental::simd_mask<std::int32_t, Abi>(true);
-    auto b = Kokkos::Experimental::simd_mask<double, Abi>(a);
-    EXPECT_TRUE(b == decltype(b)(true));
-  }
-}
-
-template <class Abi>
-inline void host_check_shifts() {
-  auto a = Kokkos::Experimental::simd<std::uint64_t, Abi>(8);
-  auto b = a >> 1;
-  EXPECT_TRUE(all_of(b == decltype(b)(4)));
-}
-
-template <class Abi>
-inline void host_check_condition() {
-  auto a = Kokkos::Experimental::condition(
-      Kokkos::Experimental::simd<std::int32_t, Abi>(1) > 0,
-      Kokkos::Experimental::simd<std::uint64_t, Abi>(16),
-      Kokkos::Experimental::simd<std::uint64_t, Abi>(20));
-  EXPECT_TRUE(all_of(a == decltype(a)(16)));
-}
-
-template <class Abi>
-KOKKOS_INLINE_FUNCTION void device_check_math_ops() {
-  std::size_t constexpr n     = 11;
-  double const first_args[n]  = {1, 2, -1, 10, 0, 1, -2, 10, 0, 1, -2};
-  double const second_args[n] = {1, 2, 1, 1, 1, -3, -2, 1, 13, -3, -2};
-  device_check_binary_op_all_loaders<Abi>(plus(), n, first_args, second_args);
-  device_check_binary_op_all_loaders<Abi>(minus(), n, first_args, second_args);
-  device_check_binary_op_all_loaders<Abi>(multiplies(), n, first_args,
-                                          second_args);
-  device_check_binary_op_all_loaders<Abi>(divides(), n, first_args,
-                                          second_args);
-}
-
-template <class Abi>
-KOKKOS_INLINE_FUNCTION void device_check_mask_ops() {
-  using mask_type = Kokkos::Experimental::simd_mask<double, Abi>;
-  kokkos_checker checker;
-  checker.truth(!none_of(mask_type(true)));
-  checker.truth(none_of(mask_type(false)));
-  checker.truth(all_of(mask_type(true)));
-  checker.truth(!all_of(mask_type(false)));
-}
-
-template <class Abi>
-KOKKOS_INLINE_FUNCTION void device_check_conversions() {
-  kokkos_checker checker;
-  {
-    auto a = Kokkos::Experimental::simd<std::uint64_t, Abi>(1);
-    auto b = Kokkos::Experimental::simd<std::int64_t, Abi>(a);
-    checker.truth(all_of(b == decltype(b)(1)));
-  }
-  {
-    auto a = Kokkos::Experimental::simd<std::int32_t, Abi>(1);
-    auto b = Kokkos::Experimental::simd<std::uint64_t, Abi>(a);
-    checker.truth(all_of(b == decltype(b)(1)));
-  }
-  {
-    auto a = Kokkos::Experimental::simd<std::uint64_t, Abi>(1);
-    auto b = Kokkos::Experimental::simd<std::int32_t, Abi>(a);
-    checker.truth(all_of(b == decltype(b)(1)));
-  }
-  {
-    auto a = Kokkos::Experimental::simd_mask<double, Abi>(true);
-    auto b = Kokkos::Experimental::simd_mask<std::int32_t, Abi>(a);
-    checker.truth(b == decltype(b)(true));
-  }
-  {
-    auto a = Kokkos::Experimental::simd_mask<std::int32_t, Abi>(true);
-    auto b = Kokkos::Experimental::simd_mask<std::uint64_t, Abi>(a);
-    checker.truth(b == decltype(b)(true));
-  }
-  {
-    auto a = Kokkos::Experimental::simd_mask<std::int32_t, Abi>(true);
-    auto b = Kokkos::Experimental::simd_mask<std::int64_t, Abi>(a);
-    checker.truth(b == decltype(b)(true));
-  }
-  {
-    auto a = Kokkos::Experimental::simd_mask<std::int32_t, Abi>(true);
-    auto b = Kokkos::Experimental::simd_mask<double, Abi>(a);
-    checker.truth(b == decltype(b)(true));
-  }
-}
-
-template <class Abi>
-KOKKOS_INLINE_FUNCTION void device_check_shifts() {
-  kokkos_checker checker;
-  auto a = Kokkos::Experimental::simd<std::uint64_t, Abi>(8);
-  auto b = a >> 1;
-  checker.truth(all_of(b == decltype(b)(4)));
-}
-
-template <class Abi>
-KOKKOS_INLINE_FUNCTION void device_check_condition() {
-  kokkos_checker checker;
-  auto a = Kokkos::Experimental::condition(
-      Kokkos::Experimental::simd<std::int32_t, Abi>(1) > 0,
-      Kokkos::Experimental::simd<std::uint64_t, Abi>(16),
-      Kokkos::Experimental::simd<std::uint64_t, Abi>(20));
-  checker.truth(all_of(a == decltype(a)(16)));
-}
-
-template <class Abi>
-inline void host_check_abi() {
-  host_check_math_ops<Abi>();
-  host_check_mask_ops<Abi>();
-  host_check_conversions<Abi>();
-  host_check_shifts<Abi>();
-  host_check_condition<Abi>();
-}
-
-template <class Abi>
-KOKKOS_INLINE_FUNCTION void device_check_abi() {
-  device_check_math_ops<Abi>();
-  device_check_mask_ops<Abi>();
-  device_check_conversions<Abi>();
-  device_check_shifts<Abi>();
-  device_check_condition<Abi>();
-}
-
-inline void host_check_abis(Kokkos::Experimental::Impl::abi_set<>) {}
-
-KOKKOS_INLINE_FUNCTION void device_check_abis(
-    Kokkos::Experimental::Impl::abi_set<>) {}
-
-template <class FirstAbi, class... RestAbis>
-inline void host_check_abis(
-    Kokkos::Experimental::Impl::abi_set<FirstAbi, RestAbis...>) {
-  host_check_abi<FirstAbi>();
-  host_check_abis(Kokkos::Experimental::Impl::abi_set<RestAbis...>());
-}
-
-template <class FirstAbi, class... RestAbis>
-KOKKOS_INLINE_FUNCTION void device_check_abis(
-    Kokkos::Experimental::Impl::abi_set<FirstAbi, RestAbis...>) {
-  device_check_abi<FirstAbi>();
-  device_check_abis(Kokkos::Experimental::Impl::abi_set<RestAbis...>());
-}
-
-TEST(simd, host) {
-  host_check_abis(Kokkos::Experimental::Impl::host_abi_set());
-}
-
-class simd_device_functor {
- public:
-  KOKKOS_INLINE_FUNCTION void operator()(int) const {
-    device_check_abis(Kokkos::Experimental::Impl::device_abi_set());
-  }
-};
-
-TEST(simd, device) {
-  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::IndexType<int>>(0, 1),
-                       simd_device_functor());
-}
-
-TEST(simd, test_size) {
-#if defined(KOKKOS_ARCH_AVX512XEON)
-  constexpr auto width = 8;
-  using Abi = Kokkos::Experimental::simd_abi::avx512_fixed_size<width>;
-  static_assert(width ==
-                Kokkos::Experimental::simd<std::uint32_t, Abi>::size());
-
-#elif defined(KOKKOS_ARCH_AVX2)
-  constexpr auto width = 4;
-  using Abi            = Kokkos::Experimental::simd_abi::avx2_fixed_size<width>;
-
-#elif defined(__ARM_NEON)
-  constexpr auto width = 2;
-  using Abi            = Kokkos::Experimental::simd_abi::neon_fixed_size<width>;
-
-#else
-  constexpr auto width = 1;
-  using Abi            = Kokkos::Experimental::simd_abi::scalar;
-  static_assert(width ==
-                Kokkos::Experimental::simd<std::uint32_t, Abi>::size());
-#endif
-
-  static_assert(width == Kokkos::Experimental::simd<double, Abi>::size());
-  static_assert(width == Kokkos::Experimental::simd<std::int64_t, Abi>::size());
-  static_assert(width ==
-                Kokkos::Experimental::simd<std::uint64_t, Abi>::size());
-  static_assert(width == Kokkos::Experimental::simd<std::int32_t, Abi>::size());
-}
+#include <TestSIMD_MathOps.hpp>
+#include <TestSIMD_MaskOps.hpp>
+#include <TestSIMD_Conversions.hpp>
+#include <TestSIMD_ShiftOps.hpp>
+#include <TestSIMD_Condition.hpp>
+#include <TestSIMD_GeneratorCtors.hpp>
+#include <TestSIMD_WhereExpressions.hpp>
diff --git a/packages/kokkos/simd/unit_tests/include/SIMDTesting_Ops.hpp b/packages/kokkos/simd/unit_tests/include/SIMDTesting_Ops.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..6529f20e66ac416707276bd1479104f8dd30bfc6
--- /dev/null
+++ b/packages/kokkos/simd/unit_tests/include/SIMDTesting_Ops.hpp
@@ -0,0 +1,212 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_SIMD_TESTING_OPS_HPP
+#define KOKKOS_SIMD_TESTING_OPS_HPP
+
+#include <Kokkos_SIMD.hpp>
+
+class plus {
+ public:
+  template <class T>
+  auto on_host(T const& a, T const& b) const {
+    return a + b;
+  }
+  template <class T>
+  KOKKOS_INLINE_FUNCTION auto on_device(T const& a, T const& b) const {
+    return a + b;
+  }
+};
+
+class minus {
+ public:
+  template <class T>
+  auto on_host(T const& a, T const& b) const {
+    return a - b;
+  }
+  template <class T>
+  KOKKOS_INLINE_FUNCTION auto on_device(T const& a, T const& b) const {
+    return a - b;
+  }
+};
+
+class multiplies {
+ public:
+  template <class T>
+  auto on_host(T const& a, T const& b) const {
+    return a * b;
+  }
+  template <class T>
+  KOKKOS_INLINE_FUNCTION auto on_device(T const& a, T const& b) const {
+    return a * b;
+  }
+};
+
+class divides {
+ public:
+  template <class T>
+  auto on_host(T const& a, T const& b) const {
+    return a / b;
+  }
+  template <class T>
+  KOKKOS_INLINE_FUNCTION auto on_device(T const& a, T const& b) const {
+    return a / b;
+  }
+};
+
+class absolutes {
+  template <typename T>
+  static KOKKOS_FUNCTION auto abs_impl(T const& x) {
+    if constexpr (std::is_signed_v<T>) {
+      return Kokkos::abs(x);
+    }
+    return x;
+  }
+
+ public:
+  template <typename T>
+  auto on_host(T const& a) const {
+    if constexpr (std::is_signed_v<typename T::value_type>) {
+#if defined(KOKKOS_ENABLE_DEPRECATED_CODE_4)
+      return Kokkos::Experimental::abs(a);
+#else
+      return Kokkos::abs(a);
+#endif
+    }
+    return a;
+  }
+  template <typename T>
+  auto on_host_serial(T const& a) const {
+    return abs_impl(a);
+  }
+  template <typename T>
+  KOKKOS_INLINE_FUNCTION auto on_device(T const& a) const {
+    if constexpr (std::is_signed_v<typename T::value_type>) {
+      return Kokkos::abs(a);
+    }
+    return a;
+  }
+  template <typename T>
+  KOKKOS_INLINE_FUNCTION auto on_device_serial(T const& a) const {
+    return abs_impl(a);
+  }
+};
+
+class floors {
+ public:
+  template <typename T>
+  auto on_host(T const& a) const {
+    return Kokkos::floor(a);
+  }
+  template <typename T>
+  auto on_host_serial(T const& a) const {
+    return Kokkos::floor(a);
+  }
+  template <typename T>
+  KOKKOS_INLINE_FUNCTION auto on_device(T const& a) const {
+    return Kokkos::floor(a);
+  }
+  template <typename T>
+  KOKKOS_INLINE_FUNCTION auto on_device_serial(T const& a) const {
+    return Kokkos::floor(a);
+  }
+};
+
+class ceils {
+ public:
+  template <typename T>
+  auto on_host(T const& a) const {
+    return Kokkos::ceil(a);
+  }
+  template <typename T>
+  auto on_host_serial(T const& a) const {
+    return Kokkos::ceil(a);
+  }
+  template <typename T>
+  KOKKOS_INLINE_FUNCTION auto on_device(T const& a) const {
+    return Kokkos::ceil(a);
+  }
+  template <typename T>
+  KOKKOS_INLINE_FUNCTION auto on_device_serial(T const& a) const {
+    return Kokkos::ceil(a);
+  }
+};
+
+class rounds {
+ public:
+  template <typename T>
+  auto on_host(T const& a) const {
+    return Kokkos::round(a);
+  }
+  template <typename T>
+  auto on_host_serial(T const& a) const {
+    return std::rint(a);
+  }
+  template <typename T>
+  KOKKOS_INLINE_FUNCTION auto on_device(T const& a) const {
+    return Kokkos::round(a);
+  }
+  template <typename T>
+  KOKKOS_INLINE_FUNCTION auto on_device_serial(T const& a) const {
+    return Kokkos::Experimental::round_half_to_nearest_even(a);
+  }
+};
+
+class truncates {
+ public:
+  template <typename T>
+  auto on_host(T const& a) const {
+    return Kokkos::trunc(a);
+  }
+  template <typename T>
+  auto on_host_serial(T const& a) const {
+    return Kokkos::trunc(a);
+  }
+  template <typename T>
+  KOKKOS_INLINE_FUNCTION auto on_device(T const& a) const {
+    return Kokkos::trunc(a);
+  }
+  template <typename T>
+  KOKKOS_INLINE_FUNCTION auto on_device_serial(T const& a) const {
+    return Kokkos::trunc(a);
+  }
+};
+
+class shift_right {
+ public:
+  template <typename T, typename U>
+  auto on_host(T&& a, U&& b) const {
+    return a >> b;
+  }
+  template <typename T, typename U>
+  KOKKOS_INLINE_FUNCTION auto on_device(T&& a, U&& b) const {
+    return a >> b;
+  }
+};
+
+class shift_left {
+ public:
+  template <typename T, typename U>
+  auto on_host(T&& a, U&& b) const {
+    return a << b;
+  }
+  template <typename T, typename U>
+  KOKKOS_INLINE_FUNCTION auto on_device(T&& a, U&& b) const {
+    return a << b;
+  }
+};
+
+#endif
diff --git a/packages/kokkos/simd/unit_tests/include/SIMDTesting_Utilities.hpp b/packages/kokkos/simd/unit_tests/include/SIMDTesting_Utilities.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ae2ab2c697c5c1173f83b387bebe21b2e0f7b31b
--- /dev/null
+++ b/packages/kokkos/simd/unit_tests/include/SIMDTesting_Utilities.hpp
@@ -0,0 +1,167 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_SIMD_TESTING_UTILITIES_HPP
+#define KOKKOS_SIMD_TESTING_UTILITIES_HPP
+
+#include <gtest/gtest.h>
+#include <Kokkos_SIMD.hpp>
+#include <SIMDTesting_Ops.hpp>
+
+class gtest_checker {
+ public:
+  void truth(bool x) const { EXPECT_TRUE(x); }
+  template <class T>
+  void equality(T const& a, T const& b) const {
+    EXPECT_EQ(a, b);
+  }
+};
+
+class kokkos_checker {
+ public:
+  KOKKOS_INLINE_FUNCTION void truth(bool x) const {
+    if (!x) Kokkos::abort("SIMD unit test truth condition failed on device");
+  }
+  template <class T>
+  KOKKOS_INLINE_FUNCTION void equality(T const& a, T const& b) const {
+    if (a != b)
+      Kokkos::abort("SIMD unit test equality condition failed on device");
+  }
+};
+
+template <class T, class Abi>
+inline void host_check_equality(
+    Kokkos::Experimental::simd<T, Abi> const& expected_result,
+    Kokkos::Experimental::simd<T, Abi> const& computed_result,
+    std::size_t nlanes) {
+  gtest_checker checker;
+  for (std::size_t i = 0; i < nlanes; ++i) {
+    checker.equality(expected_result[i], computed_result[i]);
+  }
+  using mask_type = typename Kokkos::Experimental::simd<T, Abi>::mask_type;
+  mask_type mask(false);
+  for (std::size_t i = 0; i < nlanes; ++i) {
+    mask[i] = true;
+  }
+  checker.equality((expected_result == computed_result) && mask, mask);
+}
+
+template <class T, class Abi>
+KOKKOS_INLINE_FUNCTION void device_check_equality(
+    Kokkos::Experimental::simd<T, Abi> const& expected_result,
+    Kokkos::Experimental::simd<T, Abi> const& computed_result,
+    std::size_t nlanes) {
+  kokkos_checker checker;
+  for (std::size_t i = 0; i < nlanes; ++i) {
+    checker.equality(expected_result[i], computed_result[i]);
+  }
+  using mask_type = typename Kokkos::Experimental::simd<T, Abi>::mask_type;
+  mask_type mask(false);
+  for (std::size_t i = 0; i < nlanes; ++i) {
+    mask[i] = true;
+  }
+  checker.equality((expected_result == computed_result) && mask, mask);
+}
+
+template <typename T, typename Abi>
+KOKKOS_INLINE_FUNCTION void check_equality(
+    Kokkos::Experimental::simd<T, Abi> const& expected_result,
+    Kokkos::Experimental::simd<T, Abi> const& computed_result,
+    std::size_t nlanes) {
+  KOKKOS_IF_ON_HOST(
+      (host_check_equality(expected_result, computed_result, nlanes);))
+  KOKKOS_IF_ON_DEVICE(
+      (device_check_equality(expected_result, computed_result, nlanes);))
+}
+
+class load_element_aligned {
+ public:
+  template <class T, class Abi>
+  bool host_load(T const* mem, std::size_t n,
+                 Kokkos::Experimental::simd<T, Abi>& result) const {
+    if (n < result.size()) return false;
+    result.copy_from(mem, Kokkos::Experimental::element_aligned_tag());
+    return true;
+  }
+  template <class T, class Abi>
+  KOKKOS_INLINE_FUNCTION bool device_load(
+      T const* mem, std::size_t n,
+      Kokkos::Experimental::simd<T, Abi>& result) const {
+    if (n < result.size()) return false;
+    result.copy_from(mem, Kokkos::Experimental::element_aligned_tag());
+    return true;
+  }
+};
+
+class load_masked {
+ public:
+  template <class T, class Abi>
+  bool host_load(T const* mem, std::size_t n,
+                 Kokkos::Experimental::simd<T, Abi>& result) const {
+    using mask_type = typename Kokkos::Experimental::simd<T, Abi>::mask_type;
+    mask_type mask(false);
+    for (std::size_t i = 0; i < n; ++i) {
+      mask[i] = true;
+    }
+    where(mask, result)
+        .copy_from(mem, Kokkos::Experimental::element_aligned_tag());
+    where(!mask, result) = 0;
+    return true;
+  }
+  template <class T, class Abi>
+  KOKKOS_INLINE_FUNCTION bool device_load(
+      T const* mem, std::size_t n,
+      Kokkos::Experimental::simd<T, Abi>& result) const {
+    using mask_type = typename Kokkos::Experimental::simd<T, Abi>::mask_type;
+    mask_type mask(false);
+    for (std::size_t i = 0; i < n; ++i) {
+      mask[i] = true;
+    }
+    where(mask, result)
+        .copy_from(mem, Kokkos::Experimental::element_aligned_tag());
+    where(!mask, result) = T(0);
+    return true;
+  }
+};
+
+class load_as_scalars {
+ public:
+  template <class T, class Abi>
+  bool host_load(T const* mem, std::size_t n,
+                 Kokkos::Experimental::simd<T, Abi>& result) const {
+    for (std::size_t i = 0; i < n; ++i) {
+      result[i] = mem[i];
+    }
+    for (std::size_t i = n; i < result.size(); ++i) {
+      result[i] = T(0);
+    }
+    return true;
+  }
+  template <class T, class Abi>
+  KOKKOS_INLINE_FUNCTION bool device_load(
+      T const* mem, std::size_t n,
+      Kokkos::Experimental::simd<T, Abi>& result) const {
+    for (std::size_t i = 0; i < n; ++i) {
+      result[i] = mem[i];
+    }
+    for (std::size_t i = n; i < result.size(); ++i) {
+      result[i] = T(0);
+    }
+    return true;
+  }
+};
+
+#endif
diff --git a/packages/kokkos/simd/unit_tests/include/TestSIMD_Condition.hpp b/packages/kokkos/simd/unit_tests/include/TestSIMD_Condition.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f8d8cc70fa4ed91e1731d2510d6a8f7e70685358
--- /dev/null
+++ b/packages/kokkos/simd/unit_tests/include/TestSIMD_Condition.hpp
@@ -0,0 +1,105 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_TEST_SIMD_CONDITION_HPP
+#define KOKKOS_TEST_SIMD_CONDITION_HPP
+
+#include <Kokkos_SIMD.hpp>
+#include <SIMDTesting_Utilities.hpp>
+
+template <typename Abi, typename DataType>
+inline void host_check_condition() {
+  using simd_type = typename Kokkos::Experimental::simd<DataType, Abi>;
+  using mask_type = typename simd_type::mask_type;
+
+  auto condition_op = [](mask_type const& mask, simd_type const& a,
+                         simd_type const& b) {
+    return Kokkos::Experimental::condition(mask, a, b);
+  };
+
+  simd_type value_a(16);
+  simd_type value_b(20);
+
+  auto condition_result = condition_op(mask_type(false), value_a, value_b);
+  EXPECT_TRUE(all_of(condition_result == value_b));
+  condition_result = condition_op(mask_type(true), value_a, value_b);
+  EXPECT_TRUE(all_of(condition_result == value_a));
+}
+
+template <typename Abi, typename... DataTypes>
+inline void host_check_condition_all_types(
+    Kokkos::Experimental::Impl::data_types<DataTypes...>) {
+  (host_check_condition<Abi, DataTypes>(), ...);
+}
+
+template <typename... Abis>
+inline void host_check_condition_all_abis(
+    Kokkos::Experimental::Impl::abi_set<Abis...>) {
+  using DataTypes = Kokkos::Experimental::Impl::data_type_set;
+  (host_check_condition_all_types<Abis>(DataTypes()), ...);
+}
+
+template <typename Abi, typename DataType>
+KOKKOS_INLINE_FUNCTION void device_check_condition() {
+  using simd_type = typename Kokkos::Experimental::simd<DataType, Abi>;
+  using mask_type = typename simd_type::mask_type;
+  kokkos_checker checker;
+
+  auto condition_op = [](mask_type const& mask, simd_type const& a,
+                         simd_type const& b) {
+    return Kokkos::Experimental::condition(mask, a, b);
+  };
+
+  simd_type value_a(16);
+  simd_type value_b(20);
+
+  auto condition_result = condition_op(mask_type(false), value_a, value_b);
+  checker.truth(all_of(condition_result == value_b));
+  condition_result = condition_op(mask_type(true), value_a, value_b);
+  checker.truth(all_of(condition_result == value_a));
+}
+
+template <typename Abi, typename... DataTypes>
+KOKKOS_INLINE_FUNCTION void device_check_condition_all_types(
+    Kokkos::Experimental::Impl::data_types<DataTypes...>) {
+  (device_check_condition<Abi, DataTypes>(), ...);
+}
+
+template <typename... Abis>
+KOKKOS_INLINE_FUNCTION void device_check_condition_all_abis(
+    Kokkos::Experimental::Impl::abi_set<Abis...>) {
+  using DataTypes = Kokkos::Experimental::Impl::data_type_set;
+  (device_check_condition_all_types<Abis>(DataTypes()), ...);
+}
+
+class simd_device_condition_functor {
+ public:
+  KOKKOS_INLINE_FUNCTION void operator()(int) const {
+    device_check_condition_all_abis(
+        Kokkos::Experimental::Impl::device_abi_set());
+  }
+};
+
+TEST(simd, host_condition) {
+  host_check_condition_all_abis(Kokkos::Experimental::Impl::host_abi_set());
+}
+
+TEST(simd, device_condition) {
+  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::IndexType<int>>(0, 1),
+                       simd_device_condition_functor());
+}
+
+#endif
diff --git a/packages/kokkos/simd/unit_tests/include/TestSIMD_Conversions.hpp b/packages/kokkos/simd/unit_tests/include/TestSIMD_Conversions.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b98871bbab80921628001b3dc4e6d1284bf4d10d
--- /dev/null
+++ b/packages/kokkos/simd/unit_tests/include/TestSIMD_Conversions.hpp
@@ -0,0 +1,131 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_TEST_SIMD_CONVERSIONS_HPP
+#define KOKKOS_TEST_SIMD_CONVERSIONS_HPP
+
+#include <Kokkos_SIMD.hpp>
+#include <SIMDTesting_Utilities.hpp>
+
+template <typename Abi>
+inline void host_check_conversions() {
+  {
+    auto a = Kokkos::Experimental::simd<std::uint64_t, Abi>(1);
+    auto b = Kokkos::Experimental::simd<std::int64_t, Abi>(a);
+    EXPECT_TRUE(all_of(b == decltype(b)(1)));
+  }
+  {
+    auto a = Kokkos::Experimental::simd<std::int32_t, Abi>(1);
+    auto b = Kokkos::Experimental::simd<std::uint64_t, Abi>(a);
+    EXPECT_TRUE(all_of(b == decltype(b)(1)));
+  }
+  {
+    auto a = Kokkos::Experimental::simd<std::uint64_t, Abi>(1);
+    auto b = Kokkos::Experimental::simd<std::int32_t, Abi>(a);
+    EXPECT_TRUE(all_of(b == decltype(b)(1)));
+  }
+  {
+    auto a = Kokkos::Experimental::simd_mask<double, Abi>(true);
+    auto b = Kokkos::Experimental::simd_mask<std::int32_t, Abi>(a);
+    EXPECT_TRUE(b == decltype(b)(true));
+  }
+  {
+    auto a = Kokkos::Experimental::simd_mask<std::int32_t, Abi>(true);
+    auto b = Kokkos::Experimental::simd_mask<std::uint64_t, Abi>(a);
+    EXPECT_TRUE(b == decltype(b)(true));
+  }
+  {
+    auto a = Kokkos::Experimental::simd_mask<std::int32_t, Abi>(true);
+    auto b = Kokkos::Experimental::simd_mask<std::int64_t, Abi>(a);
+    EXPECT_TRUE(b == decltype(b)(true));
+  }
+  {
+    auto a = Kokkos::Experimental::simd_mask<std::int32_t, Abi>(true);
+    auto b = Kokkos::Experimental::simd_mask<double, Abi>(a);
+    EXPECT_TRUE(b == decltype(b)(true));
+  }
+}
+
+template <typename... Abis>
+inline void host_check_conversions_all_abis(
+    Kokkos::Experimental::Impl::abi_set<Abis...>) {
+  (host_check_conversions<Abis>(), ...);
+}
+
+template <typename Abi>
+KOKKOS_INLINE_FUNCTION void device_check_conversions() {
+  kokkos_checker checker;
+  {
+    auto a = Kokkos::Experimental::simd<std::uint64_t, Abi>(1);
+    auto b = Kokkos::Experimental::simd<std::int64_t, Abi>(a);
+    checker.truth(all_of(b == decltype(b)(1)));
+  }
+  {
+    auto a = Kokkos::Experimental::simd<std::int32_t, Abi>(1);
+    auto b = Kokkos::Experimental::simd<std::uint64_t, Abi>(a);
+    checker.truth(all_of(b == decltype(b)(1)));
+  }
+  {
+    auto a = Kokkos::Experimental::simd<std::uint64_t, Abi>(1);
+    auto b = Kokkos::Experimental::simd<std::int32_t, Abi>(a);
+    checker.truth(all_of(b == decltype(b)(1)));
+  }
+  {
+    auto a = Kokkos::Experimental::simd_mask<double, Abi>(true);
+    auto b = Kokkos::Experimental::simd_mask<std::int32_t, Abi>(a);
+    checker.truth(b == decltype(b)(true));
+  }
+  {
+    auto a = Kokkos::Experimental::simd_mask<std::int32_t, Abi>(true);
+    auto b = Kokkos::Experimental::simd_mask<std::uint64_t, Abi>(a);
+    checker.truth(b == decltype(b)(true));
+  }
+  {
+    auto a = Kokkos::Experimental::simd_mask<std::int32_t, Abi>(true);
+    auto b = Kokkos::Experimental::simd_mask<std::int64_t, Abi>(a);
+    checker.truth(b == decltype(b)(true));
+  }
+  {
+    auto a = Kokkos::Experimental::simd_mask<std::int32_t, Abi>(true);
+    auto b = Kokkos::Experimental::simd_mask<double, Abi>(a);
+    checker.truth(b == decltype(b)(true));
+  }
+}
+
+template <typename... Abis>
+KOKKOS_INLINE_FUNCTION void device_check_conversions_all_abis(
+    Kokkos::Experimental::Impl::abi_set<Abis...>) {
+  (device_check_conversions<Abis>(), ...);
+}
+
+class simd_device_conversions_functor {
+ public:
+  KOKKOS_INLINE_FUNCTION void operator()(int) const {
+    device_check_conversions_all_abis(
+        Kokkos::Experimental::Impl::device_abi_set());
+  }
+};
+
+TEST(simd, host_conversions) {
+  host_check_conversions_all_abis(Kokkos::Experimental::Impl::host_abi_set());
+}
+
+TEST(simd, device_conversions) {
+  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::IndexType<int>>(0, 1),
+                       simd_device_conversions_functor());
+}
+
+#endif
diff --git a/packages/kokkos/simd/unit_tests/include/TestSIMD_GeneratorCtors.hpp b/packages/kokkos/simd/unit_tests/include/TestSIMD_GeneratorCtors.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..4feff3a89d2846b93f55b7ef0933d692da757b7c
--- /dev/null
+++ b/packages/kokkos/simd/unit_tests/include/TestSIMD_GeneratorCtors.hpp
@@ -0,0 +1,140 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_TEST_SIMD_GENERATOR_CTORS_HPP
+#define KOKKOS_TEST_SIMD_GENERATOR_CTORS_HPP
+
+#include <Kokkos_SIMD.hpp>
+#include <SIMDTesting_Utilities.hpp>
+
+template <typename Abi, typename DataType>
+inline void host_check_gen_ctor() {
+  using simd_type             = Kokkos::Experimental::simd<DataType, Abi>;
+  using mask_type             = typename simd_type::mask_type;
+  constexpr std::size_t lanes = simd_type::size();
+
+  DataType init[lanes];
+  DataType expected[lanes];
+  mask_type init_mask(false);
+
+  for (std::size_t i = 0; i < lanes; ++i) {
+    if (i % 3 == 0) init_mask[i] = true;
+    init[i]     = 7;
+    expected[i] = (init_mask[i]) ? init[i] * 9 : init[i];
+  }
+
+  simd_type rhs;
+  rhs.copy_from(init, Kokkos::Experimental::element_aligned_tag());
+
+  simd_type blend;
+  blend.copy_from(expected, Kokkos::Experimental::element_aligned_tag());
+
+  if constexpr (std::is_same_v<Abi, Kokkos::Experimental::simd_abi::scalar>) {
+    simd_type basic(KOKKOS_LAMBDA(std::size_t i) { return init[i]; });
+    host_check_equality(basic, rhs, lanes);
+
+    simd_type lhs(KOKKOS_LAMBDA(std::size_t i) { return init[i] * 9; });
+    mask_type mask(KOKKOS_LAMBDA(std::size_t i) { return init_mask[i]; });
+    simd_type result(
+        KOKKOS_LAMBDA(std::size_t i) { return (mask[i]) ? lhs[i] : rhs[i]; });
+
+    host_check_equality(blend, result, lanes);
+  } else {
+    simd_type basic([=](std::size_t i) { return init[i]; });
+    host_check_equality(basic, rhs, lanes);
+
+    simd_type lhs([=](std::size_t i) { return init[i] * 9; });
+    mask_type mask([=](std::size_t i) { return init_mask[i]; });
+    simd_type result(
+        [=](std::size_t i) { return (mask[i]) ? lhs[i] : rhs[i]; });
+
+    host_check_equality(blend, result, lanes);
+  }
+}
+
+template <typename Abi, typename... DataTypes>
+inline void host_check_gen_ctors_all_types(
+    Kokkos::Experimental::Impl::data_types<DataTypes...>) {
+  (host_check_gen_ctor<Abi, DataTypes>(), ...);
+}
+
+template <typename... Abis>
+inline void host_check_gen_ctors_all_abis(
+    Kokkos::Experimental::Impl::abi_set<Abis...>) {
+  using DataTypes = Kokkos::Experimental::Impl::data_type_set;
+  (host_check_gen_ctors_all_types<Abis>(DataTypes()), ...);
+}
+
+template <typename Abi, typename DataType>
+KOKKOS_INLINE_FUNCTION void device_check_gen_ctor() {
+  using simd_type             = Kokkos::Experimental::simd<DataType, Abi>;
+  using mask_type             = typename simd_type::mask_type;
+  constexpr std::size_t lanes = simd_type::size();
+
+  DataType init[lanes];
+  DataType expected[lanes];
+  mask_type mask(false);
+
+  for (std::size_t i = 0; i < lanes; ++i) {
+    if (i % 3 == 0) mask[i] = true;
+    init[i]     = 7;
+    expected[i] = (mask[i]) ? init[i] * 9 : init[i];
+  }
+
+  simd_type basic(KOKKOS_LAMBDA(std::size_t i) { return init[i]; });
+  simd_type rhs;
+  rhs.copy_from(init, Kokkos::Experimental::element_aligned_tag());
+  device_check_equality(basic, rhs, lanes);
+
+  simd_type lhs(KOKKOS_LAMBDA(std::size_t i) { return init[i] * 9; });
+  simd_type result(
+      KOKKOS_LAMBDA(std::size_t i) { return (mask[i]) ? lhs[i] : rhs[i]; });
+
+  simd_type blend;
+  blend.copy_from(expected, Kokkos::Experimental::element_aligned_tag());
+  device_check_equality(result, blend, lanes);
+}
+
+template <typename Abi, typename... DataTypes>
+KOKKOS_INLINE_FUNCTION void device_check_gen_ctors_all_types(
+    Kokkos::Experimental::Impl::data_types<DataTypes...>) {
+  (device_check_gen_ctor<Abi, DataTypes>(), ...);
+}
+
+template <typename... Abis>
+KOKKOS_INLINE_FUNCTION void device_check_gen_ctors_all_abis(
+    Kokkos::Experimental::Impl::abi_set<Abis...>) {
+  using DataTypes = Kokkos::Experimental::Impl::data_type_set;
+  (device_check_gen_ctors_all_types<Abis>(DataTypes()), ...);
+}
+
+class simd_device_gen_ctor_functor {
+ public:
+  KOKKOS_INLINE_FUNCTION void operator()(int) const {
+    device_check_gen_ctors_all_abis(
+        Kokkos::Experimental::Impl::device_abi_set());
+  }
+};
+
+TEST(simd, host_gen_ctors) {
+  host_check_gen_ctors_all_abis(Kokkos::Experimental::Impl::host_abi_set());
+}
+
+TEST(simd, device_gen_ctors) {
+  Kokkos::parallel_for(1, simd_device_gen_ctor_functor());
+}
+
+#endif
diff --git a/packages/kokkos/simd/unit_tests/include/TestSIMD_MaskOps.hpp b/packages/kokkos/simd/unit_tests/include/TestSIMD_MaskOps.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a93c52e9a8d56b139668ffa717f73cdff017f3eb
--- /dev/null
+++ b/packages/kokkos/simd/unit_tests/include/TestSIMD_MaskOps.hpp
@@ -0,0 +1,116 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_TEST_SIMD_MASK_OPS_HPP
+#define KOKKOS_TEST_SIMD_MASK_OPS_HPP
+
+#include <Kokkos_SIMD.hpp>
+#include <SIMDTesting_Utilities.hpp>
+
+template <typename Abi, typename DataType>
+inline void host_check_mask_ops() {
+  using mask_type = Kokkos::Experimental::simd_mask<DataType, Abi>;
+
+  EXPECT_FALSE(none_of(mask_type(true)));
+  EXPECT_TRUE(none_of(mask_type(false)));
+  EXPECT_TRUE(all_of(mask_type(true)));
+  EXPECT_FALSE(all_of(mask_type(false)));
+  EXPECT_TRUE(any_of(mask_type(true)));
+  EXPECT_FALSE(any_of(mask_type(false)));
+
+  for (std::size_t i = 0; i < mask_type::size(); ++i) {
+    mask_type test_mask(KOKKOS_LAMBDA(std::size_t j) { return i == j; });
+
+    EXPECT_TRUE(any_of(test_mask));
+    EXPECT_FALSE(none_of(test_mask));
+
+    if constexpr (mask_type::size() > 1) {
+      EXPECT_FALSE(all_of(test_mask));
+    } else {
+      EXPECT_TRUE(all_of(test_mask));
+    }
+  }
+}
+
+template <typename Abi, typename... DataTypes>
+inline void host_check_mask_ops_all_types(
+    Kokkos::Experimental::Impl::data_types<DataTypes...>) {
+  (host_check_mask_ops<Abi, DataTypes>(), ...);
+}
+
+template <typename... Abis>
+inline void host_check_mask_ops_all_abis(
+    Kokkos::Experimental::Impl::abi_set<Abis...>) {
+  using DataTypes = Kokkos::Experimental::Impl::data_type_set;
+  (host_check_mask_ops_all_types<Abis>(DataTypes()), ...);
+}
+
+template <typename Abi, typename DataType>
+KOKKOS_INLINE_FUNCTION void device_check_mask_ops() {
+  using mask_type = Kokkos::Experimental::simd_mask<DataType, Abi>;
+  kokkos_checker checker;
+  checker.truth(!none_of(mask_type(true)));
+  checker.truth(none_of(mask_type(false)));
+  checker.truth(all_of(mask_type(true)));
+  checker.truth(!all_of(mask_type(false)));
+  checker.truth(any_of(mask_type(true)));
+  checker.truth(!any_of(mask_type(false)));
+
+  for (std::size_t i = 0; i < mask_type::size(); ++i) {
+    mask_type test_mask(KOKKOS_LAMBDA(std::size_t j) { return i == j; });
+
+    checker.truth(any_of(test_mask));
+    checker.truth(!none_of(test_mask));
+
+    if constexpr (mask_type::size() > 1) {
+      checker.truth(!all_of(test_mask));
+    } else {
+      checker.truth(all_of(test_mask));
+    }
+  }
+}
+
+template <typename Abi, typename... DataTypes>
+KOKKOS_INLINE_FUNCTION void device_check_mask_ops_all_types(
+    Kokkos::Experimental::Impl::data_types<DataTypes...>) {
+  (device_check_mask_ops<Abi, DataTypes>(), ...);
+}
+
+template <typename... Abis>
+KOKKOS_INLINE_FUNCTION void device_check_mask_ops_all_abis(
+    Kokkos::Experimental::Impl::abi_set<Abis...>) {
+  using DataTypes = Kokkos::Experimental::Impl::data_type_set;
+  (device_check_mask_ops_all_types<Abis>(DataTypes()), ...);
+}
+
+class simd_device_mask_ops_functor {
+ public:
+  KOKKOS_INLINE_FUNCTION void operator()(int) const {
+    device_check_mask_ops_all_abis(
+        Kokkos::Experimental::Impl::device_abi_set());
+  }
+};
+
+TEST(simd, host_mask_ops) {
+  host_check_mask_ops_all_abis(Kokkos::Experimental::Impl::host_abi_set());
+}
+
+TEST(simd, device_mask_ops) {
+  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::IndexType<int>>(0, 1),
+                       simd_device_mask_ops_functor());
+}
+
+#endif
diff --git a/packages/kokkos/simd/unit_tests/include/TestSIMD_MathOps.hpp b/packages/kokkos/simd/unit_tests/include/TestSIMD_MathOps.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..802e41efe5f234dafaa0e42a9cba049bd5d11056
--- /dev/null
+++ b/packages/kokkos/simd/unit_tests/include/TestSIMD_MathOps.hpp
@@ -0,0 +1,289 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_TEST_SIMD_MATH_OPS_HPP
+#define KOKKOS_TEST_SIMD_MATH_OPS_HPP
+
+#include <Kokkos_SIMD.hpp>
+#include <SIMDTesting_Utilities.hpp>
+
+template <class Abi, class Loader, class BinaryOp, class T>
+void host_check_math_op_one_loader(BinaryOp binary_op, std::size_t n,
+                                   T const* first_args, T const* second_args) {
+  Loader loader;
+  using simd_type             = Kokkos::Experimental::simd<T, Abi>;
+  constexpr std::size_t width = simd_type::size();
+  for (std::size_t i = 0; i < n; i += width) {
+    std::size_t const nremaining = n - i;
+    std::size_t const nlanes     = Kokkos::min(nremaining, width);
+    simd_type first_arg;
+    bool const loaded_first_arg =
+        loader.host_load(first_args + i, nlanes, first_arg);
+    simd_type second_arg;
+    bool const loaded_second_arg =
+        loader.host_load(second_args + i, nlanes, second_arg);
+    if (!(loaded_first_arg && loaded_second_arg)) continue;
+    simd_type expected_result;
+    // gcc 8.4.0 warns if using nlanes as upper bound about first_arg and/or
+    // second_arg being uninitialized
+    for (std::size_t lane = 0; lane < simd_type::size(); ++lane) {
+      if (lane < nlanes)
+        expected_result[lane] =
+            binary_op.on_host(T(first_arg[lane]), T(second_arg[lane]));
+    }
+    simd_type const computed_result = binary_op.on_host(first_arg, second_arg);
+    host_check_equality(expected_result, computed_result, nlanes);
+  }
+}
+
+template <class Abi, class Loader, class UnaryOp, class T>
+void host_check_math_op_one_loader(UnaryOp unary_op, std::size_t n,
+                                   T const* args) {
+  Loader loader;
+  using simd_type             = Kokkos::Experimental::simd<T, Abi>;
+  constexpr std::size_t width = simd_type::size();
+  for (std::size_t i = 0; i < n; i += width) {
+    std::size_t const nremaining = n - i;
+    std::size_t const nlanes     = Kokkos::min(nremaining, width);
+    simd_type arg;
+    bool const loaded_arg = loader.host_load(args + i, nlanes, arg);
+    if (!loaded_arg) continue;
+    auto computed_result = unary_op.on_host(arg);
+
+    decltype(computed_result) expected_result;
+    for (std::size_t lane = 0; lane < simd_type::size(); ++lane) {
+      if (lane < nlanes)
+        expected_result[lane] = unary_op.on_host_serial(T(arg[lane]));
+    }
+    host_check_equality(expected_result, computed_result, nlanes);
+  }
+}
+
+template <class Abi, class Op, class... T>
+inline void host_check_math_op_all_loaders(Op op, std::size_t n,
+                                           T const*... args) {
+  host_check_math_op_one_loader<Abi, load_element_aligned>(op, n, args...);
+  host_check_math_op_one_loader<Abi, load_masked>(op, n, args...);
+  host_check_math_op_one_loader<Abi, load_as_scalars>(op, n, args...);
+}
+
+template <typename Abi, typename DataType, size_t n>
+inline void host_check_all_math_ops(const DataType (&first_args)[n],
+                                    const DataType (&second_args)[n]) {
+  host_check_math_op_all_loaders<Abi>(plus(), n, first_args, second_args);
+  host_check_math_op_all_loaders<Abi>(minus(), n, first_args, second_args);
+  host_check_math_op_all_loaders<Abi>(multiplies(), n, first_args, second_args);
+  host_check_math_op_all_loaders<Abi>(absolutes(), n, first_args);
+
+  host_check_math_op_all_loaders<Abi>(floors(), n, first_args);
+  host_check_math_op_all_loaders<Abi>(ceils(), n, first_args);
+  host_check_math_op_all_loaders<Abi>(rounds(), n, first_args);
+  host_check_math_op_all_loaders<Abi>(truncates(), n, first_args);
+
+  // TODO: Place fallback implementations for all simd integer types
+  if constexpr (std::is_floating_point_v<DataType>) {
+    host_check_math_op_all_loaders<Abi>(divides(), n, first_args, second_args);
+  }
+}
+
+template <typename Abi, typename DataType>
+inline void host_check_abi_size() {
+  using simd_type = Kokkos::Experimental::simd<DataType, Abi>;
+  using mask_type = typename simd_type::mask_type;
+  static_assert(simd_type::size() == mask_type::size());
+}
+
+template <typename Abi, typename DataType>
+inline void host_check_math_ops() {
+  constexpr size_t n = 11;
+
+  host_check_abi_size<Abi, DataType>();
+
+  if constexpr (!std::is_integral_v<DataType>) {
+    DataType const first_args[n]  = {0.1,  0.4,  0.5, 0.7, 1.0, 1.5,
+                                    -2.0, 10.0, 0.0, 1.2, -2.8};
+    DataType const second_args[n] = {1.0,  0.2, 1.1,  1.8,  -0.1, -3.0,
+                                     -2.4, 1.0, 13.0, -3.2, -2.1};
+    host_check_all_math_ops<Abi>(first_args, second_args);
+  } else {
+    if constexpr (std::is_signed_v<DataType>) {
+      DataType const first_args[n]  = {1, 2, -1, 10, 0, 1, -2, 10, 0, 1, -2};
+      DataType const second_args[n] = {1, 2, 1, 1, 1, -3, -2, 1, 13, -3, -2};
+      host_check_all_math_ops<Abi>(first_args, second_args);
+    } else {
+      DataType const first_args[n]  = {1, 2, 1, 10, 0, 1, 2, 10, 0, 1, 2};
+      DataType const second_args[n] = {1, 2, 1, 1, 1, 3, 2, 1, 13, 3, 2};
+      host_check_all_math_ops<Abi>(first_args, second_args);
+    }
+  }
+}
+
+template <typename Abi, typename... DataTypes>
+inline void host_check_math_ops_all_types(
+    Kokkos::Experimental::Impl::data_types<DataTypes...>) {
+  (host_check_math_ops<Abi, DataTypes>(), ...);
+}
+
+template <typename... Abis>
+inline void host_check_math_ops_all_abis(
+    Kokkos::Experimental::Impl::abi_set<Abis...>) {
+  using DataTypes = Kokkos::Experimental::Impl::data_type_set;
+  (host_check_math_ops_all_types<Abis>(DataTypes()), ...);
+}
+
+template <typename Abi, typename Loader, typename BinaryOp, typename T>
+KOKKOS_INLINE_FUNCTION void device_check_math_op_one_loader(
+    BinaryOp binary_op, std::size_t n, T const* first_args,
+    T const* second_args) {
+  Loader loader;
+  using simd_type             = Kokkos::Experimental::simd<T, Abi>;
+  constexpr std::size_t width = simd_type::size();
+  for (std::size_t i = 0; i < n; i += width) {
+    std::size_t const nremaining = n - i;
+    std::size_t const nlanes     = Kokkos::min(nremaining, width);
+    simd_type first_arg;
+    bool const loaded_first_arg =
+        loader.device_load(first_args + i, nlanes, first_arg);
+    simd_type second_arg;
+    bool const loaded_second_arg =
+        loader.device_load(second_args + i, nlanes, second_arg);
+    if (!(loaded_first_arg && loaded_second_arg)) continue;
+    simd_type expected_result;
+    for (std::size_t lane = 0; lane < nlanes; ++lane) {
+      expected_result[lane] =
+          binary_op.on_device(first_arg[lane], second_arg[lane]);
+    }
+    simd_type const computed_result =
+        binary_op.on_device(first_arg, second_arg);
+    device_check_equality(expected_result, computed_result, nlanes);
+  }
+}
+
+template <typename Abi, typename Loader, typename UnaryOp, typename T>
+KOKKOS_INLINE_FUNCTION void device_check_math_op_one_loader(UnaryOp unary_op,
+                                                            std::size_t n,
+                                                            T const* args) {
+  Loader loader;
+  using simd_type             = Kokkos::Experimental::simd<T, Abi>;
+  constexpr std::size_t width = simd_type::size();
+  for (std::size_t i = 0; i < n; i += width) {
+    std::size_t const nremaining = n - i;
+    std::size_t const nlanes     = Kokkos::min(nremaining, width);
+    simd_type arg;
+    bool const loaded_arg = loader.device_load(args + i, nlanes, arg);
+    if (!loaded_arg) continue;
+    auto computed_result = unary_op.on_device(arg);
+
+    decltype(computed_result) expected_result;
+    for (std::size_t lane = 0; lane < nlanes; ++lane) {
+      expected_result[lane] = unary_op.on_device_serial(arg[lane]);
+    }
+    device_check_equality(expected_result, computed_result, nlanes);
+  }
+}
+
+template <typename Abi, typename Op, typename... T>
+KOKKOS_INLINE_FUNCTION void device_check_math_op_all_loaders(Op op,
+                                                             std::size_t n,
+                                                             T const*... args) {
+  device_check_math_op_one_loader<Abi, load_element_aligned>(op, n, args...);
+  device_check_math_op_one_loader<Abi, load_masked>(op, n, args...);
+  device_check_math_op_one_loader<Abi, load_as_scalars>(op, n, args...);
+}
+
+template <typename Abi, typename DataType, size_t n>
+KOKKOS_INLINE_FUNCTION void device_check_all_math_ops(
+    const DataType (&first_args)[n], const DataType (&second_args)[n]) {
+  device_check_math_op_all_loaders<Abi>(plus(), n, first_args, second_args);
+  device_check_math_op_all_loaders<Abi>(minus(), n, first_args, second_args);
+  device_check_math_op_all_loaders<Abi>(multiplies(), n, first_args,
+                                        second_args);
+  device_check_math_op_all_loaders<Abi>(absolutes(), n, first_args);
+
+  device_check_math_op_all_loaders<Abi>(floors(), n, first_args);
+  device_check_math_op_all_loaders<Abi>(ceils(), n, first_args);
+  device_check_math_op_all_loaders<Abi>(rounds(), n, first_args);
+  device_check_math_op_all_loaders<Abi>(truncates(), n, first_args);
+
+  if constexpr (std::is_floating_point_v<DataType>) {
+    device_check_math_op_all_loaders<Abi>(divides(), n, first_args,
+                                          second_args);
+  }
+}
+
+template <typename Abi, typename DataType>
+KOKKOS_INLINE_FUNCTION void device_check_abi_size() {
+  using simd_type = Kokkos::Experimental::simd<DataType, Abi>;
+  using mask_type = typename simd_type::mask_type;
+  static_assert(simd_type::size() == mask_type::size());
+}
+
+template <typename Abi, typename DataType>
+KOKKOS_INLINE_FUNCTION void device_check_math_ops() {
+  constexpr size_t n = 11;
+
+  device_check_abi_size<Abi, DataType>();
+
+  if constexpr (!std::is_integral_v<DataType>) {
+    DataType const first_args[n]  = {0.1,  0.4,  0.5, 0.7, 1.0, 1.5,
+                                    -2.0, 10.0, 0.0, 1.2, -2.8};
+    DataType const second_args[n] = {1.0,  0.2, 1.1,  1.8,  -0.1, -3.0,
+                                     -2.4, 1.0, 13.0, -3.2, -2.1};
+    device_check_all_math_ops<Abi>(first_args, second_args);
+  } else {
+    if constexpr (std::is_signed_v<DataType>) {
+      DataType const first_args[n]  = {1, 2, -1, 10, 0, 1, -2, 10, 0, 1, -2};
+      DataType const second_args[n] = {1, 2, 1, 1, 1, -3, -2, 1, 13, -3, -2};
+      device_check_all_math_ops<Abi>(first_args, second_args);
+    } else {
+      DataType const first_args[n]  = {1, 2, 1, 10, 0, 1, 2, 10, 0, 1, 2};
+      DataType const second_args[n] = {1, 2, 1, 1, 1, 3, 2, 1, 13, 3, 2};
+      device_check_all_math_ops<Abi>(first_args, second_args);
+    }
+  }
+}
+
+template <typename Abi, typename... DataTypes>
+KOKKOS_INLINE_FUNCTION void device_check_math_ops_all_types(
+    Kokkos::Experimental::Impl::data_types<DataTypes...>) {
+  (device_check_math_ops<Abi, DataTypes>(), ...);
+}
+
+template <typename... Abis>
+KOKKOS_INLINE_FUNCTION void device_check_math_ops_all_abis(
+    Kokkos::Experimental::Impl::abi_set<Abis...>) {
+  using DataTypes = Kokkos::Experimental::Impl::data_type_set;
+  (device_check_math_ops_all_types<Abis>(DataTypes()), ...);
+}
+
+class simd_device_math_ops_functor {
+ public:
+  KOKKOS_INLINE_FUNCTION void operator()(int) const {
+    device_check_math_ops_all_abis(
+        Kokkos::Experimental::Impl::device_abi_set());
+  }
+};
+
+TEST(simd, host_math_ops) {
+  host_check_math_ops_all_abis(Kokkos::Experimental::Impl::host_abi_set());
+}
+
+TEST(simd, device_math_ops) {
+  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::IndexType<int>>(0, 1),
+                       simd_device_math_ops_functor());
+}
+
+#endif
diff --git a/packages/kokkos/simd/unit_tests/include/TestSIMD_ShiftOps.hpp b/packages/kokkos/simd/unit_tests/include/TestSIMD_ShiftOps.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f6fdcb920ed22ed89397cc35eaea03c1cb05b7f1
--- /dev/null
+++ b/packages/kokkos/simd/unit_tests/include/TestSIMD_ShiftOps.hpp
@@ -0,0 +1,280 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_TEST_SIMD_SHIFT_OPS_HPP
+#define KOKKOS_TEST_SIMD_SHIFT_OPS_HPP
+
+#include <Kokkos_SIMD.hpp>
+#include <SIMDTesting_Utilities.hpp>
+
+template <typename Abi, typename Loader, typename ShiftOp, typename DataType>
+inline void host_check_shift_on_one_loader(ShiftOp shift_op,
+                                           DataType test_vals[],
+                                           DataType shift_by[], std::size_t n) {
+  using simd_type             = Kokkos::Experimental::simd<DataType, Abi>;
+  constexpr std::size_t width = simd_type::size();
+  Loader loader;
+
+  for (std::size_t i = 0; i < n; ++i) {
+    simd_type simd_vals;
+    bool const loaded_arg = loader.host_load(test_vals, width, simd_vals);
+    if (!loaded_arg) {
+      continue;
+    }
+
+    simd_type expected_result;
+
+    for (std::size_t lane = 0; lane < width; ++lane) {
+      DataType value = simd_vals[lane];
+      expected_result[lane] =
+          shift_op.on_host(value, static_cast<int>(shift_by[i]));
+      EXPECT_EQ(value, value);
+    }
+
+    simd_type const computed_result =
+        shift_op.on_host(simd_vals, static_cast<int>(shift_by[i]));
+    host_check_equality(expected_result, computed_result, width);
+  }
+}
+
+template <typename Abi, typename Loader, typename ShiftOp, typename DataType>
+inline void host_check_shift_by_lanes_on_one_loader(
+    ShiftOp shift_op, DataType test_vals[],
+    Kokkos::Experimental::simd<DataType, Abi>& shift_by) {
+  using simd_type             = Kokkos::Experimental::simd<DataType, Abi>;
+  constexpr std::size_t width = simd_type::size();
+  Loader loader;
+
+  simd_type simd_vals;
+  bool const loaded_arg = loader.host_load(test_vals, width, simd_vals);
+  ASSERT_TRUE(loaded_arg);
+
+  simd_type expected_result;
+
+  for (std::size_t lane = 0; lane < width; ++lane) {
+    DataType value = simd_vals[lane];
+    expected_result[lane] =
+        shift_op.on_host(value, static_cast<int>(shift_by[lane]));
+    EXPECT_EQ(value, value);
+  }
+  simd_type const computed_result = shift_op.on_host(simd_vals, shift_by);
+  host_check_equality(expected_result, computed_result, width);
+}
+
+template <typename Abi, typename ShiftOp, typename DataType>
+inline void host_check_shift_op_all_loaders(ShiftOp shift_op,
+                                            DataType test_vals[],
+                                            DataType shift_by[],
+                                            std::size_t n) {
+  host_check_shift_on_one_loader<Abi, load_element_aligned>(shift_op, test_vals,
+                                                            shift_by, n);
+  host_check_shift_on_one_loader<Abi, load_masked>(shift_op, test_vals,
+                                                   shift_by, n);
+  host_check_shift_on_one_loader<Abi, load_as_scalars>(shift_op, test_vals,
+                                                       shift_by, n);
+
+  Kokkos::Experimental::simd<DataType, Abi> shift_by_lanes;
+  shift_by_lanes.copy_from(shift_by,
+                           Kokkos::Experimental::element_aligned_tag());
+
+  host_check_shift_by_lanes_on_one_loader<Abi, load_element_aligned>(
+      shift_op, test_vals, shift_by_lanes);
+  host_check_shift_by_lanes_on_one_loader<Abi, load_masked>(shift_op, test_vals,
+                                                            shift_by_lanes);
+  host_check_shift_by_lanes_on_one_loader<Abi, load_as_scalars>(
+      shift_op, test_vals, shift_by_lanes);
+}
+
+template <typename Abi, typename DataType>
+inline void host_check_shift_ops() {
+  if constexpr (std::is_integral_v<DataType>) {
+    using simd_type                 = Kokkos::Experimental::simd<DataType, Abi>;
+    constexpr std::size_t width     = simd_type::size();
+    constexpr std::size_t num_cases = 8;
+
+    DataType max = std::numeric_limits<DataType>::max();
+
+    DataType shift_by[num_cases] = {
+        0, 1, 3, width / 2, width / 2 + 1, width - 1, width, width + 1};
+    DataType test_vals[width];
+    for (std::size_t i = 0; i < width; ++i) {
+      DataType inc = max / width;
+      test_vals[i] = i * inc + 1;
+    }
+
+    host_check_shift_op_all_loaders<Abi>(shift_right(), test_vals, shift_by,
+                                         num_cases);
+    host_check_shift_op_all_loaders<Abi>(shift_left(), test_vals, shift_by,
+                                         num_cases);
+
+    if constexpr (std::is_signed_v<DataType>) {
+      for (std::size_t i = 0; i < width; ++i) test_vals[i] *= -1;
+      host_check_shift_op_all_loaders<Abi>(shift_right(), test_vals, shift_by,
+                                           num_cases);
+      host_check_shift_op_all_loaders<Abi>(shift_left(), test_vals, shift_by,
+                                           num_cases);
+    }
+  }
+}
+
+template <typename Abi, typename... DataTypes>
+inline void host_check_shift_ops_all_types(
+    Kokkos::Experimental::Impl::data_types<DataTypes...>) {
+  (host_check_shift_ops<Abi, DataTypes>(), ...);
+}
+
+template <typename... Abis>
+inline void host_check_shift_ops_all_abis(
+    Kokkos::Experimental::Impl::abi_set<Abis...>) {
+  using DataTypes = Kokkos::Experimental::Impl::data_type_set;
+  (host_check_shift_ops_all_types<Abis>(DataTypes()), ...);
+}
+
+template <typename Abi, typename Loader, typename ShiftOp, typename DataType>
+KOKKOS_INLINE_FUNCTION void device_check_shift_on_one_loader(
+    ShiftOp shift_op, DataType test_vals[], DataType shift_by[],
+    std::size_t n) {
+  using simd_type             = Kokkos::Experimental::simd<DataType, Abi>;
+  constexpr std::size_t width = simd_type::size();
+  Loader loader;
+
+  for (std::size_t i = 0; i < n; ++i) {
+    simd_type simd_vals;
+    bool const loaded_arg = loader.device_load(test_vals, width, simd_vals);
+    if (!loaded_arg) {
+      continue;
+    }
+
+    simd_type expected_result;
+
+    for (std::size_t lane = 0; lane < width; ++lane) {
+      expected_result[lane] = shift_op.on_device(DataType(simd_vals[lane]),
+                                                 static_cast<int>(shift_by[i]));
+    }
+
+    simd_type const computed_result =
+        shift_op.on_device(simd_vals, static_cast<int>(shift_by[i]));
+    device_check_equality(expected_result, computed_result, width);
+  }
+}
+
+template <typename Abi, typename Loader, typename ShiftOp, typename DataType>
+KOKKOS_INLINE_FUNCTION void device_check_shift_by_lanes_on_one_loader(
+    ShiftOp shift_op, DataType test_vals[],
+    Kokkos::Experimental::simd<DataType, Abi>& shift_by) {
+  using simd_type             = Kokkos::Experimental::simd<DataType, Abi>;
+  constexpr std::size_t width = simd_type::size();
+  Loader loader;
+  simd_type simd_vals;
+  loader.device_load(test_vals, width, simd_vals);
+
+  simd_type expected_result;
+
+  for (std::size_t lane = 0; lane < width; ++lane) {
+    expected_result[lane] = shift_op.on_device(
+        DataType(simd_vals[lane]), static_cast<int>(shift_by[lane]));
+  }
+  simd_type const computed_result = shift_op.on_device(simd_vals, shift_by);
+  device_check_equality(expected_result, computed_result, width);
+}
+
+template <typename Abi, typename ShiftOp, typename DataType>
+KOKKOS_INLINE_FUNCTION void device_check_shift_op_all_loaders(
+    ShiftOp shift_op, DataType test_vals[], DataType shift_by[],
+    std::size_t n) {
+  device_check_shift_on_one_loader<Abi, load_element_aligned>(
+      shift_op, test_vals, shift_by, n);
+  device_check_shift_on_one_loader<Abi, load_masked>(shift_op, test_vals,
+                                                     shift_by, n);
+  device_check_shift_on_one_loader<Abi, load_as_scalars>(shift_op, test_vals,
+                                                         shift_by, n);
+
+  Kokkos::Experimental::simd<DataType, Abi> shift_by_lanes;
+  shift_by_lanes.copy_from(shift_by,
+                           Kokkos::Experimental::element_aligned_tag());
+
+  device_check_shift_by_lanes_on_one_loader<Abi, load_element_aligned>(
+      shift_op, test_vals, shift_by_lanes);
+  device_check_shift_by_lanes_on_one_loader<Abi, load_masked>(
+      shift_op, test_vals, shift_by_lanes);
+  device_check_shift_by_lanes_on_one_loader<Abi, load_as_scalars>(
+      shift_op, test_vals, shift_by_lanes);
+}
+
+template <typename Abi, typename DataType>
+KOKKOS_INLINE_FUNCTION void device_check_shift_ops() {
+  if constexpr (std::is_integral_v<DataType>) {
+    using simd_type                 = Kokkos::Experimental::simd<DataType, Abi>;
+    constexpr std::size_t width     = simd_type::size();
+    constexpr std::size_t num_cases = 8;
+
+    DataType max = Kokkos::reduction_identity<DataType>::max();
+
+    DataType shift_by[num_cases] = {
+        0, 1, 3, width / 2, width / 2 + 1, width - 1, width, width + 1};
+    DataType test_vals[width];
+
+    for (std::size_t i = 0; i < width; ++i) {
+      DataType inc = max / width;
+      test_vals[i] = i * inc + 1;
+    }
+
+    device_check_shift_op_all_loaders<Abi>(shift_right(), test_vals, shift_by,
+                                           num_cases);
+    device_check_shift_op_all_loaders<Abi>(shift_left(), test_vals, shift_by,
+                                           num_cases);
+
+    if constexpr (std::is_signed_v<DataType>) {
+      for (std::size_t i = 0; i < width; ++i) test_vals[i] *= -1;
+      device_check_shift_op_all_loaders<Abi>(shift_right(), test_vals, shift_by,
+                                             num_cases);
+      device_check_shift_op_all_loaders<Abi>(shift_left(), test_vals, shift_by,
+                                             num_cases);
+    }
+  }
+}
+
+template <typename Abi, typename... DataTypes>
+KOKKOS_INLINE_FUNCTION void device_check_shift_ops_all_types(
+    Kokkos::Experimental::Impl::data_types<DataTypes...>) {
+  (device_check_shift_ops<Abi, DataTypes>(), ...);
+}
+
+template <typename... Abis>
+KOKKOS_INLINE_FUNCTION void device_check_shift_ops_all_abis(
+    Kokkos::Experimental::Impl::abi_set<Abis...>) {
+  using DataTypes = Kokkos::Experimental::Impl::data_type_set;
+  (device_check_shift_ops_all_types<Abis>(DataTypes()), ...);
+}
+
+class simd_device_shift_ops_functor {
+ public:
+  KOKKOS_INLINE_FUNCTION void operator()(int) const {
+    device_check_shift_ops_all_abis(
+        Kokkos::Experimental::Impl::device_abi_set());
+  }
+};
+
+TEST(simd, host_shift_ops) {
+  host_check_shift_ops_all_abis(Kokkos::Experimental::Impl::host_abi_set());
+}
+
+TEST(simd, device_shift_ops) {
+  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::IndexType<int>>(0, 1),
+                       simd_device_shift_ops_functor());
+}
+
+#endif
diff --git a/packages/kokkos/simd/unit_tests/include/TestSIMD_WhereExpressions.hpp b/packages/kokkos/simd/unit_tests/include/TestSIMD_WhereExpressions.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..129f2b0d5c90c89e8b8c485f23acf8dc7385ee82
--- /dev/null
+++ b/packages/kokkos/simd/unit_tests/include/TestSIMD_WhereExpressions.hpp
@@ -0,0 +1,195 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_TEST_SIMD_WHERE_EXPRESSIONS_HPP
+#define KOKKOS_TEST_SIMD_WHERE_EXPRESSIONS_HPP
+
+#include <Kokkos_SIMD.hpp>
+#include <SIMDTesting_Utilities.hpp>
+
+template <typename Abi, typename DataType>
+inline void host_check_where_expr_scatter_to() {
+  using simd_type  = Kokkos::Experimental::simd<DataType, Abi>;
+  using index_type = Kokkos::Experimental::simd<std::int32_t, Abi>;
+  using mask_type  = typename simd_type::mask_type;
+
+  std::size_t nlanes = simd_type::size();
+  DataType init[]    = {11, 13, 17, 19, 23, 29, 31, 37};
+  simd_type src;
+  src.copy_from(init, Kokkos::Experimental::element_aligned_tag());
+
+  for (std::size_t idx = 0; idx < nlanes; ++idx) {
+    mask_type mask(true);
+    mask[idx] = false;
+
+    DataType dst[8] = {0};
+    index_type index;
+    simd_type expected_result;
+    for (std::size_t i = 0; i < nlanes; ++i) {
+      dst[i]             = (2 + (i * 2));
+      index[i]           = i;
+      expected_result[i] = (mask[i]) ? src[index[i]] : dst[i];
+    }
+    where(mask, src).scatter_to(dst, index);
+
+    simd_type dst_simd;
+    dst_simd.copy_from(dst, Kokkos::Experimental::element_aligned_tag());
+
+    host_check_equality(expected_result, dst_simd, nlanes);
+  }
+}
+
+template <typename Abi, typename DataType>
+inline void host_check_where_expr_gather_from() {
+  using simd_type  = Kokkos::Experimental::simd<DataType, Abi>;
+  using index_type = Kokkos::Experimental::simd<std::int32_t, Abi>;
+  using mask_type  = typename simd_type::mask_type;
+
+  std::size_t nlanes = simd_type::size();
+  DataType src[]     = {11, 13, 17, 19, 23, 29, 31, 37};
+
+  for (std::size_t idx = 0; idx < nlanes; ++idx) {
+    mask_type mask(true);
+    mask[idx] = false;
+
+    simd_type dst;
+    index_type index;
+    simd_type expected_result;
+    for (std::size_t i = 0; i < nlanes; ++i) {
+      dst[i]             = (2 + (i * 2));
+      index[i]           = i;
+      expected_result[i] = (mask[i]) ? src[index[i]] : dst[i];
+    }
+    where(mask, dst).gather_from(src, index);
+
+    host_check_equality(expected_result, dst, nlanes);
+  }
+}
+
+template <class Abi, typename DataType>
+inline void host_check_where_expr() {
+  host_check_where_expr_scatter_to<Abi, DataType>();
+  host_check_where_expr_gather_from<Abi, DataType>();
+}
+
+template <typename Abi, typename... DataTypes>
+inline void host_check_where_expr_all_types(
+    Kokkos::Experimental::Impl::data_types<DataTypes...>) {
+  (host_check_where_expr<Abi, DataTypes>(), ...);
+}
+
+template <typename... Abis>
+inline void host_check_where_expr_all_abis(
+    Kokkos::Experimental::Impl::abi_set<Abis...>) {
+  using DataTypes = Kokkos::Experimental::Impl::data_type_set;
+  (host_check_where_expr_all_types<Abis>(DataTypes()), ...);
+}
+
+template <typename Abi, typename DataType>
+KOKKOS_INLINE_FUNCTION void device_check_where_expr_scatter_to() {
+  using simd_type  = Kokkos::Experimental::simd<DataType, Abi>;
+  using index_type = Kokkos::Experimental::simd<std::int32_t, Abi>;
+  using mask_type  = typename simd_type::mask_type;
+
+  std::size_t nlanes = simd_type::size();
+  DataType init[]    = {11, 13, 17, 19, 23, 29, 31, 37};
+  simd_type src;
+  src.copy_from(init, Kokkos::Experimental::element_aligned_tag());
+
+  for (std::size_t idx = 0; idx < nlanes; ++idx) {
+    mask_type mask(true);
+    mask[idx] = false;
+
+    DataType dst[8] = {0};
+    index_type index;
+    simd_type expected_result;
+    for (std::size_t i = 0; i < nlanes; ++i) {
+      dst[i]             = (2 + (i * 2));
+      index[i]           = i;
+      expected_result[i] = (mask[i]) ? src[index[i]] : dst[i];
+    }
+    where(mask, src).scatter_to(dst, index);
+
+    simd_type dst_simd;
+    dst_simd.copy_from(dst, Kokkos::Experimental::element_aligned_tag());
+
+    device_check_equality(expected_result, dst_simd, nlanes);
+  }
+}
+
+template <typename Abi, typename DataType>
+KOKKOS_INLINE_FUNCTION void device_check_where_expr_gather_from() {
+  using simd_type  = Kokkos::Experimental::simd<DataType, Abi>;
+  using index_type = Kokkos::Experimental::simd<std::int32_t, Abi>;
+  using mask_type  = typename simd_type::mask_type;
+
+  std::size_t nlanes = simd_type::size();
+  DataType src[]     = {11, 13, 17, 19, 23, 29, 31, 37};
+
+  for (std::size_t idx = 0; idx < nlanes; ++idx) {
+    mask_type mask(true);
+    mask[idx] = false;
+
+    simd_type dst;
+    index_type index;
+    simd_type expected_result;
+    for (std::size_t i = 0; i < nlanes; ++i) {
+      dst[i]             = (2 + (i * 2));
+      index[i]           = i;
+      expected_result[i] = (mask[i]) ? src[index[i]] : dst[i];
+    }
+    where(mask, dst).gather_from(src, index);
+
+    device_check_equality(expected_result, dst, nlanes);
+  }
+}
+
+template <class Abi, typename DataType>
+KOKKOS_INLINE_FUNCTION void device_check_where_expr() {
+  device_check_where_expr_scatter_to<Abi, DataType>();
+  device_check_where_expr_gather_from<Abi, DataType>();
+}
+
+template <typename Abi, typename... DataTypes>
+KOKKOS_INLINE_FUNCTION void device_check_where_expr_all_types(
+    Kokkos::Experimental::Impl::data_types<DataTypes...>) {
+  (device_check_where_expr<Abi, DataTypes>(), ...);
+}
+
+template <typename... Abis>
+KOKKOS_INLINE_FUNCTION void device_check_where_expr_all_abis(
+    Kokkos::Experimental::Impl::abi_set<Abis...>) {
+  using DataTypes = Kokkos::Experimental::Impl::data_type_set;
+  (device_check_where_expr_all_types<Abis>(DataTypes()), ...);
+}
+
+class simd_device_where_expr_functor {
+ public:
+  KOKKOS_INLINE_FUNCTION void operator()(int) const {
+    device_check_where_expr_all_abis(
+        Kokkos::Experimental::Impl::device_abi_set());
+  }
+};
+
+TEST(simd, host_where_expressions) {
+  host_check_where_expr_all_abis(Kokkos::Experimental::Impl::host_abi_set());
+}
+
+TEST(simd, device_where_expressions) {
+  Kokkos::parallel_for(1, simd_device_where_expr_functor());
+}
+
+#endif
diff --git a/packages/kokkos/tpls/desul/Config.hpp.cmake.in b/packages/kokkos/tpls/desul/Config.hpp.cmake.in
index 40ab5c1c6cb1f4a042b945202434594a8f810181..a7bc738191e781e4f73f5cb86c727d2112b228f0 100644
--- a/packages/kokkos/tpls/desul/Config.hpp.cmake.in
+++ b/packages/kokkos/tpls/desul/Config.hpp.cmake.in
@@ -10,7 +10,9 @@ SPDX-License-Identifier: (BSD-3-Clause)
 #define DESUL_ATOMICS_CONFIG_HPP_
 
 #cmakedefine DESUL_ATOMICS_ENABLE_CUDA
+#cmakedefine DESUL_ATOMICS_ENABLE_CUDA_SEPARABLE_COMPILATION
 #cmakedefine DESUL_ATOMICS_ENABLE_HIP
+#cmakedefine DESUL_ATOMICS_ENABLE_HIP_SEPARABLE_COMPILATION
 #cmakedefine DESUL_ATOMICS_ENABLE_SYCL
 #cmakedefine DESUL_ATOMICS_ENABLE_OPENMP
 
diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/Adapt_SYCL.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Adapt_SYCL.hpp
index c8449d495dccf072c00ba65c87d0f09711748923..082fc132de53caecd5735a3b9d617edee5feb463 100644
--- a/packages/kokkos/tpls/desul/include/desul/atomics/Adapt_SYCL.hpp
+++ b/packages/kokkos/tpls/desul/include/desul/atomics/Adapt_SYCL.hpp
@@ -22,77 +22,54 @@ SPDX-License-Identifier: (BSD-3-Clause)
 namespace desul {
 namespace Impl {
 
-#ifdef __clang__
-namespace sycl_sync_and_atomics = ::sycl::ext::oneapi;
-#else
-namespace sycl_sync_and_atomics = ::sycl;
-#endif
-
-template <bool extended_namespace>
-using sycl_memory_order = std::conditional_t<extended_namespace,
-                                             sycl_sync_and_atomics::memory_order,
-                                             sycl::memory_order>;
-template <bool extended_namespace>
-using sycl_memory_scope = std::conditional_t<extended_namespace,
-                                             sycl_sync_and_atomics::memory_scope,
-                                             sycl::memory_scope>;
-
 //<editor-fold desc="SYCL memory order">
 // The default memory order for sycl::atomic_ref
 // can be seq_cst, acq_rel, or relaxed according to the
 // "SYCL 2020 Specification (revision 6)", see
 // https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:atomic-references.
 // Thus, we map MemoryOrderAcquire and MemoryOrderRelease to acq_rel.
-template <class MemoryOrder, bool extended_namespace = true>
+template <class MemoryOrder>
 struct SYCLMemoryOrder;
 
-template <bool extended_namespace>
-struct SYCLMemoryOrder<MemoryOrderSeqCst, extended_namespace> {
-  static constexpr sycl_memory_order<extended_namespace> value =
-      sycl_memory_order<extended_namespace>::seq_cst;
+template <>
+struct SYCLMemoryOrder<MemoryOrderSeqCst> {
+  static constexpr sycl::memory_order value = sycl::memory_order::seq_cst;
 };
-template <bool extended_namespace>
-struct SYCLMemoryOrder<MemoryOrderAcquire, extended_namespace> {
-  static constexpr sycl_memory_order<extended_namespace> value =
-      sycl_memory_order<extended_namespace>::acq_rel;
+template <>
+struct SYCLMemoryOrder<MemoryOrderAcquire> {
+  static constexpr sycl::memory_order value = sycl::memory_order::acq_rel;
 };
-template <bool extended_namespace>
-struct SYCLMemoryOrder<MemoryOrderRelease, extended_namespace> {
-  static constexpr sycl_memory_order<extended_namespace> value =
-      sycl_memory_order<extended_namespace>::acq_rel;
+template <>
+struct SYCLMemoryOrder<MemoryOrderRelease> {
+  static constexpr sycl::memory_order value = sycl::memory_order::acq_rel;
 };
-template <bool extended_namespace>
-struct SYCLMemoryOrder<MemoryOrderAcqRel, extended_namespace> {
-  static constexpr sycl_memory_order<extended_namespace> value =
-      sycl_memory_order<extended_namespace>::acq_rel;
+template <>
+struct SYCLMemoryOrder<MemoryOrderAcqRel> {
+  static constexpr sycl::memory_order value = sycl::memory_order::acq_rel;
 };
-template <bool extended_namespace>
-struct SYCLMemoryOrder<MemoryOrderRelaxed, extended_namespace> {
-  static constexpr sycl_memory_order<extended_namespace> value =
-      sycl_memory_order<extended_namespace>::relaxed;
+template <>
+struct SYCLMemoryOrder<MemoryOrderRelaxed> {
+  static constexpr sycl::memory_order value = sycl::memory_order::relaxed;
 };
 //</editor-fold>
 
 //<editor-fold desc="SYCL memory scope">
-template <class MemoryScope, bool extended_namespace = true>
+template <class MemoryScope>
 struct SYCLMemoryScope;
 
-template <bool extended_namespace>
-struct SYCLMemoryScope<MemoryScopeCore, extended_namespace> {
-  static constexpr sycl_memory_scope<extended_namespace> value =
-      sycl_memory_scope<extended_namespace>::work_group;
+template <>
+struct SYCLMemoryScope<MemoryScopeCore> {
+  static constexpr sycl::memory_scope value = sycl::memory_scope::work_group;
 };
 
-template <bool extended_namespace>
-struct SYCLMemoryScope<MemoryScopeDevice, extended_namespace> {
-  static constexpr sycl_memory_scope<extended_namespace> value =
-      sycl_memory_scope<extended_namespace>::device;
+template <>
+struct SYCLMemoryScope<MemoryScopeDevice> {
+  static constexpr sycl::memory_scope value = sycl::memory_scope::device;
 };
 
-template <bool extended_namespace>
-struct SYCLMemoryScope<MemoryScopeSystem, extended_namespace> {
-  static constexpr sycl_memory_scope<extended_namespace> value =
-      sycl_memory_scope<extended_namespace>::system;
+template <>
+struct SYCLMemoryScope<MemoryScopeSystem> {
+  static constexpr sycl::memory_scope value = sycl::memory_scope::system;
 };
 //</editor-fold>
 
@@ -111,6 +88,16 @@ using sycl_atomic_ref = sycl::atomic_ref<T,
                                          sycl::access::address_space::generic_space>;
 #endif
 
+// FIXME_SYCL Use SYCL_EXT_ONEAPI_DEVICE_GLOBAL when available instead
+#ifdef DESUL_SYCL_DEVICE_GLOBAL_SUPPORTED
+// FIXME_SYCL The compiler forces us to use device_image_scope. Drop this when possible.
+template <class T>
+using sycl_device_global = sycl::ext::oneapi::experimental::device_global<
+    T,
+    decltype(sycl::ext::oneapi::experimental::properties(
+        sycl::ext::oneapi::experimental::device_image_scope))>;
+#endif
+
 }  // namespace Impl
 }  // namespace desul
 
diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange_SYCL.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange_SYCL.hpp
index 34e36bc4e4bfc5720061bbc024cd7b480cb4a6bf..43b4fb56f9cafbd0722684ad04a9909dbf1fd618 100644
--- a/packages/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange_SYCL.hpp
+++ b/packages/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange_SYCL.hpp
@@ -11,6 +11,7 @@ SPDX-License-Identifier: (BSD-3-Clause)
 
 #include <desul/atomics/Adapt_SYCL.hpp>
 #include <desul/atomics/Common.hpp>
+#include <desul/atomics/Lock_Array_SYCL.hpp>
 #include <desul/atomics/Thread_Fence_SYCL.hpp>
 
 // FIXME_SYCL SYCL2020 dictates that <sycl/sycl.hpp> is the header to include
@@ -78,16 +79,62 @@ std::enable_if_t<sizeof(T) == 8, T> device_atomic_exchange(T* const dest,
 template <class T, class MemoryOrder, class MemoryScope>
 std::enable_if_t<(sizeof(T) != 8) && (sizeof(T) != 4), T>
 device_atomic_compare_exchange(
-    T* const /*dest*/, T compare, T /*value*/, MemoryOrder, MemoryScope) {
-  assert(false);  // FIXME_SYCL not implemented
-  return compare;
+    T* const dest, T compare, T value, MemoryOrder, MemoryScope scope) {
+  // This is a way to avoid deadlock in a subgroup
+  T return_val;
+  int done = 0;
+  auto sg = sycl::ext::oneapi::experimental::this_sub_group();
+  using sycl::ext::oneapi::group_ballot;
+  using sycl::ext::oneapi::sub_group_mask;
+  sub_group_mask active = group_ballot(sg, 1);
+  sub_group_mask done_active = group_ballot(sg, 0);
+  while (active != done_active) {
+    if (!done) {
+      if (lock_address_sycl((void*)dest, scope)) {
+        if (std::is_same<MemoryOrder, MemoryOrderSeqCst>::value)
+          atomic_thread_fence(MemoryOrderRelease(), scope);
+        atomic_thread_fence(MemoryOrderAcquire(), scope);
+        return_val = *dest;
+        if (return_val == compare) {
+          *dest = value;
+          device_atomic_thread_fence(MemoryOrderRelease(), scope);
+        }
+        unlock_address_sycl((void*)dest, scope);
+        done = 1;
+      }
+    }
+    done_active = group_ballot(sg, done);
+  }
+  return return_val;
 }
 
 template <class T, class MemoryOrder, class MemoryScope>
 std::enable_if_t<(sizeof(T) != 8) && (sizeof(T) != 4), T> device_atomic_exchange(
-    T* const /*dest*/, T value, MemoryOrder, MemoryScope) {
-  assert(false);  // FIXME_SYCL not implemented
-  return value;
+    T* const dest, T value, MemoryOrder, MemoryScope scope) {
+  // This is a way to avoid deadlock in a subgroup
+  T return_val;
+  int done = 0;
+  auto sg = sycl::ext::oneapi::experimental::this_sub_group();
+  using sycl::ext::oneapi::group_ballot;
+  using sycl::ext::oneapi::sub_group_mask;
+  sub_group_mask active = group_ballot(sg, 1);
+  sub_group_mask done_active = group_ballot(sg, 0);
+  while (active != done_active) {
+    if (!done) {
+      if (lock_address_sycl((void*)dest, scope)) {
+        if (std::is_same<MemoryOrder, MemoryOrderSeqCst>::value)
+          atomic_thread_fence(MemoryOrderRelease(), scope);
+        device_atomic_thread_fence(MemoryOrderAcquire(), scope);
+        return_val = *dest;
+        *dest = value;
+        device_atomic_thread_fence(MemoryOrderRelease(), scope);
+        unlock_address_sycl((void*)dest, scope);
+        done = 1;
+      }
+    }
+    done_active = group_ballot(sg, done);
+  }
+  return return_val;
 }
 
 }  // namespace Impl
diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/Fetch_Op_CUDA.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Fetch_Op_CUDA.hpp
index 5c662bfc58ba63702568efa19da928efa9823161..69ed8bcb9fd8a46fd609e6eb42de0dc4a8fbac44 100644
--- a/packages/kokkos/tpls/desul/include/desul/atomics/Fetch_Op_CUDA.hpp
+++ b/packages/kokkos/tpls/desul/include/desul/atomics/Fetch_Op_CUDA.hpp
@@ -63,7 +63,7 @@ inline __device__ unsigned long long device_atomic_fetch_inc(unsigned long long*
 
 inline __device__                int device_atomic_fetch_dec(               int* ptr,                         MemoryOrderRelaxed, MemoryScopeDevice) { return atomicSub(ptr,  1  ); }
 inline __device__       unsigned int device_atomic_fetch_dec(      unsigned int* ptr,                         MemoryOrderRelaxed, MemoryScopeDevice) { return atomicSub(ptr,  1u ); }
-inline __device__ unsigned long long device_atomic_fetch_dec(unsigned long long* ptr,                         MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, -1  ); }
+inline __device__ unsigned long long device_atomic_fetch_dec(unsigned long long* ptr,                         MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, -1ull);}
 
 inline __device__       unsigned int device_atomic_fetch_inc_mod(  unsigned int* ptr,       unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicInc(ptr,  val); }
 inline __device__       unsigned int device_atomic_fetch_dec_mod(  unsigned int* ptr,       unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicDec(ptr,  val); }
diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Array.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Array.hpp
index a5af4c48c2eb0a652548c53c9bc4a49f791a952a..33e26b37b97d9fd8bfd719715ddc4d0a5122c2bb 100644
--- a/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Array.hpp
+++ b/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Array.hpp
@@ -17,6 +17,9 @@ SPDX-License-Identifier: (BSD-3-Clause)
 #ifdef DESUL_HAVE_HIP_ATOMICS
 #include <desul/atomics/Lock_Array_HIP.hpp>
 #endif
+#ifdef DESUL_HAVE_SYCL_ATOMICS
+#include <desul/atomics/Lock_Array_SYCL.hpp>
+#endif
 
 namespace desul {
 namespace Impl {
@@ -67,7 +70,7 @@ inline void ensure_lock_arrays_on_device() {
 #endif
 
 #ifdef DESUL_HAVE_HIP_ATOMICS
-  DESUL_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE();
+  ensure_hip_lock_arrays_on_device();
 #endif
 }
 
diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Array_CUDA.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Array_CUDA.hpp
index e0e4e129acc9f18e27393c82b2be6ab1154cc059..ebfb8172e5682fee6142c85c1fe9395eb4928ad5 100644
--- a/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Array_CUDA.hpp
+++ b/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Array_CUDA.hpp
@@ -9,13 +9,11 @@ SPDX-License-Identifier: (BSD-3-Clause)
 #ifndef DESUL_ATOMICS_LOCK_ARRAY_CUDA_HPP_
 #define DESUL_ATOMICS_LOCK_ARRAY_CUDA_HPP_
 
+#include <cstdint>
+
 #include "desul/atomics/Common.hpp"
 #include "desul/atomics/Macros.hpp"
 
-#ifdef DESUL_HAVE_CUDA_ATOMICS
-
-#include <cstdint>
-
 namespace desul {
 namespace Impl {
 
@@ -42,14 +40,6 @@ void init_lock_arrays_cuda();
 template <typename /*AlwaysInt*/ = int>
 void finalize_lock_arrays_cuda();
 
-}  // namespace Impl
-}  // namespace desul
-
-#if defined(__CUDACC__)
-
-namespace desul {
-namespace Impl {
-
 /// \brief This global variable in CUDA space is what kernels use
 ///        to get access to the lock arrays.
 ///
@@ -69,17 +59,15 @@ namespace Impl {
 /// variable based on the Host global variable prior to running any kernels
 /// that will use it.
 /// That is the purpose of the ensure_cuda_lock_arrays_on_device function.
-__device__
-#ifdef __CUDACC_RDC__
-    __constant__ extern
+#ifdef DESUL_ATOMICS_ENABLE_CUDA_SEPARABLE_COMPILATION
+extern
 #endif
-    int32_t* CUDA_SPACE_ATOMIC_LOCKS_DEVICE;
+    __device__ __constant__ int32_t* CUDA_SPACE_ATOMIC_LOCKS_DEVICE;
 
-__device__
-#ifdef __CUDACC_RDC__
-    __constant__ extern
+#ifdef DESUL_ATOMICS_ENABLE_CUDA_SEPARABLE_COMPILATION
+extern
 #endif
-    int32_t* CUDA_SPACE_ATOMIC_LOCKS_NODE;
+    __device__ __constant__ int32_t* CUDA_SPACE_ATOMIC_LOCKS_NODE;
 
 #define CUDA_SPACE_ATOMIC_MASK 0x1FFFF
 
@@ -120,45 +108,31 @@ __device__ inline void unlock_address_cuda(void* ptr, desul::MemoryScopeNode) {
   atomicExch(&desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_NODE[offset], 0);
 }
 
-}  // namespace Impl
-}  // namespace desul
-
-// Make lock_array_copied an explicit translation unit scope thingy
-namespace desul {
-namespace Impl {
-namespace {
-static int lock_array_copied = 0;
-inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
-}  // namespace
-
-#ifdef __CUDACC_RDC__
+#ifdef DESUL_ATOMICS_ENABLE_CUDA_SEPARABLE_COMPILATION
 inline
 #else
 inline static
 #endif
     void
     copy_cuda_lock_arrays_to_device() {
-  if (lock_array_copied == 0) {
+  static bool once = []() {
     cudaMemcpyToSymbol(CUDA_SPACE_ATOMIC_LOCKS_DEVICE,
                        &CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h,
                        sizeof(int32_t*));
     cudaMemcpyToSymbol(CUDA_SPACE_ATOMIC_LOCKS_NODE,
                        &CUDA_SPACE_ATOMIC_LOCKS_NODE_h,
                        sizeof(int32_t*));
-  }
-  lock_array_copied = 1;
+    return true;
+  }();
+  (void)once;
 }
 
 }  // namespace Impl
 }  // namespace desul
 
-#endif /* defined( __CUDACC__ ) */
-
-#endif /* defined( DESUL_HAVE_CUDA_ATOMICS ) */
-
 namespace desul {
 
-#if defined(__CUDACC_RDC__) || (!defined(__CUDACC__))
+#ifdef DESUL_ATOMICS_ENABLE_CUDA_SEPARABLE_COMPILATION
 inline void ensure_cuda_lock_arrays_on_device() {}
 #else
 static inline void ensure_cuda_lock_arrays_on_device() {
diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Array_HIP.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Array_HIP.hpp
index 1ab9544eb4ed71299d9dd4fb72cbed6f514370e8..beca3e9e4031e951c54415e270d1b914d18ba1ad 100644
--- a/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Array_HIP.hpp
+++ b/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Array_HIP.hpp
@@ -35,7 +35,7 @@ extern int32_t* HIP_SPACE_ATOMIC_LOCKS_NODE_h;
 template <typename /*AlwaysInt*/ = int>
 void init_lock_arrays_hip();
 
-/// \brief After this call, the g_host_cuda_lock_arrays variable has
+/// \brief After this call, the g_host_hip_lock_arrays variable has
 ///        all null pointers, and all array memory has been freed.
 ///
 /// This call is idempotent.
@@ -43,12 +43,6 @@ void init_lock_arrays_hip();
 ///   snapshotted version while also linking against pure Desul
 template <typename /*AlwaysInt*/ = int>
 void finalize_lock_arrays_hip();
-}  // namespace Impl
-}  // namespace desul
-
-#ifdef __HIPCC__
-namespace desul {
-namespace Impl {
 
 /**
  * \brief This global variable in HIP space is what kernels use to get access
@@ -64,22 +58,20 @@ namespace Impl {
  * be created in every translation unit that sees this header file (we make this
  * clear by marking it static, meaning no other translation unit can link to
  * it). Since the Kokkos_HIP_Locks.cpp translation unit cannot initialize the
- * instances in other translation units, we must update this CUDA global
+ * instances in other translation units, we must update this HIP global
  * variable based on the Host global variable prior to running any kernels that
  * will use it.  That is the purpose of the
- * KOKKOS_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE macro.
+ * ensure_hip_lock_arrays_on_device function.
  */
-__device__
-#ifdef DESUL_HIP_RDC
-    __constant__ extern
+#ifdef DESUL_ATOMICS_ENABLE_HIP_SEPARABLE_COMPILATION
+extern
 #endif
-    int32_t* HIP_SPACE_ATOMIC_LOCKS_DEVICE;
+    __device__ __constant__ int32_t* HIP_SPACE_ATOMIC_LOCKS_DEVICE;
 
-__device__
-#ifdef DESUL_HIP_RDC
-    __constant__ extern
+#ifdef DESUL_ATOMICS_ENABLE_HIP_SEPARABLE_COMPILATION
+extern
 #endif
-    int32_t* HIP_SPACE_ATOMIC_LOCKS_NODE;
+    __device__ __constant__ int32_t* HIP_SPACE_ATOMIC_LOCKS_NODE;
 
 #define HIP_SPACE_ATOMIC_MASK 0x1FFFF
 
@@ -122,42 +114,35 @@ __device__ inline void unlock_address_hip(void* ptr, desul::MemoryScopeNode) {
   offset = offset & HIP_SPACE_ATOMIC_MASK;
   atomicExch(&desul::Impl::HIP_SPACE_ATOMIC_LOCKS_NODE[offset], 0);
 }
-#endif
-}  // namespace Impl
-}  // namespace desul
 
-// Make lock_array_copied an explicit translation unit scope thing
-namespace desul {
-namespace Impl {
-namespace {
-static int lock_array_copied = 0;
-inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
-}  // namespace
+#ifdef DESUL_ATOMICS_ENABLE_HIP_SEPARABLE_COMPILATION
+inline
+#else
+inline static
+#endif
+    void
+    copy_hip_lock_arrays_to_device() {
+  static bool once = []() {
+    (void)hipMemcpyToSymbol(HIP_SYMBOL(HIP_SPACE_ATOMIC_LOCKS_DEVICE),
+                            &HIP_SPACE_ATOMIC_LOCKS_DEVICE_h,
+                            sizeof(int32_t*));
+    (void)hipMemcpyToSymbol(HIP_SYMBOL(HIP_SPACE_ATOMIC_LOCKS_NODE),
+                            &HIP_SPACE_ATOMIC_LOCKS_NODE_h,
+                            sizeof(int32_t*));
+    return true;
+  }();
+  (void)once;
+}
 }  // namespace Impl
-}  // namespace desul
 
-/* It is critical that this code be a macro, so that it will
-   capture the right address for g_device_hip_lock_arrays!
-   putting this in an inline function will NOT do the right thing! */
-#define DESUL_IMPL_COPY_HIP_LOCK_ARRAYS_TO_DEVICE()                                   \
-  {                                                                                   \
-    if (::desul::Impl::lock_array_copied == 0) {                                      \
-      (void)hipMemcpyToSymbol(                                                        \
-          HIP_SYMBOL(::desul::Impl::HIP_SPACE_ATOMIC_LOCKS_DEVICE),                   \
-          &::desul::Impl::HIP_SPACE_ATOMIC_LOCKS_DEVICE_h,                            \
-          sizeof(int32_t*));                                                          \
-      (void)hipMemcpyToSymbol(HIP_SYMBOL(::desul::Impl::HIP_SPACE_ATOMIC_LOCKS_NODE), \
-                              &::desul::Impl::HIP_SPACE_ATOMIC_LOCKS_NODE_h,          \
-                              sizeof(int32_t*));                                      \
-    }                                                                                 \
-    ::desul::Impl::lock_array_copied = 1;                                             \
-  }
-
-#if defined(DESUL_HIP_RDC) || (!defined(__HIPCC__))
-#define DESUL_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE()
+#ifdef DESUL_ATOMICS_ENABLE_HIP_SEPARABLE_COMPILATION
+inline void ensure_hip_lock_arrays_on_device() {}
 #else
-#define DESUL_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE() \
-  DESUL_IMPL_COPY_HIP_LOCK_ARRAYS_TO_DEVICE()
+static inline void ensure_hip_lock_arrays_on_device() {
+  Impl::copy_hip_lock_arrays_to_device();
+}
 #endif
 
+}  // namespace desul
+
 #endif
diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Array_SYCL.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Array_SYCL.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..8216f9a797c94b45be905fb79a53a68342c73f68
--- /dev/null
+++ b/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Array_SYCL.hpp
@@ -0,0 +1,161 @@
+/*
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_LOCK_ARRAY_SYCL_HPP_
+#define DESUL_ATOMICS_LOCK_ARRAY_SYCL_HPP_
+
+#include <cstdint>
+
+#include "desul/atomics/Adapt_SYCL.hpp"
+#include "desul/atomics/Common.hpp"
+#include "desul/atomics/Macros.hpp"
+
+// FIXME_SYCL
+#if __has_include(<sycl/sycl.hpp>)
+#include <sycl/sycl.hpp>
+#else
+#include <CL/sycl.hpp>
+#endif
+
+namespace desul {
+namespace Impl {
+
+// FIXME_SYCL Use SYCL_EXT_ONEAPI_DEVICE_GLOBAL when available instead
+#ifdef DESUL_SYCL_DEVICE_GLOBAL_SUPPORTED
+
+/**
+ * \brief This global variable in Host space is the central definition of these
+ * arrays.
+ */
+extern int32_t* SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h;
+extern int32_t* SYCL_SPACE_ATOMIC_LOCKS_NODE_h;
+
+/// \brief After this call, the lock arrays used in [un]lock_address_sycl
+///        are initialized and ready to be used.
+///
+/// This call is idempotent.
+/// The function is templated to make it a weak symbol to deal with Kokkos/RAJA
+///   snapshotted version while also linking against pure Desul
+template <typename /*AlwaysInt*/ = int>
+void init_lock_arrays_sycl(sycl::queue q);
+
+/// \brief After this call, the lock arrays used in [un]lock_address_sycl
+///        are freed and can't be used anymore.
+///
+/// This call is idempotent.
+/// The function is templated to make it a weak symbol to deal with Kokkos/RAJA
+///   snapshotted version while also linking against pure Desul
+template <typename /*AlwaysInt*/ = int>
+void finalize_lock_arrays_sycl(sycl::queue q);
+
+/**
+ * \brief This global variable in SYCL space is what kernels use to get access
+ * to the lock arrays.
+ *
+ * There is only one single instance of this global variable for the entire
+ * executable, whose definition will be in Kokkos_SYCL_Locks.cpp (and whose
+ * declaration here must be extern). This one instance will be initialized
+ * by initialize_host_sycl_lock_arrays and need not be modified afterwards.
+ */
+SYCL_EXTERNAL extern sycl_device_global<int32_t*> SYCL_SPACE_ATOMIC_LOCKS_DEVICE;
+
+SYCL_EXTERNAL extern sycl_device_global<int32_t*> SYCL_SPACE_ATOMIC_LOCKS_NODE;
+
+#define SYCL_SPACE_ATOMIC_MASK 0x1FFFF
+
+/// \brief Acquire a lock for the address
+///
+/// This function tries to acquire the lock for the hash value derived
+/// from the provided ptr. If the lock is successfully acquired the
+/// function returns true. Otherwise it returns false.
+inline bool lock_address_sycl(void* ptr, MemoryScopeDevice) {
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & SYCL_SPACE_ATOMIC_MASK;
+  sycl::atomic_ref<int32_t,
+                   sycl::memory_order::relaxed,
+                   sycl::memory_scope::device,
+                   sycl::access::address_space::global_space>
+      lock_device_ref(SYCL_SPACE_ATOMIC_LOCKS_DEVICE[offset]);
+  return (0 == lock_device_ref.exchange(1));
+}
+
+inline bool lock_address_sycl(void* ptr, MemoryScopeNode) {
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & SYCL_SPACE_ATOMIC_MASK;
+  sycl::atomic_ref<int32_t,
+                   sycl::memory_order::relaxed,
+                   sycl::memory_scope::system,
+                   sycl::access::address_space::global_space>
+      lock_node_ref(SYCL_SPACE_ATOMIC_LOCKS_NODE[offset]);
+  return (0 == lock_node_ref.exchange(1));
+}
+
+/**
+ * \brief Release lock for the address
+ *
+ * This function releases the lock for the hash value derived from the provided
+ * ptr. This function should only be called after previously successfully
+ * acquiring a lock with lock_address.
+ */
+inline void unlock_address_sycl(void* ptr, MemoryScopeDevice) {
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & SYCL_SPACE_ATOMIC_MASK;
+  sycl::atomic_ref<int32_t,
+                   sycl::memory_order::relaxed,
+                   sycl::memory_scope::device,
+                   sycl::access::address_space::global_space>
+      lock_device_ref(SYCL_SPACE_ATOMIC_LOCKS_DEVICE[offset]);
+  lock_device_ref.exchange(0);
+}
+
+inline void unlock_address_sycl(void* ptr, MemoryScopeNode) {
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & SYCL_SPACE_ATOMIC_MASK;
+  sycl::atomic_ref<int32_t,
+                   sycl::memory_order::relaxed,
+                   sycl::memory_scope::system,
+                   sycl::access::address_space::global_space>
+      lock_node_ref(SYCL_SPACE_ATOMIC_LOCKS_NODE[offset]);
+  lock_node_ref.exchange(0);
+}
+
+#else  // not supported
+
+template <typename /*AlwaysInt*/ = int>
+void init_lock_arrays_sycl(sycl::queue) {
+  assert(false);
+}
+
+template <typename /*AlwaysInt*/ = int>
+void finalize_lock_arrays_sycl(sycl::queue) {
+  assert(false);
+}
+
+inline bool lock_address_sycl(void*, MemoryScopeDevice) {
+  assert(false);
+  // return true so that the CAS loops don't hang.
+  return true;
+}
+
+inline bool lock_address_sycl(void*, MemoryScopeNode) {
+  assert(false);
+  // return true so that the CAS loops don't hang.
+  return true;
+}
+
+inline void unlock_address_sycl(void*, MemoryScopeDevice) { assert(false); }
+
+inline void unlock_address_sycl(void*, MemoryScopeNode) { assert(false); }
+#endif
+}  // namespace Impl
+}  // namespace desul
+#endif
diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op.hpp
index e7c36673e282e5f8543ea3876eb8b649893d4e92..cb97f4a906db19f16ca66ad9efba96b2b0908351 100644
--- a/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op.hpp
+++ b/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op.hpp
@@ -18,7 +18,7 @@ SPDX-License-Identifier: (BSD-3-Clause)
 #include <desul/atomics/Lock_Based_Fetch_Op_HIP.hpp>
 #endif
 #ifdef DESUL_HAVE_SYCL_ATOMICS
-#include <desul/atomics/Lock_Based_Fetch_Op_Unimplemented.hpp>
+#include <desul/atomics/Lock_Based_Fetch_Op_SYCL.hpp>
 #endif
 
 #include <desul/atomics/Lock_Based_Fetch_Op_Host.hpp>
diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op_SYCL.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op_SYCL.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..8774a6e96eb5f41443a0c09295c48d3944623924
--- /dev/null
+++ b/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op_SYCL.hpp
@@ -0,0 +1,94 @@
+/*
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_LOCK_BASED_FETCH_OP_SYCL_HPP_
+#define DESUL_ATOMICS_LOCK_BASED_FETCH_OP_SYCL_HPP_
+
+#include <desul/atomics/Common.hpp>
+#include <desul/atomics/Lock_Array_SYCL.hpp>
+#include <desul/atomics/Thread_Fence_SYCL.hpp>
+#include <type_traits>
+
+namespace desul {
+namespace Impl {
+
+template <class Oper,
+          class T,
+          class MemoryOrder,
+          class MemoryScope,
+          // equivalent to:
+          //   requires !atomic_always_lock_free(sizeof(T))
+          std::enable_if_t<!atomic_always_lock_free(sizeof(T)), int> = 0>
+T device_atomic_fetch_oper(const Oper& op,
+                           T* const dest,
+                           dont_deduce_this_parameter_t<const T> val,
+                           MemoryOrder /*order*/,
+                           MemoryScope scope) {
+  // This is a way to avoid deadlock in a subgroup
+  T return_val;
+  int done = 0;
+  auto sg = sycl::ext::oneapi::experimental::this_sub_group();
+  using sycl::ext::oneapi::group_ballot;
+  using sycl::ext::oneapi::sub_group_mask;
+  sub_group_mask active = group_ballot(sg, 1);
+  sub_group_mask done_active = group_ballot(sg, 0);
+  while (active != done_active) {
+    if (!done) {
+      if (lock_address_sycl((void*)dest, scope)) {
+        device_atomic_thread_fence(MemoryOrderAcquire(), scope);
+        return_val = *dest;
+        *dest = op.apply(return_val, val);
+        device_atomic_thread_fence(MemoryOrderRelease(), scope);
+        unlock_address_sycl((void*)dest, scope);
+        done = 1;
+      }
+    }
+    done_active = group_ballot(sg, done);
+  }
+  return return_val;
+}
+
+template <class Oper,
+          class T,
+          class MemoryOrder,
+          class MemoryScope,
+          // equivalent to:
+          //   requires !atomic_always_lock_free(sizeof(T))
+          std::enable_if_t<!atomic_always_lock_free(sizeof(T)), int> = 0>
+T device_atomic_oper_fetch(const Oper& op,
+                           T* const dest,
+                           dont_deduce_this_parameter_t<const T> val,
+                           MemoryOrder /*order*/,
+                           MemoryScope scope) {
+  // This is a way to avoid deadlock in a subgroup
+  T return_val;
+  int done = 0;
+  auto sg = sycl::ext::oneapi::experimental::this_sub_group();
+  using sycl::ext::oneapi::group_ballot;
+  using sycl::ext::oneapi::sub_group_mask;
+  sub_group_mask active = group_ballot(sg, 1);
+  sub_group_mask done_active = group_ballot(sg, 0);
+  while (active != done_active) {
+    if (!done) {
+      if (lock_address_sycl((void*)dest, scope)) {
+        device_atomic_thread_fence(MemoryOrderAcquire(), scope);
+        return_val = op.apply(*dest, val);
+        *dest = return_val;
+        device_atomic_thread_fence(MemoryOrderRelease(), scope);
+        unlock_address_sycl((void*)dest, scope);
+        done = 1;
+      }
+    }
+    done_active = group_ballot(sg, done);
+  }
+  return return_val;
+}
+}  // namespace Impl
+}  // namespace desul
+
+#endif
diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op_Unimplemented.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op_Unimplemented.hpp
deleted file mode 100644
index b9f9fac5359ea9b2dce5cfaa1dbd81beb6f11352..0000000000000000000000000000000000000000
--- a/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op_Unimplemented.hpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
-Copyright (c) 2019, Lawrence Livermore National Security, LLC
-and DESUL project contributors. See the COPYRIGHT file for details.
-Source: https://github.com/desul/desul
-
-SPDX-License-Identifier: (BSD-3-Clause)
-*/
-
-#ifndef DESUL_ATOMICS_LOCK_BASED_FETCH_OP_UNIMPLEMENTED_HPP_
-#define DESUL_ATOMICS_LOCK_BASED_FETCH_OP_UNIMPLEMENTED_HPP_
-
-#include <cassert>
-#include <desul/atomics/Common.hpp>
-#include <type_traits>
-
-namespace desul {
-namespace Impl {
-
-template <class Oper,
-          class T,
-          class MemoryOrder,
-          class MemoryScope,
-          // equivalent to:
-          //   requires !atomic_always_lock_free(sizeof(T))
-          std::enable_if_t<!atomic_always_lock_free(sizeof(T)), int> = 0>
-DESUL_INLINE_FUNCTION T
-device_atomic_fetch_oper(const Oper& /*op*/,
-                         T* const /*dest*/,
-                         dont_deduce_this_parameter_t<const T> val,
-                         MemoryOrder /*order*/,
-                         MemoryScope /*scope*/) {
-  assert(false);
-  return val;  // FIXME not implemented
-}
-
-template <class Oper,
-          class T,
-          class MemoryOrder,
-          class MemoryScope,
-          // equivalent to:
-          //   requires !atomic_always_lock_free(sizeof(T))
-          std::enable_if_t<!atomic_always_lock_free(sizeof(T)), int> = 0>
-DESUL_INLINE_FUNCTION T
-device_atomic_oper_fetch(const Oper& /*op*/,
-                         T* const /*dest*/,
-                         dont_deduce_this_parameter_t<const T> val,
-                         MemoryOrder /*order*/,
-                         MemoryScope /*scope*/) {
-  assert(false);
-  return val;  // FIXME not implemented
-}
-}  // namespace Impl
-}  // namespace desul
-
-#endif
diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/Macros.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Macros.hpp
index 992fb9fa66bf32a863e8ac4fbd4aaf690d13e69d..3a14b93d323033d82f7a260a4a6c2a233d9dccdc 100644
--- a/packages/kokkos/tpls/desul/include/desul/atomics/Macros.hpp
+++ b/packages/kokkos/tpls/desul/include/desul/atomics/Macros.hpp
@@ -11,6 +11,34 @@ SPDX-License-Identifier: (BSD-3-Clause)
 
 #include <desul/atomics/Config.hpp>
 
+// Intercept incompatible relocatable device code mode which leads to ODR violations
+#ifdef DESUL_ATOMICS_ENABLE_CUDA
+#if (defined(__clang__) && defined(__CUDA__) && defined(__CLANG_RDC__)) || \
+    defined(__CUDACC_RDC__)
+#define DESUL_IMPL_CUDA_RDC
+#endif
+
+#if (defined(DESUL_ATOMICS_ENABLE_CUDA_SEPARABLE_COMPILATION) &&  \
+     !defined(DESUL_IMPL_CUDA_RDC)) ||                            \
+    (!defined(DESUL_ATOMICS_ENABLE_CUDA_SEPARABLE_COMPILATION) && \
+     defined(DESUL_IMPL_CUDA_RDC))
+#error Relocatable device code mode incompatible with desul atomics configuration
+#endif
+
+#ifdef DESUL_IMPL_CUDA_RDC
+#undef DESUL_IMPL_CUDA_RDC
+#endif
+#endif
+
+#ifdef DESUL_ATOMICS_ENABLE_HIP
+#if (defined(DESUL_ATOMICS_ENABLE_HIP_SEPARABLE_COMPILATION) &&  \
+     !defined(__CLANG_RDC__)) ||                                 \
+    (!defined(DESUL_ATOMICS_ENABLE_HIP_SEPARABLE_COMPILATION) && \
+     defined(__CLANG_RDC__))
+#error Relocatable device code mode incompatible with desul atomics configuration
+#endif
+#endif
+
 // Macros
 
 #if defined(DESUL_ATOMICS_ENABLE_CUDA) && defined(__CUDACC__)
@@ -39,12 +67,6 @@ SPDX-License-Identifier: (BSD-3-Clause)
 #define DESUL_HAVE_MSVC_ATOMICS
 #endif
 
-#if (defined(DESUL_ATOMICS_ENABLE_CUDA) && defined(__CUDA_ARCH__)) ||         \
-    (defined(DESUL_ATOMICS_ENABLE_HIP) && defined(__HIP_DEVICE_COMPILE__)) || \
-    (defined(DESUL_ATOMICS_ENABLE_SYCL) && defined(__SYCL_DEVICE_ONLY__))
-#define DESUL_HAVE_GPU_LIKE_PROGRESS
-#endif
-
 #if defined(DESUL_HAVE_CUDA_ATOMICS) || defined(DESUL_HAVE_HIP_ATOMICS)
 #define DESUL_FORCEINLINE_FUNCTION inline __host__ __device__
 #define DESUL_INLINE_FUNCTION inline __host__ __device__
@@ -59,10 +81,6 @@ SPDX-License-Identifier: (BSD-3-Clause)
 #define DESUL_IMPL_DEVICE_FUNCTION
 #endif
 
-#if !defined(DESUL_HAVE_GPU_LIKE_PROGRESS)
-#define DESUL_HAVE_FORWARD_PROGRESS
-#endif
-
 #define DESUL_IMPL_STRIP_PARENS(X) DESUL_IMPL_ESC(DESUL_IMPL_ISH X)
 #define DESUL_IMPL_ISH(...) DESUL_IMPL_ISH __VA_ARGS__
 #define DESUL_IMPL_ESC(...) DESUL_IMPL_ESC_(__VA_ARGS__)
diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/Thread_Fence_OpenMP.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Thread_Fence_OpenMP.hpp
index 72fc7a5e7c61a64d9ff6be22e41081eedaa32c30..402f1a7bdea16aa6d8685d7779a48787b26a4ae8 100644
--- a/packages/kokkos/tpls/desul/include/desul/atomics/Thread_Fence_OpenMP.hpp
+++ b/packages/kokkos/tpls/desul/include/desul/atomics/Thread_Fence_OpenMP.hpp
@@ -16,7 +16,9 @@ SPDX-License-Identifier: (BSD-3-Clause)
 namespace desul {
 namespace Impl {
 
-#if _OPENMP > 201800
+// NVHPC compiler only supports the basic flush construct without the
+// memory-order-clause.
+#if _OPENMP > 201800 && !defined(__NVCOMPILER)
 
 // There is no seq_cst flush in OpenMP, isn't it the same anyway for fence?
 inline void host_atomic_thread_fence(MemoryOrderSeqCst, MemoryScopeCore) {
diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/Thread_Fence_SYCL.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Thread_Fence_SYCL.hpp
index 769e92abfebbe00089eb5104b7065cd922132b87..7cf3e8d5c0bf11b624e189d1a2bb74a7fc4a260b 100644
--- a/packages/kokkos/tpls/desul/include/desul/atomics/Thread_Fence_SYCL.hpp
+++ b/packages/kokkos/tpls/desul/include/desul/atomics/Thread_Fence_SYCL.hpp
@@ -25,8 +25,8 @@ namespace Impl {
 
 template <class MemoryOrder, class MemoryScope>
 void device_atomic_thread_fence(MemoryOrder, MemoryScope) {
-  sycl::atomic_fence(SYCLMemoryOrder<MemoryOrder, /*extended namespace*/ false>::value,
-                     SYCLMemoryScope<MemoryScope, /*extended namespace*/ false>::value);
+  sycl::atomic_fence(SYCLMemoryOrder<MemoryOrder>::value,
+                     SYCLMemoryScope<MemoryScope>::value);
 }
 
 }  // namespace Impl
diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/cuda/CUDA_asm.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/cuda/CUDA_asm.hpp
index 96e0dfa269b363afc53beab3a1dc1776a57a0a2f..9471862a6be397d711b67ed73e3e6aafadc45925 100644
--- a/packages/kokkos/tpls/desul/include/desul/atomics/cuda/CUDA_asm.hpp
+++ b/packages/kokkos/tpls/desul/include/desul/atomics/cuda/CUDA_asm.hpp
@@ -2,10 +2,15 @@
 namespace desul {
 namespace Impl {
 // Choose the variant of atomics we are using later
+// The __isGlobal intrinsic was only introduced in CUDA 11.2
+// It also stopped working in NVC++ 23.1 - it works in 22.11
+// this is a bug in NVHPC, not treating CUDA intrinsics correctly
+// FIXME_NVHPC
 #if !defined(DESUL_IMPL_ATOMIC_CUDA_PTX_PREDICATE) && \
     !defined(DESUL_IMPL_ATOMIC_CUDA_PTX_ISGLOBAL)
-#if (__CUDACC_VER_MAJOR__ > 11) || \
-    ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ > 1))
+#if ((__CUDACC_VER_MAJOR__ > 11) ||                                   \
+     ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ > 1))) && \
+    (!defined(__NVCOMPILER) || __NVCOMPILER_MAJOR__ < 23)
 #define DESUL_IMPL_ATOMIC_CUDA_PTX_ISGLOBAL
 #else
 #define DESUL_IMPL_ATOMIC_CUDA_PTX_PREDICATE
diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_forceglobal b/packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_forceglobal
deleted file mode 100644
index b235163820782a664189470f01dd7df095f117e5..0000000000000000000000000000000000000000
--- a/packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_forceglobal
+++ /dev/null
@@ -1,153 +0,0 @@
-
-// Inline PTX: h u16 , r u32,  l u64, f f32, d f64
-// Ops:
-
-// binary operations
-
-#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND() \
-template<class ctype> \
-inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_fetch_and(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
-  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
-  uint32_t asm_result = 0u; \
-  asm volatile("atom.and.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
-  return reinterpret_cast<ctype&>(asm_result); \
-} \
-template<class ctype> \
-inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_fetch_and(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
-  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
-  uint64_t asm_result = 0u; \
-  asm volatile("atom.and.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
-  return reinterpret_cast<ctype&>(asm_result); \
-}
-
-#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_OR() \
-template<class ctype> \
-inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_fetch_or(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
-  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
-  uint32_t asm_result = 0u; \
-  asm volatile("atom.or.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
-  return reinterpret_cast<ctype&>(asm_result); \
-} \
-template<class ctype> \
-inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_fetch_or(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
-  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
-  uint64_t asm_result = 0u; \
-  asm volatile("atom.or.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
-  return reinterpret_cast<ctype&>(asm_result); \
-}
-
-#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_XOR() \
-template<class ctype> \
-inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_fetch_xor(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
-  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
-  uint32_t asm_result = 0u; \
-  asm volatile("atom.xor.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
-  return reinterpret_cast<ctype&>(asm_result); \
-} \
-template<class ctype> \
-inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_fetch_xor(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
-  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
-  uint64_t asm_result = 0u; \
-  asm volatile("atom.xor.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
-  return reinterpret_cast<ctype&>(asm_result); \
-}
-
-// Fetch atomics
-#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
-inline __device__ ctype atomic_fetch_add(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
-  ctype result=0; \
-  asm volatile("atom.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
-  return result; \
-}
-
-#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
-inline __device__ ctype atomic_fetch_sub(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
-  ctype result=0; \
-  ctype neg_value = -value; \
-  asm volatile("atom.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(neg_value) : "memory"); \
-  return result; \
-}
-
-
-#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
-inline __device__ ctype atomic_fetch_min(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
-  ctype result=0; \
-  asm volatile("atom.min.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
-  return result; \
-}
-
-#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
-inline __device__ ctype atomic_fetch_max(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
-  ctype result=0; \
-  asm volatile("atom.max.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
-  return result; \
-}
-
-#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
-inline __device__ ctype atomic_fetch_inc(ctype* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
-  ctype result = 0; \
-  ctype limit = desul::Impl::numeric_limits_max<ctype>::value; \
-  asm volatile("atom.inc.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \
-  return result; \
-} \
-inline __device__ ctype atomic_fetch_inc_mod(ctype* dest, ctype limit, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
-  ctype result = 0; \
-  asm volatile("atom.inc.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \
-  return result; \
-}
-
-#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
-inline __device__ ctype atomic_fetch_dec(ctype* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
-  ctype result = 0; \
-  ctype limit = desul::Impl::numeric_limits_max<ctype>::value; \
-  asm volatile("atom.dec.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \
-  return result; \
-} \
-inline __device__ ctype atomic_fetch_dec_mod(ctype* dest, ctype limit, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
-  ctype result = 0; \
-  asm volatile("atom.dec.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \
-  return result; \
-}
-
-// Group ops for integer ctypes
-#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype)
-
-#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype)
-
-#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_BIN_OP() \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND() \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_OR() \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_XOR()
-
-
-// Instantiate Functions
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(float,".f32","f","=f")
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(float,".f32","f","=f")
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(double,".f64","d","=d")
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(double,".f64","d","=d")
-
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(uint32_t,".u32","r","=r")
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(uint64_t,".u64","l","=l")
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(int32_t,".s32","r","=r")
-//__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(int64_t,".s64","l","=l")
-
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(uint32_t,".u32","r","=r")
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(uint32_t,".u32","r","=r")
-
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_BIN_OP()
-
-#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD
-#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN
-#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX
-#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC
-#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC
-#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND
-
diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_generic b/packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_generic
deleted file mode 100644
index 0484d109c3db39440267b06ecd7736bb7b35b2fe..0000000000000000000000000000000000000000
--- a/packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_generic
+++ /dev/null
@@ -1,151 +0,0 @@
-
-// Inline PTX: h u16 , r u32,  l u64, f f32, d f64
-// Ops: 
-
-// binary operations
-
-#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND() \
-template<class ctype> \
-inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_fetch_and(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
-  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
-  uint32_t asm_result = 0u; \
-  asm volatile("atom.and" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
-  return reinterpret_cast<ctype&>(asm_result); \
-} \
-template<class ctype> \
-inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_fetch_and(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
-  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
-  uint64_t asm_result = 0u; \
-  asm volatile("atom.and" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
-  return reinterpret_cast<ctype&>(asm_result); \
-}
-
-#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_OR() \
-template<class ctype> \
-inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_fetch_or(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
-  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
-  uint32_t asm_result = 0u; \
-  asm volatile("atom.or" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
-  return reinterpret_cast<ctype&>(asm_result); \
-} \
-template<class ctype> \
-inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_fetch_or(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
-  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
-  uint64_t asm_result = 0u; \
-  asm volatile("atom.or" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
-  return reinterpret_cast<ctype&>(asm_result); \
-}
-
-#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_XOR() \
-template<class ctype> \
-inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_fetch_xor(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
-  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
-  uint32_t asm_result = 0u; \
-  asm volatile("atom.xor" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
-  return reinterpret_cast<ctype&>(asm_result); \
-} \
-template<class ctype> \
-inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_fetch_xor(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
-  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
-  uint64_t asm_result = 0u; \
-  asm volatile("atom.xor" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
-  return reinterpret_cast<ctype&>(asm_result); \
-}
-
-// Fetch atomics
-#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
-inline __device__ ctype atomic_fetch_add(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
-  ctype result=0; \
-  asm volatile("atom.add" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
-  return result; \
-}
-
-#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
-inline __device__ ctype atomic_fetch_sub(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
-  ctype result=0; \
-  ctype neg_value = -value; \
-  asm volatile("atom.add" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(neg_value) : "memory"); \
-  return result; \
-}
-
-#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
-inline __device__ ctype atomic_fetch_min(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
-  ctype result=0; \
-  asm volatile("atom.min" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
-  return result; \
-}
-
-#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
-inline __device__ ctype atomic_fetch_max(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
-  ctype result=0; \
-  asm volatile("atom.max" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
-  return result; \
-}
-
-#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
-inline __device__ ctype atomic_fetch_inc(ctype* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
-  ctype result = 0; \
-  ctype limit = desul::Impl::numeric_limits_max<ctype>::value; \
-  asm volatile("atom.inc" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \
-  return result; \
-} \
-inline __device__ ctype atomic_fetch_inc_mod(ctype* dest, ctype limit, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
-  ctype result = 0; \
-  asm volatile("atom.inc" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \
-  return result; \
-}
-
-#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
-inline __device__ ctype atomic_fetch_dec(ctype* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
-  ctype result = 0; \
-  ctype limit = desul::Impl::numeric_limits_max<ctype>::value; \
-  asm volatile("atom.dec" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \
-  return result; \
-} \
-inline __device__ ctype atomic_fetch_dec_mod(ctype* dest, ctype limit, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
-  ctype result = 0; \
-  asm volatile("atom.dec" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \
-  return result; \
-}
-// Group ops for integer ctypes
-#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype)
-
-#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype)
-
-#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_BIN_OP() \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND() \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_OR() \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_XOR()
-
-
-// Instantiate Functions
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(float,".f32","f","=f")
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(float,".f32","f","=f")
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(double,".f64","d","=d")
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(double,".f64","d","=d")
-
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(uint32_t,".u32","r","=r")
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(uint64_t,".u64","l","=l")
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(int32_t,".s32","r","=r")
-//__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(int64_t,".s64","l","=l")
-
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(uint32_t,".u32","r","=r")
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(uint32_t,".u32","r","=r")
-
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_BIN_OP()
-
-#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD
-#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN
-#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX
-#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC
-#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC
-#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND
-
diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_isglobal b/packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_isglobal
index ef5798f21138e936cd63d8593fc9c245f1d75d92..dd359405b5e69cc7eb3ce5045cb9763e54bed2c8 100644
--- a/packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_isglobal
+++ b/packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_isglobal
@@ -164,39 +164,56 @@ inline __device__ ctype device_atomic_fetch_dec_mod(ctype* dest, ctype limit, __
   return result; \
 }
 
-// Group ops for integer ctypes
-#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype)
-
-#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype)
-
 #define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_BIN_OP() \
 __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND() \
 __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_OR() \
 __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_XOR()
 
-
 // Instantiate Functions
+
+// General comments:
+//  - float/double only support add
+//  - inc/dec only supported with uint32_t
+//  - int64_t does not support add
+
+// floating point types
 __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(float,".f32","f","=f")
 __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(float,".f32","f","=f")
 __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(double,".f64","d","=d")
 __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(double,".f64","d","=d")
 
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(uint32_t,".u32","r","=r")
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(uint64_t,".u64","l","=l")
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(int32_t,".s32","r","=r")
-//__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(int64_t,".s64","l","=l")
-
+// uint32_t
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(uint32_t,".u32","r","=r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(uint32_t,".u32","r","=r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(uint32_t,".u32","r","=r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(uint32_t,".u32","r","=r")
 __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(uint32_t,".u32","r","=r")
 __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(uint32_t,".u32","r","=r")
 
+// uint64_t
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(uint64_t,".u64","l","=l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(uint64_t,".u64","l","=l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(uint64_t,".u64","l","=l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(uint64_t,".u64","l","=l")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(uint64_t,".u64","l","=l")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(uint64_t,".u64","l","=l")
+
+// int32_t
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(int32_t,".s32","r","=r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(int32_t,".s32","r","=r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(int32_t,".s32","r","=r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(int32_t,".s32","r","=r")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(int32_t,".s32","r","=r")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(int32_t,".s32","r","=r")
+
+// int64_t note: add/sub is using unsigned register
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(int64_t,".u64","l","=l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(int64_t,".u64","l","=l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(int64_t,".s64","l","=l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(int64_t,".s64","l","=l")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(int64_t,".s64","l","=l")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(int64_t,".s64","l","=l")
+
 __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_BIN_OP()
 
 #undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD
@@ -205,4 +222,4 @@ __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_BIN_OP()
 #undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC
 #undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC
 #undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND
-
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_BIN_OP
diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_predicate b/packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_predicate
index c80efc5e7cf043d06a7f82fc3643f3566c5bd6e6..2c0fe9e132cad53d576776bc6f3742145e8a2bd4 100644
--- a/packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_predicate
+++ b/packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_predicate
@@ -206,19 +206,6 @@ inline __device__ ctype device_atomic_fetch_dec_mod(ctype* dest, ctype limit, __
   return result; \
 }
 
-// Group ops for integer ctypes
-#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype)
-
-#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype)
-
 #define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_BIN_OP() \
 __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND() \
 __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_OR() \
@@ -226,19 +213,50 @@ __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_XOR()
 
 
 // Instantiate Functions
+
+// General comments:
+//  - float/double only support add
+//  - inc/dec only supported with uint32_t
+//  - int64_t does not support add
+
+// floating point types
 __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(float,".f32","f","=f")
 __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(float,".f32","f","=f")
 __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(double,".f64","d","=d")
 __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(double,".f64","d","=d")
 
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(uint32_t,".u32","r","=r")
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(uint64_t,".u64","l","=l")
-__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(int32_t,".s32","r","=r")
-//__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(int64_t,".s64","l","=l")
-
+// uint32_t
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(uint32_t,".u32","r","=r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(uint32_t,".u32","r","=r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(uint32_t,".u32","r","=r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(uint32_t,".u32","r","=r")
 __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(uint32_t,".u32","r","=r")
 __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(uint32_t,".u32","r","=r")
 
+// uint64_t
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(uint64_t,".u64","l","=l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(uint64_t,".u64","l","=l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(uint64_t,".u64","l","=l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(uint64_t,".u64","l","=l")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(uint64_t,".u64","l","=l")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(uint64_t,".u64","l","=l")
+
+// int32_t
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(int32_t,".s32","r","=r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(int32_t,".s32","r","=r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(int32_t,".s32","r","=r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(int32_t,".s32","r","=r")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(int32_t,".s32","r","=r")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(int32_t,".s32","r","=r")
+
+// int64_t note: add/sub is using unsigned register
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(int64_t,".u64","l","=l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(int64_t,".u64","l","=l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(int64_t,".s64","l","=l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(int64_t,".s64","l","=l")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(int64_t,".s64","l","=l")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(int64_t,".s64","l","=l")
+
 __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_BIN_OP()
 
 #undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD
diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_forceglobal b/packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_forceglobal
deleted file mode 100644
index 3767b2ab4980c0d811357a6d0e1a912de5bba500..0000000000000000000000000000000000000000
--- a/packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_forceglobal
+++ /dev/null
@@ -1,64 +0,0 @@
-
-// Inline PTX: h u16 , r u32,  l u64, f f32, d f64
-// Ops:
-
-// Non Returning Atomic Operations
-#define __DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
-inline __device__ void atomic_add(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
-  asm volatile("red.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \
-}
-
-#define __DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
-inline __device__ void atomic_sub(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
-  type neg_value = -value; \
-  asm volatile("red.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(neg_value) : "memory"); \
-}
-
-#define __DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
-inline __device__ void atomic_min(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
-  asm volatile("red.min.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \
-}
-
-#define __DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type) \
-inline __device__ void atomic_max(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
-  asm volatile("red.max.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \
-}
-
-#define __DESUL_IMPL_CUDA_ASM_ATOMIC_INC(type,asm_type,reg_type) \
-inline __device__ void atomic_inc(type* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
-  type limit = desul::Impl::numeric_limits_max<type>::value; \
-  asm volatile("red.inc.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(limit) : "memory"); \
-}
-
-#define __DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(type,asm_type,reg_type) \
-inline __device__ void atomic_dec(type* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
-  type limit = desul::Impl::numeric_limits_max<type>::value; \
-  asm volatile("red.dec.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(limit) : "memory"); \
-}
-
-// Group ops for integer types
-#define __DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(type,asm_type,reg_type) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type)
-
-#define __DESUL_IMPL_CUDA_ASM_ATOMIC_UNSIGNED_OP(type,asm_type,reg_type) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_INC(type,asm_type,reg_type) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(type,asm_type,reg_type)
-
-// Instantiate Functions
-__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(float,".f32","f")
-__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(float,".f32","f")
-__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(double,".f64","d")
-__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(double,".f64","d")
-
-__DESUL_IMPL_CUDA_ASM_ATOMIC_UNSIGNED_OP(uint32_t,".u32","r")
-
-__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int64_t,".u64","l")
-__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int32_t,".s32","r")
-//__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int64_t,".s64","l")
diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_generic b/packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_generic
deleted file mode 100644
index 5de36a3e0a87b967fff5d9a936644c5e8a566051..0000000000000000000000000000000000000000
--- a/packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_generic
+++ /dev/null
@@ -1,64 +0,0 @@
-
-// Inline PTX: h u16 , r u32,  l u64, f f32, d f64
-// Ops:
-
-// Non Returning Atomic Operations
-#define __DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
-inline __device__ void atomic_add(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
-  asm volatile("red.add" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \
-}
-
-#define __DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
-inline __device__ void atomic_sub(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
-  type neg_value = -value; \
-  asm volatile("red.add" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(neg_value) : "memory"); \
-}
-
-#define __DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
-inline __device__ void atomic_min(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
-  asm volatile("red.min" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \
-}
-
-#define __DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type) \
-inline __device__ void atomic_max(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
-  asm volatile("red.max" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \
-}
-
-#define __DESUL_IMPL_CUDA_ASM_ATOMIC_INC(type,asm_type,reg_type) \
-inline __device__ void atomic_inc(type* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
-  type limit = desul::Impl::numeric_limits_max<type>::value; \
-  asm volatile("red.inc" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(limit) : "memory"); \
-}
-
-#define __DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(type,asm_type,reg_type) \
-inline __device__ void atomic_dec(type* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
-  type limit = desul::Impl::numeric_limits_max<type>::value; \
-  asm volatile("red.dec" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(limit) : "memory"); \
-}
-
-// Group ops for integer types
-#define __DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(type,asm_type,reg_type) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type)
-
-#define __DESUL_IMPL_CUDA_ASM_ATOMIC_UNSIGNED_OP(type,asm_type,reg_type) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_INC(type,asm_type,reg_type) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(type,asm_type,reg_type)
-
-// Instantiate Functions
-__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(float,".f32","f")
-__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(float,".f32","f")
-__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(double,".f64","d")
-__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(double,".f64","d")
-
-__DESUL_IMPL_CUDA_ASM_ATOMIC_UNSIGNED_OP(uint32_t,".u32","r")
-
-__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int64_t,".u64","l")
-__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int32_t,".s32","r")
-//__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int64_t,".s64","l")
diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_isglobal b/packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_isglobal
index 7bc6d4d9d19d76636988368f9153d83e2930531d..6b6e3593fddf60b6365add362f5aa62c73574b5b 100644
--- a/packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_isglobal
+++ b/packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_isglobal
@@ -60,29 +60,48 @@ inline __device__ void device_atomic_dec(type* dest, __DESUL_IMPL_CUDA_ASM_MEMOR
   } \
 }
 
-// Group ops for integer types
-#define __DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(type,asm_type,reg_type) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type)
+// Instantiate Functions
 
-#define __DESUL_IMPL_CUDA_ASM_ATOMIC_UNSIGNED_OP(type,asm_type,reg_type) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_INC(type,asm_type,reg_type) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(type,asm_type,reg_type)
+// General comments:
+//  - float/double only support add
+//  - inc/dec only supported with uint32_t
+//  - int64_t does not support add
 
-// Instantiate Functions
+// floating point types
 __DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(float,".f32","f")
 __DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(float,".f32","f")
 __DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(double,".f64","d")
 __DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(double,".f64","d")
 
-__DESUL_IMPL_CUDA_ASM_ATOMIC_UNSIGNED_OP(uint32_t,".u32","r")
+// uint32_t
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(uint32_t,".u32","r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(uint32_t,".u32","r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(uint32_t,".u32","r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(uint32_t,".u32","r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_INC(uint32_t,".u32","r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(uint32_t,".u32","r")
+
+// uint64_t
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(uint64_t,".u64","l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(uint64_t,".u64","l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(uint64_t,".u64","l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(uint64_t,".u64","l")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_INC(uint64_t,".u64","l")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(uint64_t,".u64","l")
+
+// int32_t
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(int32_t,".s32","r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(int32_t,".s32","r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(int32_t,".s32","r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(int32_t,".s32","r")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_INC(int32_t,".s32","r")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(int32_t,".s32","r")
+
+// int64_t note: add/sub is using unsigned register
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(int64_t,".u64","l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(int64_t,".u64","l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(int64_t,".s64","l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(int64_t,".s64","l")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_INC(int64_t,".s64","l")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(int64_t,".s64","l")
 
-__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int64_t,".u64","l")
-__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int32_t,".s32","r")
-//__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int64_t,".s64","l")
diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_predicate b/packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_predicate
index 4ae8e46266e0f4b72a384db76b7f258314bc2838..b9569faf1bd0315d5a3290fb6cf9355b9e278b92 100644
--- a/packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_predicate
+++ b/packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_predicate
@@ -78,29 +78,48 @@ inline __device__ void device_atomic_dec(type* dest, __DESUL_IMPL_CUDA_ASM_MEMOR
     :: "l"(dest),reg_type(limit) : "memory"); \
 }
 
-// Group ops for integer types
-#define __DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(type,asm_type,reg_type) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type)
+// Instantiate Functions
 
-#define __DESUL_IMPL_CUDA_ASM_ATOMIC_UNSIGNED_OP(type,asm_type,reg_type) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_INC(type,asm_type,reg_type) \
-__DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(type,asm_type,reg_type)
+// General comments:
+//  - float/double only support add
+//  - inc/dec only supported with uint32_t
+//  - int64_t does not support add
 
-// Instantiate Functions
+// floating point types
 __DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(float,".f32","f")
 __DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(float,".f32","f")
 __DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(double,".f64","d")
 __DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(double,".f64","d")
 
-__DESUL_IMPL_CUDA_ASM_ATOMIC_UNSIGNED_OP(uint32_t,".u32","r")
+// uint32_t
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(uint32_t,".u32","r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(uint32_t,".u32","r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(uint32_t,".u32","r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(uint32_t,".u32","r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_INC(uint32_t,".u32","r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(uint32_t,".u32","r")
+
+// uint64_t
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(uint64_t,".u64","l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(uint64_t,".u64","l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(uint64_t,".u64","l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(uint64_t,".u64","l")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_INC(uint64_t,".u64","l")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(uint64_t,".u64","l")
+
+// int32_t
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(int32_t,".s32","r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(int32_t,".s32","r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(int32_t,".s32","r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(int32_t,".s32","r")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_INC(int32_t,".s32","r")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(int32_t,".s32","r")
+
+// int64_t note: add/sub are using unsigned register!
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(int64_t,".u64","l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(int64_t,".u64","l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(int64_t,".s64","l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(int64_t,".s64","l")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_INC(int64_t,".s64","l")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(int64_t,".s64","l")
 
-__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int64_t,".u64","l")
-__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int32_t,".s32","r")
-//__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int64_t,".s64","l")
diff --git a/packages/kokkos/tpls/desul/src/Lock_Array_CUDA.cpp b/packages/kokkos/tpls/desul/src/Lock_Array_CUDA.cpp
index 19944b378e2c47090dbe3ce28913017a3f308933..155f33653ef25acae1f94fae5939be824cff5d5c 100644
--- a/packages/kokkos/tpls/desul/src/Lock_Array_CUDA.cpp
+++ b/packages/kokkos/tpls/desul/src/Lock_Array_CUDA.cpp
@@ -11,8 +11,7 @@ SPDX-License-Identifier: (BSD-3-Clause)
 #include <sstream>
 #include <string>
 
-#ifdef DESUL_HAVE_CUDA_ATOMICS
-#ifdef __CUDACC_RDC__
+#ifdef DESUL_ATOMICS_ENABLE_CUDA_SEPARABLE_COMPILATION
 namespace desul {
 namespace Impl {
 __device__ __constant__ int32_t* CUDA_SPACE_ATOMIC_LOCKS_DEVICE = nullptr;
@@ -84,7 +83,7 @@ void finalize_lock_arrays_cuda() {
   cudaFreeHost(CUDA_SPACE_ATOMIC_LOCKS_NODE_h);
   CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr;
   CUDA_SPACE_ATOMIC_LOCKS_NODE_h = nullptr;
-#ifdef __CUDACC_RDC__
+#ifdef DESUL_ATOMICS_ENABLE_CUDA_SEPARABLE_COMPILATION
   copy_cuda_lock_arrays_to_device();
 #endif
 }
@@ -96,4 +95,3 @@ template void finalize_lock_arrays_cuda<int>();
 }  // namespace Impl
 
 }  // namespace desul
-#endif
diff --git a/packages/kokkos/tpls/desul/src/Lock_Array_HIP.cpp b/packages/kokkos/tpls/desul/src/Lock_Array_HIP.cpp
index 5ccc6f7d54a1721af83c44b0246eea3fbc962c9f..465b2eb25a50d3924093ec046e2f67076a65e7fc 100644
--- a/packages/kokkos/tpls/desul/src/Lock_Array_HIP.cpp
+++ b/packages/kokkos/tpls/desul/src/Lock_Array_HIP.cpp
@@ -11,8 +11,7 @@ SPDX-License-Identifier: (BSD-3-Clause)
 #include <sstream>
 #include <string>
 
-#ifdef DESUL_HAVE_HIP_ATOMICS
-#ifdef DESUL_HIP_RDC
+#ifdef DESUL_ATOMICS_ENABLE_HIP_SEPARABLE_COMPILATION
 namespace desul {
 namespace Impl {
 __device__ __constant__ int32_t* HIP_SPACE_ATOMIC_LOCKS_DEVICE = nullptr;
@@ -70,7 +69,7 @@ void init_lock_arrays_hip() {
                             "init_lock_arrays_hip: hipMallocHost host locks");
 
   auto error_sync1 = hipDeviceSynchronize();
-  DESUL_IMPL_COPY_HIP_LOCK_ARRAYS_TO_DEVICE();
+  copy_hip_lock_arrays_to_device();
   check_error_and_throw_hip(error_sync1, "init_lock_arrays_hip: post malloc");
 
   init_lock_arrays_hip_kernel<<<(HIP_SPACE_ATOMIC_MASK + 1 + 255) / 256, 256>>>();
@@ -88,8 +87,8 @@ void finalize_lock_arrays_hip() {
   check_error_and_throw_hip(error_free2, "finalize_lock_arrays_hip: free host locks");
   HIP_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr;
   HIP_SPACE_ATOMIC_LOCKS_NODE_h = nullptr;
-#ifdef DESUL_HIP_RDC
-  DESUL_IMPL_COPY_HIP_LOCK_ARRAYS_TO_DEVICE();
+#ifdef DESUL_ATOMICS_ENABLE_HIP_SEPARABLE_COMPILATION
+  copy_hip_lock_arrays_to_device();
 #endif
 }
 
@@ -99,4 +98,3 @@ template void finalize_lock_arrays_hip<int>();
 }  // namespace Impl
 
 }  // namespace desul
-#endif
diff --git a/packages/kokkos/tpls/desul/src/Lock_Array_SYCL.cpp b/packages/kokkos/tpls/desul/src/Lock_Array_SYCL.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9e84c60e41a541df54b4aa31eef9a039815f6705
--- /dev/null
+++ b/packages/kokkos/tpls/desul/src/Lock_Array_SYCL.cpp
@@ -0,0 +1,69 @@
+/*
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+// FIXME_SYCL Use SYCL_EXT_ONEAPI_DEVICE_GLOBAL when available instead
+#ifdef DESUL_SYCL_DEVICE_GLOBAL_SUPPORTED
+
+#include <cinttypes>
+#include <desul/atomics/Lock_Array_SYCL.hpp>
+
+namespace desul::Impl {
+
+SYCL_EXTERNAL
+sycl_device_global<int32_t*> SYCL_SPACE_ATOMIC_LOCKS_DEVICE;
+SYCL_EXTERNAL
+sycl_device_global<int32_t*> SYCL_SPACE_ATOMIC_LOCKS_NODE;
+
+int32_t* SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr;
+int32_t* SYCL_SPACE_ATOMIC_LOCKS_NODE_h = nullptr;
+
+template <>
+void init_lock_arrays_sycl<int>(sycl::queue q) {
+  if (SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h != nullptr) return;
+
+  SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h =
+      sycl::malloc_device<int32_t>(SYCL_SPACE_ATOMIC_MASK + 1, q);
+  SYCL_SPACE_ATOMIC_LOCKS_NODE_h =
+      sycl::malloc_host<int32_t>(SYCL_SPACE_ATOMIC_MASK + 1, q);
+
+  // FIXME_SYCL Once supported, the following should be replaced by
+  // q.memcpy(SYCL_SPACE_ATOMIC_LOCKS_DEVICE,
+  //          &SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h,
+  //          sizeof(int32_t*));
+  // q.memcpy(SYCL_SPACE_ATOMIC_LOCKS_NODE,
+  //          &SYCL_SPACE_ATOMIC_LOCKS_NODE_h,
+  //          sizeof(int32_t*));
+  auto device_ptr = SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h;
+  auto node_ptr = SYCL_SPACE_ATOMIC_LOCKS_NODE_h;
+  q.single_task([=] {
+    SYCL_SPACE_ATOMIC_LOCKS_DEVICE.get() = device_ptr;
+    SYCL_SPACE_ATOMIC_LOCKS_NODE.get() = node_ptr;
+  });
+
+  q.memset(SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h,
+           0,
+           sizeof(int32_t) * (SYCL_SPACE_ATOMIC_MASK + 1));
+  q.memset(SYCL_SPACE_ATOMIC_LOCKS_NODE_h,
+           0,
+           sizeof(int32_t) * (SYCL_SPACE_ATOMIC_MASK + 1));
+
+  q.wait_and_throw();
+}
+
+template <>
+void finalize_lock_arrays_sycl<int>(sycl::queue q) {
+  if (SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h == nullptr) return;
+
+  sycl::free(SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h, q);
+  sycl::free(SYCL_SPACE_ATOMIC_LOCKS_NODE_h, q);
+  SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr;
+  SYCL_SPACE_ATOMIC_LOCKS_NODE_h = nullptr;
+}
+
+} // namespace desul::Impl
+#endif
diff --git a/packages/kokkos/tpls/gtest/gtest/gtest-all.cc b/packages/kokkos/tpls/gtest/gtest/gtest-all.cc
index 9f340e0140520dbfca0b0f31a61b1c1f4c5e7e25..4a6c2e787a642a3a536a979cffe96e095436b5e5 100644
--- a/packages/kokkos/tpls/gtest/gtest/gtest-all.cc
+++ b/packages/kokkos/tpls/gtest/gtest/gtest-all.cc
@@ -11957,7 +11957,7 @@ static const char* GetCharWidthPrefix(unsigned char) {
   return "";
 }
 
-#ifdef __cpp_char8_t
+#ifdef __cpp_lib_char8_t
 static const char* GetCharWidthPrefix(char8_t) {
   return "u8";
 }
@@ -11981,7 +11981,7 @@ static CharFormat PrintAsStringLiteralTo(char c, ostream* os) {
   return PrintAsStringLiteralTo(ToChar32(c), os);
 }
 
-#ifdef __cpp_char8_t
+#ifdef __cpp_lib_char8_t
 static CharFormat PrintAsStringLiteralTo(char8_t c, ostream* os) {
   return PrintAsStringLiteralTo(ToChar32(c), os);
 }
@@ -12103,7 +12103,7 @@ void UniversalPrintArray(const char* begin, size_t len, ostream* os) {
   UniversalPrintCharArray(begin, len, os);
 }
 
-#ifdef __cpp_char8_t
+#ifdef __cpp_lib_char8_t
 // Prints a (const) char8_t array of 'len' elements, starting at address
 // 'begin'.
 void UniversalPrintArray(const char8_t* begin, size_t len, ostream* os) {
@@ -12146,7 +12146,7 @@ void PrintCStringTo(const Char* s, ostream* os) {
 
 void PrintTo(const char* s, ostream* os) { PrintCStringTo(s, os); }
 
-#ifdef __cpp_char8_t
+#ifdef __cpp_lib_char8_t
 void PrintTo(const char8_t* s, ostream* os) { PrintCStringTo(s, os); }
 #endif
 
@@ -12240,7 +12240,7 @@ void PrintStringTo(const ::std::string& s, ostream* os) {
   }
 }
 
-#ifdef __cpp_char8_t
+#ifdef __cpp_lib_char8_t
 void PrintU8StringTo(const ::std::u8string& s, ostream* os) {
   PrintCharsAsStringTo(s.data(), s.size(), os);
 }
diff --git a/packages/kokkos/tpls/gtest/gtest/gtest.h b/packages/kokkos/tpls/gtest/gtest/gtest.h
index e7490573ac5fbf630111d2ac9d63ad1906f6e2a9..c17c9ab3fc2295eded01956cd7820a33a1accb2a 100644
--- a/packages/kokkos/tpls/gtest/gtest/gtest.h
+++ b/packages/kokkos/tpls/gtest/gtest/gtest.h
@@ -2189,7 +2189,7 @@ inline bool IsUpper(char ch) {
 inline bool IsXDigit(char ch) {
   return isxdigit(static_cast<unsigned char>(ch)) != 0;
 }
-#ifdef __cpp_char8_t
+#ifdef __cpp_lib_char8_t
 inline bool IsXDigit(char8_t ch) {
   return isxdigit(static_cast<unsigned char>(ch)) != 0;
 }
@@ -5417,7 +5417,7 @@ GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char);
 GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char);
 GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(wchar_t);
 GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const wchar_t);
-#ifdef __cpp_char8_t
+#ifdef __cpp_lib_char8_t
 GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char8_t);
 GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char8_t);
 #endif
@@ -5442,7 +5442,7 @@ GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char32_t);
 
 GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::std::string);
 GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::std::string);
-#ifdef __cpp_char8_t
+#ifdef __cpp_lib_char8_t
 GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char8_t, ::std::u8string);
 GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char8_t, ::std::u8string);
 #endif
@@ -5530,7 +5530,7 @@ GTEST_API_ void PrintTo(char32_t c, ::std::ostream* os);
 inline void PrintTo(char16_t c, ::std::ostream* os) {
   PrintTo(ImplicitCast_<char32_t>(c), os);
 }
-#ifdef __cpp_char8_t
+#ifdef __cpp_lib_char8_t
 inline void PrintTo(char8_t c, ::std::ostream* os) {
   PrintTo(ImplicitCast_<char32_t>(c), os);
 }
@@ -5556,7 +5556,7 @@ inline void PrintTo(const unsigned char* s, ::std::ostream* os) {
 inline void PrintTo(unsigned char* s, ::std::ostream* os) {
   PrintTo(ImplicitCast_<const void*>(s), os);
 }
-#ifdef __cpp_char8_t
+#ifdef __cpp_lib_char8_t
 // Overloads for u8 strings.
 GTEST_API_ void PrintTo(const char8_t* s, ::std::ostream* os);
 inline void PrintTo(char8_t* s, ::std::ostream* os) {
@@ -5608,7 +5608,7 @@ inline void PrintTo(const ::std::string& s, ::std::ostream* os) {
 }
 
 // Overloads for ::std::u8string
-#ifdef __cpp_char8_t
+#ifdef __cpp_lib_char8_t
 GTEST_API_ void PrintU8StringTo(const ::std::u8string& s, ::std::ostream* os);
 inline void PrintTo(const ::std::u8string& s, ::std::ostream* os) {
   PrintU8StringTo(s, os);
@@ -5862,7 +5862,7 @@ void UniversalPrintArray(const T* begin, size_t len, ::std::ostream* os) {
 GTEST_API_ void UniversalPrintArray(
     const char* begin, size_t len, ::std::ostream* os);
 
-#ifdef __cpp_char8_t
+#ifdef __cpp_lib_char8_t
 // This overload prints a (const) char8_t array compactly.
 GTEST_API_ void UniversalPrintArray(const char8_t* begin, size_t len,
                                     ::std::ostream* os);
@@ -5951,7 +5951,7 @@ template <>
 class UniversalTersePrinter<char*> : public UniversalTersePrinter<const char*> {
 };
 
-#ifdef __cpp_char8_t
+#ifdef __cpp_lib_char8_t
 template <>
 class UniversalTersePrinter<const char8_t*> {
  public:
diff --git a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/compressed_pair.hpp b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/compressed_pair.hpp
index 163ec5fa56a418b6e7d7f94ffd74780e9e038805..ab1561bd47fa57f31004f3c8e56e361eb55c4c76 100644
--- a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/compressed_pair.hpp
+++ b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/compressed_pair.hpp
@@ -1,46 +1,18 @@
-/*
 //@HEADER
 // ************************************************************************
 //
-//                        Kokkos v. 2.0
-//              Copyright (2019) Sandia Corporation
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
 //
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// Under the terms of Contract DE-NA0003525 with NTESS,
 // the U.S. Government retains certain rights in this software.
 //
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
 //@HEADER
-*/
-
 #pragma once
 
 #include "macros.hpp"
@@ -50,8 +22,7 @@
 #  include "no_unique_address.hpp"
 #endif
 
-namespace std {
-namespace experimental {
+namespace MDSPAN_IMPL_STANDARD_NAMESPACE {
 namespace detail {
 
 // For no unique address emulation, this is the case taken when neither are empty.
@@ -93,7 +64,7 @@ template <class _T, class _U, class _Enable = void> struct __compressed_pair {
 template <class _T, class _U>
 struct __compressed_pair<
     _T, _U,
-    enable_if_t<_MDSPAN_TRAIT(is_empty, _T) && !_MDSPAN_TRAIT(is_empty, _U)>>
+    std::enable_if_t<_MDSPAN_TRAIT(std::is_empty, _T) && !_MDSPAN_TRAIT(std::is_empty, _U)>>
     : private _T {
   _U __u_val;
   MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T &__first() noexcept {
@@ -130,7 +101,7 @@ struct __compressed_pair<
 template <class _T, class _U>
 struct __compressed_pair<
     _T, _U,
-    enable_if_t<!_MDSPAN_TRAIT(is_empty, _T) && _MDSPAN_TRAIT(is_empty, _U)>>
+    std::enable_if_t<!_MDSPAN_TRAIT(std::is_empty, _T) && _MDSPAN_TRAIT(std::is_empty, _U)>>
     : private _U {
   _T __t_val;
   MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T &__first() noexcept { return __t_val; }
@@ -168,7 +139,7 @@ struct __compressed_pair<
 template <class _T, class _U>
 struct __compressed_pair<
     _T, _U,
-    enable_if_t<_MDSPAN_TRAIT(is_empty, _T) && _MDSPAN_TRAIT(is_empty, _U)>>
+    std::enable_if_t<_MDSPAN_TRAIT(std::is_empty, _T) && _MDSPAN_TRAIT(std::is_empty, _U)>>
     // We need to use the __no_unique_address_emulation wrapper here to avoid
     // base class ambiguities.
 #ifdef _MDSPAN_COMPILER_MSVC
@@ -221,5 +192,4 @@ struct __compressed_pair<
 #endif // !defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS)
 
 } // end namespace detail
-} // end namespace experimental
-} // end namespace std
+} // end namespace MDSPAN_IMPL_STANDARD_NAMESPACE
diff --git a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/config.hpp b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/config.hpp
index 63cb63b9e903f05f439eecf5c89eb18b9a0a9983..d35e201cebd2bd8d0b1b99e6409e618a440c7a68 100644
--- a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/config.hpp
+++ b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/config.hpp
@@ -1,46 +1,18 @@
-/*
 //@HEADER
 // ************************************************************************
 //
-//                        Kokkos v. 2.0
-//              Copyright (2019) Sandia Corporation
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
 //
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// Under the terms of Contract DE-NA0003525 with NTESS,
 // the U.S. Government retains certain rights in this software.
 //
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
 //@HEADER
-*/
-
 #pragma once
 
 #ifndef __has_include
@@ -110,6 +82,12 @@ static_assert(_MDSPAN_CPLUSPLUS >= MDSPAN_CXX_STD_14, "mdspan requires C++14 or
 #  endif
 #endif
 
+#ifndef _MDSPAN_HAS_SYCL
+#  if defined(SYCL_LANGUAGE_VERSION)
+#    define _MDSPAN_HAS_SYCL SYCL_LANGUAGE_VERSION
+#  endif
+#endif
+
 #ifndef __has_cpp_attribute
 #  define __has_cpp_attribute(x) 0
 #endif
@@ -143,11 +121,15 @@ static_assert(_MDSPAN_CPLUSPLUS >= MDSPAN_CXX_STD_14, "mdspan requires C++14 or
 #  define _MDSPAN_NO_UNIQUE_ADDRESS
 #endif
 
+// AMDs HIP compiler seems to have issues with concepts
+// it pretends concepts exist, but doesn't ship <concept>
+#ifndef __HIPCC__
 #ifndef _MDSPAN_USE_CONCEPTS
 #  if defined(__cpp_concepts) && __cpp_concepts >= 201507L
 #    define _MDSPAN_USE_CONCEPTS 1
 #  endif
 #endif
+#endif
 
 #ifndef _MDSPAN_USE_FOLD_EXPRESSIONS
 #  if (defined(__cpp_fold_expressions) && __cpp_fold_expressions >= 201603L) \
@@ -216,23 +198,13 @@ static_assert(_MDSPAN_CPLUSPLUS >= MDSPAN_CXX_STD_14, "mdspan requires C++14 or
 #endif
 
 #ifndef _MDSPAN_USE_CLASS_TEMPLATE_ARGUMENT_DEDUCTION
-// GCC 10's CTAD seems sufficiently broken to prevent its use.
-#  if (defined(_MDSPAN_COMPILER_CLANG) || !defined(__GNUC__) || __GNUC__ >= 11) \
-      && ((defined(__cpp_deduction_guides) && __cpp_deduction_guides >= 201703) \
-         || (!defined(__cpp_deduction_guides) && MDSPAN_HAS_CXX_17))
+#  if (!defined(__NVCC__) || (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 7)) && \
+      ((defined(__cpp_deduction_guides) && __cpp_deduction_guides >= 201703) || \
+       (!defined(__cpp_deduction_guides) && MDSPAN_HAS_CXX_17))
 #    define _MDSPAN_USE_CLASS_TEMPLATE_ARGUMENT_DEDUCTION 1
 #  endif
 #endif
 
-#ifndef _MDSPAN_USE_ALIAS_TEMPLATE_ARGUMENT_DEDUCTION
-// GCC 10's CTAD seems sufficiently broken to prevent its use.
-#  if (defined(_MDSPAN_COMPILER_CLANG) || !defined(__GNUC__) || __GNUC__ >= 11) \
-      && ((defined(__cpp_deduction_guides) && __cpp_deduction_guides >= 201907) \
-          || (!defined(__cpp_deduction_guides) && MDSPAN_HAS_CXX_20))
-#    define _MDSPAN_USE_ALIAS_TEMPLATE_ARGUMENT_DEDUCTION 1
-#  endif
-#endif
-
 #ifndef _MDSPAN_USE_STANDARD_TRAIT_ALIASES
 #  if (defined(__cpp_lib_transformation_trait_aliases) && __cpp_lib_transformation_trait_aliases >= 201304) \
           || (!defined(__cpp_lib_transformation_trait_aliases) && MDSPAN_HAS_CXX_14)
diff --git a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/default_accessor.hpp b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/default_accessor.hpp
index d26a7e6dad270b0ebadea63b1323f55ec850e855..ea0f537b2fe191ace9c6f5271d5b7331e172a4fd 100644
--- a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/default_accessor.hpp
+++ b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/default_accessor.hpp
@@ -1,54 +1,25 @@
-/*
 //@HEADER
 // ************************************************************************
 //
-//                        Kokkos v. 2.0
-//              Copyright (2019) Sandia Corporation
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
 //
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// Under the terms of Contract DE-NA0003525 with NTESS,
 // the U.S. Government retains certain rights in this software.
 //
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
 //@HEADER
-*/
-
 #pragma once
 
 #include "macros.hpp"
 
 #include <cstddef> // size_t
 
-namespace std {
-namespace experimental {
+namespace MDSPAN_IMPL_STANDARD_NAMESPACE {
 
 template <class ElementType>
 struct default_accessor {
@@ -63,7 +34,7 @@ struct default_accessor {
   MDSPAN_TEMPLATE_REQUIRES(
     class OtherElementType,
     /* requires */ (
-      _MDSPAN_TRAIT(is_convertible, OtherElementType(*)[], element_type(*)[])
+      _MDSPAN_TRAIT(std::is_convertible, OtherElementType(*)[], element_type(*)[])
     )
   )
   MDSPAN_INLINE_FUNCTION
@@ -82,5 +53,4 @@ struct default_accessor {
 
 };
 
-} // end namespace experimental
-} // end namespace std
+} // end namespace MDSPAN_IMPL_STANDARD_NAMESPACE
diff --git a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/dynamic_extent.hpp b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/dynamic_extent.hpp
index 206540292d7e783be48ac46502d070013bac6880..2e29da13d6adfd1107fe7ea3ff022bfcf376f189 100644
--- a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/dynamic_extent.hpp
+++ b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/dynamic_extent.hpp
@@ -1,73 +1,35 @@
-/*
 //@HEADER
 // ************************************************************************
 //
-//                        Kokkos v. 2.0
-//              Copyright (2019) Sandia Corporation
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
 //
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// Under the terms of Contract DE-NA0003525 with NTESS,
 // the U.S. Government retains certain rights in this software.
 //
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
 //@HEADER
-*/
-
 #pragma once
 
 #include "macros.hpp"
 
+#if defined(__cpp_lib_span)
+#include <span>
+#endif
+
 #include <cstddef>  // size_t
 #include <limits>   // numeric_limits
 
-namespace std {
-namespace experimental {
-
+namespace MDSPAN_IMPL_STANDARD_NAMESPACE {
+#if defined(__cpp_lib_span)
+using std::dynamic_extent;
+#else
 _MDSPAN_INLINE_VARIABLE constexpr auto dynamic_extent = std::numeric_limits<size_t>::max();
-
-namespace detail {
-
-template <class>
-constexpr auto __make_dynamic_extent() {
-  return dynamic_extent;
-}
-
-template <size_t>
-constexpr auto __make_dynamic_extent_integral() {
-  return dynamic_extent;
-}
-
-} // end namespace detail
-
-} // end namespace experimental
-} // namespace std
+#endif
+} // namespace MDSPAN_IMPL_STANDARD_NAMESPACE
 
 //==============================================================================================================
diff --git a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/extents.hpp b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/extents.hpp
index ce562ed7e2f16b6a7fa6b860d2a94129fe39c0e8..0dd31c4cd0aacb38b1fff605f6101059195e2d90 100644
--- a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/extents.hpp
+++ b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/extents.hpp
@@ -1,543 +1,594 @@
-/*
 //@HEADER
 // ************************************************************************
 //
-//                        Kokkos v. 2.0
-//              Copyright (2019) Sandia Corporation
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
 //
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// Under the terms of Contract DE-NA0003525 with NTESS,
 // the U.S. Government retains certain rights in this software.
 //
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
 //@HEADER
-*/
 
 #pragma once
+#include "dynamic_extent.hpp"
 
-#include "macros.hpp"
-#include "static_array.hpp"
-#include "standard_layout_static_array.hpp"
-#include "trait_backports.hpp" // integer_sequence, etc.
-
-#if !defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS)
-#  include "no_unique_address.hpp"
+#ifdef __cpp_lib_span
+#include <span>
 #endif
-
 #include <array>
-#include <cstddef>
 
-namespace std {
-namespace experimental {
+#include <cinttypes>
 
+namespace MDSPAN_IMPL_STANDARD_NAMESPACE {
 namespace detail {
 
-template<size_t ... Extents>
-struct _count_dynamic_extents;
+// Function used to check compatibility of extents in converting constructor
+// can't be a private member function for some reason.
+template <size_t... Extents, size_t... OtherExtents>
+static constexpr std::integral_constant<bool, false> __check_compatible_extents(
+    std::integral_constant<bool, false>,
+    std::integer_sequence<size_t, Extents...>,
+    std::integer_sequence<size_t, OtherExtents...>) noexcept {
+  return {};
+}
+
+// This helper prevents ICE's on MSVC.
+template <size_t Lhs, size_t Rhs>
+struct __compare_extent_compatible : std::integral_constant<bool,
+     Lhs == dynamic_extent ||
+     Rhs == dynamic_extent ||
+     Lhs == Rhs>
+{};
 
-template<size_t E, size_t ... Extents>
-struct _count_dynamic_extents<E,Extents...> {
-  static constexpr size_t val = (E==dynamic_extent?1:0) + _count_dynamic_extents<Extents...>::val;
+template <size_t... Extents, size_t... OtherExtents>
+static constexpr std::integral_constant<
+    bool, _MDSPAN_FOLD_AND(__compare_extent_compatible<Extents, OtherExtents>::value)>
+__check_compatible_extents(
+    std::integral_constant<bool, true>,
+    std::integer_sequence<size_t, Extents...>,
+    std::integer_sequence<size_t, OtherExtents...>) noexcept {
+  return {};
+}
+
+// ------------------------------------------------------------------
+// ------------ static_array ----------------------------------------
+// ------------------------------------------------------------------
+
+// array like class which provides an array of static values with get
+// function and operator [].
+
+// Implementation of Static Array with recursive implementation of get.
+template <size_t R, class T, T... Extents> struct static_array_impl;
+
+template <size_t R, class T, T FirstExt, T... Extents>
+struct static_array_impl<R, T, FirstExt, Extents...> {
+  MDSPAN_INLINE_FUNCTION
+  constexpr static T get(size_t r) {
+    if (r == R)
+      return FirstExt;
+    else
+      return static_array_impl<R + 1, T, Extents...>::get(r);
+  }
+  template <size_t r> MDSPAN_INLINE_FUNCTION constexpr static T get() {
+#if MDSPAN_HAS_CXX_17
+    if constexpr (r == R)
+      return FirstExt;
+    else
+      return static_array_impl<R + 1, T, Extents...>::template get<r>();
+#else
+    get(r);
+#endif
+  }
 };
 
-template<>
-struct _count_dynamic_extents<> {
-  static constexpr size_t val = 0;
+// End the recursion
+template <size_t R, class T, T FirstExt>
+struct static_array_impl<R, T, FirstExt> {
+  MDSPAN_INLINE_FUNCTION
+  constexpr static T get(size_t) { return FirstExt; }
+  template <size_t> MDSPAN_INLINE_FUNCTION constexpr static T get() {
+    return FirstExt;
+  }
 };
 
-template <size_t... Extents, size_t... OtherExtents>
-static constexpr std::false_type _check_compatible_extents(
-  std::false_type, std::integer_sequence<size_t, Extents...>, std::integer_sequence<size_t, OtherExtents...>
-) noexcept { return { }; }
+// Don't start recursion if size 0
+template <class T> struct static_array_impl<0, T> {
+  MDSPAN_INLINE_FUNCTION
+  constexpr static T get(size_t) { return T(); }
+  template <size_t> MDSPAN_INLINE_FUNCTION constexpr static T get() {
+    return T();
+  }
+};
 
-template <size_t... Extents, size_t... OtherExtents>
-static std::integral_constant<
-  bool,
-  _MDSPAN_FOLD_AND(
-    (
-      Extents == dynamic_extent
-        || OtherExtents == dynamic_extent
-        || Extents == OtherExtents
-    ) /* && ... */
-  )
->
-_check_compatible_extents(
-  std::true_type, std::integer_sequence<size_t, Extents...>, std::integer_sequence<size_t, OtherExtents...>
-) noexcept { return { }; }
-
-struct __extents_tag { };
+// Static array, provides get<r>(), get(r) and operator[r]
+template <class T, T... Values> struct static_array:
+  public static_array_impl<0, T, Values...>  {
 
-} // end namespace detail
-
-template <class ThisIndexType, size_t... Extents>
-class extents
-#if !defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS)
-  : private detail::__no_unique_address_emulation<
-      detail::__partially_static_sizes_tagged<detail::__extents_tag, ThisIndexType , size_t, Extents...>>
-#endif
-{
 public:
+  using value_type = T;
 
-  using rank_type = size_t;
-  using index_type = ThisIndexType;
-  using size_type = make_unsigned_t<index_type>;
+  MDSPAN_INLINE_FUNCTION
+  constexpr static size_t size() { return sizeof...(Values); }
+};
 
-// internal typedefs which for technical reasons are public
-  using __storage_t = detail::__partially_static_sizes_tagged<detail::__extents_tag, index_type, size_t, Extents...>;
 
-#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS)
-  _MDSPAN_NO_UNIQUE_ADDRESS __storage_t __storage_;
-#else
-  using __base_t = detail::__no_unique_address_emulation<__storage_t>;
-#endif
+// ------------------------------------------------------------------
+// ------------ index_sequence_scan ---------------------------------
+// ------------------------------------------------------------------
 
-// private members dealing with the way we internally store dynamic extents
- private:
+// index_sequence_scan takes compile time values and provides get(r)
+// and get<r>() which return the sum of the first r-1 values.
 
-  MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14
-  __storage_t& __storage() noexcept {
-#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS)
-    return __storage_;
-#else
-    return this->__base_t::__ref();
-#endif
+// Recursive implementation for get
+template <size_t R, size_t... Values> struct index_sequence_scan_impl;
+
+template <size_t R, size_t FirstVal, size_t... Values>
+struct index_sequence_scan_impl<R, FirstVal, Values...> {
+  MDSPAN_INLINE_FUNCTION
+  constexpr static size_t get(size_t r) {
+    if (r > R)
+      return FirstVal + index_sequence_scan_impl<R + 1, Values...>::get(r);
+    else
+      return 0;
+  }
+};
+
+template <size_t R, size_t FirstVal>
+struct index_sequence_scan_impl<R, FirstVal> {
+#if defined(__NVCC__) || defined(__NVCOMPILER)
+  // NVCC warns about pointless comparison with 0 for R==0 and r being const
+  // evaluatable and also 0.
+  MDSPAN_INLINE_FUNCTION
+  constexpr static size_t get(size_t r) {
+    return static_cast<int64_t>(R) > static_cast<int64_t>(r) ? FirstVal : 0;
   }
-  MDSPAN_FORCE_INLINE_FUNCTION
-  constexpr __storage_t const& __storage() const noexcept {
-#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS)
-    return __storage_;
 #else
-    return this->__base_t::__ref();
+  MDSPAN_INLINE_FUNCTION
+  constexpr static size_t get(size_t r) { return R > r ? FirstVal : 0; }
 #endif
-  }
+};
+template <> struct index_sequence_scan_impl<0> {
+  MDSPAN_INLINE_FUNCTION
+  constexpr static size_t get(size_t) { return 0; }
+};
 
-  template <size_t... Idxs>
-  MDSPAN_FORCE_INLINE_FUNCTION
-  static constexpr
-  std::size_t _static_extent_impl(size_t n, std::integer_sequence<size_t, Idxs...>) noexcept {
-    return _MDSPAN_FOLD_PLUS_RIGHT(((Idxs == n) ? Extents : 0), /* + ... + */ 0);
-  }
+// ------------------------------------------------------------------
+// ------------ possibly_empty_array  -------------------------------
+// ------------------------------------------------------------------
 
-  template <class, size_t...>
-  friend class extents;
+// array like class which provides get function and operator [], and
+// has a specialization for the size 0 case.
+// This is needed to make the maybe_static_array be truly empty, for
+// all static values.
 
-  template <class OtherIndexType, size_t... OtherExtents, size_t... Idxs>
+template <class T, size_t N> struct possibly_empty_array {
+  T vals[N];
   MDSPAN_INLINE_FUNCTION
-  constexpr bool _eq_impl(std::experimental::extents<OtherIndexType, OtherExtents...>, false_type, index_sequence<Idxs...>) const noexcept { return false; }
-  template <class OtherIndexType, size_t... OtherExtents, size_t... Idxs>
+  constexpr T &operator[](size_t r) { return vals[r]; }
   MDSPAN_INLINE_FUNCTION
-  constexpr bool _eq_impl(
-    std::experimental::extents<OtherIndexType, OtherExtents...> other,
-    true_type, index_sequence<Idxs...>
-  ) const noexcept {
-    return _MDSPAN_FOLD_AND(
-      (__storage().template __get_n<Idxs>() == other.__storage().template __get_n<Idxs>()) /* && ... */
-    );
-  }
+  constexpr const T &operator[](size_t r) const { return vals[r]; }
+};
 
-  template <class OtherIndexType, size_t... OtherExtents, size_t... Idxs>
+template <class T> struct possibly_empty_array<T, 0> {
   MDSPAN_INLINE_FUNCTION
-  constexpr bool _not_eq_impl(std::experimental::extents<OtherIndexType, OtherExtents...>, false_type, index_sequence<Idxs...>) const noexcept { return true; }
-  template <class OtherIndexType, size_t... OtherExtents, size_t... Idxs>
+  constexpr T operator[](size_t) { return T(); }
   MDSPAN_INLINE_FUNCTION
-  constexpr bool _not_eq_impl(
-    std::experimental::extents<OtherIndexType, OtherExtents...> other,
-    true_type, index_sequence<Idxs...>
-  ) const noexcept {
-    return _MDSPAN_FOLD_OR(
-      (__storage().template __get_n<Idxs>() != other.__storage().template __get_n<Idxs>()) /* || ... */
-    );
-  }
+  constexpr const T operator[](size_t) const { return T(); }
+};
 
-#if !defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS)
-  MDSPAN_INLINE_FUNCTION constexpr explicit
-  extents(__base_t&& __b) noexcept
-    : __base_t(::std::move(__b))
-  { }
-#endif
+// ------------------------------------------------------------------
+// ------------ maybe_static_array ----------------------------------
+// ------------------------------------------------------------------
+
+// array like class which has a mix of static and runtime values but
+// only stores the runtime values.
+// The type of the static and the runtime values can be different.
+// The position of a dynamic value is indicated through a tag value.
+template <class TDynamic, class TStatic, TStatic dyn_tag, TStatic... Values>
+struct maybe_static_array {
+
+  static_assert(std::is_convertible<TStatic, TDynamic>::value, "maybe_static_array: TStatic must be convertible to TDynamic");
+  static_assert(std::is_convertible<TDynamic, TStatic>::value, "maybe_static_array: TDynamic must be convertible to TStatic");
+
+private:
+  // Static values member
+  using static_vals_t = static_array<TStatic, Values...>;
+  constexpr static size_t m_size = sizeof...(Values);
+  constexpr static size_t m_size_dynamic =
+      _MDSPAN_FOLD_PLUS_RIGHT((Values == dyn_tag), 0);
+
+  // Dynamic values member
+  _MDSPAN_NO_UNIQUE_ADDRESS possibly_empty_array<TDynamic, m_size_dynamic>
+      m_dyn_vals;
+
+  // static mapping of indices to the position in the dynamic values array
+  using dyn_map_t = index_sequence_scan_impl<0, static_cast<size_t>(Values == dyn_tag)...>;
+public:
 
+  // two types for static and dynamic values
+  using value_type = TDynamic;
+  using static_value_type = TStatic;
+  // tag value indicating dynamic value
+  constexpr static static_value_type tag_value = dyn_tag;
 
-// public interface:
-public:
-  /* Defined above for use in the private code
-  using rank_type = size_t;
-  using index_type = ThisIndexType;
-  */
+  constexpr maybe_static_array() = default;
 
+  // constructor for all static values
+  // TODO: add precondition check?
+  MDSPAN_TEMPLATE_REQUIRES(class... Vals,
+                           /* requires */ ((m_size_dynamic == 0) &&
+                                           (sizeof...(Vals) > 0)))
   MDSPAN_INLINE_FUNCTION
-  static constexpr rank_type rank() noexcept { return sizeof...(Extents); }
-  MDSPAN_INLINE_FUNCTION
-  static constexpr rank_type rank_dynamic() noexcept { return _MDSPAN_FOLD_PLUS_RIGHT((rank_type(Extents == dynamic_extent)), /* + ... + */ 0); }
+  constexpr maybe_static_array(Vals...) : m_dyn_vals{} {}
 
-  //--------------------------------------------------------------------------------
-  // Constructors, Destructors, and Assignment
+  // constructors from dynamic values only
+  MDSPAN_TEMPLATE_REQUIRES(class... DynVals,
+                           /* requires */ (sizeof...(DynVals) ==
+                                               m_size_dynamic &&
+                                           m_size_dynamic > 0))
+  MDSPAN_INLINE_FUNCTION
+  constexpr maybe_static_array(DynVals... vals)
+      : m_dyn_vals{static_cast<TDynamic>(vals)...} {}
 
-  // Default constructor
-  MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr extents() noexcept = default;
 
-  // Converting constructor
-  MDSPAN_TEMPLATE_REQUIRES(
-    class OtherIndexType, size_t... OtherExtents,
-    /* requires */ (
-      /* multi-stage check to protect from invalid pack expansion when sizes don't match? */
-      decltype(detail::_check_compatible_extents(
-        std::integral_constant<bool, sizeof...(Extents) == sizeof...(OtherExtents)>{},
-        std::integer_sequence<size_t, Extents...>{},
-        std::integer_sequence<size_t, OtherExtents...>{}
-      ))::value
-    )
-  )
-  MDSPAN_INLINE_FUNCTION
-  MDSPAN_CONDITIONAL_EXPLICIT(
-    (((Extents != dynamic_extent) && (OtherExtents == dynamic_extent)) || ...) ||
-    (std::numeric_limits<index_type>::max() < std::numeric_limits<OtherIndexType>::max()))
-  constexpr extents(const extents<OtherIndexType, OtherExtents...>& __other)
-    noexcept
-#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS)
-    : __storage_{
-#else
-    : __base_t(__base_t{__storage_t{
-#endif
-        __other.__storage().__enable_psa_conversion()
-#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS)
-      }
-#else
-      }})
-#endif
-  {
-    /* TODO: precondition check
-     * other.extent(r) equals Er for each r for which Er is a static extent, and
-     * either
-     *   - sizeof...(OtherExtents) is zero, or
-     *   - other.extent(r) is a representable value of type index_type for all rank index r of other
-     */
+  MDSPAN_TEMPLATE_REQUIRES(class T, size_t N,
+                           /* requires */ (N == m_size_dynamic && N > 0))
+  MDSPAN_INLINE_FUNCTION
+  constexpr maybe_static_array(const std::array<T, N> &vals) {
+    for (size_t r = 0; r < N; r++)
+      m_dyn_vals[r] = static_cast<TDynamic>(vals[r]);
   }
 
-#ifdef __NVCC__
-    MDSPAN_TEMPLATE_REQUIRES(
-    class... Integral,
-    /* requires */ (
-      // TODO: check whether the other version works with newest NVCC, doesn't with 11.4
-      // NVCC seems to pick up rank_dynamic from the wrong extents type???
-      _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(is_convertible, Integral, index_type) /* && ... */) &&
-      _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(is_nothrow_constructible, index_type, Integral) /* && ... */) &&
-      // NVCC chokes on the fold thingy here so wrote the workaround
-      ((sizeof...(Integral) == detail::_count_dynamic_extents<Extents...>::val) ||
-       (sizeof...(Integral) == sizeof...(Extents)))
-      )
-    )
-#else
-    MDSPAN_TEMPLATE_REQUIRES(
-    class... Integral,
-    /* requires */ (
-       _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(is_convertible, Integral, index_type) /* && ... */) &&
-       _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(is_nothrow_constructible, index_type, Integral) /* && ... */) &&
-       ((sizeof...(Integral) == rank_dynamic()) || (sizeof...(Integral) == rank()))
-      )
-    )
-#endif
+  MDSPAN_TEMPLATE_REQUIRES(class T, size_t N,
+                           /* requires */ (N == m_size_dynamic && N == 0))
   MDSPAN_INLINE_FUNCTION
-  explicit constexpr extents(Integral... exts) noexcept
-#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS)
-    : __storage_{
-#else
-    : __base_t(__base_t{typename __base_t::__stored_type{
+  constexpr maybe_static_array(const std::array<T, N> &) : m_dyn_vals{} {}
+
+#ifdef __cpp_lib_span
+  MDSPAN_TEMPLATE_REQUIRES(class T, size_t N,
+                           /* requires */ (N == m_size_dynamic))
+  MDSPAN_INLINE_FUNCTION
+  constexpr maybe_static_array(const std::span<T, N> &vals) {
+    for (size_t r = 0; r < N; r++)
+      m_dyn_vals[r] = static_cast<TDynamic>(vals[r]);
+  }
 #endif
-      std::conditional_t<sizeof...(Integral)==rank_dynamic(),
-        detail::__construct_psa_from_dynamic_exts_values_tag_t,
-        detail::__construct_psa_from_all_exts_values_tag_t>(),
-        static_cast<index_type>(exts)...
-#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS)
+
+  // constructors from all values
+  MDSPAN_TEMPLATE_REQUIRES(class... DynVals,
+                           /* requires */ (sizeof...(DynVals) !=
+                                               m_size_dynamic &&
+                                           m_size_dynamic > 0))
+  MDSPAN_INLINE_FUNCTION
+  constexpr maybe_static_array(DynVals... vals)
+    : m_dyn_vals{} {
+    static_assert((sizeof...(DynVals) == m_size), "Invalid number of values.");
+    TDynamic values[m_size]{static_cast<TDynamic>(vals)...};
+    for (size_t r = 0; r < m_size; r++) {
+      TStatic static_val = static_vals_t::get(r);
+      if (static_val == dyn_tag) {
+        m_dyn_vals[dyn_map_t::get(r)] = values[r];
+      }
+// Precondition check
+#ifdef _MDSPAN_DEBUG
+      else {
+        assert(values[r] == static_cast<TDynamic>(static_val));
       }
-#else
-      }})
 #endif
-  {
-    /* TODO: precondition check
-     * If sizeof...(IndexTypes) != rank_dynamic() is true, exts_arr[r] equals Er for each r for which Er is a static extent, and
-     * either
-     *   - sizeof...(exts) == 0 is true, or
-     *   - each element of exts is nonnegative and is a representable value of type index_type.
-     */
+    }
   }
 
-    // TODO: check whether this works with newest NVCC, doesn't with 11.4
-#ifdef __NVCC__
-  // NVCC seems to pick up rank_dynamic from the wrong extents type???
-  // NVCC chokes on the fold thingy here so wrote the workaround
   MDSPAN_TEMPLATE_REQUIRES(
-    class IndexType, size_t N,
-    /* requires */ (
-      _MDSPAN_TRAIT(is_convertible, IndexType, index_type) &&
-      _MDSPAN_TRAIT(is_nothrow_constructible, index_type, IndexType) &&
-      ((N == detail::_count_dynamic_extents<Extents...>::val) ||
-       (N == sizeof...(Extents)))
-    )
-  )
-#else
-    MDSPAN_TEMPLATE_REQUIRES(
-        class IndexType, size_t N,
-        /* requires */ (
-          _MDSPAN_TRAIT(is_convertible, IndexType, index_type) &&
-          _MDSPAN_TRAIT(is_nothrow_constructible, index_type, IndexType) &&
-          (N == rank() || N == rank_dynamic())
-    )
-  )
-#endif
-  MDSPAN_CONDITIONAL_EXPLICIT(N != rank_dynamic())
+      class T, size_t N,
+      /* requires */ (N != m_size_dynamic && m_size_dynamic > 0))
   MDSPAN_INLINE_FUNCTION
-  constexpr
-  extents(std::array<IndexType, N> const& exts) noexcept
-#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS)
-    : __storage_{
-#else
-    : __base_t(__base_t{typename __base_t::__stored_type{
+  constexpr maybe_static_array(const std::array<T, N> &vals) {
+    static_assert((N == m_size), "Invalid number of values.");
+// Precondition check
+#ifdef _MDSPAN_DEBUG
+    assert(N == m_size);
 #endif
-      std::conditional_t<N==rank_dynamic(),
-        detail::__construct_psa_from_dynamic_exts_array_tag_t<0>,
-        detail::__construct_psa_from_all_exts_array_tag_t>(),
-      std::array<IndexType,N>{exts}
-#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS)
+    for (size_t r = 0; r < m_size; r++) {
+      TStatic static_val = static_vals_t::get(r);
+      if (static_val == dyn_tag) {
+        m_dyn_vals[dyn_map_t::get(r)] = static_cast<TDynamic>(vals[r]);
+      }
+// Precondition check
+#ifdef _MDSPAN_DEBUG
+      else {
+        assert(static_cast<TDynamic>(vals[r]) ==
+               static_cast<TDynamic>(static_val));
       }
-#else
-      }})
 #endif
-  {
-    /* TODO: precondition check
-     * If N != rank_dynamic() is true, exts[r] equals Er for each r for which Er is a static extent, and
-     * either
-     *   - N is zero, or
-     *   - exts[r] is nonnegative and is a representable value of type index_type for all rank index r
-     */
+    }
   }
 
 #ifdef __cpp_lib_span
-  // TODO: check whether the below works with newest NVCC, doesn't with 11.4
-#ifdef __NVCC__
-  // NVCC seems to pick up rank_dynamic from the wrong extents type???
-  // NVCC chokes on the fold thingy here so wrote the workaround
   MDSPAN_TEMPLATE_REQUIRES(
-    class IndexType, size_t N,
-    /* requires */ (
-      _MDSPAN_TRAIT(is_convertible, IndexType, index_type) &&
-      _MDSPAN_TRAIT(is_nothrow_constructible, index_type, IndexType) &&
-      ((N == detail::_count_dynamic_extents<Extents...>::val) ||
-       (N == sizeof...(Extents)))
-    )
-  )
-#else
-    MDSPAN_TEMPLATE_REQUIRES(
-        class IndexType, size_t N,
-        /* requires */ (
-          _MDSPAN_TRAIT(is_convertible, IndexType, index_type) &&
-          _MDSPAN_TRAIT(is_nothrow_constructible, index_type, IndexType) &&
-          (N == rank() || N == rank_dynamic())
-    )
-  )
-#endif
-  MDSPAN_CONDITIONAL_EXPLICIT(N != rank_dynamic())
+      class T, size_t N,
+      /* requires */ (N != m_size_dynamic && m_size_dynamic > 0))
   MDSPAN_INLINE_FUNCTION
-  constexpr
-  extents(std::span<IndexType, N> exts) noexcept
-#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS)
-    : __storage_{
-#else
-    : __base_t(__base_t{typename __base_t::__stored_type{
+  constexpr maybe_static_array(const std::span<T, N> &vals) {
+    static_assert((N == m_size) || (m_size == dynamic_extent));
+#ifdef _MDSPAN_DEBUG
+    assert(N == m_size);
 #endif
-      std::conditional_t<N==rank_dynamic(),
-        detail::__construct_psa_from_dynamic_exts_array_tag_t<0>,
-        detail::__construct_psa_from_all_exts_array_tag_t>(),
-      exts
-#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS)
+    for (size_t r = 0; r < m_size; r++) {
+      TStatic static_val = static_vals_t::get(r);
+      if (static_val == dyn_tag) {
+        m_dyn_vals[dyn_map_t::get(r)] = static_cast<TDynamic>(vals[r]);
+      }
+#ifdef _MDSPAN_DEBUG
+      else {
+        assert(static_cast<TDynamic>(vals[r]) ==
+               static_cast<TDynamic>(static_val));
       }
-#else
-      }})
 #endif
-  {
-    /* TODO: precondition check
-     * If N != rank_dynamic() is true, exts[r] equals Er for each r for which Er is a static extent, and
-     * either
-     *   - N is zero, or
-     *   - exts[r] is nonnegative and is a representable value of type index_type for all rank index r
-     */
+    }
   }
 #endif
 
-  // Need this constructor for some submdspan implementation stuff
-  // for the layout_stride case where I use an extents object for strides
+  // access functions
   MDSPAN_INLINE_FUNCTION
-  constexpr explicit
-  extents(__storage_t const& sto ) noexcept
-#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS)
-    : __storage_{
-#else
-    : __base_t(__base_t{
-#endif
-        sto
-#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS)
-      }
-#else
-      })
-#endif
-  { }
-
-  //--------------------------------------------------------------------------------
+  constexpr static TStatic static_value(size_t r) { return static_vals_t::get(r); }
 
   MDSPAN_INLINE_FUNCTION
-  static constexpr
-  size_t static_extent(size_t n) noexcept {
-    // Can't do assert here since that breaks true constexpr ness
-    // assert(n<rank());
-    return _static_extent_impl(n, std::make_integer_sequence<size_t, sizeof...(Extents)>{});
+  constexpr TDynamic value(size_t r) const {
+    TStatic static_val = static_vals_t::get(r);
+    return static_val == dyn_tag ? m_dyn_vals[dyn_map_t::get(r)]
+                                        : static_cast<TDynamic>(static_val);
   }
+  MDSPAN_INLINE_FUNCTION
+  constexpr TDynamic operator[](size_t r) const { return value(r); }
+
 
+  // observers
   MDSPAN_INLINE_FUNCTION
-  constexpr
-  index_type extent(size_t n) const noexcept {
-    // Can't do assert here since that breaks true constexpr ness
-    // assert(n<rank());
-    return __storage().__get(n);
-  }
+  constexpr static size_t size() { return m_size; }
+  MDSPAN_INLINE_FUNCTION
+  constexpr static size_t size_dynamic() { return m_size_dynamic; }
+};
+
+} // namespace detail
+} // namespace MDSPAN_IMPL_STANDARD_NAMESPACE
+
+namespace MDSPAN_IMPL_STANDARD_NAMESPACE {
+
+// ------------------------------------------------------------------
+// ------------ extents ---------------------------------------------
+// ------------------------------------------------------------------
+
+// Class to describe the extents of a multi dimensional array.
+// Used by mdspan, mdarray and layout mappings.
+// See ISO C++ standard [mdspan.extents]
+
+template <class IndexType, size_t... Extents> class extents {
+public:
+  // typedefs for integral types used
+  using index_type = IndexType;
+  using size_type = std::make_unsigned_t<index_type>;
+  using rank_type = size_t;
+
+  static_assert(std::is_integral<index_type>::value && !std::is_same<index_type, bool>::value,
+                MDSPAN_IMPL_STANDARD_NAMESPACE_STRING "::extents::index_type must be a signed or unsigned integer type");
+private:
+  constexpr static rank_type m_rank = sizeof...(Extents);
+  constexpr static rank_type m_rank_dynamic =
+      _MDSPAN_FOLD_PLUS_RIGHT((Extents == dynamic_extent), /* + ... + */ 0);
 
-  //--------------------------------------------------------------------------------
+  // internal storage type using maybe_static_array
+  using vals_t =
+      detail::maybe_static_array<IndexType, size_t, dynamic_extent, Extents...>;
+  _MDSPAN_NO_UNIQUE_ADDRESS vals_t m_vals;
 
-  template<class OtherIndexType, size_t... RHS>
+public:
+  // [mdspan.extents.obs], observers of multidimensional index space
   MDSPAN_INLINE_FUNCTION
-  friend constexpr bool operator==(extents const& lhs, extents<OtherIndexType, RHS...> const& rhs) noexcept {
-    return lhs._eq_impl(
-      rhs, std::integral_constant<bool, (sizeof...(RHS) == rank())>{},
-      make_index_sequence<sizeof...(RHS)>{}
-    );
-  }
+  constexpr static rank_type rank() noexcept { return m_rank; }
+  MDSPAN_INLINE_FUNCTION
+  constexpr static rank_type rank_dynamic() noexcept { return m_rank_dynamic; }
 
-#if !(MDSPAN_HAS_CXX_20)
-  template<class OtherIndexType, size_t... RHS>
   MDSPAN_INLINE_FUNCTION
-  friend constexpr bool operator!=(extents const& lhs, extents<OtherIndexType, RHS...> const& rhs) noexcept {
-    return lhs._not_eq_impl(
-      rhs, std::integral_constant<bool, (sizeof...(RHS) == rank())>{},
-      make_index_sequence<sizeof...(RHS)>{}
-    );
+  constexpr index_type extent(rank_type r) const noexcept { return m_vals.value(r); }
+  MDSPAN_INLINE_FUNCTION
+  constexpr static size_t static_extent(rank_type r) noexcept {
+    return vals_t::static_value(r);
   }
-#endif
 
-  // End of public interface
+  // [mdspan.extents.cons], constructors
+  MDSPAN_INLINE_FUNCTION_DEFAULTED
+  constexpr extents() noexcept = default;
 
-public:  // (but not really)
+  // Construction from just dynamic or all values.
+  // Precondition check is deferred to maybe_static_array constructor
+  MDSPAN_TEMPLATE_REQUIRES(
+      class... OtherIndexTypes,
+      /* requires */ (
+          _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_convertible, OtherIndexTypes,
+                                         index_type) /* && ... */) &&
+          _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_nothrow_constructible, index_type,
+                                         OtherIndexTypes) /* && ... */) &&
+          (sizeof...(OtherIndexTypes) == m_rank ||
+           sizeof...(OtherIndexTypes) == m_rank_dynamic)))
+  MDSPAN_INLINE_FUNCTION
+  constexpr explicit extents(OtherIndexTypes... dynvals) noexcept
+      : m_vals(static_cast<index_type>(dynvals)...) {}
 
-  MDSPAN_INLINE_FUNCTION static constexpr
-  extents __make_extents_impl(detail::__partially_static_sizes<index_type, size_t,Extents...>&& __bs) noexcept {
-    // This effectively amounts to a sideways cast that can be done in a constexpr
-    // context, but we have to do it to handle the case where the extents and the
-    // strides could accidentally end up with the same types in their hierarchies
-    // somehow (which would cause layout_stride::mapping to not be standard_layout)
-    return extents(
-#if !defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS)
-      __base_t{
-#endif
-        ::std::move(__bs.template __with_tag<detail::__extents_tag>())
-#if !defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS)
-      }
+  MDSPAN_TEMPLATE_REQUIRES(
+      class OtherIndexType, size_t N,
+      /* requires */
+      (
+          _MDSPAN_TRAIT(std::is_convertible, OtherIndexType, index_type) &&
+          _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type,
+              OtherIndexType) &&
+          (N == m_rank || N == m_rank_dynamic)))
+  MDSPAN_INLINE_FUNCTION
+  MDSPAN_CONDITIONAL_EXPLICIT(N != m_rank_dynamic)
+  constexpr extents(const std::array<OtherIndexType, N> &exts) noexcept
+      : m_vals(std::move(exts)) {}
+
+#ifdef __cpp_lib_span
+  MDSPAN_TEMPLATE_REQUIRES(
+      class OtherIndexType, size_t N,
+      /* requires */
+      (_MDSPAN_TRAIT(std::is_convertible, OtherIndexType, index_type) &&
+       _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, OtherIndexType) &&
+       (N == m_rank || N == m_rank_dynamic)))
+  MDSPAN_INLINE_FUNCTION
+  MDSPAN_CONDITIONAL_EXPLICIT(N != m_rank_dynamic)
+  constexpr extents(const std::span<OtherIndexType, N> &exts) noexcept
+      : m_vals(std::move(exts)) {}
 #endif
-    );
+
+private:
+  // Function to construct extents storage from other extents.
+  // With C++ 17 the first two variants could be collapsed using if constexpr
+  // in which case you don't need all the requires clauses.
+  // in C++ 14 mode that doesn't work due to infinite recursion
+  MDSPAN_TEMPLATE_REQUIRES(
+      size_t DynCount, size_t R, class OtherExtents, class... DynamicValues,
+      /* requires */ ((R < m_rank) && (static_extent(R) == dynamic_extent)))
+  MDSPAN_INLINE_FUNCTION
+  vals_t __construct_vals_from_extents(std::integral_constant<size_t, DynCount>,
+                                       std::integral_constant<size_t, R>,
+                                       const OtherExtents &exts,
+                                       DynamicValues... dynamic_values) noexcept {
+    return __construct_vals_from_extents(
+        std::integral_constant<size_t, DynCount + 1>(),
+        std::integral_constant<size_t, R + 1>(), exts, dynamic_values...,
+        exts.extent(R));
   }
 
-  template <size_t N>
-  MDSPAN_FORCE_INLINE_FUNCTION
-  constexpr
-  index_type __extent() const noexcept {
-    return __storage().template __get_n<N>();
+  MDSPAN_TEMPLATE_REQUIRES(
+      size_t DynCount, size_t R, class OtherExtents, class... DynamicValues,
+      /* requires */ ((R < m_rank) && (static_extent(R) != dynamic_extent)))
+  MDSPAN_INLINE_FUNCTION
+  vals_t __construct_vals_from_extents(std::integral_constant<size_t, DynCount>,
+                                       std::integral_constant<size_t, R>,
+                                       const OtherExtents &exts,
+                                       DynamicValues... dynamic_values) noexcept {
+    return __construct_vals_from_extents(
+        std::integral_constant<size_t, DynCount>(),
+        std::integral_constant<size_t, R + 1>(), exts, dynamic_values...);
+  }
+
+  MDSPAN_TEMPLATE_REQUIRES(
+      size_t DynCount, size_t R, class OtherExtents, class... DynamicValues,
+      /* requires */ ((R == m_rank) && (DynCount == m_rank_dynamic)))
+  MDSPAN_INLINE_FUNCTION
+  vals_t __construct_vals_from_extents(std::integral_constant<size_t, DynCount>,
+                                       std::integral_constant<size_t, R>,
+                                       const OtherExtents &,
+                                       DynamicValues... dynamic_values) noexcept {
+    return vals_t{static_cast<index_type>(dynamic_values)...};
   }
 
-  template <size_t N, size_t Default=dynamic_extent>
+public:
+
+  // Converting constructor from other extents specializations
+  MDSPAN_TEMPLATE_REQUIRES(
+      class OtherIndexType, size_t... OtherExtents,
+      /* requires */
+      (
+          /* multi-stage check to protect from invalid pack expansion when sizes
+             don't match? */
+          decltype(detail::__check_compatible_extents(
+              std::integral_constant<bool, sizeof...(Extents) ==
+                                               sizeof...(OtherExtents)>{},
+              std::integer_sequence<size_t, Extents...>{},
+              std::integer_sequence<size_t, OtherExtents...>{}))::value))
   MDSPAN_INLINE_FUNCTION
-  static constexpr
-  index_type __static_extent() noexcept {
-    return __storage_t::template __get_static_n<N, Default>();
+  MDSPAN_CONDITIONAL_EXPLICIT((((Extents != dynamic_extent) &&
+                                (OtherExtents == dynamic_extent)) ||
+                               ...) ||
+                              (std::numeric_limits<index_type>::max() <
+                               std::numeric_limits<OtherIndexType>::max()))
+  constexpr extents(const extents<OtherIndexType, OtherExtents...> &other) noexcept
+      : m_vals(__construct_vals_from_extents(
+            std::integral_constant<size_t, 0>(),
+            std::integral_constant<size_t, 0>(), other)) {}
+
+  // Comparison operator
+  template <class OtherIndexType, size_t... OtherExtents>
+  MDSPAN_INLINE_FUNCTION friend constexpr bool
+  operator==(const extents &lhs,
+             const extents<OtherIndexType, OtherExtents...> &rhs) noexcept {
+    bool value = true;
+    for (size_type r = 0; r < m_rank; r++)
+      value &= rhs.extent(r) == lhs.extent(r);
+    return value;
   }
 
+#if !(MDSPAN_HAS_CXX_20)
+  template <class OtherIndexType, size_t... OtherExtents>
+  MDSPAN_INLINE_FUNCTION friend constexpr bool
+  operator!=(extents const &lhs,
+             extents<OtherIndexType, OtherExtents...> const &rhs) noexcept {
+    return !(lhs == rhs);
+  }
+#endif
 };
 
+// Recursive helper classes to implement dextents alias for extents
 namespace detail {
 
-template <class IndexType, size_t Rank, class Extents = ::std::experimental::extents<IndexType>>
+template <class IndexType, size_t Rank,
+          class Extents = ::MDSPAN_IMPL_STANDARD_NAMESPACE::extents<IndexType>>
 struct __make_dextents;
 
 template <class IndexType, size_t Rank, size_t... ExtentsPack>
-struct __make_dextents<IndexType, Rank, ::std::experimental::extents<IndexType, ExtentsPack...>> {
-  using type = typename __make_dextents<IndexType, Rank - 1,
-    ::std::experimental::extents<IndexType, ::std::experimental::dynamic_extent, ExtentsPack...>>::type;
+struct __make_dextents<
+    IndexType, Rank, ::MDSPAN_IMPL_STANDARD_NAMESPACE::extents<IndexType, ExtentsPack...>>
+{
+  using type = typename __make_dextents<
+      IndexType, Rank - 1,
+      ::MDSPAN_IMPL_STANDARD_NAMESPACE::extents<IndexType,
+                                                ::MDSPAN_IMPL_STANDARD_NAMESPACE::dynamic_extent,
+                                                ExtentsPack...>>::type;
 };
 
 template <class IndexType, size_t... ExtentsPack>
-struct __make_dextents<IndexType, 0, ::std::experimental::extents<IndexType, ExtentsPack...>> {
-  using type = ::std::experimental::extents<IndexType, ExtentsPack...>;
+struct __make_dextents<
+    IndexType, 0, ::MDSPAN_IMPL_STANDARD_NAMESPACE::extents<IndexType, ExtentsPack...>>
+{
+  using type = ::MDSPAN_IMPL_STANDARD_NAMESPACE::extents<IndexType, ExtentsPack...>;
 };
 
 } // end namespace detail
 
+// [mdspan.extents.dextents], alias template
 template <class IndexType, size_t Rank>
 using dextents = typename detail::__make_dextents<IndexType, Rank>::type;
 
+// Deduction guide for extents
 #if defined(_MDSPAN_USE_CLASS_TEMPLATE_ARGUMENT_DEDUCTION)
 template <class... IndexTypes>
 extents(IndexTypes...)
-  -> extents<size_t, detail::__make_dynamic_extent<IndexTypes>()...>;
+    -> extents<size_t,
+               size_t((IndexTypes(), ::MDSPAN_IMPL_STANDARD_NAMESPACE::dynamic_extent))...>;
 #endif
 
+// Helper type traits for identifying a class as extents.
 namespace detail {
 
-template <class T>
-struct __is_extents : ::std::false_type {};
+template <class T> struct __is_extents : ::std::false_type {};
 
 template <class IndexType, size_t... ExtentsPack>
-struct __is_extents<::std::experimental::extents<IndexType, ExtentsPack...>> : ::std::true_type {};
+struct __is_extents<::MDSPAN_IMPL_STANDARD_NAMESPACE::extents<IndexType, ExtentsPack...>>
+    : ::std::true_type {};
 
 template <class T>
-static constexpr bool __is_extents_v = __is_extents<T>::value;
-
-
-template <typename Extents>
-struct __extents_to_partially_static_sizes;
-
-template <class IndexType, size_t... ExtentsPack>
-struct __extents_to_partially_static_sizes<::std::experimental::extents<IndexType, ExtentsPack...>> {
-  using type = detail::__partially_static_sizes<
-          typename ::std::experimental::extents<IndexType, ExtentsPack...>::index_type, size_t, 
-          ExtentsPack...>;
-};
-
-template <typename Extents>
-using __extents_to_partially_static_sizes_t = typename __extents_to_partially_static_sizes<Extents>::type;
+#if MDSPAN_HAS_CXX_17
+inline
+#else
+static
+#endif
+constexpr bool __is_extents_v = __is_extents<T>::value;
 
-} // end namespace detail
-} // end namespace experimental
-} // end namespace std
+} // namespace detail
+} // namespace MDSPAN_IMPL_STANDARD_NAMESPACE
diff --git a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/full_extent_t.hpp b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/full_extent_t.hpp
index e5ede0f9d594be0433e94b97cb716dcbb7779614..bd4b5c6a8baa31a21281b1528ae2f57a264a7d53 100644
--- a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/full_extent_t.hpp
+++ b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/full_extent_t.hpp
@@ -1,56 +1,26 @@
-/*
 //@HEADER
 // ************************************************************************
 //
-//                        Kokkos v. 2.0
-//              Copyright (2019) Sandia Corporation
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
 //
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// Under the terms of Contract DE-NA0003525 with NTESS,
 // the U.S. Government retains certain rights in this software.
 //
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
 //@HEADER
-*/
-
 #pragma once
 
 #include "macros.hpp"
 
-namespace std {
-namespace experimental {
+namespace MDSPAN_IMPL_STANDARD_NAMESPACE {
 
 struct full_extent_t { explicit full_extent_t() = default; };
 
 _MDSPAN_INLINE_VARIABLE constexpr auto full_extent = full_extent_t{ };
 
-} // end namespace experimental
-} // namespace std
+} // namespace MDSPAN_IMPL_STANDARD_NAMESPACE
diff --git a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_left.hpp b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_left.hpp
index 8cd2e14fb8d7039727ee7a1f87e5a29cdd875136..af44494a98d85bbb377a06e286a55b2b88d30414 100644
--- a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_left.hpp
+++ b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_left.hpp
@@ -1,54 +1,25 @@
-/*
 //@HEADER
 // ************************************************************************
 //
-//                        Kokkos v. 2.0
-//              Copyright (2019) Sandia Corporation
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
 //
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// Under the terms of Contract DE-NA0003525 with NTESS,
 // the U.S. Government retains certain rights in this software.
 //
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
 //@HEADER
-*/
-
 #pragma once
 
 #include "macros.hpp"
 #include "trait_backports.hpp"
 #include "extents.hpp"
 
-namespace std {
-namespace experimental {
+namespace MDSPAN_IMPL_STANDARD_NAMESPACE {
 
 //==============================================================================
 
@@ -62,7 +33,8 @@ class layout_left::mapping {
     using layout_type = layout_left;
   private:
 
-    static_assert(detail::__is_extents_v<extents_type>, "std::experimental::layout_left::mapping must be instantiated with a specialization of std::experimental::extents.");
+    static_assert(detail::__is_extents_v<extents_type>,
+                  MDSPAN_IMPL_STANDARD_NAMESPACE_STRING "::layout_left::mapping must be instantiated with a specialization of " MDSPAN_IMPL_STANDARD_NAMESPACE_STRING "::extents.");
 
     template <class>
     friend class mapping;
@@ -72,18 +44,21 @@ class layout_left::mapping {
     struct __rank_count {};
 
     template <size_t r, size_t Rank, class I, class... Indices>
+    _MDSPAN_HOST_DEVICE
     constexpr index_type __compute_offset(
       __rank_count<r,Rank>, const I& i, Indices... idx) const {
       return __compute_offset(__rank_count<r+1,Rank>(), idx...) *
-                 __extents.template __extent<r>() + i;
+                 __extents.extent(r) + i;
     }
 
     template<class I>
+    _MDSPAN_HOST_DEVICE
     constexpr index_type __compute_offset(
       __rank_count<extents_type::rank()-1,extents_type::rank()>, const I& i) const {
       return i;
     }
 
+    _MDSPAN_HOST_DEVICE
     constexpr index_type __compute_offset(__rank_count<0,0>) const { return 0; }
 
   public:
@@ -93,6 +68,7 @@ class layout_left::mapping {
     MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mapping() noexcept = default;
     MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mapping(mapping const&) noexcept = default;
 
+    _MDSPAN_HOST_DEVICE
     constexpr mapping(extents_type const& __exts) noexcept
       :__extents(__exts)
     { }
@@ -100,10 +76,10 @@ class layout_left::mapping {
     MDSPAN_TEMPLATE_REQUIRES(
       class OtherExtents,
       /* requires */ (
-        _MDSPAN_TRAIT(is_constructible, extents_type, OtherExtents)
+        _MDSPAN_TRAIT(std::is_constructible, extents_type, OtherExtents)
       )
     )
-    MDSPAN_CONDITIONAL_EXPLICIT((!is_convertible<OtherExtents, extents_type>::value)) // needs two () due to comma
+    MDSPAN_CONDITIONAL_EXPLICIT((!std::is_convertible<OtherExtents, extents_type>::value)) // needs two () due to comma
     MDSPAN_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14
     mapping(mapping<OtherExtents> const& other) noexcept // NOLINT(google-explicit-constructor)
       :__extents(other.extents())
@@ -117,11 +93,11 @@ class layout_left::mapping {
     MDSPAN_TEMPLATE_REQUIRES(
       class OtherExtents,
       /* requires */ (
-        _MDSPAN_TRAIT(is_constructible, extents_type, OtherExtents) &&
+        _MDSPAN_TRAIT(std::is_constructible, extents_type, OtherExtents) &&
         (extents_type::rank() <= 1)
       )
     )
-    MDSPAN_CONDITIONAL_EXPLICIT((!is_convertible<OtherExtents, extents_type>::value)) // needs two () due to comma
+    MDSPAN_CONDITIONAL_EXPLICIT((!std::is_convertible<OtherExtents, extents_type>::value)) // needs two () due to comma
     MDSPAN_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14
     mapping(layout_right::mapping<OtherExtents> const& other) noexcept // NOLINT(google-explicit-constructor)
       :__extents(other.extents())
@@ -135,23 +111,25 @@ class layout_left::mapping {
     MDSPAN_TEMPLATE_REQUIRES(
       class OtherExtents,
       /* requires */ (
-        _MDSPAN_TRAIT(is_constructible, extents_type, OtherExtents)
+        _MDSPAN_TRAIT(std::is_constructible, extents_type, OtherExtents)
       )
     )
     MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 0))
     MDSPAN_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14
-    mapping(layout_stride::mapping<OtherExtents> const& other) // NOLINT(google-explicit-constructor)
+    mapping(layout_stride::mapping<OtherExtents> const& other) noexcept // NOLINT(google-explicit-constructor)
       :__extents(other.extents())
     {
        /*
         * TODO: check precondition
         * other.required_span_size() is a representable value of type index_type
         */
-       #ifndef __CUDA_ARCH__
-       size_t stride = 1;
+       #if !defined(_MDSPAN_HAS_CUDA) && !defined(_MDSPAN_HAS_HIP) && !defined(NDEBUG)
+       index_type stride = 1;
        for(rank_type r=0; r<__extents.rank(); r++) {
-         if(stride != other.stride(r))
+         if(stride != static_cast<index_type>(other.stride(r))) {
+           // Note this throw will lead to a terminate if triggered since this function is marked noexcept
            throw std::runtime_error("Assigning layout_stride to layout_left with invalid strides.");
+         }
          stride *= __extents.extent(r);
        }
        #endif
@@ -178,13 +156,14 @@ class layout_left::mapping {
       /* requires */ (
         (sizeof...(Indices) == extents_type::rank()) &&
         _MDSPAN_FOLD_AND(
-           (_MDSPAN_TRAIT(is_convertible, Indices, index_type) &&
-            _MDSPAN_TRAIT(is_nothrow_constructible, index_type, Indices))
+           (_MDSPAN_TRAIT(std::is_convertible, Indices, index_type) &&
+            _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, Indices))
         )
       )
     )
+    _MDSPAN_HOST_DEVICE
     constexpr index_type operator()(Indices... idxs) const noexcept {
-      return __compute_offset(__rank_count<0, extents_type::rank()>(), idxs...);
+      return __compute_offset(__rank_count<0, extents_type::rank()>(), static_cast<index_type>(idxs)...);
     }
 
 
@@ -198,7 +177,11 @@ class layout_left::mapping {
     MDSPAN_INLINE_FUNCTION constexpr bool is_strided() const noexcept { return true; }
 
     MDSPAN_INLINE_FUNCTION
-    constexpr index_type stride(rank_type i) const noexcept {
+    constexpr index_type stride(rank_type i) const noexcept
+#if MDSPAN_HAS_CXX_20
+      requires ( Extents::rank() > 0 )
+#endif
+    {
       index_type value = 1;
       for(rank_type r=0; r<i; r++) value*=__extents.extent(r);
       return value;
@@ -211,7 +194,7 @@ class layout_left::mapping {
     }
 
     // In C++ 20 the not equal exists if equal is found
-#if MDSPAN_HAS_CXX_20
+#if !(MDSPAN_HAS_CXX_20)
     template<class OtherExtents>
     MDSPAN_INLINE_FUNCTION
     friend constexpr bool operator!=(mapping const& lhs, mapping<OtherExtents> const& rhs) noexcept {
@@ -221,12 +204,12 @@ class layout_left::mapping {
 
     // Not really public, but currently needed to implement fully constexpr useable submdspan:
     template<size_t N, class SizeType, size_t ... E, size_t ... Idx>
-    constexpr index_type __get_stride(std::experimental::extents<SizeType, E...>,integer_sequence<size_t, Idx...>) const {
+    constexpr index_type __get_stride(MDSPAN_IMPL_STANDARD_NAMESPACE::extents<SizeType, E...>,std::integer_sequence<size_t, Idx...>) const {
       return _MDSPAN_FOLD_TIMES_RIGHT((Idx<N? __extents.template __extent<Idx>():1),1);
     }
     template<size_t N>
     constexpr index_type __stride() const noexcept {
-      return __get_stride<N>(__extents, make_index_sequence<extents_type::rank()>());
+      return __get_stride<N>(__extents, std::make_index_sequence<extents_type::rank()>());
     }
 
 private:
@@ -235,6 +218,5 @@ private:
 };
 
 
-} // end namespace experimental
-} // end namespace std
+} // end namespace MDSPAN_IMPL_STANDARD_NAMESPACE
 
diff --git a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_right.hpp b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_right.hpp
index 118f3632c009d19eb0ef763329478c12a1444d8c..a0586484202efda912bab37d94df2999165fb079 100644
--- a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_right.hpp
+++ b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_right.hpp
@@ -1,46 +1,18 @@
-/*
 //@HEADER
 // ************************************************************************
 //
-//                        Kokkos v. 2.0
-//              Copyright (2019) Sandia Corporation
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
 //
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// Under the terms of Contract DE-NA0003525 with NTESS,
 // the U.S. Government retains certain rights in this software.
 //
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
 //@HEADER
-*/
-
 #pragma once
 
 #include "macros.hpp"
@@ -49,8 +21,7 @@
 #include <stdexcept>
 #include "layout_stride.hpp"
 
-namespace std {
-namespace experimental {
+namespace MDSPAN_IMPL_STANDARD_NAMESPACE {
 
 //==============================================================================
 template <class Extents>
@@ -63,7 +34,8 @@ class layout_right::mapping {
     using layout_type = layout_right;
   private:
 
-    static_assert(detail::__is_extents_v<extents_type>, "std::experimental::layout_right::mapping must be instantiated with a specialization of std::experimental::extents.");
+    static_assert(detail::__is_extents_v<extents_type>,
+                  MDSPAN_IMPL_STANDARD_NAMESPACE_STRING "::layout_right::mapping must be instantiated with a specialization of " MDSPAN_IMPL_STANDARD_NAMESPACE_STRING "::extents.");
 
     template <class>
     friend class mapping;
@@ -73,21 +45,25 @@ class layout_right::mapping {
     struct __rank_count {};
 
     template <size_t r, size_t Rank, class I, class... Indices>
+    _MDSPAN_HOST_DEVICE
     constexpr index_type __compute_offset(
       index_type offset, __rank_count<r,Rank>, const I& i, Indices... idx) const {
-      return __compute_offset(offset * __extents.template __extent<r>() + i,__rank_count<r+1,Rank>(),  idx...);
+      return __compute_offset(offset * __extents.extent(r) + i,__rank_count<r+1,Rank>(),  idx...);
     }
 
     template<class I, class ... Indices>
+    _MDSPAN_HOST_DEVICE
     constexpr index_type __compute_offset(
       __rank_count<0,extents_type::rank()>, const I& i, Indices... idx) const {
       return __compute_offset(i,__rank_count<1,extents_type::rank()>(),idx...);
     }
 
+    _MDSPAN_HOST_DEVICE
     constexpr index_type __compute_offset(size_t offset, __rank_count<extents_type::rank(), extents_type::rank()>) const {
       return static_cast<index_type>(offset);
     }
 
+    _MDSPAN_HOST_DEVICE
     constexpr index_type __compute_offset(__rank_count<0,0>) const { return 0; }
 
   public:
@@ -97,6 +73,7 @@ class layout_right::mapping {
     MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mapping() noexcept = default;
     MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mapping(mapping const&) noexcept = default;
 
+    _MDSPAN_HOST_DEVICE
     constexpr mapping(extents_type const& __exts) noexcept
       :__extents(__exts)
     { }
@@ -104,10 +81,10 @@ class layout_right::mapping {
     MDSPAN_TEMPLATE_REQUIRES(
       class OtherExtents,
       /* requires */ (
-        _MDSPAN_TRAIT(is_constructible, extents_type, OtherExtents)
+        _MDSPAN_TRAIT(std::is_constructible, extents_type, OtherExtents)
       )
     )
-    MDSPAN_CONDITIONAL_EXPLICIT((!is_convertible<OtherExtents, extents_type>::value)) // needs two () due to comma
+    MDSPAN_CONDITIONAL_EXPLICIT((!std::is_convertible<OtherExtents, extents_type>::value)) // needs two () due to comma
     MDSPAN_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14
     mapping(mapping<OtherExtents> const& other) noexcept // NOLINT(google-explicit-constructor)
       :__extents(other.extents())
@@ -121,11 +98,11 @@ class layout_right::mapping {
     MDSPAN_TEMPLATE_REQUIRES(
       class OtherExtents,
       /* requires */ (
-        _MDSPAN_TRAIT(is_constructible, extents_type, OtherExtents) &&
+        _MDSPAN_TRAIT(std::is_constructible, extents_type, OtherExtents) &&
         (extents_type::rank() <= 1)
       )
     )
-    MDSPAN_CONDITIONAL_EXPLICIT((!is_convertible<OtherExtents, extents_type>::value)) // needs two () due to comma
+    MDSPAN_CONDITIONAL_EXPLICIT((!std::is_convertible<OtherExtents, extents_type>::value)) // needs two () due to comma
     MDSPAN_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14
     mapping(layout_left::mapping<OtherExtents> const& other) noexcept // NOLINT(google-explicit-constructor)
       :__extents(other.extents())
@@ -139,23 +116,25 @@ class layout_right::mapping {
     MDSPAN_TEMPLATE_REQUIRES(
       class OtherExtents,
       /* requires */ (
-        _MDSPAN_TRAIT(is_constructible, extents_type, OtherExtents)
+        _MDSPAN_TRAIT(std::is_constructible, extents_type, OtherExtents)
       )
     )
     MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 0))
     MDSPAN_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14
-    mapping(layout_stride::mapping<OtherExtents> const& other) // NOLINT(google-explicit-constructor)
+    mapping(layout_stride::mapping<OtherExtents> const& other) noexcept // NOLINT(google-explicit-constructor)
       :__extents(other.extents())
     {
        /*
         * TODO: check precondition
         * other.required_span_size() is a representable value of type index_type
         */
-       #ifndef __CUDA_ARCH__
-       size_t stride = 1;
+       #if !defined(_MDSPAN_HAS_CUDA) && !defined(_MDSPAN_HAS_HIP) && !defined(NDEBUG)
+       index_type stride = 1;
        for(rank_type r=__extents.rank(); r>0; r--) {
-         if(stride != other.stride(r-1))
+         if(stride != static_cast<index_type>(other.stride(r-1))) {
+           // Note this throw will lead to a terminate if triggered since this function is marked noexcept
            throw std::runtime_error("Assigning layout_stride to layout_right with invalid strides.");
+         }
          stride *= __extents.extent(r-1);
        }
        #endif
@@ -182,13 +161,14 @@ class layout_right::mapping {
       /* requires */ (
         (sizeof...(Indices) == extents_type::rank()) &&
         _MDSPAN_FOLD_AND(
-           (_MDSPAN_TRAIT(is_convertible, Indices, index_type) &&
-            _MDSPAN_TRAIT(is_nothrow_constructible, index_type, Indices))
+           (_MDSPAN_TRAIT(std::is_convertible, Indices, index_type) &&
+            _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, Indices))
         )
       )
     )
+    _MDSPAN_HOST_DEVICE
     constexpr index_type operator()(Indices... idxs) const noexcept {
-      return __compute_offset(__rank_count<0, extents_type::rank()>(), idxs...);
+      return __compute_offset(__rank_count<0, extents_type::rank()>(), static_cast<index_type>(idxs)...);
     }
 
     MDSPAN_INLINE_FUNCTION static constexpr bool is_always_unique() noexcept { return true; }
@@ -199,7 +179,11 @@ class layout_right::mapping {
     MDSPAN_INLINE_FUNCTION constexpr bool is_strided() const noexcept { return true; }
 
     MDSPAN_INLINE_FUNCTION
-    constexpr index_type stride(rank_type i) const noexcept {
+    constexpr index_type stride(rank_type i) const noexcept
+#if MDSPAN_HAS_CXX_20
+      requires ( Extents::rank() > 0 )
+#endif
+    {
       index_type value = 1;
       for(rank_type r=extents_type::rank()-1; r>i; r--) value*=__extents.extent(r);
       return value;
@@ -212,7 +196,7 @@ class layout_right::mapping {
     }
 
     // In C++ 20 the not equal exists if equal is found
-#if MDSPAN_HAS_CXX_20
+#if !(MDSPAN_HAS_CXX_20)
     template<class OtherExtents>
     MDSPAN_INLINE_FUNCTION
     friend constexpr bool operator!=(mapping const& lhs, mapping<OtherExtents> const& rhs) noexcept {
@@ -222,12 +206,12 @@ class layout_right::mapping {
 
     // Not really public, but currently needed to implement fully constexpr useable submdspan:
     template<size_t N, class SizeType, size_t ... E, size_t ... Idx>
-    constexpr index_type __get_stride(std::experimental::extents<SizeType, E...>,integer_sequence<size_t, Idx...>) const {
+    constexpr index_type __get_stride(MDSPAN_IMPL_STANDARD_NAMESPACE::extents<SizeType, E...>,std::integer_sequence<size_t, Idx...>) const {
       return _MDSPAN_FOLD_TIMES_RIGHT((Idx>N? __extents.template __extent<Idx>():1),1);
     }
     template<size_t N>
     constexpr index_type __stride() const noexcept {
-      return __get_stride<N>(__extents, make_index_sequence<extents_type::rank()>());
+      return __get_stride<N>(__extents, std::make_index_sequence<extents_type::rank()>());
     }
 
 private:
@@ -235,6 +219,5 @@ private:
 
 };
 
-} // end namespace experimental
-} // end namespace std
+} // end namespace MDSPAN_IMPL_STANDARD_NAMESPACE
 
diff --git a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_stride.hpp b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_stride.hpp
index c04c0c45aee8433f1a734be4fcc5215e414101ea..030a494529b60043f962d7b1c4348bd4243ee33f 100644
--- a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_stride.hpp
+++ b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_stride.hpp
@@ -1,50 +1,21 @@
-/*
 //@HEADER
 // ************************************************************************
 //
-//                        Kokkos v. 2.0
-//              Copyright (2019) Sandia Corporation
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
 //
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// Under the terms of Contract DE-NA0003525 with NTESS,
 // the U.S. Government retains certain rights in this software.
 //
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
 //@HEADER
-*/
-
 #pragma once
 
 #include "macros.hpp"
-#include "static_array.hpp"
 #include "extents.hpp"
 #include "trait_backports.hpp"
 #include "compressed_pair.hpp"
@@ -56,12 +27,14 @@
 #include <algorithm>
 #include <numeric>
 #include <array>
-#if  _MDSPAN_USE_CONCEPTS && MDSPAN_HAS_CXX_20
-#include<concepts>
+#ifdef __cpp_lib_span
+#include <span>
+#endif
+#if defined(_MDSPAN_USE_CONCEPTS) && MDSPAN_HAS_CXX_20 && defined(__cpp_lib_concepts)
+#  include <concepts>
 #endif
 
-namespace std {
-namespace experimental {
+namespace MDSPAN_IMPL_STANDARD_NAMESPACE {
 
 struct layout_left {
   template<class Extents>
@@ -75,18 +48,35 @@ struct layout_right {
 namespace detail {
   template<class Layout, class Mapping>
   constexpr bool __is_mapping_of =
-    is_same<typename Layout::template mapping<typename Mapping::extents_type>, Mapping>::value;
+    std::is_same<typename Layout::template mapping<typename Mapping::extents_type>, Mapping>::value;
+
+#if defined(_MDSPAN_USE_CONCEPTS) && MDSPAN_HAS_CXX_20
+#  if !defined(__cpp_lib_concepts)
+  namespace internal {
+  namespace detail {
+  template <typename _Tp, typename _Up>
+  concept __same_as = std::is_same_v<_Tp, _Up>;
+  } // namespace detail
+  template <class T, class U>
+  concept __same_as = detail::__same_as<T, U> && detail::__same_as<U, T>;
+  } // namespace internal
+#  endif
 
-#if  _MDSPAN_USE_CONCEPTS && MDSPAN_HAS_CXX_20
   template<class M>
   concept __layout_mapping_alike = requires {
     requires __is_extents<typename M::extents_type>::value;
-    { M::is_always_strided() } -> same_as<bool>;
-    { M::is_always_exhaustive() } -> same_as<bool>;
-    { M::is_always_unique() } -> same_as<bool>;
-    bool_constant<M::is_always_strided()>::value;
-    bool_constant<M::is_always_exhaustive()>::value;
-    bool_constant<M::is_always_unique()>::value;
+#if defined(__cpp_lib_concepts)
+    { M::is_always_strided() } -> std::same_as<bool>;
+    { M::is_always_exhaustive() } -> std::same_as<bool>;
+    { M::is_always_unique() } -> std::same_as<bool>;
+#else
+    { M::is_always_strided() } -> internal::__same_as<bool>;
+    { M::is_always_exhaustive() } -> internal::__same_as<bool>;
+    { M::is_always_unique() } -> internal::__same_as<bool>;
+#endif
+    std::bool_constant<M::is_always_strided()>::value;
+    std::bool_constant<M::is_always_exhaustive()>::value;
+    std::bool_constant<M::is_always_unique()>::value;
   };
 #endif
 } // namespace detail
@@ -111,14 +101,15 @@ struct layout_stride {
     using layout_type = layout_stride;
 
     // This could be a `requires`, but I think it's better and clearer as a `static_assert`.
-    static_assert(detail::__is_extents_v<Extents>, "std::experimental::layout_stride::mapping must be instantiated with a specialization of std::experimental::extents.");
+    static_assert(detail::__is_extents_v<Extents>,
+                  MDSPAN_IMPL_STANDARD_NAMESPACE_STRING "::layout_stride::mapping must be instantiated with a specialization of " MDSPAN_IMPL_STANDARD_NAMESPACE_STRING "::extents.");
 
 
   private:
 
     //----------------------------------------------------------------------------
 
-    using __strides_storage_t = array<index_type, extents_type::rank()>;//::std::experimental::dextents<index_type, extents_type::rank()>;
+    using __strides_storage_t = std::array<index_type, extents_type::rank()>;
     using __member_pair_t = detail::__compressed_pair<extents_type, __strides_storage_t>;
 
 #if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS)
@@ -144,6 +135,12 @@ struct layout_stride {
 #endif
     }
 
+    template<class SizeType, size_t ... Ep, size_t ... Idx>
+    _MDSPAN_HOST_DEVICE
+    constexpr index_type __get_size(::MDSPAN_IMPL_STANDARD_NAMESPACE::extents<SizeType, Ep...>,std::integer_sequence<size_t, Idx...>) const {
+      return _MDSPAN_FOLD_TIMES_RIGHT( static_cast<index_type>(extents().extent(Idx)), 1 );
+    }
+
     //----------------------------------------------------------------------------
 
     template <class>
@@ -156,17 +153,19 @@ struct layout_stride {
     struct __deduction_workaround;
 
     template <size_t... Idxs>
-    struct __deduction_workaround<index_sequence<Idxs...>>
+    struct __deduction_workaround<std::index_sequence<Idxs...>>
     {
       template <class OtherExtents>
       MDSPAN_INLINE_FUNCTION
       static constexpr bool _eq_impl(mapping const& self, mapping<OtherExtents> const& other) noexcept {
-        return _MDSPAN_FOLD_AND((self.stride(Idxs) == other.stride(Idxs)) /* && ... */);
+        return    _MDSPAN_FOLD_AND((self.stride(Idxs) == other.stride(Idxs)) /* && ... */)
+               && _MDSPAN_FOLD_AND((self.extents().extent(Idxs) == other.extents().extent(Idxs)) /* || ... */);
       }
       template <class OtherExtents>
       MDSPAN_INLINE_FUNCTION
       static constexpr bool _not_eq_impl(mapping const& self, mapping<OtherExtents> const& other) noexcept {
-        return _MDSPAN_FOLD_OR((self.stride(Idxs) != other.stride(Idxs)) /* || ... */);
+        return    _MDSPAN_FOLD_OR((self.stride(Idxs) != other.stride(Idxs)) /* || ... */)
+               || _MDSPAN_FOLD_OR((self.extents().extent(Idxs) != other.extents().extent(Idxs)) /* || ... */);
       }
 
       template <class... Integral>
@@ -194,16 +193,17 @@ struct layout_stride {
 
       template<class IntegralType>
       MDSPAN_INLINE_FUNCTION
-      static constexpr const __strides_storage_t fill_strides(const array<IntegralType,extents_type::rank()>& s) {
+      static constexpr const __strides_storage_t fill_strides(const std::array<IntegralType,extents_type::rank()>& s) {
         return __strides_storage_t{static_cast<index_type>(s[Idxs])...};
       }
 
+#ifdef __cpp_lib_span
+      template<class IntegralType>
       MDSPAN_INLINE_FUNCTION
-      static constexpr const __strides_storage_t fill_strides(
-        detail::__extents_to_partially_static_sizes_t<
-          ::std::experimental::dextents<index_type, extents_type::rank()>>&& s) {
-        return __strides_storage_t{static_cast<index_type>(s.template __get_n<Idxs>())...};
+      static constexpr const __strides_storage_t fill_strides(const std::span<IntegralType,extents_type::rank()>& s) {
+        return __strides_storage_t{static_cast<index_type>(s[Idxs])...};
       }
+#endif
 
       template<size_t K>
       MDSPAN_INLINE_FUNCTION
@@ -216,7 +216,7 @@ struct layout_stride {
     };
 
     // Can't use defaulted parameter in the __deduction_workaround template because of a bug in MSVC warning C4348.
-    using __impl = __deduction_workaround<make_index_sequence<Extents::rank()>>;
+    using __impl = __deduction_workaround<std::make_index_sequence<Extents::rank()>>;
 
 
     //----------------------------------------------------------------------------
@@ -229,31 +229,6 @@ struct layout_stride {
     mapping(__base_t&& __b) : __base_t(::std::move(__b)) {}
 #endif
 
-  public: // but not really
-    MDSPAN_INLINE_FUNCTION
-    static constexpr mapping
-    __make_mapping(
-      detail::__extents_to_partially_static_sizes_t<Extents>&& __exts,
-      detail::__extents_to_partially_static_sizes_t<
-        ::std::experimental::dextents<index_type, Extents::rank()>>&& __strs
-    ) noexcept {
-      // call the private constructor we created for this purpose
-      return mapping(
-#if !defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS)
-        __base_t{
-#endif
-          __member_pair_t(
-            extents_type::__make_extents_impl(::std::move(__exts)),
-            __strides_storage_t{__impl::fill_strides(::std::move(__strs))}
-          )
-#if !defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS)
-        }
-#endif
-      );
-    }
-    //----------------------------------------------------------------------------
-
-
   public:
 
     //--------------------------------------------------------------------------------
@@ -265,16 +240,16 @@ struct layout_stride {
       class IntegralTypes,
       /* requires */ (
         // MSVC 19.32 does not like using index_type here, requires the typename Extents::index_type
-        // error C2641: cannot deduce template arguments for 'std::experimental::layout_stride::mapping'
-        _MDSPAN_TRAIT(is_convertible, const remove_const_t<IntegralTypes>&, typename Extents::index_type) &&
-        _MDSPAN_TRAIT(is_nothrow_constructible, typename Extents::index_type, const remove_const_t<IntegralTypes>&)
+        // error C2641: cannot deduce template arguments for 'MDSPAN_IMPL_STANDARD_NAMESPACE::layout_stride::mapping'
+        _MDSPAN_TRAIT(std::is_convertible, const std::remove_const_t<IntegralTypes>&, typename Extents::index_type) &&
+        _MDSPAN_TRAIT(std::is_nothrow_constructible, typename Extents::index_type, const std::remove_const_t<IntegralTypes>&)
       )
     )
     MDSPAN_INLINE_FUNCTION
     constexpr
     mapping(
       extents_type const& e,
-      array<IntegralTypes, extents_type::rank()> const& s
+      std::array<IntegralTypes, extents_type::rank()> const& s
     ) noexcept
 #if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS)
       : __members{
@@ -303,16 +278,16 @@ struct layout_stride {
       class IntegralTypes,
       /* requires */ (
         // MSVC 19.32 does not like using index_type here, requires the typename Extents::index_type
-        // error C2641: cannot deduce template arguments for 'std::experimental::layout_stride::mapping'
-        _MDSPAN_TRAIT(is_convertible, const remove_const_t<IntegralTypes>&, typename Extents::index_type) &&
-        _MDSPAN_TRAIT(is_nothrow_constructible, typename Extents::index_type, const remove_const_t<IntegralTypes>&)
+        // error C2641: cannot deduce template arguments for 'MDSPAN_IMPL_STANDARD_NAMESPACE::layout_stride::mapping'
+        _MDSPAN_TRAIT(std::is_convertible, const std::remove_const_t<IntegralTypes>&, typename Extents::index_type) &&
+        _MDSPAN_TRAIT(std::is_nothrow_constructible, typename Extents::index_type, const std::remove_const_t<IntegralTypes>&)
       )
     )
     MDSPAN_INLINE_FUNCTION
     constexpr
     mapping(
       extents_type const& e,
-      span<IntegralTypes, extents_type::rank()> const& s
+      std::span<IntegralTypes, extents_type::rank()> const& s
     ) noexcept
 #if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS)
       : __members{
@@ -337,11 +312,11 @@ struct layout_stride {
     }
 #endif // __cpp_lib_span
 
-#if !(_MDSPAN_USE_CONCEPTS && MDSPAN_HAS_CXX_20)
+#if !(defined(_MDSPAN_USE_CONCEPTS) && MDSPAN_HAS_CXX_20)
     MDSPAN_TEMPLATE_REQUIRES(
       class StridedLayoutMapping,
       /* requires */ (
-        _MDSPAN_TRAIT(is_constructible, extents_type, typename StridedLayoutMapping::extents_type) &&
+        _MDSPAN_TRAIT(std::is_constructible, extents_type, typename StridedLayoutMapping::extents_type) &&
         detail::__is_mapping_of<typename StridedLayoutMapping::layout_type, StridedLayoutMapping> &&
         StridedLayoutMapping::is_always_unique() &&
         StridedLayoutMapping::is_always_strided()
@@ -351,13 +326,13 @@ struct layout_stride {
     template<class StridedLayoutMapping>
     requires(
          detail::__layout_mapping_alike<StridedLayoutMapping> &&
-         _MDSPAN_TRAIT(is_constructible, extents_type, typename StridedLayoutMapping::extents_type) &&
+         _MDSPAN_TRAIT(std::is_constructible, extents_type, typename StridedLayoutMapping::extents_type) &&
          StridedLayoutMapping::is_always_unique() &&
          StridedLayoutMapping::is_always_strided()
     )
 #endif
     MDSPAN_CONDITIONAL_EXPLICIT(
-      (!is_convertible<typename StridedLayoutMapping::extents_type, extents_type>::value) &&
+      (!std::is_convertible<typename StridedLayoutMapping::extents_type, extents_type>::value) &&
       (detail::__is_mapping_of<layout_left, StridedLayoutMapping> ||
        detail::__is_mapping_of<layout_right, StridedLayoutMapping> ||
        detail::__is_mapping_of<layout_stride, StridedLayoutMapping>)
@@ -398,7 +373,7 @@ struct layout_stride {
     };
 
     MDSPAN_INLINE_FUNCTION
-    constexpr array< index_type, extents_type::rank() > strides() const noexcept {
+    constexpr std::array< index_type, extents_type::rank() > strides() const noexcept {
       return __strides_storage();
     }
 
@@ -408,7 +383,7 @@ struct layout_stride {
       for(unsigned r = 0; r < extents_type::rank(); r++) {
         // Return early if any of the extents are zero
         if(extents().extent(r)==0) return 0;
-        span_size = std::max(span_size, static_cast<index_type>(extents().extent(r) * __strides_storage()[r]));
+        span_size += ( static_cast<index_type>(extents().extent(r) - 1 ) * __strides_storage()[r]);
       }
       return span_size;
     }
@@ -418,13 +393,13 @@ struct layout_stride {
       class... Indices,
       /* requires */ (
         sizeof...(Indices) == Extents::rank() &&
-        _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(is_convertible, Indices, index_type) /*&& ...*/ ) &&
-        _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(is_nothrow_constructible, index_type, Indices) /*&& ...*/)
+        _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_convertible, Indices, index_type) /*&& ...*/ ) &&
+        _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, Indices) /*&& ...*/)
       )
     )
     MDSPAN_FORCE_INLINE_FUNCTION
-    constexpr size_t operator()(Indices... idxs) const noexcept {
-      return __impl::_call_op_impl(*this, static_cast<index_type>(idxs)...);
+    constexpr index_type operator()(Indices... idxs) const noexcept {
+      return static_cast<index_type>(__impl::_call_op_impl(*this, static_cast<index_type>(idxs)...));
     }
 
     MDSPAN_INLINE_FUNCTION static constexpr bool is_always_unique() noexcept { return true; }
@@ -435,53 +410,21 @@ struct layout_stride {
 
     MDSPAN_INLINE_FUNCTION static constexpr bool is_unique() noexcept { return true; }
     MDSPAN_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 bool is_exhaustive() const noexcept {
-// TODO @testing test layout_stride is_exhaustive()
-// FIXME CUDA
-#ifdef __CUDA_ARCH__
-      return false;
-#else
-      auto rem = array<size_t, Extents::rank()>{ };
-      std::iota(rem.begin(), rem.end(), size_t(0));
-      auto next_idx_iter = std::find_if(
-        rem.begin(), rem.end(),
-        [&](size_t i) { return this->stride(i) == 1;  }
-      );
-      if(next_idx_iter != rem.end()) {
-        size_t prev_stride_times_prev_extent =
-          this->extents().extent(*next_idx_iter) * this->stride(*next_idx_iter);
-        // "remove" the index
-        constexpr auto removed_index_sentinel = static_cast<size_t>(-1);
-        *next_idx_iter = removed_index_sentinel;
-        size_t found_count = 1;
-        while (found_count != Extents::rank()) {
-          next_idx_iter = std::find_if(
-            rem.begin(), rem.end(),
-            [&](size_t i) {
-              return i != removed_index_sentinel
-                && static_cast<size_t>(this->extents().extent(i)) == prev_stride_times_prev_extent;
-            }
-          );
-          if (next_idx_iter != rem.end()) {
-            // "remove" the index
-            *next_idx_iter = removed_index_sentinel;
-            ++found_count;
-            prev_stride_times_prev_extent = stride(*next_idx_iter) * this->extents().extent(*next_idx_iter);
-          } else { break; }
-        }
-        return found_count == Extents::rank();
-      }
-      return false;
-#endif
+      return required_span_size() == __get_size(extents(), std::make_index_sequence<extents_type::rank()>());
     }
     MDSPAN_INLINE_FUNCTION static constexpr bool is_strided() noexcept { return true; }
 
 
     MDSPAN_INLINE_FUNCTION
-    constexpr index_type stride(rank_type r) const noexcept {
+    constexpr index_type stride(rank_type r) const noexcept
+#if MDSPAN_HAS_CXX_20
+      requires ( Extents::rank() > 0 )
+#endif
+    {
       return __strides_storage()[r];
     }
 
-#if !(_MDSPAN_USE_CONCEPTS && MDSPAN_HAS_CXX_20)
+#if !(defined(_MDSPAN_USE_CONCEPTS) && MDSPAN_HAS_CXX_20)
     MDSPAN_TEMPLATE_REQUIRES(
       class StridedLayoutMapping,
       /* requires */ (
@@ -549,5 +492,4 @@ struct layout_stride {
   };
 };
 
-} // end namespace experimental
-} // end namespace std
+} // end namespace MDSPAN_IMPL_STANDARD_NAMESPACE
diff --git a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/macros.hpp b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/macros.hpp
index 848dcf91a74a0859de18debae545b59333840428..3eeb39755c8aed3690892ac4cb2b2cb9c9935c96 100644
--- a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/macros.hpp
+++ b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/macros.hpp
@@ -1,46 +1,18 @@
-/*
 //@HEADER
 // ************************************************************************
 //
-//                        Kokkos v. 2.0
-//              Copyright (2019) Sandia Corporation
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
 //
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// Under the terms of Contract DE-NA0003525 with NTESS,
 // the U.S. Government retains certain rights in this software.
 //
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
 //@HEADER
-*/
-
 
 #pragma once
 
@@ -68,6 +40,16 @@
 #  define MDSPAN_INLINE_FUNCTION inline _MDSPAN_HOST_DEVICE
 #endif
 
+#ifndef MDSPAN_FUNCTION
+#  define MDSPAN_FUNCTION _MDSPAN_HOST_DEVICE
+#endif
+
+#ifdef _MDSPAN_HAS_HIP
+#  define MDSPAN_DEDUCTION_GUIDE _MDSPAN_HOST_DEVICE
+#else
+#  define MDSPAN_DEDUCTION_GUIDE
+#endif
+
 // In CUDA defaulted functions do not need host device markup
 #ifndef MDSPAN_INLINE_FUNCTION_DEFAULTED
 #  define MDSPAN_INLINE_FUNCTION_DEFAULTED
@@ -116,6 +98,9 @@
 #define MDSPAN_PP_REMOVE_PARENS_IMPL(...) __VA_ARGS__
 #define MDSPAN_PP_REMOVE_PARENS(...) MDSPAN_PP_REMOVE_PARENS_IMPL __VA_ARGS__
 
+#define MDSPAN_IMPL_STANDARD_NAMESPACE_STRING MDSPAN_PP_STRINGIFY(MDSPAN_IMPL_STANDARD_NAMESPACE)
+#define MDSPAN_IMPL_PROPOSED_NAMESPACE_STRING MDSPAN_PP_STRINGIFY(MDSPAN_IMPL_STANDARD_NAMESPACE) "::" MDSPAN_PP_STRINGIFY(MDSPAN_IMPL_PROPOSED_NAMESPACE)
+
 // </editor-fold> end Preprocessor helpers }}}1
 //==============================================================================
 
@@ -139,8 +124,7 @@
      /**/
 #endif
 
-
-#if defined(_MDSPAN_COMPILER_MSVC)
+#if defined(_MDSPAN_COMPILER_MSVC) && (!defined(_MSVC_TRADITIONAL) || _MSVC_TRADITIONAL)
 #  define MDSPAN_TEMPLATE_REQUIRES(...) \
       MDSPAN_PP_CAT( \
         MDSPAN_PP_CAT(MDSPAN_TEMPLATE_REQUIRES_, MDSPAN_PP_COUNT(__VA_ARGS__))\
@@ -294,7 +278,7 @@ struct __mdspan_enable_fold_comma { };
 #  define _MDSPAN_FOLD_COMMA(...) ((__VA_ARGS__), ...)
 #else
 
-namespace std {
+namespace MDSPAN_IMPL_STANDARD_NAMESPACE {
 
 namespace __fold_compatibility_impl {
 
@@ -597,15 +581,15 @@ struct __bools;
 
 } // __fold_compatibility_impl
 
-} // end namespace std
+} // end namespace MDSPAN_IMPL_STANDARD_NAMESPACE
 
-#  define _MDSPAN_FOLD_AND(...) std::__fold_compatibility_impl::__fold_right_and_impl((__VA_ARGS__)...)
-#  define _MDSPAN_FOLD_OR(...) std::__fold_compatibility_impl::__fold_right_or_impl((__VA_ARGS__)...)
-#  define _MDSPAN_FOLD_ASSIGN_LEFT(INIT, ...) std::__fold_compatibility_impl::__fold_left_assign_impl(INIT, (__VA_ARGS__)...)
-#  define _MDSPAN_FOLD_ASSIGN_RIGHT(PACK, ...) std::__fold_compatibility_impl::__fold_right_assign_impl((PACK)..., __VA_ARGS__)
-#  define _MDSPAN_FOLD_TIMES_RIGHT(PACK, ...) std::__fold_compatibility_impl::__fold_right_times_impl((PACK)..., __VA_ARGS__)
-#  define _MDSPAN_FOLD_PLUS_RIGHT(PACK, ...) std::__fold_compatibility_impl::__fold_right_plus_impl((PACK)..., __VA_ARGS__)
-#  define _MDSPAN_FOLD_COMMA(...) std::__fold_compatibility_impl::__fold_comma_impl((__VA_ARGS__)...)
+#  define _MDSPAN_FOLD_AND(...) MDSPAN_IMPL_STANDARD_NAMESPACE::__fold_compatibility_impl::__fold_right_and_impl((__VA_ARGS__)...)
+#  define _MDSPAN_FOLD_OR(...) MDSPAN_IMPL_STANDARD_NAMESPACE::__fold_compatibility_impl::__fold_right_or_impl((__VA_ARGS__)...)
+#  define _MDSPAN_FOLD_ASSIGN_LEFT(INIT, ...) MDSPAN_IMPL_STANDARD_NAMESPACE::__fold_compatibility_impl::__fold_left_assign_impl(INIT, (__VA_ARGS__)...)
+#  define _MDSPAN_FOLD_ASSIGN_RIGHT(PACK, ...) MDSPAN_IMPL_STANDARD_NAMESPACE::__fold_compatibility_impl::__fold_right_assign_impl((PACK)..., __VA_ARGS__)
+#  define _MDSPAN_FOLD_TIMES_RIGHT(PACK, ...) MDSPAN_IMPL_STANDARD_NAMESPACE::__fold_compatibility_impl::__fold_right_times_impl((PACK)..., __VA_ARGS__)
+#  define _MDSPAN_FOLD_PLUS_RIGHT(PACK, ...) MDSPAN_IMPL_STANDARD_NAMESPACE::__fold_compatibility_impl::__fold_right_plus_impl((PACK)..., __VA_ARGS__)
+#  define _MDSPAN_FOLD_COMMA(...) MDSPAN_IMPL_STANDARD_NAMESPACE::__fold_compatibility_impl::__fold_comma_impl((__VA_ARGS__)...)
 
 #  define _MDSPAN_FOLD_AND_TEMPLATE(...) \
   _MDSPAN_TRAIT(std::is_same, __fold_compatibility_impl::__bools<(__VA_ARGS__)..., true>, __fold_compatibility_impl::__bools<true, (__VA_ARGS__)...>)
diff --git a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/maybe_static_value.hpp b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/maybe_static_value.hpp
deleted file mode 100644
index 7abb1d5d65f7ee73d1cb949d0b7c0ce3665603c7..0000000000000000000000000000000000000000
--- a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/maybe_static_value.hpp
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 2.0
-//              Copyright (2019) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include "macros.hpp"
-
-#include "dynamic_extent.hpp"
-
-#if !defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS)
-#  include "no_unique_address.hpp"
-#endif
-
-// This is only needed for the non-standard-layout version of partially
-// static array.
-// Needs to be after the includes above to work with the single header generator
-#if !_MDSPAN_PRESERVE_STANDARD_LAYOUT
-namespace std {
-namespace experimental {
-
-//==============================================================================
-
-namespace detail {
-
-// static case
-template <class _dynamic_t, class static_t, _static_t __v,
-          _static_t __is_dynamic_sentinal = dynamic_extent,
-          size_t __array_entry_index = 0>
-struct __maybe_static_value {
-  static constexpr _static_t __static_value = __v;
-  MDSPAN_FORCE_INLINE_FUNCTION constexpr _dynamic_t __value() const noexcept {
-    return static_cast<_dynamic_t>(__v);
-  }
-  template <class _U>
-  MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14
-  __mdspan_enable_fold_comma
-  __set_value(_U&& /*__rhs*/) noexcept {
-    // Should we assert that the value matches the static value here?
-    return {};
-  }
-
-  //--------------------------------------------------------------------------
-
-  MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __maybe_static_value() noexcept = default;
-  MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __maybe_static_value(__maybe_static_value const&) noexcept = default;
-  MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __maybe_static_value(__maybe_static_value&&) noexcept = default;
-  MDSPAN_INLINE_FUNCTION_DEFAULTED
-  _MDSPAN_CONSTEXPR_14_DEFAULTED __maybe_static_value& operator=(__maybe_static_value const&) noexcept = default;
-  MDSPAN_INLINE_FUNCTION_DEFAULTED
-  _MDSPAN_CONSTEXPR_14_DEFAULTED __maybe_static_value& operator=(__maybe_static_value&&) noexcept = default;
-  MDSPAN_INLINE_FUNCTION_DEFAULTED
-  ~__maybe_static_value() noexcept = default;
-
-  MDSPAN_INLINE_FUNCTION
-  constexpr explicit __maybe_static_value(_dynamic_t const&) noexcept {
-    // Should we assert that the value matches the static value here?
-  }
-
-  //--------------------------------------------------------------------------
-
-};
-
-// dynamic case
-template <class _dynamic_t, class _static_t, _static_t __is_dynamic_sentinal, size_t __array_entry_index>
-struct __maybe_static_value<_dynamic_t, _static_t, __is_dynamic_sentinal, __is_dynamic_sentinal,
-                            __array_entry_index>
-#if !defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS)
-    : __no_unique_address_emulation<_T>
-#endif
-{
-  static constexpr _static_t __static_value = __is_dynamic_sentinal;
-#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS)
-  _MDSPAN_NO_UNIQUE_ADDRESS _dynamic_t __v = {};
-  MDSPAN_FORCE_INLINE_FUNCTION constexpr _dynamic_t __value() const noexcept {
-    return __v;
-  }
-  MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _dynamic_t &__ref() noexcept {
-    return __v;
-  }
-  template <class _U>
-  MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14
-  __mdspan_enable_fold_comma
-  __set_value(_U&& __rhs) noexcept {
-    __v = (_U &&)rhs;
-    return {};
-  }
-#else
-  MDSPAN_FORCE_INLINE_FUNCTION constexpr _dynamic_t __value() const noexcept {
-    return this->__no_unique_address_emulation<_dynamic_t>::__ref();
-  }
-  MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _dynamic_t &__ref() noexcept {
-    return this->__no_unique_address_emulation<_dynamic_t>::__ref();
-  }
-  template <class _U>
-  MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14
-  __mdspan_enable_fold_comma
-  __set_value(_U&& __rhs) noexcept {
-    this->__no_unique_address_emulation<_dynamic_t>::__ref() = (_U &&)__rhs;
-    return {};
-  }
-#endif
-};
-
-} // namespace detail
-
-//==============================================================================
-
-} // end namespace experimental
-} // end namespace std
-
-#endif // !_MDSPAN_PRESERVE_STANDARD_LAYOUT
diff --git a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/mdspan.hpp b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/mdspan.hpp
index 374444c0535038f37dce9f829243bc0e6ac78bf6..6febe30021501dbbb2d46656a2877b96c8898cd1 100644
--- a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/mdspan.hpp
+++ b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/mdspan.hpp
@@ -1,46 +1,18 @@
-/*
 //@HEADER
 // ************************************************************************
 //
-//                        Kokkos v. 2.0
-//              Copyright (2019) Sandia Corporation
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
 //
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// Under the terms of Contract DE-NA0003525 with NTESS,
 // the U.S. Government retains certain rights in this software.
 //
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
 //@HEADER
-*/
-
 
 #pragma once
 
@@ -50,9 +22,7 @@
 #include "trait_backports.hpp"
 #include "compressed_pair.hpp"
 
-namespace std {
-namespace experimental {
-
+namespace MDSPAN_IMPL_STANDARD_NAMESPACE {
 template <
   class ElementType,
   class Extents,
@@ -62,26 +32,27 @@ template <
 class mdspan
 {
 private:
-  static_assert(detail::__is_extents_v<Extents>, "std::experimental::mdspan's Extents template parameter must be a specialization of std::experimental::extents.");
+  static_assert(detail::__is_extents_v<Extents>,
+                MDSPAN_IMPL_STANDARD_NAMESPACE_STRING "::mdspan's Extents template parameter must be a specialization of " MDSPAN_IMPL_STANDARD_NAMESPACE_STRING "::extents.");
 
   // Workaround for non-deducibility of the index sequence template parameter if it's given at the top level
   template <class>
   struct __deduction_workaround;
 
   template <size_t... Idxs>
-  struct __deduction_workaround<index_sequence<Idxs...>>
+  struct __deduction_workaround<std::index_sequence<Idxs...>>
   {
     MDSPAN_FORCE_INLINE_FUNCTION static constexpr
     size_t __size(mdspan const& __self) noexcept {
-      return _MDSPAN_FOLD_TIMES_RIGHT((__self.__mapping_ref().extents().template __extent<Idxs>()), /* * ... * */ size_t(1));
+      return _MDSPAN_FOLD_TIMES_RIGHT((__self.__mapping_ref().extents().extent(Idxs)), /* * ... * */ size_t(1));
     }
     MDSPAN_FORCE_INLINE_FUNCTION static constexpr
     bool __empty(mdspan const& __self) noexcept {
-      return (__self.rank()>0) && _MDSPAN_FOLD_OR((__self.__mapping_ref().extents().template __extent<Idxs>()==index_type(0)));
+      return (__self.rank()>0) && _MDSPAN_FOLD_OR((__self.__mapping_ref().extents().extent(Idxs)==index_type(0)));
     }
     template <class ReferenceType, class SizeType, size_t N>
     MDSPAN_FORCE_INLINE_FUNCTION static constexpr
-    ReferenceType __callop(mdspan const& __self, const array<SizeType, N>& indices) noexcept {
+    ReferenceType __callop(mdspan const& __self, const std::array<SizeType, N>& indices) noexcept {
       return __self.__accessor_ref().access(__self.__ptr_ref(), __self.__mapping_ref()(indices[Idxs]...));
     }
   };
@@ -96,7 +67,7 @@ public:
   using accessor_type = AccessorPolicy;
   using mapping_type = typename layout_type::template mapping<extents_type>;
   using element_type = ElementType;
-  using value_type = remove_cv_t<element_type>;
+  using value_type = std::remove_cv_t<element_type>;
   using index_type = typename extents_type::index_type;
   using size_type = typename extents_type::size_type;
   using rank_type = typename extents_type::rank_type;
@@ -111,7 +82,7 @@ public:
 private:
 
   // Can't use defaulted parameter in the __deduction_workaround template because of a bug in MSVC warning C4348.
-  using __impl = __deduction_workaround<make_index_sequence<extents_type::rank()>>;
+  using __impl = __deduction_workaround<std::make_index_sequence<extents_type::rank()>>;
 
   using __map_acc_pair_t = detail::__compressed_pair<mapping_type, accessor_type>;
 
@@ -125,10 +96,11 @@ public:
 #else
   MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mdspan()
     requires(
-       (rank_dynamic() > 0) &&
-       _MDSPAN_TRAIT(is_default_constructible, data_handle_type) &&
-       _MDSPAN_TRAIT(is_default_constructible, mapping_type) &&
-       _MDSPAN_TRAIT(is_default_constructible, accessor_type)
+       // nvhpc has a bug where using just rank_dynamic() here doesn't work ...
+       (extents_type::rank_dynamic() > 0) &&
+       _MDSPAN_TRAIT(std::is_default_constructible, data_handle_type) &&
+       _MDSPAN_TRAIT(std::is_default_constructible, mapping_type) &&
+       _MDSPAN_TRAIT(std::is_default_constructible, accessor_type)
      ) = default;
 #endif
   MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mdspan(const mdspan&) = default;
@@ -137,11 +109,11 @@ public:
   MDSPAN_TEMPLATE_REQUIRES(
     class... SizeTypes,
     /* requires */ (
-      _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(is_convertible, SizeTypes, index_type) /* && ... */) &&
-      _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(is_nothrow_constructible, index_type, SizeTypes) /* && ... */) &&
+      _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_convertible, SizeTypes, index_type) /* && ... */) &&
+      _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeTypes) /* && ... */) &&
       ((sizeof...(SizeTypes) == rank()) || (sizeof...(SizeTypes) == rank_dynamic())) &&
-      _MDSPAN_TRAIT(is_constructible, mapping_type, extents_type) &&
-      _MDSPAN_TRAIT(is_default_constructible, accessor_type)
+      _MDSPAN_TRAIT(std::is_constructible, mapping_type, extents_type) &&
+      _MDSPAN_TRAIT(std::is_default_constructible, accessor_type)
     )
   )
   MDSPAN_INLINE_FUNCTION
@@ -153,16 +125,16 @@ public:
   MDSPAN_TEMPLATE_REQUIRES(
     class SizeType, size_t N,
     /* requires */ (
-      _MDSPAN_TRAIT(is_convertible, SizeType, index_type) &&
-      _MDSPAN_TRAIT(is_nothrow_constructible, index_type, SizeType) &&
+      _MDSPAN_TRAIT(std::is_convertible, SizeType, index_type) &&
+      _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeType) &&
       ((N == rank()) || (N == rank_dynamic())) &&
-      _MDSPAN_TRAIT(is_constructible, mapping_type, extents_type) &&
-      _MDSPAN_TRAIT(is_default_constructible, accessor_type)
+      _MDSPAN_TRAIT(std::is_constructible, mapping_type, extents_type) &&
+      _MDSPAN_TRAIT(std::is_default_constructible, accessor_type)
     )
   )
   MDSPAN_CONDITIONAL_EXPLICIT(N != rank_dynamic())
   MDSPAN_INLINE_FUNCTION
-  constexpr mdspan(data_handle_type p, const array<SizeType, N>& dynamic_extents)
+  constexpr mdspan(data_handle_type p, const std::array<SizeType, N>& dynamic_extents)
     : __members(std::move(p), __map_acc_pair_t(mapping_type(extents_type(dynamic_extents)), accessor_type()))
   { }
 
@@ -170,16 +142,16 @@ public:
   MDSPAN_TEMPLATE_REQUIRES(
     class SizeType, size_t N,
     /* requires */ (
-      _MDSPAN_TRAIT(is_convertible, SizeType, index_type) &&
-      _MDSPAN_TRAIT(is_nothrow_constructible, index_type, SizeType) &&
+      _MDSPAN_TRAIT(std::is_convertible, SizeType, index_type) &&
+      _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeType) &&
       ((N == rank()) || (N == rank_dynamic())) &&
-      _MDSPAN_TRAIT(is_constructible, mapping_type, extents_type) &&
-      _MDSPAN_TRAIT(is_default_constructible, accessor_type)
+      _MDSPAN_TRAIT(std::is_constructible, mapping_type, extents_type) &&
+      _MDSPAN_TRAIT(std::is_default_constructible, accessor_type)
     )
   )
   MDSPAN_CONDITIONAL_EXPLICIT(N != rank_dynamic())
   MDSPAN_INLINE_FUNCTION
-  constexpr mdspan(data_handle_type p, span<SizeType, N> dynamic_extents)
+  constexpr mdspan(data_handle_type p, std::span<SizeType, N> dynamic_extents)
     : __members(std::move(p), __map_acc_pair_t(mapping_type(extents_type(as_const(dynamic_extents))), accessor_type()))
   { }
 #endif
@@ -187,15 +159,15 @@ public:
   MDSPAN_FUNCTION_REQUIRES(
     (MDSPAN_INLINE_FUNCTION constexpr),
     mdspan, (data_handle_type p, const extents_type& exts), ,
-    /* requires */ (_MDSPAN_TRAIT(is_default_constructible, accessor_type) &&
-                    _MDSPAN_TRAIT(is_constructible, mapping_type, extents_type))
+    /* requires */ (_MDSPAN_TRAIT(std::is_default_constructible, accessor_type) &&
+                    _MDSPAN_TRAIT(std::is_constructible, mapping_type, extents_type))
   ) : __members(std::move(p), __map_acc_pair_t(mapping_type(exts), accessor_type()))
   { }
 
   MDSPAN_FUNCTION_REQUIRES(
     (MDSPAN_INLINE_FUNCTION constexpr),
     mdspan, (data_handle_type p, const mapping_type& m), ,
-    /* requires */ (_MDSPAN_TRAIT(is_default_constructible, accessor_type))
+    /* requires */ (_MDSPAN_TRAIT(std::is_default_constructible, accessor_type))
   ) : __members(std::move(p), __map_acc_pair_t(m, accessor_type()))
   { }
 
@@ -207,16 +179,16 @@ public:
   MDSPAN_TEMPLATE_REQUIRES(
     class OtherElementType, class OtherExtents, class OtherLayoutPolicy, class OtherAccessor,
     /* requires */ (
-      _MDSPAN_TRAIT(is_constructible, mapping_type, typename OtherLayoutPolicy::template mapping<OtherExtents>) &&
-      _MDSPAN_TRAIT(is_constructible, accessor_type, OtherAccessor)
+      _MDSPAN_TRAIT(std::is_constructible, mapping_type, typename OtherLayoutPolicy::template mapping<OtherExtents>) &&
+      _MDSPAN_TRAIT(std::is_constructible, accessor_type, OtherAccessor)
     )
   )
   MDSPAN_INLINE_FUNCTION
   constexpr mdspan(const mdspan<OtherElementType, OtherExtents, OtherLayoutPolicy, OtherAccessor>& other)
     : __members(other.__ptr_ref(), __map_acc_pair_t(other.__mapping_ref(), other.__accessor_ref()))
   {
-      static_assert(_MDSPAN_TRAIT(is_constructible, data_handle_type, typename OtherAccessor::data_handle_type),"Incompatible data_handle_type for mdspan construction");
-      static_assert(_MDSPAN_TRAIT(is_constructible, extents_type, OtherExtents),"Incompatible extents for mdspan construction");
+      static_assert(_MDSPAN_TRAIT(std::is_constructible, data_handle_type, typename OtherAccessor::data_handle_type),"Incompatible data_handle_type for mdspan construction");
+      static_assert(_MDSPAN_TRAIT(std::is_constructible, extents_type, OtherExtents),"Incompatible extents for mdspan construction");
       /*
        * TODO: Check precondition
        * For each rank index r of extents_type, static_extent(r) == dynamic_extent || static_extent(r) == other.extent(r) is true.
@@ -239,27 +211,27 @@ public:
   MDSPAN_TEMPLATE_REQUIRES(
     class... SizeTypes,
     /* requires */ (
-      _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(is_convertible, SizeTypes, index_type) /* && ... */) &&
-      _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(is_nothrow_constructible, index_type, SizeTypes) /* && ... */) &&
+      _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_convertible, SizeTypes, index_type) /* && ... */) &&
+      _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeTypes) /* && ... */) &&
       (rank() == sizeof...(SizeTypes))
     )
   )
   MDSPAN_FORCE_INLINE_FUNCTION
   constexpr reference operator[](SizeTypes... indices) const
   {
-    return __accessor_ref().access(__ptr_ref(), __mapping_ref()(index_type(indices)...));
+    return __accessor_ref().access(__ptr_ref(), __mapping_ref()(static_cast<index_type>(std::move(indices))...));
   }
   #endif
 
   MDSPAN_TEMPLATE_REQUIRES(
     class SizeType,
     /* requires */ (
-      _MDSPAN_TRAIT(is_convertible, SizeType, index_type) &&
-      _MDSPAN_TRAIT(is_nothrow_constructible, index_type, SizeType)
+      _MDSPAN_TRAIT(std::is_convertible, SizeType, index_type) &&
+      _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeType)
     )
   )
   MDSPAN_FORCE_INLINE_FUNCTION
-  constexpr reference operator[](const array<SizeType, rank()>& indices) const
+  constexpr reference operator[](const std::array< SizeType, rank()>& indices) const
   {
     return __impl::template __callop<reference>(*this, indices);
   }
@@ -268,12 +240,12 @@ public:
   MDSPAN_TEMPLATE_REQUIRES(
     class SizeType,
     /* requires */ (
-      _MDSPAN_TRAIT(is_convertible, SizeType, index_type) &&
-      _MDSPAN_TRAIT(is_nothrow_constructible, index_type, SizeType)
+      _MDSPAN_TRAIT(std::is_convertible, SizeType, index_type) &&
+      _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeType)
     )
   )
   MDSPAN_FORCE_INLINE_FUNCTION
-  constexpr reference operator[](span<SizeType, rank()> indices) const
+  constexpr reference operator[](std::span<SizeType, rank()> indices) const
   {
     return __impl::template __callop<reference>(*this, indices);
   }
@@ -283,15 +255,15 @@ public:
   MDSPAN_TEMPLATE_REQUIRES(
     class Index,
     /* requires */ (
-      _MDSPAN_TRAIT(is_convertible, Index, index_type) &&
-      _MDSPAN_TRAIT(is_nothrow_constructible, index_type, Index) &&
+      _MDSPAN_TRAIT(std::is_convertible, Index, index_type) &&
+      _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, Index) &&
       extents_type::rank() == 1
     )
   )
   MDSPAN_FORCE_INLINE_FUNCTION
   constexpr reference operator[](Index idx) const
   {
-    return __accessor_ref().access(__ptr_ref(), __mapping_ref()(index_type(idx)));
+    return __accessor_ref().access(__ptr_ref(), __mapping_ref()(static_cast<index_type>(std::move(idx))));
   }
   #endif
 
@@ -299,26 +271,26 @@ public:
   MDSPAN_TEMPLATE_REQUIRES(
     class... SizeTypes,
     /* requires */ (
-      _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(is_convertible, SizeTypes, index_type) /* && ... */) &&
-      _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(is_nothrow_constructible, index_type, SizeTypes) /* && ... */) &&
+      _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_convertible, SizeTypes, index_type) /* && ... */) &&
+      _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeTypes) /* && ... */) &&
       extents_type::rank() == sizeof...(SizeTypes)
     )
   )
   MDSPAN_FORCE_INLINE_FUNCTION
   constexpr reference operator()(SizeTypes... indices) const
   {
-    return __accessor_ref().access(__ptr_ref(), __mapping_ref()(indices...));
+    return __accessor_ref().access(__ptr_ref(), __mapping_ref()(static_cast<index_type>(std::move(indices))...));
   }
 
   MDSPAN_TEMPLATE_REQUIRES(
     class SizeType,
     /* requires */ (
-      _MDSPAN_TRAIT(is_convertible, SizeType, index_type) &&
-      _MDSPAN_TRAIT(is_nothrow_constructible, index_type, SizeType)
+      _MDSPAN_TRAIT(std::is_convertible, SizeType, index_type) &&
+      _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeType)
     )
   )
   MDSPAN_FORCE_INLINE_FUNCTION
-  constexpr reference operator()(const array<SizeType, rank()>& indices) const
+  constexpr reference operator()(const std::array<SizeType, rank()>& indices) const
   {
     return __impl::template __callop<reference>(*this, indices);
   }
@@ -327,12 +299,12 @@ public:
   MDSPAN_TEMPLATE_REQUIRES(
     class SizeType,
     /* requires */ (
-      _MDSPAN_TRAIT(is_convertible, SizeType, index_type) &&
-      _MDSPAN_TRAIT(is_nothrow_constructible, index_type, SizeType)
+      _MDSPAN_TRAIT(std::is_convertible, SizeType, index_type) &&
+      _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeType)
     )
   )
   MDSPAN_FORCE_INLINE_FUNCTION
-  constexpr reference operator()(span<SizeType, rank()> indices) const
+  constexpr reference operator()(std::span<SizeType, rank()> indices) const
   {
     return __impl::template __callop<reference>(*this, indices);
   }
@@ -349,9 +321,17 @@ public:
 
   MDSPAN_INLINE_FUNCTION
   friend constexpr void swap(mdspan& x, mdspan& y) noexcept {
+    // can't call the std::swap inside on HIP
+    #if !defined(_MDSPAN_HAS_HIP) && !defined(_MDSPAN_HAS_CUDA)
+    using std::swap;
     swap(x.__ptr_ref(), y.__ptr_ref());
     swap(x.__mapping_ref(), y.__mapping_ref());
     swap(x.__accessor_ref(), y.__accessor_ref());
+    #else
+    mdspan tmp = y;
+    y = x;
+    x = tmp;
+    #endif
   }
 
   //--------------------------------------------------------------------------------
@@ -394,51 +374,48 @@ private:
 #if defined(_MDSPAN_USE_CLASS_TEMPLATE_ARGUMENT_DEDUCTION)
 MDSPAN_TEMPLATE_REQUIRES(
   class ElementType, class... SizeTypes,
-  /* requires */ _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(is_integral, SizeTypes) /* && ... */) &&
+  /* requires */ _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_integral, SizeTypes) /* && ... */) &&
   (sizeof...(SizeTypes) > 0)
 )
-explicit mdspan(ElementType*, SizeTypes...)
-  -> mdspan<ElementType, ::std::experimental::dextents<size_t, sizeof...(SizeTypes)>>;
+MDSPAN_DEDUCTION_GUIDE explicit mdspan(ElementType*, SizeTypes...)
+  -> mdspan<ElementType, ::MDSPAN_IMPL_STANDARD_NAMESPACE::dextents<size_t, sizeof...(SizeTypes)>>;
 
 MDSPAN_TEMPLATE_REQUIRES(
   class Pointer,
-  (_MDSPAN_TRAIT(is_pointer, std::remove_reference_t<Pointer>))
+  (_MDSPAN_TRAIT(std::is_pointer, std::remove_reference_t<Pointer>))
 )
-mdspan(Pointer&&) -> mdspan<std::remove_pointer_t<std::remove_reference_t<Pointer>>, extents<size_t>>;
+MDSPAN_DEDUCTION_GUIDE mdspan(Pointer&&) -> mdspan<std::remove_pointer_t<std::remove_reference_t<Pointer>>, extents<size_t>>;
 
 MDSPAN_TEMPLATE_REQUIRES(
   class CArray,
-  (_MDSPAN_TRAIT(is_array, CArray) && (rank_v<CArray> == 1))
+  (_MDSPAN_TRAIT(std::is_array, CArray) && (std::rank_v<CArray> == 1))
 )
-mdspan(CArray&) -> mdspan<std::remove_all_extents_t<CArray>, extents<size_t, ::std::extent_v<CArray,0>>>;
+MDSPAN_DEDUCTION_GUIDE mdspan(CArray&) -> mdspan<std::remove_all_extents_t<CArray>, extents<size_t, ::std::extent_v<CArray,0>>>;
 
 template <class ElementType, class SizeType, size_t N>
-mdspan(ElementType*, const ::std::array<SizeType, N>&)
-  -> mdspan<ElementType, ::std::experimental::dextents<size_t, N>>;
+MDSPAN_DEDUCTION_GUIDE mdspan(ElementType*, const ::std::array<SizeType, N>&)
+  -> mdspan<ElementType, ::MDSPAN_IMPL_STANDARD_NAMESPACE::dextents<size_t, N>>;
 
 #ifdef __cpp_lib_span
 template <class ElementType, class SizeType, size_t N>
-mdspan(ElementType*, ::std::span<SizeType, N>)
-  -> mdspan<ElementType, ::std::experimental::dextents<size_t, N>>;
+MDSPAN_DEDUCTION_GUIDE mdspan(ElementType*, ::std::span<SizeType, N>)
+  -> mdspan<ElementType, ::MDSPAN_IMPL_STANDARD_NAMESPACE::dextents<size_t, N>>;
 #endif
 
 // This one is necessary because all the constructors take `data_handle_type`s, not
 // `ElementType*`s, and `data_handle_type` is taken from `accessor_type::data_handle_type`, which
 // seems to throw off automatic deduction guides.
 template <class ElementType, class SizeType, size_t... ExtentsPack>
-mdspan(ElementType*, const extents<SizeType, ExtentsPack...>&)
-  -> mdspan<ElementType, ::std::experimental::extents<SizeType, ExtentsPack...>>;
+MDSPAN_DEDUCTION_GUIDE mdspan(ElementType*, const extents<SizeType, ExtentsPack...>&)
+  -> mdspan<ElementType, ::MDSPAN_IMPL_STANDARD_NAMESPACE::extents<SizeType, ExtentsPack...>>;
 
 template <class ElementType, class MappingType>
-mdspan(ElementType*, const MappingType&)
+MDSPAN_DEDUCTION_GUIDE mdspan(ElementType*, const MappingType&)
   -> mdspan<ElementType, typename MappingType::extents_type, typename MappingType::layout_type>;
 
 template <class MappingType, class AccessorType>
-mdspan(const typename AccessorType::data_handle_type, const MappingType&, const AccessorType&)
+MDSPAN_DEDUCTION_GUIDE mdspan(const typename AccessorType::data_handle_type, const MappingType&, const AccessorType&)
   -> mdspan<typename AccessorType::element_type, typename MappingType::extents_type, typename MappingType::layout_type, AccessorType>;
 #endif
 
-
-
-} // end namespace experimental
-} // end namespace std
+} // end namespace MDSPAN_IMPL_STANDARD_NAMESPACE
diff --git a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/no_unique_address.hpp b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/no_unique_address.hpp
index 904dd40a7590155e745c2549381e59f63c63a8af..36e64ee24dbb7166ac5da069bbbfbc4347026993 100644
--- a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/no_unique_address.hpp
+++ b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/no_unique_address.hpp
@@ -1,53 +1,24 @@
-/*
 //@HEADER
 // ************************************************************************
 //
-//                        Kokkos v. 2.0
-//              Copyright (2019) Sandia Corporation
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
 //
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// Under the terms of Contract DE-NA0003525 with NTESS,
 // the U.S. Government retains certain rights in this software.
 //
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
 //@HEADER
-*/
-
 #pragma once
 
 #include "macros.hpp"
 #include "trait_backports.hpp"
 
-namespace std {
-namespace experimental {
+namespace MDSPAN_IMPL_STANDARD_NAMESPACE {
 namespace detail {
 
 //==============================================================================
@@ -70,17 +41,17 @@ struct __no_unique_address_emulation {
 template <class _T, size_t _Disambiguator>
 struct __no_unique_address_emulation<
     _T, _Disambiguator,
-    enable_if_t<_MDSPAN_TRAIT(is_empty, _T) &&
+    std::enable_if_t<_MDSPAN_TRAIT(std::is_empty, _T) &&
                 // If the type isn't trivially destructible, its destructor
                 // won't be called at the right time, so don't use this
                 // specialization
-                _MDSPAN_TRAIT(is_trivially_destructible, _T)>> : 
+                _MDSPAN_TRAIT(std::is_trivially_destructible, _T)>> :
 #ifdef _MDSPAN_COMPILER_MSVC
     // MSVC doesn't allow you to access public static member functions of a type
     // when you *happen* to privately inherit from that type.
     protected
 #else
-    // But we still want this to be private if possible so that we don't accidentally 
+    // But we still want this to be private if possible so that we don't accidentally
     // access members of _T directly rather than calling __ref() first, which wouldn't
     // work if _T happens to be stateful and thus we're using the unspecialized definition
     // of __no_unique_address_emulation above.
@@ -123,5 +94,4 @@ struct __no_unique_address_emulation<
 //==============================================================================
 
 } // end namespace detail
-} // end namespace experimental
-} // end namespace std
+} // end namespace MDSPAN_IMPL_STANDARD_NAMESPACE
diff --git a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/standard_layout_static_array.hpp b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/standard_layout_static_array.hpp
deleted file mode 100644
index 1c543b08051b57ba45fb1e8abe71e7875db366b1..0000000000000000000000000000000000000000
--- a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/standard_layout_static_array.hpp
+++ /dev/null
@@ -1,685 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 2.0
-//              Copyright (2019) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#pragma once
-
-#include "macros.hpp"
-#include "dynamic_extent.hpp"
-#include "trait_backports.hpp" // enable_if
-#include "compressed_pair.hpp"
-
-#if !defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS)
-#  include "no_unique_address.hpp"
-#endif
-
-#include <array>
-#ifdef __cpp_lib_span
-#include <span>
-#endif
-#include <utility> // integer_sequence
-#include <cstddef>
-
-namespace std {
-namespace experimental {
-namespace detail {
-
-//==============================================================================
-
-_MDSPAN_INLINE_VARIABLE constexpr struct
-    __construct_psa_from_dynamic_exts_values_tag_t {
-} __construct_psa_from_dynamic_exts_values_tag = {};
-
-_MDSPAN_INLINE_VARIABLE constexpr struct
-    __construct_psa_from_all_exts_values_tag_t {
-} __construct_psa_from_all_exts_values_tag = {};
-
-struct __construct_psa_from_all_exts_array_tag_t {};
-template <size_t _N = 0> struct __construct_psa_from_dynamic_exts_array_tag_t {};
-
-//==============================================================================
-
-template <size_t _I, class _T> using __repeated_with_idxs = _T;
-
-//==============================================================================
-
-#if _MDSPAN_PRESERVE_STANDARD_LAYOUT
-
-/**
- *  PSA = "partially static array"
- *
- * @tparam _T
- * @tparam _ValsSeq
- * @tparam __sentinal
- */
-template <class _Tag, class _T, class _static_t, class _ValsSeq, _static_t __sentinal = static_cast<_static_t>(dynamic_extent),
-          class _IdxsSeq = make_index_sequence<_ValsSeq::size()>>
-struct __standard_layout_psa;
-
-//==============================================================================
-// Static case
-template <class _Tag, class _T, class _static_t, _static_t __value, _static_t... __values_or_sentinals,
-          _static_t __sentinal, size_t _Idx, size_t... _Idxs>
-struct __standard_layout_psa<
-    _Tag, _T, _static_t, integer_sequence<_static_t, __value, __values_or_sentinals...>,
-    __sentinal, integer_sequence<size_t, _Idx, _Idxs...>>
-#if !defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS)
-    : private __no_unique_address_emulation<__standard_layout_psa<
-          _Tag, _T, _static_t, integer_sequence<_static_t, __values_or_sentinals...>, __sentinal,
-          integer_sequence<size_t, _Idxs...>>>
-#endif
-{
-
-  //--------------------------------------------------------------------------
-
-  using __next_t =
-      __standard_layout_psa<_Tag, _T, _static_t,
-                            integer_sequence<_static_t, __values_or_sentinals...>,
-                            __sentinal, integer_sequence<size_t, _Idxs...>>;
-
-#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS)
-  _MDSPAN_NO_UNIQUE_ADDRESS __next_t __next_;
-#else
-  using __base_t = __no_unique_address_emulation<__next_t>;
-#endif
-
-  MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 __next_t &__next() noexcept {
-#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS)
-    return __next_;
-#else
-    return this->__base_t::__ref();
-#endif
-  }
-  MDSPAN_FORCE_INLINE_FUNCTION constexpr __next_t const &__next() const noexcept {
-#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS)
-    return __next_;
-#else
-    return this->__base_t::__ref();
-#endif
-  }
-
-  static constexpr auto __size = sizeof...(_Idxs) + 1;
-  static constexpr auto __size_dynamic = __next_t::__size_dynamic;
-
-  //--------------------------------------------------------------------------
-
-  MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __standard_layout_psa() noexcept = default;
-  MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __standard_layout_psa(__standard_layout_psa const &) noexcept =
-      default;
-  MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __standard_layout_psa(__standard_layout_psa &&) noexcept = default;
-  MDSPAN_INLINE_FUNCTION_DEFAULTED
-  _MDSPAN_CONSTEXPR_14_DEFAULTED __standard_layout_psa &
-  operator=(__standard_layout_psa const &) noexcept = default;
-  MDSPAN_INLINE_FUNCTION_DEFAULTED
-  _MDSPAN_CONSTEXPR_14_DEFAULTED __standard_layout_psa &
-  operator=(__standard_layout_psa &&) noexcept = default;
-  MDSPAN_INLINE_FUNCTION_DEFAULTED
-  ~__standard_layout_psa() noexcept = default;
-
-  //--------------------------------------------------------------------------
-
-  MDSPAN_INLINE_FUNCTION
-  constexpr __standard_layout_psa(
-      __construct_psa_from_all_exts_values_tag_t, _T const & /*__val*/,
-      __repeated_with_idxs<_Idxs, _T> const &... __vals) noexcept
-#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS)
-      : __next_{
-#else
-      : __base_t(__base_t{__next_t(
-#endif
-          __construct_psa_from_all_exts_values_tag, __vals...
-#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS)
-        }
-#else
-        )})
-#endif
-  { }
-
-  template <class... _Ts>
-  MDSPAN_INLINE_FUNCTION constexpr __standard_layout_psa(
-      __construct_psa_from_dynamic_exts_values_tag_t,
-      _Ts const &... __vals) noexcept
-#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS)
-      : __next_{
-#else
-      : __base_t(__base_t{__next_t(
-#endif
-          __construct_psa_from_dynamic_exts_values_tag, __vals...
-#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS)
-        }
-#else
-        )})
-#endif
-  { }
-
-  template <class _U, size_t _N>
-  MDSPAN_INLINE_FUNCTION constexpr explicit __standard_layout_psa(
-      array<_U, _N> const &__vals) noexcept
-#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS)
-      : __next_{
-#else
-      : __base_t(__base_t{__next_t(
-#endif
-          __vals
-#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS)
-        }
-#else
-        )})
-#endif
-  { }
-
-  template <class _U, size_t _NStatic>
-  MDSPAN_INLINE_FUNCTION constexpr explicit __standard_layout_psa(
-      __construct_psa_from_all_exts_array_tag_t const & __tag,
-      array<_U, _NStatic> const &__vals) noexcept
-#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS)
-      : __next_{
-#else
-      : __base_t(__base_t{__next_t(
-#endif
-          __tag, __vals
-#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS)
-        }
-#else
-        )})
-#endif
-  { }
-
-  template <class _U, size_t _IDynamic, size_t _NDynamic>
-  MDSPAN_INLINE_FUNCTION constexpr explicit __standard_layout_psa(
-      __construct_psa_from_dynamic_exts_array_tag_t<_IDynamic> __tag,
-      array<_U, _NDynamic> const &__vals) noexcept
-#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS)
-      : __next_{
-#else
-      : __base_t(__base_t{__next_t(
-#endif
-          __tag, __vals
-#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS)
-        }
-#else
-        )})
-#endif
-  { }
-
-#ifdef __cpp_lib_span
-  template <class _U, size_t _N>
-  MDSPAN_INLINE_FUNCTION constexpr explicit __standard_layout_psa(
-      span<_U, _N> const &__vals) noexcept
-#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS)
-      : __next_{
-#else
-      : __base_t(__base_t{__next_t(
-#endif
-          __vals
-#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS)
-        }
-#else
-        )})
-#endif
-  { }
-
-  template <class _U, size_t _NStatic>
-  MDSPAN_INLINE_FUNCTION constexpr explicit __standard_layout_psa(
-      __construct_psa_from_all_exts_array_tag_t const & __tag,
-      span<_U, _NStatic> const &__vals) noexcept
-#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS)
-      : __next_{
-#else
-      : __base_t(__base_t{__next_t(
-#endif
-          __tag, __vals
-#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS)
-        }
-#else
-        )})
-#endif
-  { }
-
-  template <class _U, size_t _IDynamic, size_t _NDynamic>
-  MDSPAN_INLINE_FUNCTION constexpr explicit __standard_layout_psa(
-      __construct_psa_from_dynamic_exts_array_tag_t<_IDynamic> __tag,
-      span<_U, _NDynamic> const &__vals) noexcept
-#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS)
-      : __next_{
-#else
-      : __base_t(__base_t{__next_t(
-#endif
-          __tag, __vals
-#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS)
-        }
-#else
-        )})
-#endif
-  { }
-#endif
-
-  template <class _UTag, class _U, class _static_U, class _UValsSeq, _static_U __u_sentinal,
-            class _IdxsSeq>
-  MDSPAN_INLINE_FUNCTION constexpr __standard_layout_psa(
-      __standard_layout_psa<_UTag, _U, _static_U, _UValsSeq, __u_sentinal, _IdxsSeq> const
-          &__rhs) noexcept
-#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS)
-      : __next_{
-#else
-      : __base_t(__base_t{__next_t(
-#endif
-          __rhs.__next()
-#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS)
-        }
-#else
-        )})
-#endif
-  { }
-
-  //--------------------------------------------------------------------------
-
-  // See https://godbolt.org/z/_KSDNX for a summary-by-example of why this is
-  // necessary. We're using inheritance here instead of an alias template
-  // because we have to deduce __values_or_sentinals in several places, and
-  // alias templates don't permit that in this context.
-  MDSPAN_FORCE_INLINE_FUNCTION
-  constexpr __standard_layout_psa const &__enable_psa_conversion() const
-      noexcept {
-    return *this;
-  }
-
-  template <size_t _I, enable_if_t<_I != _Idx, int> = 0>
-  MDSPAN_FORCE_INLINE_FUNCTION constexpr _T __get_n() const noexcept {
-    return __next().template __get_n<_I>();
-  }
-  template <size_t _I, enable_if_t<_I == _Idx, int> = 1>
-  MDSPAN_FORCE_INLINE_FUNCTION constexpr _T __get_n() const noexcept {
-    return __value;
-  }
-  template <size_t _I, enable_if_t<_I != _Idx, int> = 0>
-  MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 void
-  __set_n(_T const &__rhs) noexcept {
-    __next().__set_value(__rhs);
-  }
-  template <size_t _I, enable_if_t<_I == _Idx, int> = 1>
-  MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 void
-  __set_n(_T const &) noexcept {
-    // Don't assert here because that would break constexpr. This better
-    // not change anything, though
-  }
-  template <size_t _I, enable_if_t<_I == _Idx, _static_t> = __sentinal>
-  MDSPAN_FORCE_INLINE_FUNCTION static constexpr _static_t __get_static_n() noexcept {
-    return __value;
-  }
-  template <size_t _I, enable_if_t<_I != _Idx, _static_t> __default = __sentinal>
-  MDSPAN_FORCE_INLINE_FUNCTION static constexpr _static_t __get_static_n() noexcept {
-    return __next_t::template __get_static_n<_I, __default>();
-  }
-  MDSPAN_FORCE_INLINE_FUNCTION constexpr _T __get(size_t __n) const noexcept {
-    return __value * (_T(_Idx == __n)) + __next().__get(__n);
-  }
-
-  //--------------------------------------------------------------------------
-};
-
-//==============================================================================
-
-// Dynamic case, __next_t may or may not be empty
-template <class _Tag, class _T, class _static_t, _static_t __sentinal, _static_t... __values_or_sentinals,
-          size_t _Idx, size_t... _Idxs>
-struct __standard_layout_psa<
-    _Tag, _T, _static_t, integer_sequence<_static_t, __sentinal, __values_or_sentinals...>,
-    __sentinal, integer_sequence<size_t, _Idx, _Idxs...>> {
-  //--------------------------------------------------------------------------
-
-  using __next_t =
-      __standard_layout_psa<_Tag, _T, _static_t,
-                            integer_sequence<_static_t, __values_or_sentinals...>,
-                            __sentinal, integer_sequence<size_t, _Idxs...>>;
-
-  using __value_pair_t = __compressed_pair<_T, __next_t>;
-  __value_pair_t __value_pair;
-  MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 __next_t &__next() noexcept {
-    return __value_pair.__second();
-  }
-  MDSPAN_FORCE_INLINE_FUNCTION constexpr __next_t const &__next() const noexcept {
-    return __value_pair.__second();
-  }
-
-  static constexpr auto __size = sizeof...(_Idxs) + 1;
-  static constexpr auto __size_dynamic = 1 + __next_t::__size_dynamic;
-
-  //--------------------------------------------------------------------------
-
-  MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __standard_layout_psa() noexcept = default;
-  MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __standard_layout_psa(__standard_layout_psa const &) noexcept =
-      default;
-  MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __standard_layout_psa(__standard_layout_psa &&) noexcept = default;
-  MDSPAN_INLINE_FUNCTION_DEFAULTED
-  _MDSPAN_CONSTEXPR_14_DEFAULTED __standard_layout_psa &
-  operator=(__standard_layout_psa const &) noexcept = default;
-  MDSPAN_INLINE_FUNCTION_DEFAULTED
-  _MDSPAN_CONSTEXPR_14_DEFAULTED __standard_layout_psa &
-  operator=(__standard_layout_psa &&) noexcept = default;
-  MDSPAN_INLINE_FUNCTION_DEFAULTED
-  ~__standard_layout_psa() noexcept = default;
-
-  //--------------------------------------------------------------------------
-
-  MDSPAN_INLINE_FUNCTION
-  constexpr __standard_layout_psa(
-      __construct_psa_from_all_exts_values_tag_t, _T const &__val,
-      __repeated_with_idxs<_Idxs, _T> const &... __vals) noexcept
-      : __value_pair(__val,
-                     __next_t(__construct_psa_from_all_exts_values_tag,
-                              __vals...)) {}
-
-  template <class... _Ts>
-  MDSPAN_INLINE_FUNCTION constexpr __standard_layout_psa(
-      __construct_psa_from_dynamic_exts_values_tag_t, _T const &__val,
-      _Ts const &... __vals) noexcept
-      : __value_pair(__val,
-                     __next_t(__construct_psa_from_dynamic_exts_values_tag,
-                              __vals...)) {}
-
-  template <class _U, size_t _N>
-  MDSPAN_INLINE_FUNCTION constexpr explicit __standard_layout_psa(
-      array<_U, _N> const &__vals) noexcept
-      : __value_pair(::std::get<_Idx>(__vals), __vals) {}
-
-  template <class _U, size_t _NStatic>
-  MDSPAN_INLINE_FUNCTION constexpr explicit __standard_layout_psa(
-      __construct_psa_from_all_exts_array_tag_t __tag,
-      array<_U, _NStatic> const &__vals) noexcept
-      : __value_pair(
-            ::std::get<_Idx>(__vals),
-            __next_t(__tag,
-                     __vals)) {}
-
-  template <class _U, size_t _IDynamic, size_t _NDynamic>
-  MDSPAN_INLINE_FUNCTION constexpr explicit __standard_layout_psa(
-      __construct_psa_from_dynamic_exts_array_tag_t<_IDynamic>,
-      array<_U, _NDynamic> const &__vals) noexcept
-      : __value_pair(
-            ::std::get<_IDynamic>(__vals),
-            __next_t(__construct_psa_from_dynamic_exts_array_tag_t<_IDynamic + 1>{},
-                     __vals)) {}
-
-#ifdef __cpp_lib_span
-  template <class _U, size_t _N>
-  MDSPAN_INLINE_FUNCTION constexpr explicit __standard_layout_psa(
-      span<_U, _N> const &__vals) noexcept
-      : __value_pair(__vals[_Idx], __vals) {}
-
-  template <class _U, size_t _NStatic>
-  MDSPAN_INLINE_FUNCTION constexpr explicit __standard_layout_psa(
-      __construct_psa_from_all_exts_array_tag_t __tag,
-      span<_U, _NStatic> const &__vals) noexcept
-      : __value_pair(
-            __vals[_Idx],
-            __next_t(__tag,
-                     __vals)) {}
-
-  template <class _U, size_t _IDynamic, size_t _NDynamic>
-  MDSPAN_INLINE_FUNCTION constexpr explicit __standard_layout_psa(
-      __construct_psa_from_dynamic_exts_array_tag_t<_IDynamic>,
-      span<_U, _NDynamic> const &__vals) noexcept
-      : __value_pair(
-            __vals[_IDynamic],
-            __next_t(__construct_psa_from_dynamic_exts_array_tag_t<_IDynamic + 1>{},
-                     __vals)) {}
-#endif
-
-  template <class _UTag, class _U, class _static_U, class _UValsSeq, _static_U __u_sentinal,
-            class _UIdxsSeq>
-  MDSPAN_INLINE_FUNCTION constexpr __standard_layout_psa(
-      __standard_layout_psa<_UTag, _U, _static_U, _UValsSeq, __u_sentinal, _UIdxsSeq> const
-          &__rhs) noexcept
-      : __value_pair(__rhs.template __get_n<_Idx>(), __rhs.__next()) {}
-
-  //--------------------------------------------------------------------------
-
-  // See comment in the previous partial specialization for why this is
-  // necessary.  Or just trust me that it's messy.
-  MDSPAN_FORCE_INLINE_FUNCTION
-  constexpr __standard_layout_psa const &__enable_psa_conversion() const
-      noexcept {
-    return *this;
-  }
-
-  template <size_t _I, enable_if_t<_I != _Idx, int> = 0>
-  MDSPAN_FORCE_INLINE_FUNCTION constexpr _T __get_n() const noexcept {
-    return __next().template __get_n<_I>();
-  }
-  template <size_t _I, enable_if_t<_I == _Idx, int> = 1>
-  MDSPAN_FORCE_INLINE_FUNCTION constexpr _T __get_n() const noexcept {
-    return __value_pair.__first();
-  }
-  template <size_t _I, enable_if_t<_I != _Idx, int> = 0>
-  MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 void
-  __set_n(_T const &__rhs) noexcept {
-    __next().__set_value(__rhs);
-  }
-  template <size_t _I, enable_if_t<_I == _Idx, int> = 1>
-  MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 void
-  __set_n(_T const &__rhs) noexcept {
-    __value_pair.__first() = __rhs;
-  }
-  template <size_t _I, enable_if_t<_I == _Idx, _static_t> __default = __sentinal>
-  MDSPAN_FORCE_INLINE_FUNCTION static constexpr _static_t __get_static_n() noexcept {
-    return __default;
-  }
-  template <size_t _I, enable_if_t<_I != _Idx, _static_t> __default = __sentinal>
-  MDSPAN_FORCE_INLINE_FUNCTION static constexpr _static_t __get_static_n() noexcept {
-    return __next_t::template __get_static_n<_I, __default>();
-  }
-  MDSPAN_FORCE_INLINE_FUNCTION constexpr _T __get(size_t __n) const noexcept {
-    return __value_pair.__first() * (_T(_Idx == __n)) + __next().__get(__n);
-  }
-
-  //--------------------------------------------------------------------------
-};
-
-// empty/terminal case
-template <class _Tag, class _T, class _static_t, _static_t __sentinal>
-struct __standard_layout_psa<_Tag, _T, _static_t, integer_sequence<_static_t>, __sentinal,
-                             integer_sequence<size_t>> {
-  //--------------------------------------------------------------------------
-
-  static constexpr auto __size = 0;
-  static constexpr auto __size_dynamic = 0;
-
-  //--------------------------------------------------------------------------
-
-  MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __standard_layout_psa() noexcept
-#if defined(__clang__) || defined(_MDSPAN_DEFAULTED_CONSTRUCTORS_INHERITANCE_WORKAROUND)
-  // As far as I can tell, there appears to be a bug in clang that's causing
-  // this to be non-constexpr when it's defaulted.
-  { }
-#else
-   = default;
-#endif
-  MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __standard_layout_psa(__standard_layout_psa const &) noexcept =
-      default;
-  MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __standard_layout_psa(__standard_layout_psa &&) noexcept = default;
-  MDSPAN_INLINE_FUNCTION_DEFAULTED
-  _MDSPAN_CONSTEXPR_14_DEFAULTED __standard_layout_psa &
-  operator=(__standard_layout_psa const &) noexcept = default;
-  MDSPAN_INLINE_FUNCTION_DEFAULTED
-  _MDSPAN_CONSTEXPR_14_DEFAULTED __standard_layout_psa &
-  operator=(__standard_layout_psa &&) noexcept = default;
-  MDSPAN_INLINE_FUNCTION_DEFAULTED
-  ~__standard_layout_psa() noexcept = default;
-
-  MDSPAN_INLINE_FUNCTION
-  constexpr __standard_layout_psa(
-      __construct_psa_from_all_exts_values_tag_t) noexcept {}
-
-  template <class... _Ts>
-  MDSPAN_INLINE_FUNCTION constexpr __standard_layout_psa(
-      __construct_psa_from_dynamic_exts_values_tag_t) noexcept {}
-
-  template <class _U, size_t _N>
-  MDSPAN_INLINE_FUNCTION constexpr explicit __standard_layout_psa(
-      array<_U, _N> const &) noexcept {}
-
-  template <class _U, size_t _NStatic>
-  MDSPAN_INLINE_FUNCTION constexpr explicit __standard_layout_psa(
-      __construct_psa_from_all_exts_array_tag_t,
-      array<_U, _NStatic> const &) noexcept {}
-
-  template <class _U, size_t _IDynamic, size_t _NDynamic>
-  MDSPAN_INLINE_FUNCTION constexpr explicit __standard_layout_psa(
-      __construct_psa_from_dynamic_exts_array_tag_t<_IDynamic>,
-      array<_U, _NDynamic> const &) noexcept {}
-
-#ifdef __cpp_lib_span
-  template <class _U, size_t _N>
-  MDSPAN_INLINE_FUNCTION constexpr explicit __standard_layout_psa(
-      span<_U, _N> const &) noexcept {}
-
-  template <class _U, size_t _NStatic>
-  MDSPAN_INLINE_FUNCTION constexpr explicit __standard_layout_psa(
-      __construct_psa_from_all_exts_array_tag_t,
-      span<_U, _NStatic> const &) noexcept {}
-
-  template <class _U, size_t _IDynamic, size_t _NDynamic>
-  MDSPAN_INLINE_FUNCTION constexpr explicit __standard_layout_psa(
-      __construct_psa_from_dynamic_exts_array_tag_t<_IDynamic>,
-      span<_U, _NDynamic> const &) noexcept {}
-#endif
-
-  template <class _UTag, class _U, class _static_U, class _UValsSeq, _static_U __u_sentinal,
-            class _UIdxsSeq>
-  MDSPAN_INLINE_FUNCTION constexpr __standard_layout_psa(
-      __standard_layout_psa<_UTag, _U, _static_U, _UValsSeq, __u_sentinal, _UIdxsSeq> const&) noexcept {}
-
-  // See comment in the previous partial specialization for why this is
-  // necessary.  Or just trust me that it's messy.
-  MDSPAN_FORCE_INLINE_FUNCTION
-  constexpr __standard_layout_psa const &__enable_psa_conversion() const
-      noexcept {
-    return *this;
-  }
-
-  MDSPAN_FORCE_INLINE_FUNCTION constexpr _T __get(size_t /*n*/) const noexcept {
-    return 0;
-  }
-};
-
-// Same thing, but with a disambiguator so that same-base issues doesn't cause
-// a loss of standard-layout-ness.
-template <class _Tag, class T, class _static_t, _static_t... __values_or_sentinals>
-struct __partially_static_sizes_tagged
-    : __standard_layout_psa<
-          _Tag, T, _static_t,
-          integer_sequence<_static_t, __values_or_sentinals...>> {
-  using __tag_t = _Tag;
-  using __psa_impl_t = __standard_layout_psa<
-      _Tag, T, _static_t, integer_sequence<_static_t, __values_or_sentinals...>>;
-  using __psa_impl_t::__psa_impl_t;
-#ifdef _MDSPAN_DEFAULTED_CONSTRUCTORS_INHERITANCE_WORKAROUND
-  MDSPAN_INLINE_FUNCTION
-#endif
-  constexpr __partially_static_sizes_tagged() noexcept
-#ifdef _MDSPAN_DEFAULTED_CONSTRUCTORS_INHERITANCE_WORKAROUND
-    : __psa_impl_t() { }
-#else
-    = default;
-#endif
-  MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __partially_static_sizes_tagged(
-      __partially_static_sizes_tagged const &) noexcept = default;
-  MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __partially_static_sizes_tagged(
-      __partially_static_sizes_tagged &&) noexcept = default;
-  MDSPAN_INLINE_FUNCTION_DEFAULTED
-  _MDSPAN_CONSTEXPR_14_DEFAULTED __partially_static_sizes_tagged &
-  operator=(__partially_static_sizes_tagged const &) noexcept = default;
-  MDSPAN_INLINE_FUNCTION_DEFAULTED
-  _MDSPAN_CONSTEXPR_14_DEFAULTED __partially_static_sizes_tagged &
-  operator=(__partially_static_sizes_tagged &&) noexcept = default;
-  MDSPAN_INLINE_FUNCTION_DEFAULTED
-  ~__partially_static_sizes_tagged() noexcept = default;
-
-  template <class _UTag>
-  MDSPAN_FORCE_INLINE_FUNCTION constexpr explicit __partially_static_sizes_tagged(
-    __partially_static_sizes_tagged<_UTag, T, _static_t, __values_or_sentinals...> const& __vals
-  ) noexcept : __psa_impl_t(__vals.__enable_psa_conversion()) { }
-};
-
-struct __no_tag {};
-template <class T, class _static_t, _static_t... __values_or_sentinals>
-struct __partially_static_sizes
-    : __partially_static_sizes_tagged<__no_tag, T, _static_t, __values_or_sentinals...> {
-private:
-  using __base_t =
-      __partially_static_sizes_tagged<__no_tag, T, _static_t, __values_or_sentinals...>;
-  template <class _UTag>
-  MDSPAN_FORCE_INLINE_FUNCTION constexpr __partially_static_sizes(
-    __partially_static_sizes_tagged<_UTag, T, _static_t, __values_or_sentinals...>&& __vals
-  ) noexcept : __base_t(::std::move(__vals)) { }
-public:
-  using __base_t::__base_t;
-
-#ifdef _MDSPAN_DEFAULTED_CONSTRUCTORS_INHERITANCE_WORKAROUND
-  MDSPAN_INLINE_FUNCTION
-  constexpr __partially_static_sizes() noexcept : __base_t() { }
-#endif
-  template <class _UTag>
-  MDSPAN_FORCE_INLINE_FUNCTION constexpr __partially_static_sizes_tagged<
-      _UTag, T, _static_t, __values_or_sentinals...>
-  __with_tag() const noexcept {
-    return __partially_static_sizes_tagged<_UTag, T, _static_t, __values_or_sentinals...>(*this);
-  }
-};
-
-#endif // _MDSPAN_PRESERVE_STATIC_LAYOUT
-
-} // end namespace detail
-} // end namespace experimental
-} // end namespace std
diff --git a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/static_array.hpp b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/static_array.hpp
deleted file mode 100644
index 548bd8c8a75074e39d93f2a2ad38bbc1509198a6..0000000000000000000000000000000000000000
--- a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/static_array.hpp
+++ /dev/null
@@ -1,286 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 2.0
-//              Copyright (2019) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#pragma once
-
-#include "macros.hpp"
-
-#include "dynamic_extent.hpp"
-#include "trait_backports.hpp"
-#include "maybe_static_value.hpp"
-#include "standard_layout_static_array.hpp"
-#include "type_list.hpp"
-
-// Needs to be after the includes above to work with the single header generator
-#if !_MDSPAN_PRESERVE_STANDARD_LAYOUT
-#include <cstddef> // size_t
-#include <utility> // integer_sequence
-#include <array>
-
-namespace std {
-namespace experimental {
-namespace detail {
-
-//==============================================================================
-
-template <class _T, _T _Val, bool _Mask> struct __mask_element {};
-
-template <class _T, _T... _Result>
-struct __mask_sequence_assign_op {
-  template <_T _V>
-  __mask_sequence_assign_op<_T, _Result..., _V>
-  operator=(__mask_element<_T, _V, true>&&);
-  template <_T _V>
-  __mask_sequence_assign_op<_T, _Result...>
-  operator=(__mask_element<_T, _V, false>&&);
-  using __result = integer_sequence<_T, _Result...>;
-};
-
-template <class _Seq, class _Mask>
-struct __mask_sequence;
-
-template <class _T, _T... _Vals, bool... _Masks>
-struct __mask_sequence<integer_sequence<_T, _Vals...>, integer_sequence<bool, _Masks...>>
-{
-  using type = typename decltype(
-    _MDSPAN_FOLD_ASSIGN_LEFT(
-      __mask_sequence_assign_op<_T>{}, /* = ... = */ __mask_element<_T, _Vals, _Masks>{}
-    )
-  )::__result;
-};
-
-//==============================================================================
-
-template <class _T, class _static_t, class _Vals, _static_t __sentinal,
-          class _Idxs, class _IdxsDynamic, class _IdxsDynamicIdxs>
-class __partially_static_array_impl;
-
-template <
-  class _T, class _static_t,
-  _static_t... __values_or_sentinals, _static_t __sentinal,
-  size_t... _Idxs,
-  size_t... _IdxsDynamic,
-  size_t... _IdxsDynamicIdxs
->
-class __partially_static_array_impl<
-  _T,
-  _static_t,
-  integer_sequence<_static_t, __values_or_sentinals...>,
-  __sentinal,
-  integer_sequence<size_t, _Idxs...>,
-  integer_sequence<size_t, _IdxsDynamic...>,
-  integer_sequence<size_t, _IdxsDynamicIdxs...>
->
-    : private __maybe_static_value<_T, _static_t, __values_or_sentinals, __sentinal,
-                                   _Idxs>... {
-private:
-
-  template <size_t _N>
-  using __base_n = typename __type_at<_N,
-    __type_list<__maybe_static_value<_T, _static_t, __values_or_sentinals, __sentinal, _Idxs>...>
-  >::type;
-
-public:
-
-  static constexpr auto __size = sizeof...(_Idxs);
-  static constexpr auto __size_dynamic =
-    _MDSPAN_FOLD_PLUS_RIGHT(static_cast<int>((__values_or_sentinals == __sentinal)), /* + ... + */ 0);
-
-  //--------------------------------------------------------------------------
-
-  MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __partially_static_array_impl() = default;
-  MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __partially_static_array_impl(
-      __partially_static_array_impl const &) noexcept = default;
-  MDSPAN_INLINE_FUNCTION_DEFAULTED
-  constexpr __partially_static_array_impl(
-      __partially_static_array_impl &&) noexcept = default;
-  MDSPAN_INLINE_FUNCTION_DEFAULTED
-  _MDSPAN_CONSTEXPR_14_DEFAULTED __partially_static_array_impl &
-  operator=(__partially_static_array_impl const &) noexcept = default;
-  MDSPAN_INLINE_FUNCTION_DEFAULTED
-  _MDSPAN_CONSTEXPR_14_DEFAULTED __partially_static_array_impl &
-  operator=(__partially_static_array_impl &&) noexcept = default;
-  MDSPAN_INLINE_FUNCTION_DEFAULTED
-  ~__partially_static_array_impl() noexcept = default;
-
-  MDSPAN_INLINE_FUNCTION
-  constexpr __partially_static_array_impl(
-      __construct_psa_from_all_exts_values_tag_t,
-      __repeated_with_idxs<_Idxs, _T> const &... __vals) noexcept
-      : __base_n<_Idxs>(__base_n<_Idxs>{{__vals}})... {}
-
-  MDSPAN_INLINE_FUNCTION
-  constexpr __partially_static_array_impl(
-      __construct_psa_from_dynamic_exts_values_tag_t,
-      __repeated_with_idxs<_IdxsDynamicIdxs, _T> const &... __vals) noexcept
-      : __base_n<_IdxsDynamic>(__base_n<_IdxsDynamic>{{__vals}})... {}
-
-  MDSPAN_INLINE_FUNCTION constexpr explicit __partially_static_array_impl(
-    array<_T, sizeof...(_Idxs)> const& __vals) noexcept
-    : __partially_static_array_impl(
-        __construct_psa_from_all_exts_values_tag,
-        ::std::get<_Idxs>(__vals)...) {}
-
-  // clang-format off
-  MDSPAN_FUNCTION_REQUIRES(
-    (MDSPAN_INLINE_FUNCTION constexpr explicit),
-    __partially_static_array_impl,
-    (array<_T, __size_dynamic> const &__vals), noexcept,
-    /* requires */
-      (sizeof...(_Idxs) != __size_dynamic)
-  ): __partially_static_array_impl(
-       __construct_psa_from_dynamic_exts_values_tag,
-       ::std::get<_IdxsDynamicIdxs>(__vals)...) {}
-  // clang-format on
-
-  template <class _U, class _static_u, class _UValsSeq, _static_u __u_sentinal, class _UIdxsSeq,
-            class _UIdxsDynamicSeq, class _UIdxsDynamicIdxsSeq>
-  MDSPAN_INLINE_FUNCTION constexpr __partially_static_array_impl(
-    __partially_static_array_impl<
-      _U, _static_u, _UValsSeq, __u_sentinal, _UIdxsSeq,
-     _UIdxsDynamicSeq, _UIdxsDynamicIdxsSeq> const &__rhs) noexcept
-    : __partially_static_array_impl(
-        __construct_psa_from_all_exts_values_tag,
-        __rhs.template __get_n<_Idxs>()...) {}
-
-  //--------------------------------------------------------------------------
-
-  // See comment in the previous partial specialization for why this is
-  // necessary.  Or just trust me that it's messy.
-  MDSPAN_FORCE_INLINE_FUNCTION
-  constexpr __partially_static_array_impl const &__enable_psa_conversion() const
-  noexcept {
-      return *this;
-  }
-
-  template <size_t _I>
-  MDSPAN_FORCE_INLINE_FUNCTION constexpr _T __get_n() const noexcept {
-    return static_cast<__base_n<_I> const*>(this)->__value();
-  }
-
-  template <class _U, size_t _I>
-  MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 void __set_n(_U&& __rhs) noexcept {
-    static_cast<__base_n<_I>*>(this)->__set_value((_U&&)__rhs);
-  }
-
-  template <size_t _I, _static_t __default = __sentinal>
-  MDSPAN_FORCE_INLINE_FUNCTION static constexpr _static_t
-  __get_static_n() noexcept {
-    return __base_n<_I>::__static_value == __sentinal ?
-      __default : __base_n<_I>::__static_value;
-  }
-
-  MDSPAN_FORCE_INLINE_FUNCTION constexpr _T
-  __get(size_t __n) const noexcept {
-    return _MDSPAN_FOLD_PLUS_RIGHT(
-      (_T(_Idxs == __n) * __get_n<_Idxs>()), /* + ... + */ _T(0)
-    );
-  }
-
-};
-
-//==============================================================================
-
-template <class _T, class _static_t, class _ValSeq, _static_t __sentinal, class _Idxs = make_index_sequence<_ValSeq::size()>>
-struct __partially_static_array_impl_maker;
-
-template <
-  class _T, class _static_t,  _static_t... _Vals, _static_t __sentinal, size_t... _Idxs
->
-struct __partially_static_array_impl_maker<
-  _T, _static_t, integer_sequence<_static_t, _Vals...>, __sentinal, integer_sequence<size_t, _Idxs...>
->
-{
-  using __dynamic_idxs = typename __mask_sequence<
-    integer_sequence<size_t, _Idxs...>,
-    integer_sequence<bool, (_Vals == __sentinal)...>
-  >::type;
-  using __impl_base =
-    __partially_static_array_impl<_T, _static_t,
-      integer_sequence<_static_t, _Vals...>,
-      __sentinal, integer_sequence<size_t, _Idxs...>,
-      __dynamic_idxs,
-      make_index_sequence<__dynamic_idxs::size()>
-    >;
-};
-
-template <class _T, class _static_t, class _ValsSeq, _static_t __sentinal = dynamic_extent>
-class __partially_static_array_with_sentinal
-  : public __partially_static_array_impl_maker<_T, _static_t, _ValsSeq, __sentinal>::__impl_base
-{
-private:
-  using __base_t = typename __partially_static_array_impl_maker<_T, _static_t, _ValsSeq, __sentinal>::__impl_base;
-public:
-  using __base_t::__base_t;
-};
-
-//==============================================================================
-
-template <class T, class _static_t, _static_t... __values_or_sentinals>
-struct __partially_static_sizes :
-  __partially_static_array_with_sentinal<
-    T, _static_t, ::std::integer_sequence<_static_t, __values_or_sentinals...>>
-{
-private:
-  using __base_t = __partially_static_array_with_sentinal<
-    T, _static_t, ::std::integer_sequence<_static_t, __values_or_sentinals...>>;
-public:
-  using __base_t::__base_t;
-  template <class _UTag>
-  MDSPAN_FORCE_INLINE_FUNCTION constexpr __partially_static_sizes<T, _static_t, __values_or_sentinals...>
-  __with_tag() const noexcept {
-    return *this;
-  }
-};
-
-// Tags are needed for the standard layout version, but not here
-template <class T, class _static_t, _static_t... __values_or_sentinals>
-using __partially_static_sizes_tagged = __partially_static_sizes<T, _static_t, __values_or_sentinals...>;
-
-} // end namespace detail
-} // end namespace experimental
-} // end namespace std
-
-#endif // !_MDSPAN_PRESERVE_STANDARD_LAYOUT
diff --git a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/submdspan.hpp b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/submdspan.hpp
deleted file mode 100644
index dde1ab15bf39f8190a326a9dadc4acfe968db0dc..0000000000000000000000000000000000000000
--- a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/submdspan.hpp
+++ /dev/null
@@ -1,586 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 2.0
-//              Copyright (2019) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-
-#pragma once
-
-#include "mdspan.hpp"
-#include "full_extent_t.hpp"
-#include "dynamic_extent.hpp"
-#include "layout_left.hpp"
-#include "layout_right.hpp"
-#include "layout_stride.hpp"
-#include "macros.hpp"
-#include "trait_backports.hpp"
-
-#include <tuple> // std::apply
-#include <utility> // std::pair
-
-namespace std {
-namespace experimental {
-
-namespace detail {
-
-template <size_t OldExtent, size_t OldStaticStride, class T>
-struct __slice_wrap {
-  T slice;
-  size_t old_extent;
-  size_t old_stride;
-};
-
-//--------------------------------------------------------------------------------
-
-template <size_t OldExtent, size_t OldStaticStride>
-MDSPAN_INLINE_FUNCTION constexpr
-__slice_wrap<OldExtent, OldStaticStride, size_t>
-__wrap_slice(size_t val, size_t ext, size_t stride) { return { val, ext, stride }; }
-
-template <size_t OldExtent, size_t OldStaticStride, class IntegerType, IntegerType Value0>
-MDSPAN_INLINE_FUNCTION constexpr
-__slice_wrap<OldExtent, OldStaticStride, std::integral_constant<IntegerType, Value0>>
-__wrap_slice(size_t val, size_t ext, std::integral_constant<IntegerType, Value0> stride)
-{
-#if MDSPAN_HAS_CXX_17
-  if constexpr (std::is_signed_v<IntegerType>) {
-    static_assert(Value0 >= IntegerType(0), "Invalid slice specifier");
-  }
-#endif // MDSPAN_HAS_CXX_17
-
-  return { val, ext, stride };
-}
-
-template <size_t OldExtent, size_t OldStaticStride>
-MDSPAN_INLINE_FUNCTION constexpr
-__slice_wrap<OldExtent, OldStaticStride, full_extent_t>
-__wrap_slice(full_extent_t val, size_t ext, size_t stride) { return { val, ext, stride }; }
-
-// TODO generalize this to anything that works with std::get<0> and std::get<1>
-template <size_t OldExtent, size_t OldStaticStride>
-MDSPAN_INLINE_FUNCTION constexpr
-__slice_wrap<OldExtent, OldStaticStride, std::tuple<size_t, size_t>>
-__wrap_slice(std::tuple<size_t, size_t> const& val, size_t ext, size_t stride)
-{
-  return { val, ext, stride };
-}
-
-template <size_t OldExtent, size_t OldStaticStride,
-	  class IntegerType0, IntegerType0 Value0,
-	  class IntegerType1, IntegerType1 Value1>
-MDSPAN_INLINE_FUNCTION constexpr
-  __slice_wrap<OldExtent, OldStaticStride,
-               std::tuple<std::integral_constant<IntegerType0, Value0>,
-                          std::integral_constant<IntegerType1, Value1>>>
-__wrap_slice(std::tuple<std::integral_constant<IntegerType0, Value0>, std::integral_constant<IntegerType1, Value1>> const& val, size_t ext, size_t stride)
-{
-  static_assert(Value1 >= Value0, "Invalid slice tuple");
-  return { val, ext, stride };
-}
-
-//--------------------------------------------------------------------------------
-
-
-// a layout right remains a layout right if it is indexed by 0 or more scalars,
-// then optionally a pair and finally 0 or more all
-template <
-  // what we encountered until now preserves the layout right
-  bool result=true,
-  // we only encountered 0 or more scalars, no pair or all
-  bool encountered_only_scalar=true
->
-struct preserve_layout_right_analysis : integral_constant<bool, result> {
-  using layout_type_if_preserved = layout_right;
-  using encounter_pair = preserve_layout_right_analysis<
-    // if we encounter a pair, the layout remains a layout right only if it was one before
-    // and that only scalars were encountered until now
-    result && encountered_only_scalar,
-    // if we encounter a pair, we didn't encounter scalars only
-    false
-  >;
-  using encounter_all = preserve_layout_right_analysis<
-    // if we encounter a all, the layout remains a layout right if it was one before
-    result,
-    // if we encounter a all, we didn't encounter scalars only
-    false
-  >;
-  using encounter_scalar = preserve_layout_right_analysis<
-    // if we encounter a scalar, the layout remains a layout right only if it was one before
-    // and that only scalars were encountered until now
-    result && encountered_only_scalar,
-    // if we encounter a scalar, the fact that we encountered scalars only doesn't change
-    encountered_only_scalar
-  >;
-};
-
-// a layout left remains a layout left if it is indexed by 0 or more all,
-// then optionally a pair and finally 0 or more scalars
-template <
-  bool result=true,
-  bool encountered_only_all=true
->
-struct preserve_layout_left_analysis : integral_constant<bool, result> {
-  using layout_type_if_preserved = layout_left;
-  using encounter_pair = preserve_layout_left_analysis<
-    // if we encounter a pair, the layout remains a layout left only if it was one before
-    // and that only all were encountered until now
-    result && encountered_only_all,
-    // if we encounter a pair, we didn't encounter all only
-    false
-  >;
-  using encounter_all = preserve_layout_left_analysis<
-    // if we encounter a all, the layout remains a layout left only if it was one before
-    // and that only all were encountered until now
-    result && encountered_only_all,
-    // if we encounter a all, the fact that we encountered scalars all doesn't change
-    encountered_only_all
-  >;
-  using encounter_scalar = preserve_layout_left_analysis<
-    // if we encounter a scalar, the layout remains a layout left if it was one before
-    result,
-    // if we encounter a scalar, we didn't encounter scalars only
-    false
-  >;
-};
-
-struct ignore_layout_preservation : std::integral_constant<bool, false> {
-  using layout_type_if_preserved = void;
-  using encounter_pair = ignore_layout_preservation;
-  using encounter_all = ignore_layout_preservation;
-  using encounter_scalar = ignore_layout_preservation;
-};
-
-template <class Layout>
-struct preserve_layout_analysis
-  : ignore_layout_preservation { };
-template <>
-struct preserve_layout_analysis<layout_right>
-  : preserve_layout_right_analysis<> { };
-template <>
-struct preserve_layout_analysis<layout_left>
-  : preserve_layout_left_analysis<> { };
-
-//--------------------------------------------------------------------------------
-
-template <
-  class _IndexT,
-  class _PreserveLayoutAnalysis,
-  class _OffsetsArray=__partially_static_sizes<_IndexT, size_t>,
-  class _ExtsArray=__partially_static_sizes<_IndexT, size_t>,
-  class _StridesArray=__partially_static_sizes<_IndexT, size_t>,
-  class = make_index_sequence<_OffsetsArray::__size>,
-  class = make_index_sequence<_ExtsArray::__size>,
-  class = make_index_sequence<_StridesArray::__size>
->
-struct __assign_op_slice_handler;
-
-/* clang-format: off */
-template <
-  class _IndexT,
-  class _PreserveLayoutAnalysis,
-  size_t... _Offsets,
-  size_t... _Exts,
-  size_t... _Strides,
-  size_t... _OffsetIdxs,
-  size_t... _ExtIdxs,
-  size_t... _StrideIdxs>
-struct __assign_op_slice_handler<
-  _IndexT,
-  _PreserveLayoutAnalysis,
-  __partially_static_sizes<_IndexT, size_t, _Offsets...>,
-  __partially_static_sizes<_IndexT, size_t, _Exts...>,
-  __partially_static_sizes<_IndexT, size_t, _Strides...>,
-  integer_sequence<size_t, _OffsetIdxs...>,
-  integer_sequence<size_t, _ExtIdxs...>,
-  integer_sequence<size_t, _StrideIdxs...>>
-{
-  // TODO remove this for better compiler performance
-  static_assert(
-    _MDSPAN_FOLD_AND((_Strides == dynamic_extent || _Strides > 0) /* && ... */),
-    " "
-  );
-  static_assert(
-    _MDSPAN_FOLD_AND((_Offsets == dynamic_extent || _Offsets >= 0) /* && ... */),
-    " "
-  );
-
-  using __offsets_storage_t = __partially_static_sizes<_IndexT, size_t, _Offsets...>;
-  using __extents_storage_t = __partially_static_sizes<_IndexT, size_t, _Exts...>;
-  using __strides_storage_t = __partially_static_sizes<_IndexT, size_t, _Strides...>;
-  __offsets_storage_t __offsets;
-  __extents_storage_t __exts;
-  __strides_storage_t __strides;
-
-#ifdef __INTEL_COMPILER
-#if __INTEL_COMPILER <= 1800
-  MDSPAN_INLINE_FUNCTION constexpr __assign_op_slice_handler(__assign_op_slice_handler&& __other) noexcept
-    : __offsets(::std::move(__other.__offsets)), __exts(::std::move(__other.__exts)), __strides(::std::move(__other.__strides))
-  { }
-  MDSPAN_INLINE_FUNCTION constexpr __assign_op_slice_handler(
-    __offsets_storage_t&& __o,
-    __extents_storage_t&& __e,
-    __strides_storage_t&& __s
-  ) noexcept
-    : __offsets(::std::move(__o)), __exts(::std::move(__e)), __strides(::std::move(__s))
-  { }
-#endif
-#endif
-
-// Don't define this unless we need it; they have a cost to compile
-#ifndef _MDSPAN_USE_RETURN_TYPE_DEDUCTION
-  using __extents_type = ::std::experimental::extents<_IndexT, _Exts...>;
-#endif
-
-  // For size_t slice, skip the extent and stride, but add an offset corresponding to the value
-  template <size_t _OldStaticExtent, size_t _OldStaticStride>
-  MDSPAN_FORCE_INLINE_FUNCTION // NOLINT (misc-unconventional-assign-operator)
-  _MDSPAN_CONSTEXPR_14 auto
-  operator=(__slice_wrap<_OldStaticExtent, _OldStaticStride, size_t>&& __slice) noexcept
-    -> __assign_op_slice_handler<
-         _IndexT,
-         typename _PreserveLayoutAnalysis::encounter_scalar,
-         __partially_static_sizes<_IndexT, size_t, _Offsets..., dynamic_extent>,
-         __partially_static_sizes<_IndexT, size_t, _Exts...>,
-         __partially_static_sizes<_IndexT, size_t, _Strides...>/* intentional space here to work around ICC bug*/> {
-    return {
-      __partially_static_sizes<_IndexT, size_t, _Offsets..., dynamic_extent>(
-        __construct_psa_from_all_exts_values_tag,
-        __offsets.template __get_n<_OffsetIdxs>()..., __slice.slice),
-      ::std::move(__exts),
-      ::std::move(__strides)
-    };
-  }
-
-  // Treat integral_constant slice like size_t slice, but with a compile-time offset.
-  // The result's extents_type can't take advantage of that,
-  // but it might help for specialized layouts.
-  template <size_t _OldStaticExtent, size_t _OldStaticStride, class IntegerType, IntegerType Value0>
-  MDSPAN_FORCE_INLINE_FUNCTION // NOLINT (misc-unconventional-assign-operator)
-  _MDSPAN_CONSTEXPR_14 auto
-  operator=(__slice_wrap<_OldStaticExtent, _OldStaticStride, std::integral_constant<IntegerType, Value0>>&&) noexcept
-    -> __assign_op_slice_handler<
-         _IndexT,
-         typename _PreserveLayoutAnalysis::encounter_scalar,
-         __partially_static_sizes<_IndexT, size_t, _Offsets..., Value0>,
-         __partially_static_sizes<_IndexT, size_t, _Exts...>,
-         __partially_static_sizes<_IndexT, size_t, _Strides...>/* intentional space here to work around ICC bug*/> {
-#if MDSPAN_HAS_CXX_17
-    if constexpr (std::is_signed_v<IntegerType>) {
-      static_assert(Value0 >= IntegerType(0), "Invalid slice specifier");
-    }
-#endif // MDSPAN_HAS_CXX_17
-    return {
-      __partially_static_sizes<_IndexT, size_t, _Offsets..., Value0>(
-        __construct_psa_from_all_exts_values_tag,
-        __offsets.template __get_n<_OffsetIdxs>()..., size_t(Value0)),
-      ::std::move(__exts),
-      ::std::move(__strides)
-    };
-  }
-
-  // For a std::full_extent, offset 0 and old extent
-  template <size_t _OldStaticExtent, size_t _OldStaticStride>
-  MDSPAN_FORCE_INLINE_FUNCTION // NOLINT (misc-unconventional-assign-operator)
-  _MDSPAN_CONSTEXPR_14 auto
-  operator=(__slice_wrap<_OldStaticExtent, _OldStaticStride, full_extent_t>&& __slice) noexcept
-    -> __assign_op_slice_handler<
-         _IndexT,
-         typename _PreserveLayoutAnalysis::encounter_all,
-         __partially_static_sizes<_IndexT, size_t, _Offsets..., 0>,
-         __partially_static_sizes<_IndexT, size_t, _Exts..., _OldStaticExtent>,
-         __partially_static_sizes<_IndexT, size_t, _Strides..., _OldStaticStride>/* intentional space here to work around ICC bug*/> {
-    return {
-      __partially_static_sizes<_IndexT, size_t, _Offsets..., 0>(
-        __construct_psa_from_all_exts_values_tag,
-        __offsets.template __get_n<_OffsetIdxs>()..., size_t(0)),
-      __partially_static_sizes<_IndexT, size_t, _Exts..., _OldStaticExtent>(
-        __construct_psa_from_all_exts_values_tag,
-        __exts.template __get_n<_ExtIdxs>()..., __slice.old_extent),
-      __partially_static_sizes<_IndexT, size_t, _Strides..., _OldStaticStride>(
-        __construct_psa_from_all_exts_values_tag,
-        __strides.template __get_n<_StrideIdxs>()..., __slice.old_stride)
-    };
-  }
-
-  // For a std::tuple, add an offset and add a new dynamic extent (strides still preserved)
-  template <size_t _OldStaticExtent, size_t _OldStaticStride>
-  MDSPAN_FORCE_INLINE_FUNCTION // NOLINT (misc-unconventional-assign-operator)
-  _MDSPAN_CONSTEXPR_14 auto
-  operator=(__slice_wrap<_OldStaticExtent, _OldStaticStride, tuple<size_t, size_t>>&& __slice) noexcept
-    -> __assign_op_slice_handler<
-         _IndexT,
-         typename _PreserveLayoutAnalysis::encounter_pair,
-         __partially_static_sizes<_IndexT, size_t, _Offsets..., dynamic_extent>,
-         __partially_static_sizes<_IndexT, size_t, _Exts..., dynamic_extent>,
-         __partially_static_sizes<_IndexT, size_t, _Strides..., _OldStaticStride>/* intentional space here to work around ICC bug*/> {
-    return {
-      __partially_static_sizes<_IndexT, size_t, _Offsets..., dynamic_extent>(
-        __construct_psa_from_all_exts_values_tag,
-        __offsets.template __get_n<_OffsetIdxs>()..., ::std::get<0>(__slice.slice)),
-      __partially_static_sizes<_IndexT, size_t, _Exts..., dynamic_extent>(
-        __construct_psa_from_all_exts_values_tag,
-        __exts.template __get_n<_ExtIdxs>()..., ::std::get<1>(__slice.slice) - ::std::get<0>(__slice.slice)),
-      __partially_static_sizes<_IndexT, size_t, _Strides..., _OldStaticStride>(
-        __construct_psa_from_all_exts_values_tag,
-        __strides.template __get_n<_StrideIdxs>()..., __slice.old_stride)
-    };
-  }
-
-  // For a std::tuple of two std::integral_constant, do something like
-  // we did above for a tuple of two size_t, but make sure the
-  // result's extents type make the values compile-time constants.
-  template <size_t _OldStaticExtent, size_t _OldStaticStride,
-	    class IntegerType0, IntegerType0 Value0,
-	    class IntegerType1, IntegerType1 Value1>
-  MDSPAN_FORCE_INLINE_FUNCTION // NOLINT (misc-unconventional-assign-operator)
-  _MDSPAN_CONSTEXPR_14 auto
-  operator=(__slice_wrap<_OldStaticExtent, _OldStaticStride, tuple<std::integral_constant<IntegerType0, Value0>, std::integral_constant<IntegerType1, Value1>>>&& __slice) noexcept
-    -> __assign_op_slice_handler<
-         _IndexT,
-         typename _PreserveLayoutAnalysis::encounter_pair,
-         __partially_static_sizes<_IndexT, size_t, _Offsets..., size_t(Value0)>,
-         __partially_static_sizes<_IndexT, size_t, _Exts..., size_t(Value1 - Value0)>,
-         __partially_static_sizes<_IndexT, size_t, _Strides..., _OldStaticStride>/* intentional space here to work around ICC bug*/> {
-    static_assert(Value1 >= Value0, "Invalid slice specifier");
-    return {
-      // We're still turning the template parameters Value0 and Value1
-      // into (constexpr) run-time values here.
-      __partially_static_sizes<_IndexT, size_t, _Offsets..., size_t(Value0) > (
-        __construct_psa_from_all_exts_values_tag,
-        __offsets.template __get_n<_OffsetIdxs>()..., Value0),
-      __partially_static_sizes<_IndexT, size_t, _Exts..., size_t(Value1 - Value0) > (
-        __construct_psa_from_all_exts_values_tag,
-        __exts.template __get_n<_ExtIdxs>()..., Value1 - Value0),
-      __partially_static_sizes<_IndexT, size_t, _Strides..., _OldStaticStride>(
-        __construct_psa_from_all_exts_values_tag,
-        __strides.template __get_n<_StrideIdxs>()..., __slice.old_stride)
-    };
-  }
-
-   // TODO defer instantiation of this?
-  using layout_type = typename conditional<
-    _PreserveLayoutAnalysis::value,
-    typename _PreserveLayoutAnalysis::layout_type_if_preserved,
-    layout_stride
-  >::type;
-
-  // TODO noexcept specification
-  template <class NewLayout>
-  MDSPAN_INLINE_FUNCTION
-  _MDSPAN_DEDUCE_RETURN_TYPE_SINGLE_LINE(
-    (
-      _MDSPAN_CONSTEXPR_14 /* auto */
-      _make_layout_mapping_impl(NewLayout) noexcept
-    ),
-    (
-      /* not layout stride, so don't pass dynamic_strides */
-      /* return */ typename NewLayout::template mapping<::std::experimental::extents<_IndexT, _Exts...>>(
-        experimental::extents<_IndexT, _Exts...>::__make_extents_impl(::std::move(__exts))
-      ) /* ; */
-    )
-  )
-
-  MDSPAN_INLINE_FUNCTION
-  _MDSPAN_DEDUCE_RETURN_TYPE_SINGLE_LINE(
-    (
-      _MDSPAN_CONSTEXPR_14 /* auto */
-      _make_layout_mapping_impl(layout_stride) noexcept
-    ),
-    (
-      /* return */ layout_stride::template mapping<::std::experimental::extents<_IndexT, _Exts...>>
-        ::__make_mapping(::std::move(__exts), ::std::move(__strides)) /* ; */
-    )
-  )
-
-  template <class OldLayoutMapping> // mostly for deferred instantiation, but maybe we'll use this in the future
-  MDSPAN_INLINE_FUNCTION
-  _MDSPAN_DEDUCE_RETURN_TYPE_SINGLE_LINE(
-    (
-      _MDSPAN_CONSTEXPR_14 /* auto */
-      make_layout_mapping(OldLayoutMapping const&) noexcept
-    ),
-    (
-      /* return */ this->_make_layout_mapping_impl(layout_type{}) /* ; */
-    )
-  )
-};
-
-//==============================================================================
-
-#if _MDSPAN_USE_RETURN_TYPE_DEDUCTION
-// Forking this because the C++11 version will be *completely* unreadable
-template <class ET, class ST, size_t... Exts, class LP, class AP, class... SliceSpecs, size_t... Idxs>
-MDSPAN_INLINE_FUNCTION
-constexpr auto _submdspan_impl(
-  integer_sequence<size_t, Idxs...>,
-  mdspan<ET, std::experimental::extents<ST, Exts...>, LP, AP> const& src,
-  SliceSpecs&&... slices
-) noexcept
-{
-  using _IndexT = ST;
-  auto _handled =
-    _MDSPAN_FOLD_ASSIGN_LEFT(
-      (
-        detail::__assign_op_slice_handler<
-          _IndexT,
-          detail::preserve_layout_analysis<LP>
-        >{
-          __partially_static_sizes<_IndexT, size_t>{},
-          __partially_static_sizes<_IndexT, size_t>{},
-          __partially_static_sizes<_IndexT, size_t>{}
-        }
-      ),
-        /* = ... = */
-      detail::__wrap_slice<
-        Exts, dynamic_extent
-      >(
-        slices, src.extents().template __extent<Idxs>(),
-        src.mapping().stride(Idxs)
-      )
-    );
-
-  size_t offset_size = src.mapping()(_handled.__offsets.template __get_n<Idxs>()...);
-  auto offset_ptr = src.accessor().offset(src.data_handle(), offset_size);
-  auto map = _handled.make_layout_mapping(src.mapping());
-  auto acc_pol = typename AP::offset_policy(src.accessor());
-  return mdspan<
-    ET, remove_const_t<remove_reference_t<decltype(map.extents())>>,
-        typename decltype(_handled)::layout_type, remove_const_t<remove_reference_t<decltype(acc_pol)>>
-  >(
-    std::move(offset_ptr), std::move(map), std::move(acc_pol)
-  );
-}
-#else
-
-template <class ET, class AP, class Src, class Handled, size_t... Idxs>
-auto _submdspan_impl_helper(Src&& src, Handled&& h, std::integer_sequence<size_t, Idxs...>)
-  -> mdspan<
-       ET, typename Handled::__extents_type, typename Handled::layout_type, typename AP::offset_policy
-     >
-{
-  return {
-    src.accessor().offset(src.data_handle(), src.mapping()(h.__offsets.template __get_n<Idxs>()...)),
-    h.make_layout_mapping(src.mapping()),
-    typename AP::offset_policy(src.accessor())
-  };
-}
-
-template <class ET, class ST, size_t... Exts, class LP, class AP, class... SliceSpecs, size_t... Idxs>
-MDSPAN_INLINE_FUNCTION
-_MDSPAN_DEDUCE_RETURN_TYPE_SINGLE_LINE(
-  (
-    constexpr /* auto */ _submdspan_impl(
-      std::integer_sequence<size_t, Idxs...> seq,
-      mdspan<ET, std::experimental::extents<ST, Exts...>, LP, AP> const& src,
-      SliceSpecs&&... slices
-    ) noexcept
-  ),
-  (
-    /* return */ _submdspan_impl_helper<ET, AP>(
-      src,
-      _MDSPAN_FOLD_ASSIGN_LEFT(
-        (
-          detail::__assign_op_slice_handler<
-            size_t,
-            detail::preserve_layout_analysis<LP>
-          >{
-            __partially_static_sizes<ST, size_t>{},
-            __partially_static_sizes<ST, size_t>{},
-            __partially_static_sizes<ST, size_t>{}
-          }
-        ),
-        /* = ... = */
-        detail::__wrap_slice<
-          Exts, dynamic_extent
-        >(
-          slices, src.extents().template __extent<Idxs>(), src.mapping().stride(Idxs)
-        )
-      ),
-      seq
-    ) /* ; */
-  )
-)
-
-#endif
-
-template <class T> struct _is_layout_stride : std::false_type { };
-template<>
-struct _is_layout_stride<
-  layout_stride
-> : std::true_type
-{ };
-
-} // namespace detail
-
-//==============================================================================
-
-MDSPAN_TEMPLATE_REQUIRES(
-  class ET, class EXT, class LP, class AP, class... SliceSpecs,
-  /* requires */ (
-    (
-      _MDSPAN_TRAIT(is_same, LP, layout_left)
-        || _MDSPAN_TRAIT(is_same, LP, layout_right)
-        || detail::_is_layout_stride<LP>::value
-    ) &&
-    _MDSPAN_FOLD_AND((
-      _MDSPAN_TRAIT(is_convertible, SliceSpecs, size_t)
-        || _MDSPAN_TRAIT(is_convertible, SliceSpecs, tuple<size_t, size_t>)
-        || _MDSPAN_TRAIT(is_convertible, SliceSpecs, full_extent_t)
-    ) /* && ... */) &&
-    sizeof...(SliceSpecs) == EXT::rank()
-  )
-)
-MDSPAN_INLINE_FUNCTION
-_MDSPAN_DEDUCE_RETURN_TYPE_SINGLE_LINE(
-  (
-    constexpr submdspan(
-      mdspan<ET, EXT, LP, AP> const& src, SliceSpecs... slices
-    ) noexcept
-  ),
-  (
-    /* return */
-      detail::_submdspan_impl(std::make_index_sequence<sizeof...(SliceSpecs)>{}, src, slices...) /*;*/
-  )
-)
-/* clang-format: on */
-
-} // end namespace experimental
-} // namespace std
diff --git a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/trait_backports.hpp b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/trait_backports.hpp
index a94e1a1e031d8248965493c95eba798123aad3e0..4933dd9934ea0a967ff46b5afc303a1431df6748 100644
--- a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/trait_backports.hpp
+++ b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/trait_backports.hpp
@@ -1,46 +1,18 @@
-/*
 //@HEADER
 // ************************************************************************
 //
-//                        Kokkos v. 2.0
-//              Copyright (2019) Sandia Corporation
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
 //
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// Under the terms of Contract DE-NA0003525 with NTESS,
 // the U.S. Government retains certain rights in this software.
 //
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
 //@HEADER
-*/
-
 #ifndef MDSPAN_INCLUDE_EXPERIMENTAL_BITS_TRAIT_BACKPORTS_HPP_
 #define MDSPAN_INCLUDE_EXPERIMENTAL_BITS_TRAIT_BACKPORTS_HPP_
 
@@ -56,7 +28,7 @@
 #ifdef _MDSPAN_NEEDS_TRAIT_VARIABLE_TEMPLATE_BACKPORTS
 
 #if _MDSPAN_USE_VARIABLE_TEMPLATES
-namespace std {
+namespace MDSPAN_IMPL_STANDARD_NAMESPACE {
 
 #define _MDSPAN_BACKPORT_TRAIT(TRAIT) \
   template <class... Args> _MDSPAN_INLINE_VARIABLE constexpr auto TRAIT##_v = TRAIT<Args...>::value;
@@ -72,7 +44,7 @@ _MDSPAN_BACKPORT_TRAIT(is_void)
 
 #undef _MDSPAN_BACKPORT_TRAIT
 
-} // end namespace std
+} // end namespace MDSPAN_IMPL_STANDARD_NAMESPACE
 
 #endif // _MDSPAN_USE_VARIABLE_TEMPLATES
 
@@ -86,16 +58,16 @@ _MDSPAN_BACKPORT_TRAIT(is_void)
 
 #if !defined(_MDSPAN_USE_INTEGER_SEQUENCE) || !_MDSPAN_USE_INTEGER_SEQUENCE
 
-namespace std {
+namespace MDSPAN_IMPL_STANDARD_NAMESPACE {
 
 template <class T, T... Vals>
 struct integer_sequence {
-  static constexpr std::size_t size() noexcept { return sizeof...(Vals); }
+  static constexpr size_t size() noexcept { return sizeof...(Vals); }
   using value_type = T;
 };
 
-template <std::size_t... Vals>
-using index_sequence = std::integer_sequence<std::size_t, Vals...>;
+template <size_t... Vals>
+using index_sequence = std::integer_sequence<size_t, Vals...>;
 
 namespace __detail {
 
@@ -119,13 +91,13 @@ struct __make_int_seq_impl<
 template <class T, T N>
 using make_integer_sequence = typename __detail::__make_int_seq_impl<T, N, 0, integer_sequence<T>>::type;
 
-template <std::size_t N>
+template <size_t N>
 using make_index_sequence = typename __detail::__make_int_seq_impl<size_t, N, 0, integer_sequence<size_t>>::type;
 
 template <class... T>
 using index_sequence_for = make_index_sequence<sizeof...(T)>;
 
-} // end namespace std
+} // end namespace MDSPAN_IMPL_STANDARD_NAMESPACE
 
 #endif
 
@@ -137,7 +109,7 @@ using index_sequence_for = make_index_sequence<sizeof...(T)>;
 
 #if !defined(_MDSPAN_USE_STANDARD_TRAIT_ALIASES) || !_MDSPAN_USE_STANDARD_TRAIT_ALIASES
 
-namespace std {
+namespace MDSPAN_IMPL_STANDARD_NAMESPACE {
 
 #define _MDSPAN_BACKPORT_TRAIT_ALIAS(TRAIT) \
   template <class... Args> using TRAIT##_t = typename TRAIT<Args...>::type;
@@ -150,7 +122,7 @@ using enable_if_t = typename enable_if<_B, _T>::type;
 
 #undef _MDSPAN_BACKPORT_TRAIT_ALIAS
 
-} // end namespace std
+} // end namespace MDSPAN_IMPL_STANDARD_NAMESPACE
 
 #endif
 
diff --git a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/type_list.hpp b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/type_list.hpp
index 7de72e65374cb7efda519bd0cfc8e9c99cebad62..deca7c15d095857a805fa00cbd1947ce8c434d65 100644
--- a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/type_list.hpp
+++ b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/type_list.hpp
@@ -1,52 +1,23 @@
-/*
 //@HEADER
 // ************************************************************************
 //
-//                        Kokkos v. 2.0
-//              Copyright (2019) Sandia Corporation
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
 //
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// Under the terms of Contract DE-NA0003525 with NTESS,
 // the U.S. Government retains certain rights in this software.
 //
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
 //@HEADER
-*/
-
 #include "macros.hpp"
 
 #include "trait_backports.hpp" // make_index_sequence
 
-namespace std {
-namespace experimental {
+namespace MDSPAN_IMPL_STANDARD_NAMESPACE {
 
 //==============================================================================
 
@@ -56,7 +27,7 @@ template <class... _Ts> struct __type_list { static constexpr auto __size = size
 
 // Implementation of type_list at() that's heavily optimized for small typelists
 template <size_t, class> struct __type_at;
-template <size_t, class _Seq, class=make_index_sequence<_Seq::__size>> struct __type_at_large_impl;
+template <size_t, class _Seq, class=std::make_index_sequence<_Seq::__size>> struct __type_at_large_impl;
 
 template <size_t _I, size_t _Idx, class _T>
 struct __type_at_entry { };
@@ -76,7 +47,7 @@ struct __type_at_assign_op_impl {
 };
 
 template <size_t _I, class... _Ts, size_t... _Idxs>
-struct __type_at_large_impl<_I, __type_list<_Ts...>, integer_sequence<size_t, _Idxs...>>
+struct __type_at_large_impl<_I, __type_list<_Ts...>, std::integer_sequence<size_t, _Idxs...>>
   : decltype(
       _MDSPAN_FOLD_ASSIGN_LEFT(__type_at_assign_op_impl{}, /* = ... = */ __type_at_entry<_I, _Idxs, _Ts>{})
     )
@@ -112,6 +83,5 @@ struct __type_at<3, __type_list<_T0, _T1, _T2, _T3, _Ts...>> {
 
 //==============================================================================
 
-} // end namespace experimental
-} // end namespace std
+} // end namespace MDSPAN_IMPL_STANDARD_NAMESPACE
 
diff --git a/packages/kokkos/tpls/mdspan/include/experimental/__p1684_bits/mdarray.hpp b/packages/kokkos/tpls/mdspan/include/experimental/__p1684_bits/mdarray.hpp
index b27a25c9fe4c407a1a7c0ce6750c8a2a4310c931..3950273a83dc114809a4f44151150ed4b3430fbb 100644
--- a/packages/kokkos/tpls/mdspan/include/experimental/__p1684_bits/mdarray.hpp
+++ b/packages/kokkos/tpls/mdspan/include/experimental/__p1684_bits/mdarray.hpp
@@ -1,46 +1,18 @@
-/*
 //@HEADER
 // ************************************************************************
 //
-//                        Kokkos v. 2.0
-//              Copyright (2019) Sandia Corporation
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
 //
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// Under the terms of Contract DE-NA0003525 with NTESS,
 // the U.S. Government retains certain rights in this software.
 //
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
 //@HEADER
-*/
-
 
 #pragma once
 
@@ -48,8 +20,8 @@
 #include <cassert>
 #include <vector>
 
-namespace std {
-namespace experimental {
+namespace MDSPAN_IMPL_STANDARD_NAMESPACE {
+namespace MDSPAN_IMPL_PROPOSED_NAMESPACE {
 
 namespace {
   template<class Extents>
@@ -68,14 +40,14 @@ namespace {
 
 namespace {
   template<class C>
-  struct container_is_array : false_type {
+  struct container_is_array :  std::false_type {
     template<class M>
     static constexpr C construct(const M& m) { return C(m.required_span_size()); }
   };
   template<class T, size_t N>
-  struct container_is_array<array<T,N>> : true_type {
+  struct container_is_array<std::array<T,N>> : std::true_type {
     template<class M>
-    static constexpr array<T,N> construct(const M&) { return array<T,N>(); }
+    static constexpr std::array<T,N> construct(const M&) { return std::array<T,N>(); }
   };
 }
 
@@ -83,12 +55,12 @@ template <
   class ElementType,
   class Extents,
   class LayoutPolicy = layout_right,
-  class Container = vector<ElementType>
+  class Container = std::vector<ElementType>
 >
 class mdarray {
 private:
-  static_assert(detail::__is_extents_v<Extents>, "std::experimental::mdspan's Extents template parameter must be a specialization of std::experimental::extents.");
-
+  static_assert(::MDSPAN_IMPL_STANDARD_NAMESPACE::detail::__is_extents_v<Extents>,
+                MDSPAN_IMPL_PROPOSED_NAMESPACE_STRING "::mdspan's Extents template parameter must be a specialization of " MDSPAN_IMPL_STANDARD_NAMESPACE_STRING "::extents.");
 
 public:
 
@@ -100,8 +72,12 @@ public:
   using container_type = Container;
   using mapping_type = typename layout_type::template mapping<extents_type>;
   using element_type = ElementType;
-  using value_type = remove_cv_t<element_type>;
+  using mdspan_type = mdspan<element_type, extents_type, layout_type>;
+  using const_mdspan_type = mdspan<const element_type, extents_type, layout_type>;
+  using value_type = std::remove_cv_t<element_type>;
   using index_type = typename Extents::index_type;
+  using size_type = typename Extents::size_type;
+  using rank_type = typename Extents::rank_type;
   using pointer = typename container_type::pointer;
   using reference = typename container_type::reference;
   using const_pointer = typename container_type::const_pointer;
@@ -127,10 +103,10 @@ public:
   MDSPAN_TEMPLATE_REQUIRES(
     class... SizeTypes,
     /* requires */ (
-      _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(is_convertible, SizeTypes, index_type) /* && ... */) &&
-      _MDSPAN_TRAIT(is_constructible, extents_type, SizeTypes...) &&
-      _MDSPAN_TRAIT(is_constructible, mapping_type, extents_type) &&
-      (_MDSPAN_TRAIT(is_constructible, container_type, size_t) ||
+      _MDSPAN_FOLD_AND(_MDSPAN_TRAIT( std::is_convertible, SizeTypes, index_type) /* && ... */) &&
+      _MDSPAN_TRAIT( std::is_constructible, extents_type, SizeTypes...) &&
+      _MDSPAN_TRAIT( std::is_constructible, mapping_type, extents_type) &&
+      (_MDSPAN_TRAIT( std::is_constructible, container_type, size_t) ||
        container_is_array<container_type>::value) &&
       (extents_type::rank()>0 || extents_type::rank_dynamic()==0)
     )
@@ -143,16 +119,16 @@ public:
   MDSPAN_FUNCTION_REQUIRES(
     (MDSPAN_INLINE_FUNCTION constexpr),
     mdarray, (const extents_type& exts), ,
-    /* requires */ ((_MDSPAN_TRAIT(is_constructible, container_type, size_t) ||
+    /* requires */ ((_MDSPAN_TRAIT( std::is_constructible, container_type, size_t) ||
                      container_is_array<container_type>::value) &&
-                    _MDSPAN_TRAIT(is_constructible, mapping_type, extents_type))
+                    _MDSPAN_TRAIT( std::is_constructible, mapping_type, extents_type))
   ) : map_(exts), ctr_(container_is_array<container_type>::construct(map_))
   { }
 
   MDSPAN_FUNCTION_REQUIRES(
     (MDSPAN_INLINE_FUNCTION constexpr),
     mdarray, (const mapping_type& m), ,
-    /* requires */ (_MDSPAN_TRAIT(is_constructible, container_type, size_t) ||
+    /* requires */ (_MDSPAN_TRAIT( std::is_constructible, container_type, size_t) ||
                     container_is_array<container_type>::value)
   ) : map_(m), ctr_(container_is_array<container_type>::construct(map_))
   { }
@@ -161,9 +137,9 @@ public:
   MDSPAN_TEMPLATE_REQUIRES(
     class... SizeTypes,
     /* requires */ (
-      _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(is_convertible, SizeTypes, index_type) /* && ... */) &&
-      _MDSPAN_TRAIT(is_constructible, extents_type, SizeTypes...) &&
-      _MDSPAN_TRAIT(is_constructible, mapping_type, extents_type)
+      _MDSPAN_FOLD_AND(_MDSPAN_TRAIT( std::is_convertible, SizeTypes, index_type) /* && ... */) &&
+      _MDSPAN_TRAIT( std::is_constructible, extents_type, SizeTypes...) &&
+      _MDSPAN_TRAIT( std::is_constructible, mapping_type, extents_type)
     )
   )
   MDSPAN_INLINE_FUNCTION
@@ -175,7 +151,7 @@ public:
   MDSPAN_FUNCTION_REQUIRES(
     (MDSPAN_INLINE_FUNCTION constexpr),
     mdarray, (const container_type& ctr, const extents_type& exts), ,
-    /* requires */ (_MDSPAN_TRAIT(is_constructible, mapping_type, extents_type))
+    /* requires */ (_MDSPAN_TRAIT( std::is_constructible, mapping_type, extents_type))
   ) : map_(exts), ctr_(ctr)
   { assert(ctr.size() >= static_cast<size_t>(map_.required_span_size())); }
 
@@ -188,9 +164,9 @@ public:
   MDSPAN_TEMPLATE_REQUIRES(
     class... SizeTypes,
     /* requires */ (
-      _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(is_convertible, SizeTypes, index_type) /* && ... */) &&
-      _MDSPAN_TRAIT(is_constructible, extents_type, SizeTypes...) &&
-      _MDSPAN_TRAIT(is_constructible, mapping_type, extents_type)
+      _MDSPAN_FOLD_AND(_MDSPAN_TRAIT( std::is_convertible, SizeTypes, index_type) /* && ... */) &&
+      _MDSPAN_TRAIT( std::is_constructible, extents_type, SizeTypes...) &&
+      _MDSPAN_TRAIT( std::is_constructible, mapping_type, extents_type)
     )
   )
   MDSPAN_INLINE_FUNCTION
@@ -202,7 +178,7 @@ public:
   MDSPAN_FUNCTION_REQUIRES(
     (MDSPAN_INLINE_FUNCTION constexpr),
     mdarray, (container_type&& ctr, const extents_type& exts), ,
-    /* requires */ (_MDSPAN_TRAIT(is_constructible, mapping_type, extents_type))
+    /* requires */ (_MDSPAN_TRAIT( std::is_constructible, mapping_type, extents_type))
   ) : map_(exts), ctr_(std::move(ctr))
   { assert(ctr_.size() >= static_cast<size_t>(map_.required_span_size())); }
 
@@ -215,22 +191,22 @@ public:
   MDSPAN_TEMPLATE_REQUIRES(
     class OtherElementType, class OtherExtents, class OtherLayoutPolicy, class OtherContainer,
     /* requires */ (
-      _MDSPAN_TRAIT(is_constructible, mapping_type, typename OtherLayoutPolicy::template mapping<OtherExtents>) &&
-      _MDSPAN_TRAIT(is_constructible, container_type, OtherContainer)
+      _MDSPAN_TRAIT( std::is_constructible, mapping_type, typename OtherLayoutPolicy::template mapping<OtherExtents>) &&
+      _MDSPAN_TRAIT( std::is_constructible, container_type, OtherContainer)
     )
   )
   MDSPAN_INLINE_FUNCTION
   constexpr mdarray(const mdarray<OtherElementType, OtherExtents, OtherLayoutPolicy, OtherContainer>& other)
     : map_(other.mapping()), ctr_(other.container())
   {
-    static_assert(is_constructible<extents_type, OtherExtents>::value, "");
+    static_assert( std::is_constructible<extents_type, OtherExtents>::value, "");
   }
 
   // Constructors for container types constructible from a size and allocator
   MDSPAN_TEMPLATE_REQUIRES(
     class Alloc,
-    /* requires */ (_MDSPAN_TRAIT(is_constructible, container_type, size_t, Alloc) &&
-                    _MDSPAN_TRAIT(is_constructible, mapping_type, extents_type))
+    /* requires */ (_MDSPAN_TRAIT( std::is_constructible, container_type, size_t, Alloc) &&
+                    _MDSPAN_TRAIT( std::is_constructible, mapping_type, extents_type))
   )
   MDSPAN_INLINE_FUNCTION
   constexpr mdarray(const extents_type& exts, const Alloc& a)
@@ -239,7 +215,7 @@ public:
 
   MDSPAN_TEMPLATE_REQUIRES(
     class Alloc,
-    /* requires */ (_MDSPAN_TRAIT(is_constructible, container_type, size_t, Alloc))
+    /* requires */ (_MDSPAN_TRAIT( std::is_constructible, container_type, size_t, Alloc))
   )
   MDSPAN_INLINE_FUNCTION
   constexpr mdarray(const mapping_type& map, const Alloc& a)
@@ -249,8 +225,8 @@ public:
   // Constructors for container types constructible from a container and allocator
   MDSPAN_TEMPLATE_REQUIRES(
     class Alloc,
-    /* requires */ (_MDSPAN_TRAIT(is_constructible, container_type, container_type, Alloc) &&
-                    _MDSPAN_TRAIT(is_constructible, mapping_type, extents_type))
+    /* requires */ (_MDSPAN_TRAIT( std::is_constructible, container_type, container_type, Alloc) &&
+                    _MDSPAN_TRAIT( std::is_constructible, mapping_type, extents_type))
   )
   MDSPAN_INLINE_FUNCTION
   constexpr mdarray(const container_type& ctr, const extents_type& exts, const Alloc& a)
@@ -259,7 +235,7 @@ public:
 
   MDSPAN_TEMPLATE_REQUIRES(
     class Alloc,
-    /* requires */ (_MDSPAN_TRAIT(is_constructible, container_type, size_t, Alloc))
+    /* requires */ (_MDSPAN_TRAIT( std::is_constructible, container_type, size_t, Alloc))
   )
   MDSPAN_INLINE_FUNCTION
   constexpr mdarray(const container_type& ctr, const mapping_type& map, const Alloc& a)
@@ -268,8 +244,8 @@ public:
 
   MDSPAN_TEMPLATE_REQUIRES(
     class Alloc,
-    /* requires */ (_MDSPAN_TRAIT(is_constructible, container_type, container_type, Alloc) &&
-                    _MDSPAN_TRAIT(is_constructible, mapping_type, extents_type))
+    /* requires */ (_MDSPAN_TRAIT( std::is_constructible, container_type, container_type, Alloc) &&
+                    _MDSPAN_TRAIT( std::is_constructible, mapping_type, extents_type))
   )
   MDSPAN_INLINE_FUNCTION
   constexpr mdarray(container_type&& ctr, const extents_type& exts, const Alloc& a)
@@ -278,7 +254,7 @@ public:
 
   MDSPAN_TEMPLATE_REQUIRES(
     class Alloc,
-    /* requires */ (_MDSPAN_TRAIT(is_constructible, container_type, size_t, Alloc))
+    /* requires */ (_MDSPAN_TRAIT( std::is_constructible, container_type, size_t, Alloc))
   )
   MDSPAN_INLINE_FUNCTION
   constexpr mdarray(container_type&& ctr, const mapping_type& map, const Alloc& a)
@@ -288,17 +264,19 @@ public:
   MDSPAN_TEMPLATE_REQUIRES(
     class OtherElementType, class OtherExtents, class OtherLayoutPolicy, class OtherContainer, class Alloc,
     /* requires */ (
-      _MDSPAN_TRAIT(is_constructible, mapping_type, typename OtherLayoutPolicy::template mapping<OtherExtents>) &&
-      _MDSPAN_TRAIT(is_constructible, container_type, OtherContainer, Alloc)
+      _MDSPAN_TRAIT( std::is_constructible, mapping_type, typename OtherLayoutPolicy::template mapping<OtherExtents>) &&
+      _MDSPAN_TRAIT( std::is_constructible, container_type, OtherContainer, Alloc)
     )
   )
   MDSPAN_INLINE_FUNCTION
   constexpr mdarray(const mdarray<OtherElementType, OtherExtents, OtherLayoutPolicy, OtherContainer>& other, const Alloc& a)
     : map_(other.mapping()), ctr_(other.container(), a)
   {
-    static_assert(is_constructible<extents_type, OtherExtents>::value, "");
+    static_assert( std::is_constructible<extents_type, OtherExtents>::value, "");
   }
 
+  MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mdarray& operator= (const mdarray&) = default;
+  MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mdarray& operator= (mdarray&&) = default;
   MDSPAN_INLINE_FUNCTION_DEFAULTED
   ~mdarray() = default;
 
@@ -309,27 +287,27 @@ public:
   MDSPAN_TEMPLATE_REQUIRES(
     class... SizeTypes,
     /* requires */ (
-      _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(is_convertible, SizeTypes, index_type) /* && ... */) &&
+      _MDSPAN_FOLD_AND(_MDSPAN_TRAIT( std::is_convertible, SizeTypes, index_type) /* && ... */) &&
       extents_type::rank() == sizeof...(SizeTypes)
     )
   )
   MDSPAN_FORCE_INLINE_FUNCTION
   constexpr const_reference operator[](SizeTypes... indices) const noexcept
   {
-    return ctr_[map_(index_type(indices)...)];
+    return ctr_[map_(static_cast<index_type>(std::move(indices))...)];
   }
 
   MDSPAN_TEMPLATE_REQUIRES(
     class... SizeTypes,
     /* requires */ (
-      _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(is_convertible, SizeTypes, index_type) /* && ... */) &&
+      _MDSPAN_FOLD_AND(_MDSPAN_TRAIT( std::is_convertible, SizeTypes, index_type) /* && ... */) &&
       extents_type::rank() == sizeof...(SizeTypes)
     )
   )
   MDSPAN_FORCE_INLINE_FUNCTION
   constexpr reference operator[](SizeTypes... indices) noexcept
   {
-    return ctr_[map_(index_type(indices)...)];
+    return ctr_[map_(static_cast<index_type>(std::move(indices))...)];
   }
   #endif
 
@@ -337,12 +315,12 @@ public:
   MDSPAN_TEMPLATE_REQUIRES(
     class SizeType, size_t N,
     /* requires */ (
-      _MDSPAN_TRAIT(is_convertible, SizeType, index_type) &&
+      _MDSPAN_TRAIT( std::is_convertible, SizeType, index_type) &&
       N == extents_type::rank()
     )
   )
   MDSPAN_FORCE_INLINE_FUNCTION
-  constexpr const_reference operator[](const array<SizeType, N>& indices) const noexcept
+  constexpr const_reference operator[](const std::array<SizeType, N>& indices) const noexcept
   {
     return __impl::template __callop<reference>(*this, indices);
   }
@@ -350,12 +328,12 @@ public:
   MDSPAN_TEMPLATE_REQUIRES(
     class SizeType, size_t N,
     /* requires */ (
-      _MDSPAN_TRAIT(is_convertible, SizeType, index_type) &&
+      _MDSPAN_TRAIT( std::is_convertible, SizeType, index_type) &&
       N == extents_type::rank()
     )
   )
   MDSPAN_FORCE_INLINE_FUNCTION
-  constexpr reference operator[](const array<SizeType, N>& indices) noexcept
+  constexpr reference operator[](const std::array<SizeType, N>& indices) noexcept
   {
     return __impl::template __callop<reference>(*this, indices);
   }
@@ -366,38 +344,38 @@ public:
   MDSPAN_TEMPLATE_REQUIRES(
     class... SizeTypes,
     /* requires */ (
-      _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(is_convertible, SizeTypes, index_type) /* && ... */) &&
+      _MDSPAN_FOLD_AND(_MDSPAN_TRAIT( std::is_convertible, SizeTypes, index_type) /* && ... */) &&
       extents_type::rank() == sizeof...(SizeTypes)
     )
   )
   MDSPAN_FORCE_INLINE_FUNCTION
   constexpr const_reference operator()(SizeTypes... indices) const noexcept
   {
-    return ctr_[map_(index_type(indices)...)];
+    return ctr_[map_(static_cast<index_type>(std::move(indices))...)];
   }
   MDSPAN_TEMPLATE_REQUIRES(
     class... SizeTypes,
     /* requires */ (
-      _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(is_convertible, SizeTypes, index_type) /* && ... */) &&
+      _MDSPAN_FOLD_AND(_MDSPAN_TRAIT( std::is_convertible, SizeTypes, index_type) /* && ... */) &&
       extents_type::rank() == sizeof...(SizeTypes)
     )
   )
   MDSPAN_FORCE_INLINE_FUNCTION
   constexpr reference operator()(SizeTypes... indices) noexcept
   {
-    return ctr_[map_(index_type(indices)...)];
+    return ctr_[map_(static_cast<index_type>(std::move(indices))...)];
   }
 
 #if 0
   MDSPAN_TEMPLATE_REQUIRES(
     class SizeType, size_t N,
     /* requires */ (
-      _MDSPAN_TRAIT(is_convertible, SizeType, index_type) &&
+      _MDSPAN_TRAIT( std::is_convertible, SizeType, index_type) &&
       N == extents_type::rank()
     )
   )
   MDSPAN_FORCE_INLINE_FUNCTION
-  constexpr const_reference operator()(const array<SizeType, N>& indices) const noexcept
+  constexpr const_reference operator()(const std::array<SizeType, N>& indices) const noexcept
   {
     return __impl::template __callop<reference>(*this, indices);
   }
@@ -405,12 +383,12 @@ public:
   MDSPAN_TEMPLATE_REQUIRES(
     class SizeType, size_t N,
     /* requires */ (
-      _MDSPAN_TRAIT(is_convertible, SizeType, index_type) &&
+      _MDSPAN_TRAIT( std::is_convertible, SizeType, index_type) &&
       N == extents_type::rank()
     )
   )
   MDSPAN_FORCE_INLINE_FUNCTION
-  constexpr reference operator()(const array<SizeType, N>& indices) noexcept
+  constexpr reference operator()(const std::array<SizeType, N>& indices) noexcept
   {
     return __impl::template __callop<reference>(*this, indices);
   }
@@ -425,11 +403,11 @@ public:
   //--------------------------------------------------------------------------------
   // [mdspan.basic.domobs], mdspan observers of the domain multidimensional index space
 
-  MDSPAN_INLINE_FUNCTION static constexpr size_t rank() noexcept { return extents_type::rank(); }
-  MDSPAN_INLINE_FUNCTION static constexpr size_t rank_dynamic() noexcept { return extents_type::rank_dynamic(); }
-  MDSPAN_INLINE_FUNCTION static constexpr index_type static_extent(size_t r) noexcept { return extents_type::static_extent(r); }
+  MDSPAN_INLINE_FUNCTION static constexpr rank_type rank() noexcept { return extents_type::rank(); }
+  MDSPAN_INLINE_FUNCTION static constexpr rank_type rank_dynamic() noexcept { return extents_type::rank_dynamic(); }
+  MDSPAN_INLINE_FUNCTION static constexpr size_t static_extent(size_t r) noexcept { return extents_type::static_extent(r); }
 
-  MDSPAN_INLINE_FUNCTION constexpr extents_type extents() const noexcept { return map_.extents(); };
+  MDSPAN_INLINE_FUNCTION constexpr const extents_type& extents() const noexcept { return map_.extents(); };
   MDSPAN_INLINE_FUNCTION constexpr index_type extent(size_t r) const noexcept { return map_.extents().extent(r); };
   MDSPAN_INLINE_FUNCTION constexpr index_type size() const noexcept {
 //    return __impl::__size(*this);
@@ -444,12 +422,61 @@ public:
   MDSPAN_INLINE_FUNCTION static constexpr bool is_always_exhaustive() noexcept { return mapping_type::is_always_exhaustive(); };
   MDSPAN_INLINE_FUNCTION static constexpr bool is_always_strided() noexcept { return mapping_type::is_always_strided(); };
 
-  MDSPAN_INLINE_FUNCTION constexpr mapping_type mapping() const noexcept { return map_; };
+  MDSPAN_INLINE_FUNCTION constexpr const mapping_type& mapping() const noexcept { return map_; };
   MDSPAN_INLINE_FUNCTION constexpr bool is_unique() const noexcept { return map_.is_unique(); };
   MDSPAN_INLINE_FUNCTION constexpr bool is_exhaustive() const noexcept { return map_.is_exhaustive(); };
   MDSPAN_INLINE_FUNCTION constexpr bool is_strided() const noexcept { return map_.is_strided(); };
   MDSPAN_INLINE_FUNCTION constexpr index_type stride(size_t r) const { return map_.stride(r); };
 
+  // Converstion to mdspan
+  MDSPAN_TEMPLATE_REQUIRES(
+    class OtherElementType, class OtherExtents,
+    class OtherLayoutType, class OtherAccessorType,
+    /* requires */ (
+      _MDSPAN_TRAIT(std::is_assignable, mdspan_type,
+                       mdspan<OtherElementType, OtherExtents, OtherLayoutType, OtherAccessorType>)
+    )
+  )
+  constexpr operator mdspan<OtherElementType, OtherExtents, OtherLayoutType, OtherAccessorType> () {
+    return mdspan_type(data(), map_);
+  }
+
+  MDSPAN_TEMPLATE_REQUIRES(
+    class OtherElementType, class OtherExtents,
+    class OtherLayoutType, class OtherAccessorType,
+    /* requires */ (
+      _MDSPAN_TRAIT(std::is_assignable, const_mdspan_type,
+                      mdspan<OtherElementType, OtherExtents, OtherLayoutType, OtherAccessorType>)
+    )
+  )
+  constexpr operator mdspan<OtherElementType, OtherExtents, OtherLayoutType, OtherAccessorType> () const {
+    return const_mdspan_type(data(), map_);
+  }
+
+  MDSPAN_TEMPLATE_REQUIRES(
+    class OtherAccessorType = default_accessor<element_type>,
+    /* requires */ (
+      _MDSPAN_TRAIT(std::is_assignable, mdspan_type,
+                      mdspan<element_type, extents_type, layout_type, OtherAccessorType>)
+    )
+  )
+  constexpr mdspan<element_type, extents_type, layout_type, OtherAccessorType>
+    to_mdspan(const OtherAccessorType& a = default_accessor<element_type>()) {
+      return mdspan<element_type, extents_type, layout_type, OtherAccessorType>(data(), map_, a);
+  }
+
+  MDSPAN_TEMPLATE_REQUIRES(
+    class OtherAccessorType = default_accessor<const element_type>,
+    /* requires */ (
+      _MDSPAN_TRAIT(std::is_assignable, const_mdspan_type,
+                      mdspan<const element_type, extents_type, layout_type, OtherAccessorType>)
+    )
+  )
+  constexpr mdspan<const element_type, extents_type, layout_type, OtherAccessorType>
+    to_mdspan(const OtherAccessorType& a = default_accessor<const element_type>()) const {
+      return mdspan<const element_type, extents_type, layout_type, OtherAccessorType>(data(), map_, a);
+  }
+
 private:
   mapping_type map_;
   container_type ctr_;
@@ -459,5 +486,5 @@ private:
 };
 
 
-} // end namespace experimental
-} // end namespace std
+} // end namespace MDSPAN_IMPL_PROPOSED_NAMESPACE
+} // end namespace MDSPAN_IMPL_STANDARD_NAMESPACE
diff --git a/packages/kokkos/tpls/mdspan/include/experimental/__p2630_bits/strided_slice.hpp b/packages/kokkos/tpls/mdspan/include/experimental/__p2630_bits/strided_slice.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..58f38620ba1a484e6cf1ea547232d02febd08bbd
--- /dev/null
+++ b/packages/kokkos/tpls/mdspan/include/experimental/__p2630_bits/strided_slice.hpp
@@ -0,0 +1,49 @@
+
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#pragma once
+
+#include <type_traits>
+
+namespace MDSPAN_IMPL_STANDARD_NAMESPACE {
+namespace MDSPAN_IMPL_PROPOSED_NAMESPACE {
+
+namespace {
+  template<class T>
+  struct __mdspan_is_integral_constant: std::false_type {};
+
+  template<class T, T val>
+  struct __mdspan_is_integral_constant<std::integral_constant<T,val>>: std::true_type {};
+}
+// Slice Specifier allowing for strides and compile time extent
+template <class OffsetType, class ExtentType, class StrideType>
+struct strided_slice {
+  using offset_type = OffsetType;
+  using extent_type = ExtentType;
+  using stride_type = StrideType;
+
+  OffsetType offset;
+  ExtentType extent;
+  StrideType stride;
+
+  static_assert(std::is_integral_v<OffsetType> || __mdspan_is_integral_constant<OffsetType>::value);
+  static_assert(std::is_integral_v<ExtentType> || __mdspan_is_integral_constant<ExtentType>::value);
+  static_assert(std::is_integral_v<StrideType> || __mdspan_is_integral_constant<StrideType>::value);
+};
+
+} // MDSPAN_IMPL_PROPOSED_NAMESPACE
+} // MDSPAN_IMPL_STANDARD_NAMESPACE
diff --git a/packages/kokkos/tpls/mdspan/include/experimental/__p2630_bits/submdspan.hpp b/packages/kokkos/tpls/mdspan/include/experimental/__p2630_bits/submdspan.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b9672b7f9ac357834347b9df2ab25c63c5862acf
--- /dev/null
+++ b/packages/kokkos/tpls/mdspan/include/experimental/__p2630_bits/submdspan.hpp
@@ -0,0 +1,42 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#pragma once
+
+#include "submdspan_extents.hpp"
+#include "submdspan_mapping.hpp"
+
+namespace MDSPAN_IMPL_STANDARD_NAMESPACE {
+namespace MDSPAN_IMPL_PROPOSED_NAMESPACE {
+template <class ElementType, class Extents, class LayoutPolicy,
+          class AccessorPolicy, class... SliceSpecifiers>
+MDSPAN_INLINE_FUNCTION
+constexpr auto
+submdspan(const mdspan<ElementType, Extents, LayoutPolicy, AccessorPolicy> &src,
+          SliceSpecifiers... slices) {
+  const auto sub_mapping_offset = submdspan_mapping(src.mapping(), slices...);
+  // NVCC has a problem with the deduction so lets figure out the type
+  using sub_mapping_t = std::remove_cv_t<decltype(sub_mapping_offset.mapping)>;
+  using sub_extents_t = typename sub_mapping_t::extents_type;
+  using sub_layout_t = typename sub_mapping_t::layout_type;
+  using sub_accessor_t = typename AccessorPolicy::offset_policy;
+  return mdspan<ElementType, sub_extents_t, sub_layout_t, sub_accessor_t>(
+      src.accessor().offset(src.data_handle(), sub_mapping_offset.offset),
+      sub_mapping_offset.mapping,
+      sub_accessor_t(src.accessor()));
+}
+} // namespace MDSPAN_IMPL_PROPOSED_NAMESPACE
+} // namespace MDSPAN_IMPL_STANDARD_NAMESPACE
diff --git a/packages/kokkos/tpls/mdspan/include/experimental/__p2630_bits/submdspan_extents.hpp b/packages/kokkos/tpls/mdspan/include/experimental/__p2630_bits/submdspan_extents.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f56ce023f1652d7f01e982448faafb8d5a12c542
--- /dev/null
+++ b/packages/kokkos/tpls/mdspan/include/experimental/__p2630_bits/submdspan_extents.hpp
@@ -0,0 +1,323 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#pragma once
+
+#include <tuple>
+
+#include "strided_slice.hpp"
+namespace MDSPAN_IMPL_STANDARD_NAMESPACE {
+namespace MDSPAN_IMPL_PROPOSED_NAMESPACE {
+namespace detail {
+
+// Mapping from submapping ranks to srcmapping ranks
+// InvMapRank is an index_sequence, which we build recursively
+// to contain the mapped indices.
+// end of recursion specialization containing the final index_sequence
+template <size_t Counter, size_t... MapIdxs>
+MDSPAN_INLINE_FUNCTION
+constexpr auto inv_map_rank(std::integral_constant<size_t, Counter>, std::index_sequence<MapIdxs...>) {
+  return std::index_sequence<MapIdxs...>();
+}
+
+// specialization reducing rank by one (i.e., integral slice specifier)
+template<size_t Counter, class Slice, class... SliceSpecifiers, size_t... MapIdxs>
+MDSPAN_INLINE_FUNCTION
+constexpr auto inv_map_rank(std::integral_constant<size_t, Counter>, std::index_sequence<MapIdxs...>, Slice,
+                  SliceSpecifiers... slices) {
+  using next_idx_seq_t = std::conditional_t<std::is_convertible_v<Slice, size_t>,
+                                       std::index_sequence<MapIdxs...>,
+                                       std::index_sequence<MapIdxs..., Counter>>;
+
+  return inv_map_rank(std::integral_constant<size_t,Counter + 1>(), next_idx_seq_t(),
+                                     slices...);
+}
+
+// Helper for identifying strided_slice
+template <class T> struct is_strided_slice : std::false_type {};
+
+template <class OffsetType, class ExtentType, class StrideType>
+struct is_strided_slice<
+    strided_slice<OffsetType, ExtentType, StrideType>> : std::true_type {};
+
+// first_of(slice): getting begin of slice specifier range
+MDSPAN_TEMPLATE_REQUIRES(
+  class Integral,
+  /* requires */(std::is_convertible_v<Integral, size_t>)
+)
+MDSPAN_INLINE_FUNCTION
+constexpr Integral first_of(const Integral &i) {
+  return i;
+}
+
+MDSPAN_INLINE_FUNCTION
+constexpr std::integral_constant<size_t, 0>
+first_of(const ::MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent_t &) {
+  return std::integral_constant<size_t, 0>();
+}
+
+MDSPAN_TEMPLATE_REQUIRES(
+  class Slice,
+  /* requires */(std::is_convertible_v<Slice, std::tuple<size_t, size_t>>)
+)
+MDSPAN_INLINE_FUNCTION
+constexpr auto first_of(const Slice &i) {
+  return std::get<0>(i);
+}
+
+template <class OffsetType, class ExtentType, class StrideType>
+MDSPAN_INLINE_FUNCTION
+constexpr OffsetType
+first_of(const strided_slice<OffsetType, ExtentType, StrideType> &r) {
+  return r.offset;
+}
+
+// last_of(slice): getting end of slice specifier range
+// We need however not just the slice but also the extents
+// of the original view and which rank from the extents.
+// This is needed in the case of slice being full_extent_t.
+MDSPAN_TEMPLATE_REQUIRES(
+  size_t k, class Extents, class Integral,
+  /* requires */(std::is_convertible_v<Integral, size_t>)
+)
+MDSPAN_INLINE_FUNCTION
+constexpr Integral
+    last_of(std::integral_constant<size_t, k>, const Extents &, const Integral &i) {
+  return i;
+}
+
+MDSPAN_TEMPLATE_REQUIRES(
+  size_t k, class Extents, class Slice,
+  /* requires */(std::is_convertible_v<Slice, std::tuple<size_t, size_t>>)
+)
+MDSPAN_INLINE_FUNCTION
+constexpr auto last_of(std::integral_constant<size_t, k>, const Extents &,
+                       const Slice &i) {
+  return std::get<1>(i);
+}
+
+// Suppress spurious warning with NVCC about no return statement.
+// This is a known issue in NVCC and NVC++
+// Depending on the CUDA and GCC version we need both the builtin
+// and the diagnostic push. I tried really hard to find something shorter
+// but no luck ...
+#if defined __NVCC__
+    #ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
+        #pragma nv_diagnostic push
+        #pragma nv_diag_suppress = implicit_return_from_non_void_function
+    #else
+      #ifdef __CUDA_ARCH__
+        #pragma diagnostic push
+        #pragma diag_suppress implicit_return_from_non_void_function
+      #endif
+    #endif
+#elif defined __NVCOMPILER
+    #pragma    diagnostic push
+    #pragma    diag_suppress = implicit_return_from_non_void_function
+#endif
+template <size_t k, class Extents>
+MDSPAN_INLINE_FUNCTION
+constexpr auto last_of(std::integral_constant<size_t, k>, const Extents &ext,
+                       ::MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent_t) {
+  if constexpr (Extents::static_extent(k) == dynamic_extent) {
+    return ext.extent(k);
+  } else {
+    return std::integral_constant<size_t, Extents::static_extent(k)>();
+  }
+#if defined(__NVCC__) && !defined(__CUDA_ARCH__) && defined(__GNUC__)
+  // Even with CUDA_ARCH protection this thing warns about calling host function
+  __builtin_unreachable();
+#endif
+}
+#if defined __NVCC__
+    #ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
+        #pragma nv_diagnostic pop
+    #else
+      #ifdef __CUDA_ARCH__
+        #pragma diagnostic pop
+      #endif
+    #endif
+#elif defined __NVCOMPILER
+    #pragma    diagnostic pop
+#endif
+
+template <size_t k, class Extents, class OffsetType, class ExtentType,
+          class StrideType>
+MDSPAN_INLINE_FUNCTION
+constexpr OffsetType
+last_of(std::integral_constant<size_t, k>, const Extents &,
+        const strided_slice<OffsetType, ExtentType, StrideType> &r) {
+  return r.extent;
+}
+
+// get stride of slices
+template <class T>
+MDSPAN_INLINE_FUNCTION
+constexpr auto stride_of(const T &) {
+  return std::integral_constant<size_t, 1>();
+}
+
+template <class OffsetType, class ExtentType, class StrideType>
+MDSPAN_INLINE_FUNCTION
+constexpr auto
+stride_of(const strided_slice<OffsetType, ExtentType, StrideType> &r) {
+  return r.stride;
+}
+
+// divide which can deal with integral constant preservation
+template <class IndexT, class T0, class T1>
+MDSPAN_INLINE_FUNCTION
+constexpr auto divide(const T0 &v0, const T1 &v1) {
+  return IndexT(v0) / IndexT(v1);
+}
+
+template <class IndexT, class T0, T0 v0, class T1, T1 v1>
+MDSPAN_INLINE_FUNCTION
+constexpr auto divide(const std::integral_constant<T0, v0> &,
+                      const std::integral_constant<T1, v1> &) {
+  // cutting short division by zero
+  // this is used for strided_slice with zero extent/stride
+  return std::integral_constant<IndexT, v0 == 0 ? 0 : v0 / v1>();
+}
+
+// multiply which can deal with integral constant preservation
+template <class IndexT, class T0, class T1>
+MDSPAN_INLINE_FUNCTION
+constexpr auto multiply(const T0 &v0, const T1 &v1) {
+  return IndexT(v0) * IndexT(v1);
+}
+
+template <class IndexT, class T0, T0 v0, class T1, T1 v1>
+MDSPAN_INLINE_FUNCTION
+constexpr auto multiply(const std::integral_constant<T0, v0> &,
+                        const std::integral_constant<T1, v1> &) {
+  return std::integral_constant<IndexT, v0 * v1>();
+}
+
+// compute new static extent from range, preserving static knowledge
+template <class Arg0, class Arg1> struct StaticExtentFromRange {
+  constexpr static size_t value = dynamic_extent;
+};
+
+template <class Integral0, Integral0 val0, class Integral1, Integral1 val1>
+struct StaticExtentFromRange<std::integral_constant<Integral0, val0>,
+                             std::integral_constant<Integral1, val1>> {
+  constexpr static size_t value = val1 - val0;
+};
+
+// compute new static extent from strided_slice, preserving static
+// knowledge
+template <class Arg0, class Arg1> struct StaticExtentFromStridedRange {
+  constexpr static size_t value = dynamic_extent;
+};
+
+template <class Integral0, Integral0 val0, class Integral1, Integral1 val1>
+struct StaticExtentFromStridedRange<std::integral_constant<Integral0, val0>,
+                                    std::integral_constant<Integral1, val1>> {
+  constexpr static size_t value = val0 > 0 ? 1 + (val0 - 1) / val1 : 0;
+};
+
+// creates new extents through recursive calls to next_extent member function
+// next_extent has different overloads for different types of stride specifiers
+template <size_t K, class Extents, size_t... NewExtents>
+struct extents_constructor {
+  MDSPAN_TEMPLATE_REQUIRES(
+    class Slice, class... SlicesAndExtents,
+    /* requires */(!std::is_convertible_v<Slice, size_t> &&
+                   !is_strided_slice<Slice>::value)
+  )
+  MDSPAN_INLINE_FUNCTION
+  constexpr static auto next_extent(const Extents &ext, const Slice &sl,
+                                    SlicesAndExtents... slices_and_extents) {
+    constexpr size_t new_static_extent = StaticExtentFromRange<
+        decltype(first_of(std::declval<Slice>())),
+        decltype(last_of(std::integral_constant<size_t, Extents::rank() - K>(),
+                         std::declval<Extents>(),
+                         std::declval<Slice>()))>::value;
+
+    using next_t =
+        extents_constructor<K - 1, Extents, NewExtents..., new_static_extent>;
+    using index_t = typename Extents::index_type;
+    return next_t::next_extent(
+        ext, slices_and_extents...,
+        index_t(last_of(std::integral_constant<size_t, Extents::rank() - K>(), ext,
+                        sl)) -
+            index_t(first_of(sl)));
+  }
+
+  MDSPAN_TEMPLATE_REQUIRES(
+    class Slice, class... SlicesAndExtents,
+    /* requires */ (std::is_convertible_v<Slice, size_t>)
+  )
+  MDSPAN_INLINE_FUNCTION
+  constexpr static auto next_extent(const Extents &ext, const Slice &,
+                                    SlicesAndExtents... slices_and_extents) {
+    using next_t = extents_constructor<K - 1, Extents, NewExtents...>;
+    return next_t::next_extent(ext, slices_and_extents...);
+  }
+
+  template <class OffsetType, class ExtentType, class StrideType,
+            class... SlicesAndExtents>
+  MDSPAN_INLINE_FUNCTION
+  constexpr static auto
+  next_extent(const Extents &ext,
+              const strided_slice<OffsetType, ExtentType, StrideType> &r,
+              SlicesAndExtents... slices_and_extents) {
+    using index_t = typename Extents::index_type;
+    using new_static_extent_t =
+        StaticExtentFromStridedRange<ExtentType, StrideType>;
+    if constexpr (new_static_extent_t::value == dynamic_extent) {
+      using next_t =
+          extents_constructor<K - 1, Extents, NewExtents..., dynamic_extent>;
+      return next_t::next_extent(
+          ext, slices_and_extents...,
+          r.extent > 0 ? 1 + divide<index_t>(r.extent - 1, r.stride) : 0);
+    } else {
+      constexpr size_t new_static_extent = new_static_extent_t::value;
+      using next_t =
+          extents_constructor<K - 1, Extents, NewExtents..., new_static_extent>;
+      return next_t::next_extent(
+          ext, slices_and_extents..., index_t(divide<index_t>(ExtentType(), StrideType())));
+    }
+  }
+};
+
+template <class Extents, size_t... NewStaticExtents>
+struct extents_constructor<0, Extents, NewStaticExtents...> {
+
+  template <class... NewExtents>
+  MDSPAN_INLINE_FUNCTION
+  constexpr static auto next_extent(const Extents &, NewExtents... new_exts) {
+    return extents<typename Extents::index_type, NewStaticExtents...>(
+        new_exts...);
+  }
+};
+
+} // namespace detail
+
+// submdspan_extents creates new extents given src extents and submdspan slice
+// specifiers
+template <class IndexType, size_t... Extents, class... SliceSpecifiers>
+MDSPAN_INLINE_FUNCTION
+constexpr auto submdspan_extents(const extents<IndexType, Extents...> &src_exts,
+                                 SliceSpecifiers... slices) {
+
+  using ext_t = extents<IndexType, Extents...>;
+  return detail::extents_constructor<ext_t::rank(), ext_t>::next_extent(
+      src_exts, slices...);
+}
+} // namespace MDSPAN_IMPL_PROPOSED_NAMESPACE
+} // namespace MDSPAN_IMPL_STANDARD_NAMESPACE
diff --git a/packages/kokkos/tpls/mdspan/include/experimental/__p2630_bits/submdspan_mapping.hpp b/packages/kokkos/tpls/mdspan/include/experimental/__p2630_bits/submdspan_mapping.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..48778d57e75fcc1134504fb2b9902599357bd536
--- /dev/null
+++ b/packages/kokkos/tpls/mdspan/include/experimental/__p2630_bits/submdspan_mapping.hpp
@@ -0,0 +1,299 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#pragma once
+
+#include <array>
+#include <type_traits>
+#include <tuple>
+#include <utility> // index_sequence
+
+namespace MDSPAN_IMPL_STANDARD_NAMESPACE {
+namespace MDSPAN_IMPL_PROPOSED_NAMESPACE {
+//******************************************
+// Return type of submdspan_mapping overloads
+//******************************************
+template <class Mapping> struct mapping_offset {
+  Mapping mapping;
+  size_t offset;
+};
+} // namespace MDSPAN_IMPL_PROPOSED_NAMESPACE
+
+namespace detail {
+using MDSPAN_IMPL_PROPOSED_NAMESPACE::detail::first_of;
+using MDSPAN_IMPL_PROPOSED_NAMESPACE::detail::stride_of;
+using MDSPAN_IMPL_PROPOSED_NAMESPACE::detail::inv_map_rank;
+
+// constructs sub strides
+template <class SrcMapping, class... slice_strides, size_t... InvMapIdxs>
+MDSPAN_INLINE_FUNCTION
+constexpr auto
+construct_sub_strides(const SrcMapping &src_mapping,
+                      std::index_sequence<InvMapIdxs...>,
+                      const std::tuple<slice_strides...> &slices_stride_factor) {
+  using index_type = typename SrcMapping::index_type;
+  return std::array<typename SrcMapping::index_type, sizeof...(InvMapIdxs)>{
+      (static_cast<index_type>(src_mapping.stride(InvMapIdxs)) *
+       static_cast<index_type>(std::get<InvMapIdxs>(slices_stride_factor)))...};
+}
+} // namespace detail
+
+//**********************************
+// layout_left submdspan_mapping
+//*********************************
+namespace detail {
+
+// Figure out whether to preserve layout_left
+template <class IndexSequence, size_t SubRank, class... SliceSpecifiers>
+struct preserve_layout_left_mapping;
+
+template <class... SliceSpecifiers, size_t... Idx, size_t SubRank>
+struct preserve_layout_left_mapping<std::index_sequence<Idx...>, SubRank,
+                                    SliceSpecifiers...> {
+  constexpr static bool value =
+      // Preserve layout for rank 0
+      (SubRank == 0) ||
+      (
+          // Slice specifiers up to subrank need to be full_extent_t - except
+          // for the last one which could also be tuple but not a strided index
+          // range slice specifiers after subrank are integrals
+          ((Idx > SubRank - 1) || // these are only integral slice specifiers
+           (std::is_same_v<SliceSpecifiers, full_extent_t>) ||
+           ((Idx == SubRank - 1) &&
+            std::is_convertible_v<SliceSpecifiers, std::tuple<size_t, size_t>>)) &&
+          ...);
+};
+} // namespace detail
+
+// Suppress spurious warning with NVCC about no return statement.
+// This is a known issue in NVCC and NVC++
+// Depending on the CUDA and GCC version we need both the builtin
+// and the diagnostic push. I tried really hard to find something shorter
+// but no luck ...
+#if defined __NVCC__
+    #ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
+        #pragma nv_diagnostic push
+        #pragma nv_diag_suppress = implicit_return_from_non_void_function
+    #else
+      #ifdef __CUDA_ARCH__
+        #pragma diagnostic push
+        #pragma diag_suppress implicit_return_from_non_void_function
+      #endif
+    #endif
+#elif defined __NVCOMPILER
+    #pragma    diagnostic push
+    #pragma    diag_suppress = implicit_return_from_non_void_function
+#endif
+// Actual submdspan mapping call
+template <class Extents, class... SliceSpecifiers>
+MDSPAN_INLINE_FUNCTION
+constexpr auto
+submdspan_mapping(const layout_left::mapping<Extents> &src_mapping,
+                  SliceSpecifiers... slices) {
+  using MDSPAN_IMPL_PROPOSED_NAMESPACE::submdspan_extents;
+  using MDSPAN_IMPL_PROPOSED_NAMESPACE::mapping_offset;
+
+  // compute sub extents
+  using src_ext_t = Extents;
+  auto dst_ext = submdspan_extents(src_mapping.extents(), slices...);
+  using dst_ext_t = decltype(dst_ext);
+
+  // figure out sub layout type
+  constexpr bool preserve_layout = detail::preserve_layout_left_mapping<
+      decltype(std::make_index_sequence<src_ext_t::rank()>()), dst_ext_t::rank(),
+      SliceSpecifiers...>::value;
+  using dst_layout_t =
+      std::conditional_t<preserve_layout, layout_left, layout_stride>;
+  using dst_mapping_t = typename dst_layout_t::template mapping<dst_ext_t>;
+
+  if constexpr (std::is_same_v<dst_layout_t, layout_left>) {
+    // layout_left case
+    return mapping_offset<dst_mapping_t>{
+        dst_mapping_t(dst_ext),
+        static_cast<size_t>(src_mapping(detail::first_of(slices)...))};
+  } else {
+    // layout_stride case
+    auto inv_map = detail::inv_map_rank(
+      std::integral_constant<size_t,0>(),
+      std::index_sequence<>(),
+      slices...);
+    return mapping_offset<dst_mapping_t>{
+        dst_mapping_t(dst_ext, detail::construct_sub_strides(
+                                   src_mapping, inv_map,
+    // HIP needs deduction guides to have markups so we need to be explicit
+    // NVCC 11.0 has a bug with deduction guide here, tested that 11.2 does not have the issue
+    #if defined(_MDSPAN_HAS_HIP) || (defined(__NVCC__) && (__CUDACC_VER_MAJOR__ * 100 + __CUDACC_VER_MINOR__ * 10) < 1120)
+                                   std::tuple<decltype(detail::stride_of(slices))...>{detail::stride_of(slices)...})),
+    #else
+                                   std::tuple{detail::stride_of(slices)...})),
+    #endif
+        static_cast<size_t>(src_mapping(detail::first_of(slices)...))};
+  }
+#if defined(__NVCC__) && !defined(__CUDA_ARCH__) && defined(__GNUC__)
+  __builtin_unreachable();
+#endif
+}
+#if defined __NVCC__
+    #ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
+        #pragma nv_diagnostic pop
+    #else
+      #ifdef __CUDA_ARCH__
+        #pragma diagnostic pop
+      #endif
+    #endif
+#elif defined __NVCOMPILER
+    #pragma    diagnostic pop
+#endif
+
+//**********************************
+// layout_right submdspan_mapping
+//*********************************
+namespace detail {
+
+// Figure out whether to preserve layout_right
+template <class IndexSequence, size_t SubRank, class... SliceSpecifiers>
+struct preserve_layout_right_mapping;
+
+template <class... SliceSpecifiers, size_t... Idx, size_t SubRank>
+struct preserve_layout_right_mapping<std::index_sequence<Idx...>, SubRank,
+                                     SliceSpecifiers...> {
+  constexpr static size_t SrcRank = sizeof...(SliceSpecifiers);
+  constexpr static bool value =
+      // Preserve layout for rank 0
+      (SubRank == 0) ||
+      (
+          // The last subrank slice specifiers need to be full_extent_t - except
+          // for the srcrank-subrank one which could also be tuple but not a
+          // strided index range slice specifiers before srcrank-subrank are
+          // integrals
+          ((Idx <
+            SrcRank - SubRank) || // these are only integral slice specifiers
+           (std::is_same_v<SliceSpecifiers, full_extent_t>) ||
+           ((Idx == SrcRank - SubRank) &&
+            std::is_convertible_v<SliceSpecifiers, std::tuple<size_t, size_t>>)) &&
+          ...);
+};
+} // namespace detail
+
+// Suppress spurious warning with NVCC about no return statement.
+// This is a known issue in NVCC and NVC++
+// Depending on the CUDA and GCC version we need both the builtin
+// and the diagnostic push. I tried really hard to find something shorter
+// but no luck ...
+#if defined __NVCC__
+    #ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
+        #pragma nv_diagnostic push
+        #pragma nv_diag_suppress = implicit_return_from_non_void_function
+    #else
+      #ifdef __CUDA_ARCH__
+        #pragma diagnostic push
+        #pragma diag_suppress implicit_return_from_non_void_function
+      #endif
+    #endif
+#elif defined __NVCOMPILER
+    #pragma    diagnostic push
+    #pragma    diag_suppress = implicit_return_from_non_void_function
+#endif
+template <class Extents, class... SliceSpecifiers>
+MDSPAN_INLINE_FUNCTION
+constexpr auto
+submdspan_mapping(const layout_right::mapping<Extents> &src_mapping,
+                  SliceSpecifiers... slices) {
+  using MDSPAN_IMPL_PROPOSED_NAMESPACE::submdspan_extents;
+  using MDSPAN_IMPL_PROPOSED_NAMESPACE::mapping_offset;
+
+  // get sub extents
+  using src_ext_t = Extents;
+  auto dst_ext = submdspan_extents(src_mapping.extents(), slices...);
+  using dst_ext_t = decltype(dst_ext);
+
+  // determine new layout type
+  constexpr bool preserve_layout = detail::preserve_layout_right_mapping<
+      decltype(std::make_index_sequence<src_ext_t::rank()>()), dst_ext_t::rank(),
+      SliceSpecifiers...>::value;
+  using dst_layout_t =
+      std::conditional_t<preserve_layout, layout_right, layout_stride>;
+  using dst_mapping_t = typename dst_layout_t::template mapping<dst_ext_t>;
+
+  if constexpr (std::is_same_v<dst_layout_t, layout_right>) {
+    // layout_right case
+    return mapping_offset<dst_mapping_t>{
+        dst_mapping_t(dst_ext),
+        static_cast<size_t>(src_mapping(detail::first_of(slices)...))};
+  } else {
+    // layout_stride case
+    auto inv_map = detail::inv_map_rank(
+      std::integral_constant<size_t,0>(),
+      std::index_sequence<>(),
+      slices...);
+    return mapping_offset<dst_mapping_t>{
+        dst_mapping_t(dst_ext, detail::construct_sub_strides(
+                                   src_mapping, inv_map,
+    // HIP needs deduction guides to have markups so we need to be explicit
+    // NVCC 11.0 has a bug with deduction guide here, tested that 11.2 does not have the issue
+    #if defined(_MDSPAN_HAS_HIP) || (defined(__NVCC__) && (__CUDACC_VER_MAJOR__ * 100 + __CUDACC_VER_MINOR__ * 10) < 1120)
+                                   std::tuple<decltype(detail::stride_of(slices))...>{detail::stride_of(slices)...})),
+    #else
+                                   std::tuple{detail::stride_of(slices)...})),
+    #endif
+        static_cast<size_t>(src_mapping(detail::first_of(slices)...))};
+  }
+#if defined(__NVCC__) && !defined(__CUDA_ARCH__) && defined(__GNUC__)
+  __builtin_unreachable();
+#endif
+}
+#if defined __NVCC__
+    #ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
+        #pragma nv_diagnostic pop
+    #else
+      #ifdef __CUDA_ARCH__
+        #pragma diagnostic pop
+      #endif
+    #endif
+#elif defined __NVCOMPILER
+    #pragma    diagnostic pop
+#endif
+
+//**********************************
+// layout_stride submdspan_mapping
+//*********************************
+template <class Extents, class... SliceSpecifiers>
+MDSPAN_INLINE_FUNCTION
+constexpr auto
+submdspan_mapping(const layout_stride::mapping<Extents> &src_mapping,
+                  SliceSpecifiers... slices) {
+  using MDSPAN_IMPL_PROPOSED_NAMESPACE::submdspan_extents;
+  using MDSPAN_IMPL_PROPOSED_NAMESPACE::mapping_offset;
+  auto dst_ext = submdspan_extents(src_mapping.extents(), slices...);
+  using dst_ext_t = decltype(dst_ext);
+  auto inv_map = detail::inv_map_rank(
+      std::integral_constant<size_t,0>(),
+      std::index_sequence<>(),
+      slices...);
+  using dst_mapping_t = typename layout_stride::template mapping<dst_ext_t>;
+  return mapping_offset<dst_mapping_t>{
+      dst_mapping_t(dst_ext, detail::construct_sub_strides(
+                                 src_mapping, inv_map,
+    // HIP needs deduction guides to have markups so we need to be explicit
+    // NVCC 11.0 has a bug with deduction guide here, tested that 11.2 does not have the issue
+    #if defined(_MDSPAN_HAS_HIP) || (defined(__NVCC__) && (__CUDACC_VER_MAJOR__ * 100 + __CUDACC_VER_MINOR__ * 10) < 1120)
+                                 std::tuple<decltype(detail::stride_of(slices))...>(detail::stride_of(slices)...))),
+#else
+                                 std::tuple(detail::stride_of(slices)...))),
+#endif
+      static_cast<size_t>(src_mapping(detail::first_of(slices)...))};
+}
+} // namespace MDSPAN_IMPL_STANDARD_NAMESPACE
diff --git a/packages/kokkos/tpls/mdspan/include/experimental/mdarray b/packages/kokkos/tpls/mdspan/include/experimental/mdarray
deleted file mode 100644
index fa710a59b66616665d94dd11bb7e83ce7bddfe0a..0000000000000000000000000000000000000000
--- a/packages/kokkos/tpls/mdspan/include/experimental/mdarray
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 2.0
-//              Copyright (2019) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#pragma once
-
-#include "mdspan"
-#include "__p1684_bits/mdarray.hpp"
-
diff --git a/packages/kokkos/tpls/mdspan/include/experimental/mdspan b/packages/kokkos/tpls/mdspan/include/experimental/mdspan
deleted file mode 100644
index ca6f6b8686cd19fa0142aa158cb55b204aef7760..0000000000000000000000000000000000000000
--- a/packages/kokkos/tpls/mdspan/include/experimental/mdspan
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 2.0
-//              Copyright (2019) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#pragma once
-
-#include "__p0009_bits/default_accessor.hpp"
-#include "__p0009_bits/full_extent_t.hpp"
-#include "__p0009_bits/mdspan.hpp"
-#include "__p0009_bits/dynamic_extent.hpp"
-#include "__p0009_bits/extents.hpp"
-#include "__p0009_bits/layout_stride.hpp"
-#include "__p0009_bits/layout_left.hpp"
-#include "__p0009_bits/layout_right.hpp"
-#include "__p0009_bits/macros.hpp"
-#include "__p0009_bits/static_array.hpp"
-#include "__p0009_bits/submdspan.hpp"
diff --git a/packages/kokkos/tpls/mdspan/include/mdspan/mdarray.hpp b/packages/kokkos/tpls/mdspan/include/mdspan/mdarray.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..fd8f61c52f1b1836aea89f653e263cd33537e218
--- /dev/null
+++ b/packages/kokkos/tpls/mdspan/include/mdspan/mdarray.hpp
@@ -0,0 +1,31 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef MDARRAY_HPP_
+#define MDARRAY_HPP_
+
+#ifndef MDSPAN_IMPL_STANDARD_NAMESPACE
+  #define MDSPAN_IMPL_STANDARD_NAMESPACE Kokkos
+#endif
+
+#ifndef MDSPAN_IMPL_PROPOSED_NAMESPACE
+  #define MDSPAN_IMPL_PROPOSED_NAMESPACE Experimental
+#endif
+
+#include "mdspan.hpp"
+#include "../experimental/__p1684_bits/mdarray.hpp"
+
+#endif // MDARRAY_HPP_
diff --git a/packages/kokkos/tpls/mdspan/include/mdspan/mdspan.hpp b/packages/kokkos/tpls/mdspan/include/mdspan/mdspan.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b440873526ab488857766d8e893343f951920ccc
--- /dev/null
+++ b/packages/kokkos/tpls/mdspan/include/mdspan/mdspan.hpp
@@ -0,0 +1,41 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef MDSPAN_HPP_
+#define MDSPAN_HPP_
+
+#ifndef MDSPAN_IMPL_STANDARD_NAMESPACE
+  #define MDSPAN_IMPL_STANDARD_NAMESPACE Kokkos
+#endif
+
+#ifndef MDSPAN_IMPL_PROPOSED_NAMESPACE
+  #define MDSPAN_IMPL_PROPOSED_NAMESPACE Experimental
+#endif
+
+#include "../experimental/__p0009_bits/default_accessor.hpp"
+#include "../experimental/__p0009_bits/full_extent_t.hpp"
+#include "../experimental/__p0009_bits/mdspan.hpp"
+#include "../experimental/__p0009_bits/dynamic_extent.hpp"
+#include "../experimental/__p0009_bits/extents.hpp"
+#include "../experimental/__p0009_bits/layout_stride.hpp"
+#include "../experimental/__p0009_bits/layout_left.hpp"
+#include "../experimental/__p0009_bits/layout_right.hpp"
+#include "../experimental/__p0009_bits/macros.hpp"
+#if MDSPAN_HAS_CXX_17
+#include "../experimental/__p2630_bits/submdspan.hpp"
+#endif
+
+#endif // MDSPAN_HPP_
diff --git a/packages/org-themes/.gitrepo b/packages/org-themes/.gitrepo
index eab1e017cbd91a54df4a9e824fb0ff9de8b8ecbd..e5f7a63729d2cb02d019a765fe7937b800d0683d 100644
--- a/packages/org-themes/.gitrepo
+++ b/packages/org-themes/.gitrepo
@@ -6,7 +6,7 @@
 [subrepo]
 	remote = git@gitlab.com:OlMon/org-themes.git
 	branch = master
-	commit = b9bd14cbd716e8e8efebfa8e42a6c50176e9b2cd
-	parent = 1994bb4f069142aa3f886d30aa0585ed9117eed1
+	commit = 09d001cd3d4a91a302a2d51127fc17a9967eec67
+	parent = 6ce4aaa2a1607be6c147d96b335fc88e0edc9810
 	method = merge
-	cmdver = 0.4.3
+	cmdver = 0.4.6
diff --git a/packages/org-themes/public/gray/gray.html b/packages/org-themes/public/gray/gray.html
new file mode 100644
index 0000000000000000000000000000000000000000..f630ef1a7191cae180ee318e68a20bdb45809d04
--- /dev/null
+++ b/packages/org-themes/public/gray/gray.html
@@ -0,0 +1,904 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
+"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en">
+<head>
+<!-- 2022-06-24 Fri 19:42 -->
+<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
+<meta name="viewport" content="width=device-width, initial-scale=1" />
+<title>Gray Theme</title>
+<meta name="author" content="Lee Hinman" />
+<meta name="generator" content="Org Mode" />
+<style type="text/css">
+body {
+font-family: Helvetica, Arial, sans-serif;
+font-size: 16px;
+line-height: 1.4;
+color: #33333f;
+}
+
+code {
+font-family: "Inconsolata", "monospace";
+font-size: 16px;
+}
+
+p>code, li>code {
+background-color: #eee;
+padding: 0.25em;
+}
+
+h1, h2, h3 {
+font-family: "Roboto Slab", Helvetica, Arial, sans-serif;
+}
+
+h2 {
+border-bottom: 1px solid #f0c;
+padding-bottom: 0.5em;
+font-size: 1.75em;
+}
+
+h3 {
+margin-top: 2em;
+font-size: 1.5em;
+}
+
+h4 {
+font-size: 1.25em;
+}
+
+h5 {
+font-size: 1em;
+}
+
+h2 code, h3 code, h4 code, h5 code, td code {
+font-family: inherit !important;
+font-size: inherit !important;
+}
+
+td code {
+font-weight: bold;
+}
+
+a:link, a:hover, a:visited {
+text-decoration: none;
+color: black;
+}
+
+a:link {
+background: #ff8;
+}
+
+a:visited {
+color: #666;
+background: #ffc;
+}
+
+a:link:hover,
+a:visited:hover {
+background: #ff0;
+}
+
+a[href^="http"] {
+background: #bff;
+}
+
+a[href^="http"]:visited {
+background: #dff;
+}
+
+a[href^="http"]:link:hover,
+a[href^="http"]:visited:hover {
+background: #0ff;
+}
+
+a[href^="http"]:after {
+content: "\21B3";
+background: white;
+padding-left: 0.2em;
+}
+
+#meta {
+margin-top: 2em;
+}
+
+#table-of-contents a:link,
+#table-of-contents a:visited {
+color: black;
+background: transparent;
+}
+
+#table-of-contents {
+line-height: 1.2;
+}
+#table-of-contents h2 {
+border-bottom: 0;
+}
+
+#table-of-contents ul {
+list-style: none;
+padding-left: 0.5em;
+font-weight: normal;
+}
+
+#table-of-contents div>ul>li {
+margin-top: 1em;
+font-weight: bold;
+}
+
+#table-of-contents .tag {
+display: none;
+}
+
+#table-of-contents .todo,
+#table-of-contents .done {
+font-size: 80%;
+}
+
+#table-of-contents ol>li {
+margin-top: 1em;
+}
+
+table {
+width: 100%;
+}
+
+table, th, td {
+border: 1px solid #666;
+}
+
+th, td {
+padding: 0.5em;
+text-align: left;
+}
+
+tbody tr:nth-child(odd) {
+background-color: #eee;
+}
+
+img {
+max-width: 90%;
+}
+
+div.notice {
+position: relative;
+margin: 0 1.2em;
+padding: 0.25em 1em;
+border-left: 4px solid;
+}
+
+table + div.notice {
+margin-top: 2em;
+}
+
+div.notice a {
+background: transparent !important;
+border-bottom: 1px dotted;
+}
+
+div.notice a[href^="http"]:after {
+background: transparent !important;
+}
+
+div.notice:before {
+position: absolute;
+top: 0;
+right: 0;
+padding: 0.25em 0.5em 0;
+font-size: 60%;
+border-bottom-left-radius: 0.5em;
+}
+
+.notice-warning {
+background: #fcc;
+color: #600;
+}
+
+.notice-example {
+background: #def;
+color: #069;
+}
+
+.notice-info {
+background: #efe;
+color: #060;
+}
+
+.notice-warning a {
+color: #600;
+}
+
+.notice-example a {
+color: #069;
+}
+
+.notice-info a {
+color: #060;
+}
+
+div.notice-warning:before {
+content: "WARNING";
+background: #c99;
+color: #fcc;
+}
+
+div.notice-example:before {
+content: "EXAMPLE";
+background: #abc;
+color: #def;
+}
+
+div.notice-info:before {
+content: "INFO";
+background: #9c9;
+color: #efe;
+}
+
+/* things inside the #+BEGIN_NOTE...#+END_NOTE block */
+div.NOTE a {
+background: transparent !important;
+border-bottom: 1px dotted;
+}
+
+div.NOTE {
+position: relative;
+margin: 0 1.2em;
+padding: 0.25em 1em;
+border-left: 4px solid;
+margin-top: 2em;
+background: #efe;
+color: #060;
+}
+
+div.NOTE:before {
+position: absolute;
+top: 0;
+right: 0;
+padding: 0.25em 0.5em 0;
+font-size: 60%;
+border-bottom-left-radius: 0.5em;
+content: "NOTE";
+background: #9c9;
+color: #efe;
+}
+
+blockquote {
+padding: 0px 10px 0px 10px;
+border: 1px solid #ddd;
+background: #eee;
+box-shadow: 5px 5px 5px #eee;
+border-radius: 2px;
+line-height: 1.2em;
+}
+
+pre {
+font-family: "Inconsolata", "monospace";
+font-size: 100%;
+border: 0;
+box-shadow: none;
+overflow: auto;
+}
+
+pre.example:before {
+content: "EXAMPLE";
+display: block;
+border-bottom: 1px dotted;
+margin-bottom: 1em;
+}
+
+pre.example {
+background: #fec;
+color: #666;
+font-size: 0.85em;
+}
+
+pre {
+background-color: #f8f8f8;
+background-size: 8px 8px;
+background-image: linear-gradient(135deg, transparent 25%, rgba(0, 0, 0, 0.02) 25%, rgba(0, 0, 0, 0.02) 50%, transparent 50%, transparent 75%, rgba(0, 0, 0, 0.02) 75%, rgba(0, 0, 0, 0.02));
+}
+
+pre.src {
+padding: 0.5em;
+}
+
+pre.src:before {
+display: block;
+position: absolute;
+background-color: #ccccd0;
+top: 0;
+right: 0;
+padding: 0.25em 0.5em;
+border-bottom-left-radius: 8px;
+border: 0;
+color: white;
+font-size: 80%;
+}
+
+pre.src-plantuml:before {
+content: "UML";
+}
+
+pre.src-javascript:before {
+content: "JS";
+}
+
+pre.src-clojure:before {
+content: "CLJ";
+}
+
+pre.src-c:before {
+content: "C";
+}
+
+pre.src-sh:before {
+content: "Shell";
+}
+
+pre.src-es:before {
+content: "ES";
+}
+
+span.org-string {
+color: #f94;
+}
+
+span.org-keyword {
+color: #c07;
+}
+
+span.org-variable-name {
+color: #f04;
+}
+
+span.org-clojure-keyword {
+color: #09f;
+}
+
+span.org-comment, span.org-comment-delimiter {
+color: #999;
+}
+
+span.org-rainbow-delimiters-depth-1, span.org-rainbow-delimiters-depth-5 {
+color: #666;
+}
+
+span.org-rainbow-delimiters-depth-2, span.org-rainbow-delimiters-depth-6 {
+color: #888;
+}
+
+span.org-rainbow-delimiters-depth-3, span.org-rainbow-delimiters-depth-7 {
+color: #aaa;
+}
+
+span.org-rainbow-delimiters-depth-4, span.org-rainbow-delimiters-depth-8 {
+color: #ccc;
+}
+
+div.figure {
+font-size: 0.85em;
+}
+
+.tag {
+font-family: "Roboto Slab", Helvetica, Arial, sans-serif;
+font-size: 11px;
+font-weight: normal;
+float: right;
+margin-top: 1em;
+background: transparent;
+}
+
+.tag span {
+background: #ccc;
+padding: 0 0.5em;
+border-radius: 0.2em;
+color: white;
+}
+
+.todo, .done {
+font-family: "Roboto Slab", Helvetica, Arial, sans-serif;
+font-weight: normal;
+padding: 0 0.25em;
+border-radius: 0.2em;
+}
+
+.todo {
+background: #f04;
+color: white;
+}
+
+.done {
+background: #5f7;
+color: white;
+}
+
+@media screen {
+h1.title {
+text-align: left;
+margin: 0.5em 0 1em 0;
+}
+
+h2 {
+margin-top: 3em;
+}
+
+#table-of-contents {
+position: fixed;
+top: 0;
+left: 0;
+padding: 2em 0 2em 2em;
+width: 290px;
+height: 100vh;
+font-size: 11px;
+background: #eee;
+overlow-x: hidden;
+overlow-y: auto;
+}
+
+#table-of-contents h2 {
+margin-top: 0;
+}
+
+#table-of-contents code {
+font-size: 12px;
+}
+
+div#content {
+margin-left: 320px;
+max-width: 1100px;
+}
+div#postamble {
+margin-left: 320px;
+max-width: 1100px;
+}
+}
+
+@media screen and (max-width: 1024px) {
+html, body {
+font-size: 14px;
+}
+
+#table-of-contents {
+display: none;
+}
+
+h1.title {
+margin-left: 0%;
+}
+
+div#content {
+margin-left: 5%;
+max-width: 90%;
+}
+div#postamble {
+margin-left: 5%;
+max-width: 90%;
+}
+}
+
+@media print {
+
+body {
+color: black;
+}
+
+@page {
+margin: 25mm;
+}
+
+h2, h3 {
+page-break-before: always;
+margin-top: 0;
+}
+
+table {
+page-break-inside: avoid;
+}
+
+a:visited {
+color: black;
+background: #ff8;
+}
+
+a[href^="http"]:visited {
+background: #bff;
+}
+
+div.notice:before {
+display: none;
+}
+}
+</style>
+</head>
+<body>
+<div id="content" class="content">
+<h1 class="title">Gray Theme</h1>
+<div id="table-of-contents" role="doc-toc">
+<h2>Table of Contents</h2>
+<div id="text-table-of-contents" role="doc-toc">
+<ul>
+<li><a href="#org0ccedaf">1. Plain Org</a></li>
+<li><a href="#orgfd19679">2. Lists</a>
+<ul>
+<li><a href="#org407afbb">2.1. Todo List</a>
+<ul>
+<li><a href="#orgc418707">2.1.1. <span class="todo TODO">TODO</span> First todo</a></li>
+<li><a href="#orgc18d408">2.1.2. <span class="done DONE">DONE</span> First Done with Date</a></li>
+<li><a href="#org5a0d070">2.1.3. <span class="todo TODO">TODO</span> Scheduled</a></li>
+<li><a href="#org5671973">2.1.4. <span class="todo TODO">TODO</span> Deadline</a></li>
+<li><a href="#orgf5ede8b">2.1.5. <span class="todo TODO">TODO</span> Date</a></li>
+</ul>
+</li>
+<li><a href="#org78a53a9">2.2. Simple list</a></li>
+<li><a href="#org4bc1f93">2.3. Sorted List</a></li>
+<li><a href="#org164c018">2.4. Checkbox</a></li>
+</ul>
+</li>
+<li><a href="#org0d19632">3. H1</a>
+<ul>
+<li><a href="#orgc6b5857">3.1. H2</a>
+<ul>
+<li><a href="#org8c8b338">3.1.1. H3</a></li>
+</ul>
+</li>
+</ul>
+</li>
+<li><a href="#org13b6831">4. Table</a></li>
+<li><a href="#org951db71">5. Blocks</a>
+<ul>
+<li><a href="#org2a66a12">5.1. Center</a></li>
+<li><a href="#org9d343ad">5.2. Comment</a></li>
+<li><a href="#org5594273">5.3. Example</a></li>
+<li><a href="#orgf723821">5.4. Quote</a></li>
+<li><a href="#orgee741a4">5.5. Verse</a></li>
+<li><a href="#orgddc0891">5.6. Source Blocks</a>
+<ul>
+<li><a href="#org13f87f7">5.6.1. Python</a></li>
+<li><a href="#org26e47e0">5.6.2. Elisp</a></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</div>
+</div>
+
+<div id="outline-container-org0ccedaf" class="outline-2">
+<h2 id="org0ccedaf"><span class="section-number-2">1.</span> Plain Org</h2>
+<div class="outline-text-2" id="text-1">
+<p>
+Includes:
+</p>
+<ul class="org-ul">
+<li class="on"><code>[X]</code> CSS</li>
+<li class="off"><code>[&#xa0;]</code> JAVASCRIPT</li>
+</ul>
+
+<p>
+Available as:
+</p>
+<ul class="org-ul">
+<li class="on"><code>[X]</code> CSS FILE</li>
+<li class="off"><code>[&#xa0;]</code> JS FILE</li>
+<li class="on"><code>[X]</code> SETUPFILE</li>
+</ul>
+</div>
+</div>
+
+<div id="outline-container-orgfd19679" class="outline-2">
+<h2 id="orgfd19679"><span class="section-number-2">2.</span> Lists</h2>
+<div class="outline-text-2" id="text-2">
+</div>
+<div id="outline-container-org407afbb" class="outline-3">
+<h3 id="org407afbb"><span class="section-number-3">2.1.</span> Todo List</h3>
+<div class="outline-text-3" id="text-2-1">
+</div>
+<div id="outline-container-orgc418707" class="outline-4">
+<h4 id="orgc418707"><span class="section-number-4">2.1.1.</span> <span class="todo TODO">TODO</span> First todo</h4>
+</div>
+<div id="outline-container-orgc18d408" class="outline-4">
+<h4 id="orgc18d408"><span class="section-number-4">2.1.2.</span> <span class="done DONE">DONE</span> First Done with Date</h4>
+<div class="outline-text-4" id="text-2-1-2">
+</div>
+</div>
+<div id="outline-container-org5a0d070" class="outline-4">
+<h4 id="org5a0d070"><span class="section-number-4">2.1.3.</span> <span class="todo TODO">TODO</span> Scheduled</h4>
+<div class="outline-text-4" id="text-2-1-3">
+</div>
+</div>
+<div id="outline-container-org5671973" class="outline-4">
+<h4 id="org5671973"><span class="section-number-4">2.1.4.</span> <span class="todo TODO">TODO</span> Deadline</h4>
+<div class="outline-text-4" id="text-2-1-4">
+</div>
+</div>
+<div id="outline-container-orgf5ede8b" class="outline-4">
+<h4 id="orgf5ede8b"><span class="section-number-4">2.1.5.</span> <span class="todo TODO">TODO</span> Date</h4>
+<div class="outline-text-4" id="text-2-1-5">
+<p>
+<span class="timestamp-wrapper"><span class="timestamp">&lt;2021-02-18 Thu&gt;</span></span>
+</p>
+</div>
+</div>
+</div>
+
+<div id="outline-container-org78a53a9" class="outline-3">
+<h3 id="org78a53a9"><span class="section-number-3">2.2.</span> Simple list</h3>
+<div class="outline-text-3" id="text-2-2">
+<ul class="org-ul">
+<li>List item</li>
+<li>List item</li>
+<li>List item</li>
+<li>List item</li>
+</ul>
+</div>
+</div>
+
+<div id="outline-container-org4bc1f93" class="outline-3">
+<h3 id="org4bc1f93"><span class="section-number-3">2.3.</span> Sorted List</h3>
+<div class="outline-text-3" id="text-2-3">
+<ol class="org-ol">
+<li>List item</li>
+<li>List item</li>
+<li>List item</li>
+<li>List item</li>
+</ol>
+</div>
+</div>
+
+<div id="outline-container-org164c018" class="outline-3">
+<h3 id="org164c018"><span class="section-number-3">2.4.</span> Checkbox</h3>
+<div class="outline-text-3" id="text-2-4">
+<ul class="org-ul">
+<li class="off"><code>[&#xa0;]</code> List item</li>
+<li class="on"><code>[X]</code> List item</li>
+<li class="off"><code>[&#xa0;]</code> List item</li>
+<li class="on"><code>[X]</code> List item</li>
+</ul>
+</div>
+</div>
+</div>
+
+
+<div id="outline-container-org0d19632" class="outline-2">
+<h2 id="org0d19632"><span class="section-number-2">3.</span> H1</h2>
+<div class="outline-text-2" id="text-3">
+<p>
+H1 Text
+</p>
+</div>
+<div id="outline-container-orgc6b5857" class="outline-3">
+<h3 id="orgc6b5857"><span class="section-number-3">3.1.</span> H2</h3>
+<div class="outline-text-3" id="text-3-1">
+<p>
+H2 Text
+</p>
+</div>
+<div id="outline-container-org8c8b338" class="outline-4">
+<h4 id="org8c8b338"><span class="section-number-4">3.1.1.</span> H3</h4>
+<div class="outline-text-4" id="text-3-1-1">
+<p>
+H3 Text
+</p>
+</div>
+<ol class="org-ol">
+<li><a id="orge06b470"></a>H4<br />
+<div class="outline-text-5" id="text-3-1-1-1">
+<p>
+H4 Text
+</p>
+</div>
+<ol class="org-ol">
+<li><a id="org3ea6e39"></a>H5<br />
+<div class="outline-text-6" id="text-3-1-1-1-1">
+<p>
+H5 Text
+</p>
+</div>
+<ol class="org-ol">
+<li><a id="org48406a9"></a>H6<br />
+<div class="outline-text-7" id="text-3-1-1-1-1-1">
+<p>
+H6 Text
+</p>
+</div>
+<ol class="org-ol">
+<li><a id="orgee78efe"></a>H7<br />
+<div class="outline-text-8" id="text-3-1-1-1-1-1-1">
+<p>
+H7 Text
+</p>
+</div>
+</li>
+</ol>
+</li>
+</ol>
+</li>
+</ol>
+</li>
+</ol>
+</div>
+</div>
+</div>
+
+
+
+<div id="outline-container-org13b6831" class="outline-2">
+<h2 id="org13b6831"><span class="section-number-2">4.</span> Table</h2>
+<div class="outline-text-2" id="text-4">
+<table border="2" cellspacing="0" cellpadding="6" rules="groups" frame="hsides">
+
+
+<colgroup>
+<col  class="org-right" />
+
+<col  class="org-right" />
+
+<col  class="org-right" />
+
+<col  class="org-right" />
+
+<col  class="org-right" />
+
+<col  class="org-right" />
+</colgroup>
+<thead>
+<tr>
+<th scope="col" class="org-right">a</th>
+<th scope="col" class="org-right">b</th>
+<th scope="col" class="org-right">c</th>
+<th scope="col" class="org-right">d</th>
+<th scope="col" class="org-right">e</th>
+<th scope="col" class="org-right">f</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td class="org-right">0</td>
+<td class="org-right">1</td>
+<td class="org-right">2</td>
+<td class="org-right">3</td>
+<td class="org-right">4</td>
+<td class="org-right">5</td>
+</tr>
+
+<tr>
+<td class="org-right">6</td>
+<td class="org-right">7</td>
+<td class="org-right">8</td>
+<td class="org-right">9</td>
+<td class="org-right">10</td>
+<td class="org-right">11</td>
+</tr>
+
+<tr>
+<td class="org-right">12</td>
+<td class="org-right">13</td>
+<td class="org-right">14</td>
+<td class="org-right">15</td>
+<td class="org-right">16</td>
+<td class="org-right">17</td>
+</tr>
+
+<tr>
+<td class="org-right">18</td>
+<td class="org-right">19</td>
+<td class="org-right">20</td>
+<td class="org-right">21</td>
+<td class="org-right">22</td>
+<td class="org-right">23</td>
+</tr>
+
+<tr>
+<td class="org-right">24</td>
+<td class="org-right">25</td>
+<td class="org-right">26</td>
+<td class="org-right">27</td>
+<td class="org-right">28</td>
+<td class="org-right">29</td>
+</tr>
+</tbody>
+</table>
+</div>
+</div>
+
+
+<div id="outline-container-org951db71" class="outline-2">
+<h2 id="org951db71"><span class="section-number-2">5.</span> Blocks</h2>
+<div class="outline-text-2" id="text-5">
+</div>
+<div id="outline-container-org2a66a12" class="outline-3">
+<h3 id="org2a66a12"><span class="section-number-3">5.1.</span> Center</h3>
+<div class="outline-text-3" id="text-5-1">
+<div class="org-center">
+<p>
+This is a center block
+</p>
+</div>
+</div>
+</div>
+
+<div id="outline-container-org9d343ad" class="outline-3">
+<h3 id="org9d343ad"><span class="section-number-3">5.2.</span> Comment</h3>
+<div class="outline-text-3" id="text-5-2">
+</div>
+</div>
+
+<div id="outline-container-org5594273" class="outline-3">
+<h3 id="org5594273"><span class="section-number-3">5.3.</span> Example</h3>
+<div class="outline-text-3" id="text-5-3">
+<pre class="example" id="org8369dc6">
+This is an example block
+</pre>
+</div>
+</div>
+
+<div id="outline-container-orgf723821" class="outline-3">
+<h3 id="orgf723821"><span class="section-number-3">5.4.</span> Quote</h3>
+<div class="outline-text-3" id="text-5-4">
+<blockquote>
+<p>
+This is a quote block
+</p>
+</blockquote>
+</div>
+</div>
+
+<div id="outline-container-orgee741a4" class="outline-3">
+<h3 id="orgee741a4"><span class="section-number-3">5.5.</span> Verse</h3>
+<div class="outline-text-3" id="text-5-5">
+<p class="verse">
+This is a verse block<br />
+</p>
+</div>
+</div>
+
+
+<div id="outline-container-orgddc0891" class="outline-3">
+<h3 id="orgddc0891"><span class="section-number-3">5.6.</span> Source Blocks</h3>
+<div class="outline-text-3" id="text-5-6">
+</div>
+<div id="outline-container-org13f87f7" class="outline-4">
+<h4 id="org13f87f7"><span class="section-number-4">5.6.1.</span> Python</h4>
+<div class="outline-text-4" id="text-5-6-1">
+<div class="org-src-container">
+<pre class="src src-python"><span style="color: #5180b3;">for</span> i <span style="color: #5180b3;">in</span> <span style="color: #528fd1;">range</span><span style="color: #ff8c00;">(</span>10<span style="color: #ff8c00;">)</span>:
+    <span style="color: #528fd1;">print</span><span style="color: #ff8c00;">(</span>i<span style="color: #ff8c00;">)</span>
+</pre>
+</div>
+
+<pre class="example" id="orgdf8a796">
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+</pre>
+</div>
+</div>
+
+<div id="outline-container-org26e47e0" class="outline-4">
+<h4 id="org26e47e0"><span class="section-number-4">5.6.2.</span> Elisp</h4>
+<div class="outline-text-4" id="text-5-6-2">
+<div class="org-src-container">
+<pre class="src src-emacs-lisp"><span style="color: #ff8c00;">(</span>car '<span style="color: #ff1493;">(</span>a b c d<span style="color: #ff1493;">)</span><span style="color: #ff8c00;">)</span>
+</pre>
+</div>
+
+<pre class="example">
+a
+</pre>
+</div>
+</div>
+</div>
+</div>
+</div>
+<div id="postamble" class="status">
+<p class="author">Author: Lee Hinman (<a href="mailto:lee@writequit.org">lee@writequit.org</a>)</p>
+  				      <p class="update">Last Update: 2022-06-24 Fri 19:42</p>
+</div>
+</body>
+</html>
diff --git a/packages/org-themes/public/index.html b/packages/org-themes/public/index.html
index 18d098421f216eea18744ff7d8e20cf8fe47639d..d75e998148f0334da58164242db11682f72da144 100644
--- a/packages/org-themes/public/index.html
+++ b/packages/org-themes/public/index.html
@@ -3,14 +3,14 @@
 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
 <html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en">
 <head>
-<!-- 2021-03-01 Mon 23:51 -->
+<!-- 2022-06-24 Fri 19:42 -->
 <meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
 <meta name="viewport" content="width=device-width, initial-scale=1" />
 <title>Org Themes collection</title>
 <meta name="author" content="Marco PawΕ‚owski" />
 <meta name="generator" content="Org Mode" />
-<style type="text/css">
- <!--/*--><![CDATA[/*><!--*/
+<style>
+  #content { max-width: 60em; margin: auto; }
   .title  { text-align: center;
              margin-bottom: .2em; }
   .subtitle { text-align: center;
@@ -31,8 +31,9 @@
   #postamble p, #preamble p { font-size: 90%; margin: .2em; }
   p.verse { margin-left: 3%; }
   pre {
-    border: 1px solid #ccc;
-    box-shadow: 3px 3px 3px #eee;
+    border: 1px solid #e6e6e6;
+    border-radius: 3px;
+    background-color: #f2f2f2;
     padding: 8pt;
     font-family: monospace;
     overflow: auto;
@@ -41,21 +42,21 @@
   pre.src {
     position: relative;
     overflow: auto;
-    padding-top: 1.2em;
   }
   pre.src:before {
     display: none;
     position: absolute;
-    background-color: white;
-    top: -10px;
-    right: 10px;
+    top: -8px;
+    right: 12px;
     padding: 3px;
-    border: 1px solid black;
+    color: #555;
+    background-color: #f2f2f299;
   }
   pre.src:hover:before { display: inline; margin-top: 14px;}
   /* Languages per Org manual */
   pre.src-asymptote:before { content: 'Asymptote'; }
   pre.src-awk:before { content: 'Awk'; }
+  pre.src-authinfo::before { content: 'Authinfo'; }
   pre.src-C:before { content: 'C'; }
   /* pre.src-C++ doesn't work in CSS */
   pre.src-clojure:before { content: 'Clojure'; }
@@ -190,39 +191,16 @@
     { font-size: 10px; font-weight: bold; white-space: nowrap; }
   .org-info-js_search-highlight
     { background-color: #ffff00; color: #000000; font-weight: bold; }
-  .org-svg { width: 90%; }
-  /*]]>*/-->
+  .org-svg { }
 </style>
-<script type="text/javascript">
-// @license magnet:?xt=urn:btih:e95b018ef3580986a04669f1b5879592219e2a7a&dn=public-domain.txt Public Domain
-<!--/*--><![CDATA[/*><!--*/
-     function CodeHighlightOn(elem, id)
-     {
-       var target = document.getElementById(id);
-       if(null != target) {
-         elem.classList.add("code-highlighted");
-         target.classList.add("code-highlighted");
-       }
-     }
-     function CodeHighlightOff(elem, id)
-     {
-       var target = document.getElementById(id);
-       if(null != target) {
-         elem.classList.remove("code-highlighted");
-         target.classList.remove("code-highlighted");
-       }
-     }
-    /*]]>*///-->
-// @license-end
-</script>
 </head>
 <body>
-<div id="content">
+<div id="content" class="content">
 <h1 class="title">Org Themes collection</h1>
 
 
 <div id="outline-container-org49cbd84" class="outline-2">
-<h2 id="org49cbd84"><span class="section-number-2">1</span> Introduction</h2>
+<h2 id="org49cbd84"><span class="section-number-2">1.</span> Introduction</h2>
 <div class="outline-text-2" id="text-1">
 <p>
 This is a collection of org html themes.<br />
@@ -254,7 +232,7 @@ The git repository with all files is here: <a href="https://gitlab.com/OlMon/org
 </div>
 
 <div id="outline-container-orgb75f03f" class="outline-3">
-<h3 id="orgb75f03f"><span class="section-number-3">1.1</span> Credits and apology</h3>
+<h3 id="orgb75f03f"><span class="section-number-3">1.1.</span> Credits and apology</h3>
 <div class="outline-text-3" id="text-1-1">
 <p>
 I used them mainly embedded, with custom created SETUPFILES.<br />
@@ -266,7 +244,7 @@ without your name. Please contact me, so I can credit you.<br />
 </div>
 
 <div id="outline-container-org13f0fb5" class="outline-3">
-<h3 id="org13f0fb5"><span class="section-number-3">1.2</span> Contributions</h3>
+<h3 id="org13f0fb5"><span class="section-number-3">1.2.</span> Contributions</h3>
 <div class="outline-text-3" id="text-1-2">
 <p>
 I am happy to include any theme.<br />
@@ -275,7 +253,7 @@ I am happy to include any theme.<br />
 </div>
 
 <div id="outline-container-orgb0c111f" class="outline-3">
-<h3 id="orgb0c111f"><span class="section-number-3">1.3</span> The list of Themes:</h3>
+<h3 id="orgb0c111f"><span class="section-number-3">1.3.</span> The list of Themes:</h3>
 <div class="outline-text-3" id="text-1-3">
 <table border="2" cellspacing="0" cellpadding="6" rules="groups" frame="hsides">
 
@@ -296,6 +274,10 @@ I am happy to include any theme.<br />
 <td class="org-left"><a href="darksun/darksun.html">darksun</a></td>
 </tr>
 
+<tr>
+<td class="org-left"><a href="gray/gray.html">gray</a></td>
+</tr>
+
 <tr>
 <td class="org-left"><a href="imagine_light/imagine_light.html">imagine light</a></td>
 </tr>
@@ -358,7 +340,7 @@ I am happy to include any theme.<br />
 
 
 <div id="outline-container-org87b5065" class="outline-3">
-<h3 id="org87b5065"><span class="section-number-3">1.4</span> Irony</h3>
+<h3 id="org87b5065"><span class="section-number-3">1.4.</span> Irony</h3>
 <div class="outline-text-3" id="text-1-4">
 <p>
 There is a reason that this page has no custom css.<br />
@@ -369,7 +351,7 @@ There is a reason that this page has no custom css.<br />
 </div>
 <div id="postamble" class="status">
 <p class="author">Author: Marco PawΕ‚owski (<a href="mailto:pawlowski.marco@gmail.com">pawlowski.marco@gmail.com</a>)</p>
-                                      <p class="update">Last Update: 2021-03-01 Mon 23:51</p>
+  				      <p class="update">Last Update: 2022-06-24 Fri 19:40</p>
 </div>
 </body>
 </html>
diff --git a/packages/org-themes/src/gray/gray.css b/packages/org-themes/src/gray/gray.css
new file mode 100644
index 0000000000000000000000000000000000000000..a0da868ca809e561de74b4a8acefaaeace175cf5
--- /dev/null
+++ b/packages/org-themes/src/gray/gray.css
@@ -0,0 +1,497 @@
+body {
+    font-family: Helvetica, Arial, sans-serif;
+    font-size: 16px;
+    line-height: 1.4;
+    color: #33333f;
+    }
+
+code {
+        font-family: "Inconsolata", "monospace";
+        font-size: 16px;
+    }
+
+p>code, li>code {
+        background-color: #eee;
+        padding: 0.25em;
+    }
+
+h1, h2, h3 {
+        font-family: "Roboto Slab", Helvetica, Arial, sans-serif;
+    }
+
+h2 {
+        border-bottom: 1px solid #f0c;
+        padding-bottom: 0.5em;
+        font-size: 1.75em;
+    }
+
+h3 {
+        margin-top: 2em;
+        font-size: 1.5em;
+    }
+
+h4 {
+        font-size: 1.25em;
+    }
+
+h5 {
+        font-size: 1em;
+    }
+
+h2 code, h3 code, h4 code, h5 code, td code {
+        font-family: inherit !important;
+        font-size: inherit !important;
+    }
+
+td code {
+        font-weight: bold;
+    }
+
+a:link, a:hover, a:visited {
+                         text-decoration: none;
+                         color: black;
+                     }
+
+a:link {
+        background: #ff8;
+    }
+
+a:visited {
+        color: #666;
+        background: #ffc;
+    }
+
+a:link:hover,
+a:visited:hover {
+        background: #ff0;
+    }
+
+a[href^="http"] {
+        background: #bff;
+    }
+
+a[href^="http"]:visited {
+        background: #dff;
+    }
+
+a[href^="http"]:link:hover,
+a[href^="http"]:visited:hover {
+        background: #0ff;
+    }
+
+a[href^="http"]:after {
+        content: "\21B3";
+        background: white;
+        padding-left: 0.2em;
+    }
+
+#meta {
+        margin-top: 2em;
+    }
+
+#table-of-contents a:link,
+#table-of-contents a:visited {
+        color: black;
+        background: transparent;
+    }
+
+#table-of-contents {
+        line-height: 1.2;
+    }
+#table-of-contents h2 {
+        border-bottom: 0;
+    }
+
+#table-of-contents ul {
+        list-style: none;
+        padding-left: 0.5em;
+        font-weight: normal;
+    }
+
+#table-of-contents div>ul>li {
+        margin-top: 1em;
+        font-weight: bold;
+    }
+
+#table-of-contents .tag {
+        display: none;
+    }
+
+#table-of-contents .todo,
+#table-of-contents .done {
+        font-size: 80%;
+    }
+
+#table-of-contents ol>li {
+        margin-top: 1em;
+    }
+
+table {
+        width: 100%;
+    }
+
+table, th, td {
+        border: 1px solid #666;
+    }
+
+th, td {
+        padding: 0.5em;
+        text-align: left;
+    }
+
+tbody tr:nth-child(odd) {
+        background-color: #eee;
+    }
+
+img {
+        max-width: 90%;
+    }
+
+div.notice {
+        position: relative;
+        margin: 0 1.2em;
+        padding: 0.25em 1em;
+        border-left: 4px solid;
+    }
+
+table + div.notice {
+        margin-top: 2em;
+    }
+
+div.notice a {
+        background: transparent !important;
+        border-bottom: 1px dotted;
+    }
+
+div.notice a[href^="http"]:after {
+        background: transparent !important;
+    }
+
+div.notice:before {
+        position: absolute;
+        top: 0;
+        right: 0;
+        padding: 0.25em 0.5em 0;
+        font-size: 60%;
+        border-bottom-left-radius: 0.5em;
+    }
+
+.notice-warning {
+        background: #fcc;
+        color: #600;
+    }
+
+.notice-example {
+        background: #def;
+        color: #069;
+    }
+
+.notice-info {
+        background: #efe;
+        color: #060;
+    }
+
+.notice-warning a {
+        color: #600;
+    }
+
+.notice-example a {
+        color: #069;
+    }
+
+.notice-info a {
+        color: #060;
+    }
+
+div.notice-warning:before {
+        content: "WARNING";
+        background: #c99;
+        color: #fcc;
+    }
+
+div.notice-example:before {
+        content: "EXAMPLE";
+        background: #abc;
+        color: #def;
+    }
+
+div.notice-info:before {
+        content: "INFO";
+        background: #9c9;
+        color: #efe;
+    }
+
+/* things inside the #+BEGIN_NOTE...#+END_NOTE block */
+div.NOTE a {
+        background: transparent !important;
+        border-bottom: 1px dotted;
+    }
+
+div.NOTE {
+        position: relative;
+        margin: 0 1.2em;
+        padding: 0.25em 1em;
+        border-left: 4px solid;
+        margin-top: 2em;
+        background: #efe;
+        color: #060;
+    }
+
+div.NOTE:before {
+        position: absolute;
+        top: 0;
+        right: 0;
+        padding: 0.25em 0.5em 0;
+        font-size: 60%;
+        border-bottom-left-radius: 0.5em;
+        content: "NOTE";
+        background: #9c9;
+        color: #efe;
+    }
+
+blockquote {
+        padding: 0px 10px 0px 10px;
+        border: 1px solid #ddd;
+        background: #eee;
+        box-shadow: 5px 5px 5px #eee;
+        border-radius: 2px;
+        line-height: 1.2em;
+    }
+
+pre {
+        font-family: "Inconsolata", "monospace";
+        font-size: 100%;
+        border: 0;
+        box-shadow: none;
+        overflow: auto;
+    }
+
+pre.example:before {
+        content: "EXAMPLE";
+        display: block;
+        border-bottom: 1px dotted;
+        margin-bottom: 1em;
+    }
+
+pre.example {
+        background: #fec;
+        color: #666;
+        font-size: 0.85em;
+    }
+
+pre {
+        background-color: #f8f8f8;
+        background-size: 8px 8px;
+        background-image: linear-gradient(135deg, transparent 25%, rgba(0, 0, 0, 0.02) 25%, rgba(0, 0, 0, 0.02) 50%, transparent 50%, transparent 75%, rgba(0, 0, 0, 0.02) 75%, rgba(0, 0, 0, 0.02));
+    }
+
+pre.src {
+        padding: 0.5em;
+    }
+
+pre.src:before {
+        display: block;
+        position: absolute;
+        background-color: #ccccd0;
+        top: 0;
+        right: 0;
+        padding: 0.25em 0.5em;
+        border-bottom-left-radius: 8px;
+        border: 0;
+        color: white;
+        font-size: 80%;
+    }
+
+pre.src-plantuml:before {
+        content: "UML";
+    }
+
+pre.src-javascript:before {
+        content: "JS";
+    }
+
+pre.src-clojure:before {
+        content: "CLJ";
+    }
+
+pre.src-c:before {
+        content: "C";
+    }
+
+pre.src-sh:before {
+        content: "Shell";
+    }
+
+pre.src-es:before {
+        content: "ES";
+    }
+
+span.org-string {
+        color: #f94;
+    }
+
+span.org-keyword {
+        color: #c07;
+    }
+
+span.org-variable-name {
+        color: #f04;
+    }
+
+span.org-clojure-keyword {
+        color: #09f;
+    }
+
+span.org-comment, span.org-comment-delimiter {
+        color: #999;
+    }
+
+span.org-rainbow-delimiters-depth-1, span.org-rainbow-delimiters-depth-5 {
+        color: #666;
+    }
+
+span.org-rainbow-delimiters-depth-2, span.org-rainbow-delimiters-depth-6 {
+        color: #888;
+    }
+
+span.org-rainbow-delimiters-depth-3, span.org-rainbow-delimiters-depth-7 {
+        color: #aaa;
+    }
+
+span.org-rainbow-delimiters-depth-4, span.org-rainbow-delimiters-depth-8 {
+        color: #ccc;
+    }
+
+div.figure {
+        font-size: 0.85em;
+    }
+
+.tag {
+        font-family: "Roboto Slab", Helvetica, Arial, sans-serif;
+        font-size: 11px;
+        font-weight: normal;
+        float: right;
+        margin-top: 1em;
+        background: transparent;
+    }
+
+.tag span {
+        background: #ccc;
+        padding: 0 0.5em;
+        border-radius: 0.2em;
+        color: white;
+    }
+
+.todo, .done {
+        font-family: "Roboto Slab", Helvetica, Arial, sans-serif;
+        font-weight: normal;
+        padding: 0 0.25em;
+        border-radius: 0.2em;
+    }
+
+.todo {
+        background: #f04;
+        color: white;
+    }
+
+.done {
+        background: #5f7;
+        color: white;
+    }
+
+@media screen {
+        h1.title {
+                text-align: left;
+                margin: 0.5em 0 1em 0;
+            }
+    
+        h2 {
+                margin-top: 3em;
+            }
+    
+        #table-of-contents {
+                position: fixed;
+                top: 0;
+                left: 0;
+                padding: 2em 0 2em 2em;
+                width: 290px;
+                height: 100vh;
+                font-size: 11px;
+                background: #eee;
+                overlow-x: hidden;
+                overlow-y: auto;
+            }
+    
+        #table-of-contents h2 {
+                margin-top: 0;
+            }
+    
+        #table-of-contents code {
+                font-size: 12px;
+            }
+    
+        div#content {
+                margin-left: 320px;
+                max-width: 1100px;
+            }
+        div#postamble {
+                margin-left: 320px;
+                max-width: 1100px;
+            }
+    }
+
+@media screen and (max-width: 1024px) {
+        html, body {
+                font-size: 14px;
+            }
+    
+        #table-of-contents {
+                display: none;
+            }
+    
+        h1.title {
+                margin-left: 0%;
+            }
+    
+        div#content {
+                margin-left: 5%;
+                max-width: 90%;
+            }
+        div#postamble {
+                margin-left: 5%;
+                max-width: 90%;
+            }
+    }
+
+@media print {
+    
+        body {
+                color: black;
+            }
+    
+        @page {
+                margin: 25mm;
+            }
+    
+        h2, h3 {
+                page-break-before: always;
+                margin-top: 0;
+            }
+    
+        table {
+                page-break-inside: avoid;
+            }
+    
+        a:visited {
+                color: black;
+                background: #ff8;
+            }
+    
+        a[href^="http"]:visited {
+            background: #bff;
+        }
+    
+        div.notice:before {
+            display: none;
+        }
+}
diff --git a/packages/org-themes/src/gray/gray.org b/packages/org-themes/src/gray/gray.org
new file mode 100644
index 0000000000000000000000000000000000000000..1b35cb3c34c5c6ee47666b65e06bdfa79f2ba548
--- /dev/null
+++ b/packages/org-themes/src/gray/gray.org
@@ -0,0 +1,132 @@
+#+SETUPFILE: ~/.emacs.d/org-styles/html/gray.theme
+#+TITLE: Gray Theme
+#+AUTHOR: Lee Hinman
+#+EMAIL: lee@writequit.org
+#+OPTIONS: ^:nil <:t
+
+* Plain Org
+Includes:
+- [X] CSS
+- [ ] JAVASCRIPT
+
+Available as:
+- [X] CSS FILE
+- [ ] JS FILE
+- [X] SETUPFILE
+
+* Lists
+** Todo List
+*** TODO First todo
+*** DONE First Done with Date
+CLOSED: [2021-02-18 Thu 10:12]
+*** TODO Scheduled
+SCHEDULED: <2021-02-18 Thu>
+*** TODO Deadline
+DEADLINE: <2021-02-18 Thu>
+*** TODO Date
+<2021-02-18 Thu>
+
+** Simple list
+- List item
+- List item
+- List item
+- List item
+
+** Sorted List
+1. List item
+2. List item
+3. List item
+4. List item
+
+** Checkbox
+- [ ] List item
+- [X] List item
+- [ ] List item
+- [X] List item
+
+
+* H1
+H1 Text
+** H2
+H2 Text
+*** H3
+H3 Text
+**** H4
+H4 Text
+***** H5
+H5 Text
+****** H6
+H6 Text
+******* H7
+H7 Text
+
+
+
+* Table
+
+|  a |  b |  c |  d |  e |  f |
+|----+----+----+----+----+----|
+|  0 |  1 |  2 |  3 |  4 |  5 |
+|  6 |  7 |  8 |  9 | 10 | 11 |
+| 12 | 13 | 14 | 15 | 16 | 17 |
+| 18 | 19 | 20 | 21 | 22 | 23 |
+| 24 | 25 | 26 | 27 | 28 | 29 |
+
+
+* Blocks
+** Center
+#+begin_center
+This is a center block
+#+end_center
+
+** Comment
+#+begin_comment
+This is a comment block
+#+end_comment
+
+** Example
+#+begin_example
+This is an example block
+#+end_example
+
+** Quote
+#+begin_quote
+This is a quote block
+#+end_quote
+
+** Verse
+#+begin_verse
+This is a verse block
+#+end_verse
+
+
+** Source Blocks
+*** Python
+#+begin_src python :results output :exports both
+for i in range(10):
+    print(i)
+#+end_src
+
+#+RESULTS:
+#+begin_example
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+#+end_example
+
+*** Elisp
+#+begin_src emacs-lisp :exports both
+(car '(a b c d))
+#+end_src
+
+#+RESULTS:
+: a
+
+
diff --git a/packages/org-themes/src/gray/gray.theme b/packages/org-themes/src/gray/gray.theme
new file mode 100644
index 0000000000000000000000000000000000000000..f5b0c39dfbe653f3f77e6cd7fc413ca8097cbd0f
--- /dev/null
+++ b/packages/org-themes/src/gray/gray.theme
@@ -0,0 +1,502 @@
+# Turn off default internal styles
+#+OPTIONS: html-style:nil
+
+#+HTML_HEAD: <style type="text/css">
+#+HTML_HEAD: body {
+#+HTML_HEAD:     font-family: Helvetica, Arial, sans-serif;
+#+HTML_HEAD:     font-size: 16px;
+#+HTML_HEAD:     line-height: 1.4;
+#+HTML_HEAD:     color: #33333f;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: code {
+#+HTML_HEAD:     font-family: "Inconsolata", "monospace";
+#+HTML_HEAD:     font-size: 16px;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: p>code, li>code {
+#+HTML_HEAD:     background-color: #eee;
+#+HTML_HEAD:     padding: 0.25em;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: h1, h2, h3 {
+#+HTML_HEAD:     font-family: "Roboto Slab", Helvetica, Arial, sans-serif;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: h2 {
+#+HTML_HEAD:     border-bottom: 1px solid #f0c;
+#+HTML_HEAD:     padding-bottom: 0.5em;
+#+HTML_HEAD:     font-size: 1.75em;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: h3 {
+#+HTML_HEAD:     margin-top: 2em;
+#+HTML_HEAD:     font-size: 1.5em;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: h4 {
+#+HTML_HEAD:     font-size: 1.25em;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: h5 {
+#+HTML_HEAD:     font-size: 1em;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: h2 code, h3 code, h4 code, h5 code, td code {
+#+HTML_HEAD:     font-family: inherit !important;
+#+HTML_HEAD:     font-size: inherit !important;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: td code {
+#+HTML_HEAD:     font-weight: bold;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: a:link, a:hover, a:visited {
+#+HTML_HEAD:                      text-decoration: none;
+#+HTML_HEAD:                      color: black;
+#+HTML_HEAD:                  }
+#+HTML_HEAD: 
+#+HTML_HEAD: a:link {
+#+HTML_HEAD:     background: #ff8;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: a:visited {
+#+HTML_HEAD:     color: #666;
+#+HTML_HEAD:     background: #ffc;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: a:link:hover,
+#+HTML_HEAD: a:visited:hover {
+#+HTML_HEAD:     background: #ff0;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: a[href^="http"] {
+#+HTML_HEAD:     background: #bff;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: a[href^="http"]:visited {
+#+HTML_HEAD:     background: #dff;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: a[href^="http"]:link:hover,
+#+HTML_HEAD: a[href^="http"]:visited:hover {
+#+HTML_HEAD:     background: #0ff;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: a[href^="http"]:after {
+#+HTML_HEAD:     content: "\21B3";
+#+HTML_HEAD:     background: white;
+#+HTML_HEAD:     padding-left: 0.2em;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: #meta {
+#+HTML_HEAD:     margin-top: 2em;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: #table-of-contents a:link,
+#+HTML_HEAD: #table-of-contents a:visited {
+#+HTML_HEAD:     color: black;
+#+HTML_HEAD:     background: transparent;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: #table-of-contents {
+#+HTML_HEAD:     line-height: 1.2;
+#+HTML_HEAD: }
+#+HTML_HEAD: #table-of-contents h2 {
+#+HTML_HEAD:     border-bottom: 0;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: #table-of-contents ul {
+#+HTML_HEAD:     list-style: none;
+#+HTML_HEAD:     padding-left: 0.5em;
+#+HTML_HEAD:     font-weight: normal;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: #table-of-contents div>ul>li {
+#+HTML_HEAD:     margin-top: 1em;
+#+HTML_HEAD:     font-weight: bold;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: #table-of-contents .tag {
+#+HTML_HEAD:     display: none;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: #table-of-contents .todo,
+#+HTML_HEAD: #table-of-contents .done {
+#+HTML_HEAD:     font-size: 80%;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: #table-of-contents ol>li {
+#+HTML_HEAD:     margin-top: 1em;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: table {
+#+HTML_HEAD:     width: 100%;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: table, th, td {
+#+HTML_HEAD:     border: 1px solid #666;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: th, td {
+#+HTML_HEAD:     padding: 0.5em;
+#+HTML_HEAD:     text-align: left;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: tbody tr:nth-child(odd) {
+#+HTML_HEAD:     background-color: #eee;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: img {
+#+HTML_HEAD:     max-width: 90%;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: div.notice {
+#+HTML_HEAD:     position: relative;
+#+HTML_HEAD:     margin: 0 1.2em;
+#+HTML_HEAD:     padding: 0.25em 1em;
+#+HTML_HEAD:     border-left: 4px solid;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: table + div.notice {
+#+HTML_HEAD:     margin-top: 2em;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: div.notice a {
+#+HTML_HEAD:     background: transparent !important;
+#+HTML_HEAD:     border-bottom: 1px dotted;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: div.notice a[href^="http"]:after {
+#+HTML_HEAD:     background: transparent !important;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: div.notice:before {
+#+HTML_HEAD:     position: absolute;
+#+HTML_HEAD:     top: 0;
+#+HTML_HEAD:     right: 0;
+#+HTML_HEAD:     padding: 0.25em 0.5em 0;
+#+HTML_HEAD:     font-size: 60%;
+#+HTML_HEAD:     border-bottom-left-radius: 0.5em;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: .notice-warning {
+#+HTML_HEAD:     background: #fcc;
+#+HTML_HEAD:     color: #600;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: .notice-example {
+#+HTML_HEAD:     background: #def;
+#+HTML_HEAD:     color: #069;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: .notice-info {
+#+HTML_HEAD:     background: #efe;
+#+HTML_HEAD:     color: #060;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: .notice-warning a {
+#+HTML_HEAD:     color: #600;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: .notice-example a {
+#+HTML_HEAD:     color: #069;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: .notice-info a {
+#+HTML_HEAD:     color: #060;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: div.notice-warning:before {
+#+HTML_HEAD:     content: "WARNING";
+#+HTML_HEAD:     background: #c99;
+#+HTML_HEAD:     color: #fcc;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: div.notice-example:before {
+#+HTML_HEAD:     content: "EXAMPLE";
+#+HTML_HEAD:     background: #abc;
+#+HTML_HEAD:     color: #def;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: div.notice-info:before {
+#+HTML_HEAD:     content: "INFO";
+#+HTML_HEAD:     background: #9c9;
+#+HTML_HEAD:     color: #efe;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: /* things inside the #+BEGIN_NOTE...#+END_NOTE block */
+#+HTML_HEAD: div.NOTE a {
+#+HTML_HEAD:     background: transparent !important;
+#+HTML_HEAD:     border-bottom: 1px dotted;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: div.NOTE {
+#+HTML_HEAD:     position: relative;
+#+HTML_HEAD:     margin: 0 1.2em;
+#+HTML_HEAD:     padding: 0.25em 1em;
+#+HTML_HEAD:     border-left: 4px solid;
+#+HTML_HEAD:     margin-top: 2em;
+#+HTML_HEAD:     background: #efe;
+#+HTML_HEAD:     color: #060;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: div.NOTE:before {
+#+HTML_HEAD:     position: absolute;
+#+HTML_HEAD:     top: 0;
+#+HTML_HEAD:     right: 0;
+#+HTML_HEAD:     padding: 0.25em 0.5em 0;
+#+HTML_HEAD:     font-size: 60%;
+#+HTML_HEAD:     border-bottom-left-radius: 0.5em;
+#+HTML_HEAD:     content: "NOTE";
+#+HTML_HEAD:     background: #9c9;
+#+HTML_HEAD:     color: #efe;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: blockquote {
+#+HTML_HEAD:     padding: 0px 10px 0px 10px;
+#+HTML_HEAD:     border: 1px solid #ddd;
+#+HTML_HEAD:     background: #eee;
+#+HTML_HEAD:     box-shadow: 5px 5px 5px #eee;
+#+HTML_HEAD:     border-radius: 2px;
+#+HTML_HEAD:     line-height: 1.2em;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: pre {
+#+HTML_HEAD:     font-family: "Inconsolata", "monospace";
+#+HTML_HEAD:     font-size: 100%;
+#+HTML_HEAD:     border: 0;
+#+HTML_HEAD:     box-shadow: none;
+#+HTML_HEAD:     overflow: auto;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: pre.example:before {
+#+HTML_HEAD:     content: "EXAMPLE";
+#+HTML_HEAD:     display: block;
+#+HTML_HEAD:     border-bottom: 1px dotted;
+#+HTML_HEAD:     margin-bottom: 1em;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: pre.example {
+#+HTML_HEAD:     background: #fec;
+#+HTML_HEAD:     color: #666;
+#+HTML_HEAD:     font-size: 0.85em;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: pre {
+#+HTML_HEAD:     background-color: #f8f8f8;
+#+HTML_HEAD:     background-size: 8px 8px;
+#+HTML_HEAD:     background-image: linear-gradient(135deg, transparent 25%, rgba(0, 0, 0, 0.02) 25%, rgba(0, 0, 0, 0.02) 50%, transparent 50%, transparent 75%, rgba(0, 0, 0, 0.02) 75%, rgba(0, 0, 0, 0.02));
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: pre.src {
+#+HTML_HEAD:     padding: 0.5em;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: pre.src:before {
+#+HTML_HEAD:     display: block;
+#+HTML_HEAD:     position: absolute;
+#+HTML_HEAD:     background-color: #ccccd0;
+#+HTML_HEAD:     top: 0;
+#+HTML_HEAD:     right: 0;
+#+HTML_HEAD:     padding: 0.25em 0.5em;
+#+HTML_HEAD:     border-bottom-left-radius: 8px;
+#+HTML_HEAD:     border: 0;
+#+HTML_HEAD:     color: white;
+#+HTML_HEAD:     font-size: 80%;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: pre.src-plantuml:before {
+#+HTML_HEAD:     content: "UML";
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: pre.src-javascript:before {
+#+HTML_HEAD:     content: "JS";
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: pre.src-clojure:before {
+#+HTML_HEAD:     content: "CLJ";
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: pre.src-c:before {
+#+HTML_HEAD:     content: "C";
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: pre.src-sh:before {
+#+HTML_HEAD:     content: "Shell";
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: pre.src-es:before {
+#+HTML_HEAD:     content: "ES";
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: span.org-string {
+#+HTML_HEAD:     color: #f94;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: span.org-keyword {
+#+HTML_HEAD:     color: #c07;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: span.org-variable-name {
+#+HTML_HEAD:     color: #f04;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: span.org-clojure-keyword {
+#+HTML_HEAD:     color: #09f;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: span.org-comment, span.org-comment-delimiter {
+#+HTML_HEAD:     color: #999;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: span.org-rainbow-delimiters-depth-1, span.org-rainbow-delimiters-depth-5 {
+#+HTML_HEAD:     color: #666;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: span.org-rainbow-delimiters-depth-2, span.org-rainbow-delimiters-depth-6 {
+#+HTML_HEAD:     color: #888;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: span.org-rainbow-delimiters-depth-3, span.org-rainbow-delimiters-depth-7 {
+#+HTML_HEAD:     color: #aaa;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: span.org-rainbow-delimiters-depth-4, span.org-rainbow-delimiters-depth-8 {
+#+HTML_HEAD:     color: #ccc;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: div.figure {
+#+HTML_HEAD:     font-size: 0.85em;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: .tag {
+#+HTML_HEAD:     font-family: "Roboto Slab", Helvetica, Arial, sans-serif;
+#+HTML_HEAD:     font-size: 11px;
+#+HTML_HEAD:     font-weight: normal;
+#+HTML_HEAD:     float: right;
+#+HTML_HEAD:     margin-top: 1em;
+#+HTML_HEAD:     background: transparent;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: .tag span {
+#+HTML_HEAD:     background: #ccc;
+#+HTML_HEAD:     padding: 0 0.5em;
+#+HTML_HEAD:     border-radius: 0.2em;
+#+HTML_HEAD:     color: white;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: .todo, .done {
+#+HTML_HEAD:     font-family: "Roboto Slab", Helvetica, Arial, sans-serif;
+#+HTML_HEAD:     font-weight: normal;
+#+HTML_HEAD:     padding: 0 0.25em;
+#+HTML_HEAD:     border-radius: 0.2em;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: .todo {
+#+HTML_HEAD:     background: #f04;
+#+HTML_HEAD:     color: white;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: .done {
+#+HTML_HEAD:     background: #5f7;
+#+HTML_HEAD:     color: white;
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: @media screen {
+#+HTML_HEAD:     h1.title {
+#+HTML_HEAD:         text-align: left;
+#+HTML_HEAD:         margin: 0.5em 0 1em 0;
+#+HTML_HEAD:     }
+#+HTML_HEAD: 
+#+HTML_HEAD:     h2 {
+#+HTML_HEAD:         margin-top: 3em;
+#+HTML_HEAD:     }
+#+HTML_HEAD: 
+#+HTML_HEAD:     #table-of-contents {
+#+HTML_HEAD:         position: fixed;
+#+HTML_HEAD:         top: 0;
+#+HTML_HEAD:         left: 0;
+#+HTML_HEAD:         padding: 2em 0 2em 2em;
+#+HTML_HEAD:         width: 290px;
+#+HTML_HEAD:         height: 100vh;
+#+HTML_HEAD:         font-size: 11px;
+#+HTML_HEAD:         background: #eee;
+#+HTML_HEAD:         overlow-x: hidden;
+#+HTML_HEAD:         overlow-y: auto;
+#+HTML_HEAD:     }
+#+HTML_HEAD: 
+#+HTML_HEAD:     #table-of-contents h2 {
+#+HTML_HEAD:         margin-top: 0;
+#+HTML_HEAD:     }
+#+HTML_HEAD: 
+#+HTML_HEAD:     #table-of-contents code {
+#+HTML_HEAD:         font-size: 12px;
+#+HTML_HEAD:     }
+#+HTML_HEAD: 
+#+HTML_HEAD:     div#content {
+#+HTML_HEAD:         margin-left: 320px;
+#+HTML_HEAD:         max-width: 1100px;
+#+HTML_HEAD:     }
+#+HTML_HEAD:     div#postamble {
+#+HTML_HEAD:         margin-left: 320px;
+#+HTML_HEAD:         max-width: 1100px;
+#+HTML_HEAD:     }
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: @media screen and (max-width: 1024px) {
+#+HTML_HEAD:     html, body {
+#+HTML_HEAD:         font-size: 14px;
+#+HTML_HEAD:     }
+#+HTML_HEAD: 
+#+HTML_HEAD:     #table-of-contents {
+#+HTML_HEAD:         display: none;
+#+HTML_HEAD:     }
+#+HTML_HEAD: 
+#+HTML_HEAD:     h1.title {
+#+HTML_HEAD:         margin-left: 0%;
+#+HTML_HEAD:     }
+#+HTML_HEAD: 
+#+HTML_HEAD:     div#content {
+#+HTML_HEAD:         margin-left: 5%;
+#+HTML_HEAD:         max-width: 90%;
+#+HTML_HEAD:     }
+#+HTML_HEAD:     div#postamble {
+#+HTML_HEAD:         margin-left: 5%;
+#+HTML_HEAD:         max-width: 90%;
+#+HTML_HEAD:     }
+#+HTML_HEAD: }
+#+HTML_HEAD: 
+#+HTML_HEAD: @media print {
+#+HTML_HEAD: 
+#+HTML_HEAD:     body {
+#+HTML_HEAD:         color: black;
+#+HTML_HEAD:     }
+#+HTML_HEAD: 
+#+HTML_HEAD:     @page {
+#+HTML_HEAD:         margin: 25mm;
+#+HTML_HEAD:     }
+#+HTML_HEAD: 
+#+HTML_HEAD:     h2, h3 {
+#+HTML_HEAD:         page-break-before: always;
+#+HTML_HEAD:         margin-top: 0;
+#+HTML_HEAD:     }
+#+HTML_HEAD: 
+#+HTML_HEAD:     table {
+#+HTML_HEAD:         page-break-inside: avoid;
+#+HTML_HEAD:     }
+#+HTML_HEAD: 
+#+HTML_HEAD:     a:visited {
+#+HTML_HEAD:         color: black;
+#+HTML_HEAD:         background: #ff8;
+#+HTML_HEAD:     }
+#+HTML_HEAD: 
+#+HTML_HEAD:     a[href^="http"]:visited {
+#+HTML_HEAD:         background: #bff;
+#+HTML_HEAD:     }
+#+HTML_HEAD: 
+#+HTML_HEAD:     div.notice:before {
+#+HTML_HEAD:         display: none;
+#+HTML_HEAD:     }
+#+HTML_HEAD: }
+#+HTML_HEAD: </style>
\ No newline at end of file
diff --git a/packages/org-themes/src/index.org b/packages/org-themes/src/index.org
index 25a3511ee261187b67669ba00f5ebaf1e2249b7b..65829b94f3da557151ee7d1ec5c2ee2fd35028f1 100644
--- a/packages/org-themes/src/index.org
+++ b/packages/org-themes/src/index.org
@@ -16,7 +16,7 @@ inside =style= and =script= tags.
 
 Include it by using:
 #+begin_src emacs-lisp
-#+SETUPFILE: <path to .theme file>
+,#+SETUPFILE: <path to .theme file>
 #+end_src
   
 Every theme is rendered with the same org file. A plane version is also included.
@@ -33,7 +33,7 @@ I am happy to include any theme.
 
 ** The list of Themes:
 #+NAME: Themes list
-#+begin_src emacs-lisp :exports results
+#+begin_src emacs-lisp :exports none
 ;; https://kitchingroup.cheme.cmu.edu/blog/2014/03/23/Make-a-list-of-org-files-in-all-the-subdirectories-of-the-current-working-directory/
 (defun os-walk (root)
   (let ((files '()) ;empty list to store results
@@ -61,17 +61,15 @@ I am happy to include any theme.
 
 (mapcar 
  (lambda (x) (princ (format "[[%s][%s]]\n"  x (replace-regexp-in-string "\.org$" "" (replace-regexp-in-string "_" " " (replace-regexp-in-string ".*/" "" (file-relative-name x ".")))))))
- (remove-if-not 
+ (cl-remove-if-not 
   (lambda (x) (and (string= (file-name-extension x) "org") (not (string-match "example" x)) (not (string-match "index" x))))
   (os-walk "")))
 #+end_src
 
-
-
-#+RESULTS: Themes list
 | [[file:/home/olmon/Workplace/Org/Themes/src/bigblow_inline/bigblow.org][bigblow]]           |
 | [[file:/home/olmon/Workplace/Org/Themes/src/comfy_inline/comfy_inline.org][comfy inline]]      |
 | [[file:/home/olmon/Workplace/Org/Themes/src/darksun/darksun.org][darksun]]           |
+| [[file:/home/olmon/Workplace/Org/Themes/src/gray/gray.org][gray]]              |
 | [[file:/home/olmon/Workplace/Org/Themes/src/imagine_light/imagine_light.org][imagine light]]     |
 | [[file:/home/olmon/Workplace/Org/Themes/src/latexcss/latexcss.org][latexcss]]          |
 | [[file:/home/olmon/Workplace/Org/Themes/src/plain/plain.org][plain]]             |
diff --git a/src/language/utils/PugsFunctionAdapter.hpp b/src/language/utils/PugsFunctionAdapter.hpp
index a4d1f76bd45285294008bfbaa24b6172a2b0b774..b4a9de405dfbaeee2c55543678db3d7096785ecf 100644
--- a/src/language/utils/PugsFunctionAdapter.hpp
+++ b/src/language/utils/PugsFunctionAdapter.hpp
@@ -128,7 +128,7 @@ class PugsFunctionAdapter<OutputType(InputType...)>
   [[nodiscard]] PUGS_INLINE static auto
   getContextList(const ASTNode& expression)
   {
-    SmallArray<ExecutionPolicy> context_list(Kokkos::DefaultExecutionSpace::impl_thread_pool_size());
+    SmallArray<ExecutionPolicy> context_list(Kokkos::HostSpace::execution_space().impl_thread_pool_size());
     auto& context = expression.m_symbol_table->context();
 
     for (size_t i = 0; i < context_list.size(); ++i) {
@@ -187,7 +187,7 @@ class PugsFunctionAdapter<OutputType(InputType...)>
           };
         } else {
           // If this point is reached must be a 0 vector
-          return [](DataVariant &&) -> OutputType { return OutputType{ZeroType{}}; };
+          return [](DataVariant&&) -> OutputType { return OutputType{ZeroType{}}; };
         }
       }
       case ASTNodeDataType::double_t: {
@@ -242,7 +242,7 @@ class PugsFunctionAdapter<OutputType(InputType...)>
           };
         } else {
           // If this point is reached must be a 0 matrix
-          return [](DataVariant &&) -> OutputType { return OutputType{ZeroType{}}; };
+          return [](DataVariant&&) -> OutputType { return OutputType{ZeroType{}}; };
         }
       }
       case ASTNodeDataType::double_t: {